detail adjustment & formatting

cwz427 · Nov 3, 2024 · 1fc9815 · 1fc9815
1 parent c115169
commit 1fc9815
Show file tree

Hide file tree

Showing 13 changed files with 23 additions and 34 deletions.
diff --git a/.envrc b/.envrc
diff --git a/.gitignore b/.gitignore
diff --git a/1-pretrain_vlm.py b/1-pretrain_vlm.py
@@ -173,7 +173,7 @@ def init_distributed_mode():
     if args.visual_encoder == "clip":
         lm_config = LMConfig()
     else:
-        lm_config = LMConfig(image_special_token='<'*98+'>'*98, image_ids=[30]*98+[32]*98)
+        lm_config = LMConfig(image_special_token='<' * 98 + '>' * 98, image_ids=[30] * 98 + [32] * 98)
 
     max_seq_len = lm_config.max_seq_len
     args.save_dir = os.path.join(args.out_dir)

diff --git a/2-sft_vlm.py b/2-sft_vlm.py
@@ -97,9 +97,9 @@ def train_epoch(epoch, wandb):
         if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0):
             model.eval()
             moe_path = '_moe' if lm_config.use_moe else ''
-            if args.multi: # 多图训练权重保存
+            if args.multi:  # 多图训练权重保存
                 ckp = f'{args.save_dir}/{lm_config.dim}{moe_path}_vlm_sft_multi.pth'
-            else: # 单图训练权重保存
+            else:  # 单图训练权重保存
                 ckp = f'{args.save_dir}/{lm_config.dim}{moe_path}_vlm_sft.pth'
 
             if isinstance(model, torch.nn.parallel.DistributedDataParallel):
@@ -197,7 +197,7 @@ def init_distributed_mode():
     if args.visual_encoder == "clip":
         lm_config = LMConfig()
     else:
-        lm_config = LMConfig(image_special_token='<'*98+'>'*98, image_ids=[30]*98+[32]*98)
+        lm_config = LMConfig(image_special_token='<' * 98 + '>' * 98, image_ids=[30] * 98 + [32] * 98)
 
     max_seq_len = lm_config.max_seq_len
     args.save_dir = os.path.join(args.out_dir)
@@ -229,12 +229,12 @@ def init_distributed_mode():
     if args.multi:
         print("进行多图训练，建议在指令微调后进行...")
         train_ds = SFTDataset_multi(args.data_path_multi, tokenizer, vision_model=(vision_model, preprocess),
-                          image_special_token=lm_config.image_special_token,
-                          max_length=max_seq_len)
+                                    image_special_token=lm_config.image_special_token,
+                                    max_length=max_seq_len)
     else:
         train_ds = SFTDataset(args.data_path, tokenizer, vision_model=(vision_model, preprocess),
-                            image_special_token=lm_config.image_special_token,
-                            max_length=max_seq_len)
+                              image_special_token=lm_config.image_special_token,
+                              max_length=max_seq_len)
     train_sampler = DistributedSampler(train_ds) if ddp else None
     train_loader = DataLoader(
         train_ds,

diff --git a/3-eval_chat.py b/3-eval_chat.py
@@ -66,20 +66,20 @@ def setup_seed(seed):
     device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
     dtype = 'bfloat16'
     max_seq_len = 1024
-    encoder_type="clip"
+    encoder_type = "clip"
     # lm_config = LMConfig()
     if encoder_type == "clip":
         lm_config = LMConfig()
     else:
-        lm_config = LMConfig(image_special_token='<'*98+'>'*98, image_ids=[30]*98+[32]*98)
+        lm_config = LMConfig(image_special_token='<' * 98 + '>' * 98, image_ids=[30] * 98 + [32] * 98)
     lm_config.max_seq_len = max_seq_len
     model, tokenizer, vision_model, preprocess = init_model(lm_config, device, multi)
     model.eval()
 
     # -------------------------- 问题和目录设置 -----------------------------------
     if multi:
         image_dir = './dataset/eval_multi_images/bird/'
-        prompt = "<image>\n<image>\nName all the differences between these two birds."
+        prompt = f"{lm_config.image_special_token}\n{lm_config.image_special_token}\nName all the differences between these two birds."
     else:
         image_dir = './dataset/eval_images/'
         prompt = lm_config.image_special_token + '\n这个图片描述的是什么内容？'

diff --git a/model/LMConfig.py b/model/LMConfig.py
@@ -7,8 +7,8 @@ class LMConfig(PretrainedConfig):
 
     def __init__(
             self,
-            dim: int = 512, # 768
-            n_layers: int = 8, # 16
+            dim: int = 512,  # 768
+            n_layers: int = 8,  # 16
             n_heads: int = 16,
             n_kv_heads: int = 8,
             vocab_size: int = 6400,
@@ -19,7 +19,7 @@ def __init__(
             dropout: float = 0.0,
             flash_attn: bool = True,
             image_special_token: str = '<' * 25 + '>' * 25,
-            image_ids=[30] * 25 + [32] * 25,
+            image_ids: List = [30] * 25 + [32] * 25,
             ####################################################
             # Here are the specific configurations of MOE
             # When use_moe is false, the following is invalid

diff --git a/model/__pycache__/LMConfig.cpython-310.pyc b/model/__pycache__/LMConfig.cpython-310.pyc
diff --git a/model/__pycache__/dataset.cpython-310.pyc b/model/__pycache__/dataset.cpython-310.pyc
diff --git a/model/__pycache__/model.cpython-310.pyc b/model/__pycache__/model.cpython-310.pyc
diff --git a/model/__pycache__/vision_utils.cpython-310.pyc b/model/__pycache__/vision_utils.cpython-310.pyc
diff --git a/model/dataset.py b/model/dataset.py
@@ -178,7 +178,7 @@ def __getitem__(self, index: int):
         image_encoders = get_img_process(image, self.preprocess)
 
         return X_tensor, Y_tensor, loss_mask_tensor, image_encoders
-    
+
 
 class SFTDataset_multi(Dataset):
     def __init__(self, json_path, tokenizer, vision_model=None, max_length=1024,

diff --git a/model/model.py b/model/model.py
@@ -326,12 +326,12 @@ class Transformer(PreTrainedModel):
     config_class = LMConfig
     last_loss: Optional[torch.Tensor]
 
-    def __init__(self, params: LMConfig = None, vocab_size = 6400):
+    def __init__(self, params: LMConfig = None):
         super().__init__(params)
         if not params:
             params = LMConfig()
         self.params = params
-        self.vocab_size = vocab_size
+        self.vocab_size = params.vocab_size
         self.n_layers = params.n_layers
         # image的特殊占位符，对应每张图切分成M个token，和get_img_process中的数量对应
         self.image_ids = params.image_ids
@@ -373,12 +373,10 @@ def count_vision_proj(self, tokens, h, image_encoders=None, seqlen=200):
         def find_indices(tokens, image_ids):
             image_ids_tensor = torch.tensor(image_ids).to(tokens.device)
             len_image_ids = len(image_ids)
-
             # .generate时，在初始化后直接跳过
             if len_image_ids > tokens.size(1):
-                # print(f"len_image_ids ({len_image_ids}) is greater than sequence length ({tokens.size(1)}), skipping.")
                 return None
-            
+
             # 使用view来创建一个视图，便于处理滑动窗口
             tokens_view = tokens.unfold(1, len_image_ids, 1)  # 在第二维度创建滑动窗口
             # 检查每个滑动窗口是否与image_ids_tensor相等
@@ -451,7 +449,6 @@ def forward(self, tokens: Optional[torch.Tensor] = None, targets: Optional[torch
             self.last_loss = None
 
         self.OUT.__setitem__('logits', logits)
-        # self.OUT.__setitem__('last_loss', self.last_loss)
         self.OUT.__setitem__('loss', self.last_loss)
         return self.OUT
 

diff --git a/model/vision_utils.py b/model/vision_utils.py
@@ -40,10 +40,11 @@ def hook_fn(module, input, output):
         embeddings.append(output.last_hidden_state)
 
     # 从 BatchEncoding 中提取图像张量
-    if isinstance(batch_encoding, transformers.tokenization_utils_base.BatchEncoding) or isinstance(batch_encoding, transformers.feature_extraction_utils.BatchFeature):
+    if (isinstance(batch_encoding, transformers.tokenization_utils_base.BatchEncoding)
+            or isinstance(batch_encoding, transformers.feature_extraction_utils.BatchFeature)):
         image_tensor = batch_encoding['pixel_values']
     else:
-        image_tensor = batch_encoding # torch.Size([32, 4, 3, 224, 224])
+        image_tensor = batch_encoding  # torch.Size([32, 4, 3, 224, 224])
 
     # 如果图像张量的形状是5维，则无需添加额外维度
     if len(image_tensor.shape) == 4:
@@ -66,5 +67,6 @@ def hook_fn(module, input, output):
         hook.remove()
 
     # 拼接所有特征向量成为一个张量
-    all_embeddings = torch.stack(embeddings, dim=0).squeeze() # torch.Size([32, 4, 50, 768]) or torch.Size([32, 2, 196, 768])
+    all_embeddings = torch.stack(embeddings, dim=0).squeeze()
+    # torch.Size([32, 4, 50, 768]) or torch.Size([32, 2, 196, 768])
     return all_embeddings