Skip to content

Commit

Permalink
detail adjustment & formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
jingyaogong committed Nov 3, 2024
1 parent c115169 commit 1fc9815
Show file tree
Hide file tree
Showing 13 changed files with 23 additions and 34 deletions.
1 change: 0 additions & 1 deletion .envrc

This file was deleted.

9 changes: 0 additions & 9 deletions .gitignore

This file was deleted.

2 changes: 1 addition & 1 deletion 1-pretrain_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def init_distributed_mode():
if args.visual_encoder == "clip":
lm_config = LMConfig()
else:
lm_config = LMConfig(image_special_token='<'*98+'>'*98, image_ids=[30]*98+[32]*98)
lm_config = LMConfig(image_special_token='<' * 98 + '>' * 98, image_ids=[30] * 98 + [32] * 98)

max_seq_len = lm_config.max_seq_len
args.save_dir = os.path.join(args.out_dir)
Expand Down
14 changes: 7 additions & 7 deletions 2-sft_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,9 +97,9 @@ def train_epoch(epoch, wandb):
if (step + 1) % args.save_interval == 0 and (not ddp or dist.get_rank() == 0):
model.eval()
moe_path = '_moe' if lm_config.use_moe else ''
if args.multi: # 多图训练权重保存
if args.multi: # 多图训练权重保存
ckp = f'{args.save_dir}/{lm_config.dim}{moe_path}_vlm_sft_multi.pth'
else: # 单图训练权重保存
else: # 单图训练权重保存
ckp = f'{args.save_dir}/{lm_config.dim}{moe_path}_vlm_sft.pth'

if isinstance(model, torch.nn.parallel.DistributedDataParallel):
Expand Down Expand Up @@ -197,7 +197,7 @@ def init_distributed_mode():
if args.visual_encoder == "clip":
lm_config = LMConfig()
else:
lm_config = LMConfig(image_special_token='<'*98+'>'*98, image_ids=[30]*98+[32]*98)
lm_config = LMConfig(image_special_token='<' * 98 + '>' * 98, image_ids=[30] * 98 + [32] * 98)

max_seq_len = lm_config.max_seq_len
args.save_dir = os.path.join(args.out_dir)
Expand Down Expand Up @@ -229,12 +229,12 @@ def init_distributed_mode():
if args.multi:
print("进行多图训练,建议在指令微调后进行...")
train_ds = SFTDataset_multi(args.data_path_multi, tokenizer, vision_model=(vision_model, preprocess),
image_special_token=lm_config.image_special_token,
max_length=max_seq_len)
image_special_token=lm_config.image_special_token,
max_length=max_seq_len)
else:
train_ds = SFTDataset(args.data_path, tokenizer, vision_model=(vision_model, preprocess),
image_special_token=lm_config.image_special_token,
max_length=max_seq_len)
image_special_token=lm_config.image_special_token,
max_length=max_seq_len)
train_sampler = DistributedSampler(train_ds) if ddp else None
train_loader = DataLoader(
train_ds,
Expand Down
6 changes: 3 additions & 3 deletions 3-eval_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,20 +66,20 @@ def setup_seed(seed):
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
dtype = 'bfloat16'
max_seq_len = 1024
encoder_type="clip"
encoder_type = "clip"
# lm_config = LMConfig()
if encoder_type == "clip":
lm_config = LMConfig()
else:
lm_config = LMConfig(image_special_token='<'*98+'>'*98, image_ids=[30]*98+[32]*98)
lm_config = LMConfig(image_special_token='<' * 98 + '>' * 98, image_ids=[30] * 98 + [32] * 98)
lm_config.max_seq_len = max_seq_len
model, tokenizer, vision_model, preprocess = init_model(lm_config, device, multi)
model.eval()

# -------------------------- 问题和目录设置 -----------------------------------
if multi:
image_dir = './dataset/eval_multi_images/bird/'
prompt = "<image>\n<image>\nName all the differences between these two birds."
prompt = f"{lm_config.image_special_token}\n{lm_config.image_special_token}\nName all the differences between these two birds."
else:
image_dir = './dataset/eval_images/'
prompt = lm_config.image_special_token + '\n这个图片描述的是什么内容?'
Expand Down
6 changes: 3 additions & 3 deletions model/LMConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ class LMConfig(PretrainedConfig):

def __init__(
self,
dim: int = 512, # 768
n_layers: int = 8, # 16
dim: int = 512, # 768
n_layers: int = 8, # 16
n_heads: int = 16,
n_kv_heads: int = 8,
vocab_size: int = 6400,
Expand All @@ -19,7 +19,7 @@ def __init__(
dropout: float = 0.0,
flash_attn: bool = True,
image_special_token: str = '<' * 25 + '>' * 25,
image_ids=[30] * 25 + [32] * 25,
image_ids: List = [30] * 25 + [32] * 25,
####################################################
# Here are the specific configurations of MOE
# When use_moe is false, the following is invalid
Expand Down
Binary file removed model/__pycache__/LMConfig.cpython-310.pyc
Binary file not shown.
Binary file removed model/__pycache__/dataset.cpython-310.pyc
Binary file not shown.
Binary file removed model/__pycache__/model.cpython-310.pyc
Binary file not shown.
Binary file removed model/__pycache__/vision_utils.cpython-310.pyc
Binary file not shown.
2 changes: 1 addition & 1 deletion model/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def __getitem__(self, index: int):
image_encoders = get_img_process(image, self.preprocess)

return X_tensor, Y_tensor, loss_mask_tensor, image_encoders


class SFTDataset_multi(Dataset):
def __init__(self, json_path, tokenizer, vision_model=None, max_length=1024,
Expand Down
9 changes: 3 additions & 6 deletions model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,12 +326,12 @@ class Transformer(PreTrainedModel):
config_class = LMConfig
last_loss: Optional[torch.Tensor]

def __init__(self, params: LMConfig = None, vocab_size = 6400):
def __init__(self, params: LMConfig = None):
super().__init__(params)
if not params:
params = LMConfig()
self.params = params
self.vocab_size = vocab_size
self.vocab_size = params.vocab_size
self.n_layers = params.n_layers
# image的特殊占位符,对应每张图切分成M个token,和get_img_process中的数量对应
self.image_ids = params.image_ids
Expand Down Expand Up @@ -373,12 +373,10 @@ def count_vision_proj(self, tokens, h, image_encoders=None, seqlen=200):
def find_indices(tokens, image_ids):
image_ids_tensor = torch.tensor(image_ids).to(tokens.device)
len_image_ids = len(image_ids)

# .generate时,在初始化后直接跳过
if len_image_ids > tokens.size(1):
# print(f"len_image_ids ({len_image_ids}) is greater than sequence length ({tokens.size(1)}), skipping.")
return None

# 使用view来创建一个视图,便于处理滑动窗口
tokens_view = tokens.unfold(1, len_image_ids, 1) # 在第二维度创建滑动窗口
# 检查每个滑动窗口是否与image_ids_tensor相等
Expand Down Expand Up @@ -451,7 +449,6 @@ def forward(self, tokens: Optional[torch.Tensor] = None, targets: Optional[torch
self.last_loss = None

self.OUT.__setitem__('logits', logits)
# self.OUT.__setitem__('last_loss', self.last_loss)
self.OUT.__setitem__('loss', self.last_loss)
return self.OUT

Expand Down
8 changes: 5 additions & 3 deletions model/vision_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,11 @@ def hook_fn(module, input, output):
embeddings.append(output.last_hidden_state)

# 从 BatchEncoding 中提取图像张量
if isinstance(batch_encoding, transformers.tokenization_utils_base.BatchEncoding) or isinstance(batch_encoding, transformers.feature_extraction_utils.BatchFeature):
if (isinstance(batch_encoding, transformers.tokenization_utils_base.BatchEncoding)
or isinstance(batch_encoding, transformers.feature_extraction_utils.BatchFeature)):
image_tensor = batch_encoding['pixel_values']
else:
image_tensor = batch_encoding # torch.Size([32, 4, 3, 224, 224])
image_tensor = batch_encoding # torch.Size([32, 4, 3, 224, 224])

# 如果图像张量的形状是5维,则无需添加额外维度
if len(image_tensor.shape) == 4:
Expand All @@ -66,5 +67,6 @@ def hook_fn(module, input, output):
hook.remove()

# 拼接所有特征向量成为一个张量
all_embeddings = torch.stack(embeddings, dim=0).squeeze() # torch.Size([32, 4, 50, 768]) or torch.Size([32, 2, 196, 768])
all_embeddings = torch.stack(embeddings, dim=0).squeeze()
# torch.Size([32, 4, 50, 768]) or torch.Size([32, 2, 196, 768])
return all_embeddings

0 comments on commit 1fc9815

Please sign in to comment.