Skip to content

Commit

Permalink
model load
Browse files Browse the repository at this point in the history
  • Loading branch information
Minamiyama committed Jan 22, 2025
1 parent d972ddf commit be90717
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 31 deletions.
36 changes: 22 additions & 14 deletions xinference/model/llm/transformers/intern_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,15 +265,24 @@ def _split_model(self):
if world_size == 1:
return None
model_size = f"{self.model_spec.model_size_in_billions}B"
model_name = f"{self.model_family.model_name.lower()}-{model_size}"
num_layers = {
"1B": 24,
"2B": 24,
"4B": 32,
"8B": 32,
"26B": 48,
"40B": 60,
"76B": 80,
}[model_size]
"internvl2-1B": 24,
"internvl2-2B": 24,
"internvl2-4B": 32,
"internvl2-8B": 32,
"internvl2-26B": 48,
"internvl2-40B": 60,
"internvl2-76B": 80,
"internvl2.5-1B": 24,
"internvl2.5-2B": 24,
"internvl2.5-4B": 36,
"internvl2.5-8B": 32,
"internvl2.5-26B": 48,
"internvl2.5-38B": 64,
"internvl2.5-78B": 80,
}[model_name]

# Since the first GPU will be used for ViT, treat it as half a GPU.
num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
num_layers_per_gpu = [num_layers_per_gpu] * world_size
Expand Down Expand Up @@ -322,9 +331,7 @@ def load(self, **kwargs):
self._model.cuda()

self._tokenizer = AutoTokenizer.from_pretrained(
self.model_path,
trust_remote_code=True,
use_fast=False,
self.model_path, trust_remote_code=True, use_fast=False
)

@cache_clean
Expand All @@ -340,9 +347,9 @@ def chat(
IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"

generation_config = {
"max_new_tokens": generate_config.get("max_tokens", 1024)
if generate_config
else 1024,
"max_new_tokens": (
generate_config.get("max_tokens", 1024) if generate_config else 1024
),
"do_sample": False,
}

Expand Down Expand Up @@ -458,6 +465,7 @@ def _generate_stream(self, generate_kwargs, input_ids, include_usage):
streamer = TextIteratorStreamer(
self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
)

# Define the generation configuration
generate_kwargs["streamer"] = streamer
# Start the model chat in a separate thread
Expand Down
43 changes: 26 additions & 17 deletions xinference/thirdparty/internvl/conversation.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ def get_prompt(self) -> str:

return ret
elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
seps = [self.sep, self.sep2]
seps = [self.sep2, self.sep]
ret = self.system_message + seps[0]
for i, (role, message) in enumerate(self.messages):
if message:
Expand Down Expand Up @@ -330,6 +330,19 @@ def get_conv_template(name: str) -> Conversation:
return conv_templates[name].copy()


# InternVL-Chat-V1-1 template
register_conv_template(
Conversation(
name='internvl_zh',
system_template='',
roles=('<human>', '<bot>'),
sep_style=SeparatorStyle.INTERNVL_ZH,
sep='</s>',
sep2=' ',
)
)


# Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
# is that during training, the preprocessing function for the Hermes-2 template doesn't add
# <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
Expand All @@ -344,12 +357,6 @@ def get_conv_template(name: str) -> Conversation:
roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
sep_style=SeparatorStyle.MPT,
sep='<|im_end|>',
stop_token_ids=[
2,
6,
7,
8,
],
stop_str='<|endoftext|>',
)
)
Expand All @@ -365,11 +372,6 @@ def get_conv_template(name: str) -> Conversation:
roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
sep_style=SeparatorStyle.MPT,
sep='<|im_end|>',
stop_token_ids=[
2,
92543,
92542
]
)
)

Expand All @@ -384,10 +386,17 @@ def get_conv_template(name: str) -> Conversation:
roles=('<|user|>\n', '<|assistant|>\n'),
sep_style=SeparatorStyle.MPT,
sep='<|end|>',
stop_token_ids=[
2,
32000,
32007
]
)
)


register_conv_template(
Conversation(
name='internvl2_5',
system_template='<|im_start|>system\n{system_message}',
system_message='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
sep_style=SeparatorStyle.MPT,
sep='<|im_end|>\n',
)
)

0 comments on commit be90717

Please sign in to comment.