model load

xorbitsai · Jan 22, 2025 · be90717 · be90717
1 parent d972ddf
commit be90717
Show file tree

Hide file tree

Showing 2 changed files with 48 additions and 31 deletions.
diff --git a/xinference/model/llm/transformers/intern_vl.py b/xinference/model/llm/transformers/intern_vl.py
@@ -265,15 +265,24 @@ def _split_model(self):
         if world_size == 1:
             return None
         model_size = f"{self.model_spec.model_size_in_billions}B"
+        model_name = f"{self.model_family.model_name.lower()}-{model_size}"
         num_layers = {
-            "1B": 24,
-            "2B": 24,
-            "4B": 32,
-            "8B": 32,
-            "26B": 48,
-            "40B": 60,
-            "76B": 80,
-        }[model_size]
+            "internvl2-1B": 24,
+            "internvl2-2B": 24,
+            "internvl2-4B": 32,
+            "internvl2-8B": 32,
+            "internvl2-26B": 48,
+            "internvl2-40B": 60,
+            "internvl2-76B": 80,
+            "internvl2.5-1B": 24,
+            "internvl2.5-2B": 24,
+            "internvl2.5-4B": 36,
+            "internvl2.5-8B": 32,
+            "internvl2.5-26B": 48,
+            "internvl2.5-38B": 64,
+            "internvl2.5-78B": 80,
+        }[model_name]
+
         # Since the first GPU will be used for ViT, treat it as half a GPU.
         num_layers_per_gpu = math.ceil(num_layers / (world_size - 0.5))
         num_layers_per_gpu = [num_layers_per_gpu] * world_size
@@ -322,9 +331,7 @@ def load(self, **kwargs):
             self._model.cuda()
 
         self._tokenizer = AutoTokenizer.from_pretrained(
-            self.model_path,
-            trust_remote_code=True,
-            use_fast=False,
+            self.model_path, trust_remote_code=True, use_fast=False
         )
 
     @cache_clean
@@ -340,9 +347,9 @@ def chat(
         IMG_CONTEXT_TOKEN = "<IMG_CONTEXT>"
 
         generation_config = {
-            "max_new_tokens": generate_config.get("max_tokens", 1024)
-            if generate_config
-            else 1024,
+            "max_new_tokens": (
+                generate_config.get("max_tokens", 1024) if generate_config else 1024
+            ),
             "do_sample": False,
         }
 
@@ -458,6 +465,7 @@ def _generate_stream(self, generate_kwargs, input_ids, include_usage):
         streamer = TextIteratorStreamer(
             self._tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10
         )
+
         # Define the generation configuration
         generate_kwargs["streamer"] = streamer
         # Start the model chat in a separate thread

diff --git a/xinference/thirdparty/internvl/conversation.py b/xinference/thirdparty/internvl/conversation.py
@@ -227,7 +227,7 @@ def get_prompt(self) -> str:
 
             return ret
         elif self.sep_style == SeparatorStyle.INTERNVL_ZH:
-            seps = [self.sep, self.sep2]
+            seps = [self.sep2, self.sep]
             ret = self.system_message + seps[0]
             for i, (role, message) in enumerate(self.messages):
                 if message:
@@ -330,6 +330,19 @@ def get_conv_template(name: str) -> Conversation:
     return conv_templates[name].copy()
 
 
+# InternVL-Chat-V1-1 template
+register_conv_template(
+    Conversation(
+        name='internvl_zh',
+        system_template='',
+        roles=('<human>', '<bot>'),
+        sep_style=SeparatorStyle.INTERNVL_ZH,
+        sep='</s>',
+        sep2=' ',
+    )
+)
+
+
 # Both Hermes-2 and internlm2-chat are chatml-format conversation templates. The difference
 # is that during training, the preprocessing function for the Hermes-2 template doesn't add
 # <s> at the beginning of the tokenized sequence, while the internlm2-chat template does.
@@ -344,12 +357,6 @@ def get_conv_template(name: str) -> Conversation:
         roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
         sep_style=SeparatorStyle.MPT,
         sep='<|im_end|>',
-        stop_token_ids=[
-            2,
-            6,
-            7,
-            8,
-        ],
         stop_str='<|endoftext|>',
     )
 )
@@ -365,11 +372,6 @@ def get_conv_template(name: str) -> Conversation:
         roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
         sep_style=SeparatorStyle.MPT,
         sep='<|im_end|>',
-        stop_token_ids=[
-            2,
-            92543,
-            92542
-        ]
     )
 )
 
@@ -384,10 +386,17 @@ def get_conv_template(name: str) -> Conversation:
         roles=('<|user|>\n', '<|assistant|>\n'),
         sep_style=SeparatorStyle.MPT,
         sep='<|end|>',
-        stop_token_ids=[
-            2,
-            32000,
-            32007
-        ]
     )
 )
+
+
+register_conv_template(
+    Conversation(
+        name='internvl2_5',
+        system_template='<|im_start|>system\n{system_message}',
+        system_message='你是书生·万象，英文名是InternVL，是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
+        roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
+        sep_style=SeparatorStyle.MPT,
+        sep='<|im_end|>\n',
+    )
+)