Skip to content

Commit

Permalink
新增cogVLM2的python接口
Browse files Browse the repository at this point in the history
  • Loading branch information
黄宇扬 committed Oct 23, 2024
1 parent 62b3d05 commit d779940
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 1 deletion.
103 changes: 102 additions & 1 deletion tools/fastllm_pytools/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@
ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
fastllm_lib.launch_response_llm_model.restype = ctypes.c_int

fastllm_lib.launch_response_llm_model_multimodal.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p,
ctypes.c_char_p, ctypes.c_void_p,
ctypes.c_int, ctypes.c_bool, ctypes.c_float, ctypes.c_int,
ctypes.c_float, ctypes.c_float, ctypes.c_bool,
ctypes.c_int, ctypes.POINTER(ctypes.c_int)]
fastllm_lib.launch_response_llm_model_multimodal.restype = ctypes.c_int


fastllm_lib.add_cache_llm_model.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_void_p]

fastllm_lib.fetch_response_llm_model.argtypes = [ctypes.c_int, ctypes.c_int]
Expand Down Expand Up @@ -446,6 +454,8 @@ def __init__ (self, path : str,
else:
self.model = fastllm_lib.create_llm_model_fromhf(path.encode(), fastllm_data_type_dict[dtype], int4g_groupcnt,
ctypes.c_bool(self.hf_tokenizer != None), lora.encode());
if (os.path.isfile(os.path.join(path, "config.json"))):
self.config = json.load(open(os.path.join(path, "config.json"), "r"))
else:
print("path error: ", path);
exit(0)
Expand Down Expand Up @@ -860,10 +870,57 @@ def launch_stream_response(self,
query: Union[str, List[Dict[str, str]]],
history: List[Tuple[str, str]] = None,
max_length: int = 8192, do_sample = True, top_p = 0.8, top_k = 1, temperature = 1.0, repeat_penalty = 1.0,
one_by_one = True, stop_token_ids: List[int] = None, add_generation_prompt = True):
one_by_one = True, stop_token_ids: List[int] = None, add_generation_prompt = True,
images: List = None):
conversation = None
if (isinstance(query, List)):
conversation = query
if (images != None):
architecture = ""
try:
architecture = self.config["architectures"][0]
except:
print("Error: can't detect architectures for this model.")
exit(0)
if (architecture == "CogVLMForCausalLM"):
image_channels = int(self.config["vision_config"]["in_channels"])
image_size = int(self.config["vision_config"]["image_size"])
configs = {
"image_channels": image_channels,
"image_height": image_size,
"image_width": image_size
}
des = json.dumps(configs)
from torchvision import transforms
transform = transforms.Compose(
[
transforms.Resize(
(image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC
),
transforms.ToTensor(),
transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
]
)
image = transform(images[0]).reshape([-1]).tolist()
else:
print("Error: can't support architectures: " + architecture)
exit(0)

# 有图片输入,多模态模型
tokenizer = self.hf_tokenizer
prompt = ""
if (conversation != None and len(conversation) != 0):
prompt = tokenizer.apply_chat_template(conversation, add_generation_prompt = add_generation_prompt, tokenize = False)
else:
prompt = query if self.direct_query else self.get_prompt(query, history)
input = tokenizer.encode(prompt)
stop_token_len, stop_token_list = self.stop_token_ctypes(stop_token_ids)
handle = fastllm_lib.launch_response_llm_model_multimodal(self.model, len(input), (ctypes.c_int * len(input))(*input),
des.encode(), (ctypes.c_float * len(image))(*image),
max_length, do_sample, top_p, top_k, temperature, repeat_penalty,
False, stop_token_len, stop_token_list)
return handle

if (self.hf_tokenizer != None and hasattr(self.hf_tokenizer, "chat_template") and self.hf_tokenizer.chat_template != ""):
tokenizer = self.hf_tokenizer
type = None
Expand Down Expand Up @@ -902,6 +959,50 @@ def launch_stream_response(self,

def abort_handle(self, handle):
fastllm_lib.abort_response_llm_model(self.model, handle)

def stream_response_handle(self, handle):
if (self.hf_tokenizer != None and hasattr(self.hf_tokenizer, "chat_template") and self.hf_tokenizer.chat_template != ""):
tokenizer = self.hf_tokenizer
tokens = []
while True:
if not(fastllm_lib.can_fetch_response_llm_model(self.model, handle)):
continue
cur = fastllm_lib.fetch_response_llm_model(self.model, handle)
if (cur <= -1):
if (cur == -2):
yield "prompt too long"
break
tokens.append(cur)
ret = tokenizer.decode(tokens)
if (ret.encode().find(b'\xef\xbf\xbd') == -1):
tokens.clear()
yield ret
else:
yield ""
if len(tokens) > 0:
yield tokenizer.decode(tokens)
else:
res = ""
ret = b''
fail_cnt = 0
while True:
if not(fastllm_lib.can_fetch_response_llm_model(self.model, handle)):
continue
ret += fastllm_lib.fetch_response_str_llm_model(self.model, handle)
cur = ""
try:
cur = ret.decode()
ret = b''
except:
fail_cnt += 1
if (fail_cnt == 20):
break
else:
continue
fail_cnt = 0
if (cur == "<flmeos>"):
break
yield cur

async def stream_response_handle_async(self, handle):
if (self.hf_tokenizer != None and hasattr(self.hf_tokenizer, "chat_template") and self.hf_tokenizer.chat_template != ""):
Expand Down
34 changes: 34 additions & 0 deletions tools/src/pytools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,40 @@ extern "C" {
return model->LaunchResponseTokens(input, config);
}

DLL_EXPORT int launch_response_llm_model_multimodal(int modelId, int len, int *values,
char *multimodal_json, float *multimodal_data,
int max_length, bool do_sample, float top_p, int top_k,
float temperature, float repeat_penalty, bool output_logits,
int stop_token_len, int * stop_token_ids) {
std::vector <int> input;
for (int i = 0; i < len; i++) {
input.push_back(values[i]);
}
auto config = make_config(max_length, do_sample, top_p, top_k, temperature, repeat_penalty, output_logits, false);
for(int i = 0; i < stop_token_len; i++ ) {
config.stop_token_ids.insert(stop_token_ids[i]);
}
auto model = models.GetModel(modelId);

std::string error;
auto multimodal_config = json11::Json::parse(multimodal_json, error);
int image_channels = multimodal_config["image_channels"].int_value();
int image_height = multimodal_config["image_height"].int_value();
int image_width = multimodal_config["image_width"].int_value();

std::vector <float> imageInput;
imageInput.resize(1 * image_channels * image_height * image_width);
memcpy(&imageInput[0], multimodal_data, imageInput.size() * sizeof(float));

std::map <std::string, std::vector <fastllm::Data*> > *multimodalInput = new std::map <std::string, std::vector <fastllm::Data*> > ();
fastllm::Data *imageInputData = new fastllm::Data();
imageInputData->CopyFrom(fastllm::Data(fastllm::DataType::FLOAT32, {1, image_channels, image_height, image_width}, imageInput));
(*multimodalInput)["images"].push_back(imageInputData);

int ret = model->LaunchResponseTokens(input, config, *multimodalInput);
return ret;
}

DLL_EXPORT int fetch_response_llm_model(int modelId, int handleId) {
auto model = models.GetModel(modelId);
return model->FetchResponseTokens(handleId);
Expand Down

0 comments on commit d779940

Please sign in to comment.