From e8a690b4e3e57b51be9d70f21b15b1fc497bdfa4 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 28 Aug 2024 18:21:21 +0000 Subject: [PATCH 01/36] DRAFT: merge TransformerClient & TransformerEmbedder into 1 class. --- .../model_client/transformers_client.py | 202 ++++++++++++++++++ 1 file changed, 202 insertions(+) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index f681f23f..20ef1ce6 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -40,6 +40,208 @@ def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] + + +# +# +# +# DRAFT +# +# +# +from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast + +def mean_pooling(model_output, attention_mask): + token_embeddings = model_output[0] #First element of model_output contains all token embeddings + input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + +class TransformerEmbeddingModelClient(ModelClient): + + # + # Model initialisation + # + def __init__( + self, + model_name: Optional[str] = None, + tokenizer_kwargs: Optional[dict] = dict(), + auto_model: Optional[type] = AutoModel, + auto_tokenizer: Optional[type] = AutoTokenizer, + custom_model: Optional[PreTrainedModel] = None, + custom_tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]] = None + ): + + super().__init__() + self.model_name = model_name + self.tokenizer_kwargs = tokenizer_kwargs + self.auto_model=auto_model, + self.auto_tokenizer=auto_tokenizer, + self.custom_model=custom_model, + self.custom_tokenizer=custom_tokenizer + + # Check if there is conflicting arguments + self.use_auto_model = auto_model is not None + self.use_auto_tokenizer = auto_tokenizer is not None + self.use_cusom_model = custom_model is not None + self.use_cusom_tokenizer = custom_tokenizer is not None + self.model_name_exit = model_name is not None + + ## arguments related to model + if self.use_auto_model and self.use_cusom_model: + raise ValueError("Cannot specify 'auto_model' and 'custom_model'.") + elif (not self.use_auto_model) and (not self.use_cusom_model): + raise ValueError("Need to specify either 'auto_model' or 'custom_model'.") + elif self.use_auto_model and (not self.model_name_exit): + raise ValueError("When 'auto_model' is specified 'model_name' must be specified too.") + + ## arguments related to tokenizer + if self.use_auto_tokenizer and self.use_cusom_tokenizer: + raise Exception("Cannot specify 'auto_tokenizer' and 'custom_tokenizer'.") + elif (not self.use_auto_tokenizer) and (not self.use_cusom_tokenizer): + raise Exception("Need to specify either'auto_tokenizer' and 'custom_tokenizer'.") + elif self.use_auto_tokenizer and (not self.model_name_exit): + raise ValueError("When 'auto_tokenizer' is specified 'model_name' must be specified too.") + + self.init_sync_client() + + def init_sync_client(self): + self.init_model( + model_name=self.model_name, + auto_model=self.auto_model, + auto_tokenizer=self.auto_tokenizer, + custom_model=self.custom_model, + custom_tokenizer=self.custom_tokenizer + ) + + @lru_cache(None) + def init_model( + self, + model_name: Optional[str] = None, + auto_model: Optional[type] = AutoModel, + auto_tokenizer: Optional[type] = AutoTokenizer, + custom_model: Optional[PreTrainedModel] = None, + custom_tokenizer: Optional[PreTrainedTokenizer | PreTrainedTokenizerFast] = None + ): + + try: + if self.use_auto_model: + self.model = auto_model.from_pretrained(model_name) + else: + self.model = custom_model + + if self.use_auto_tokenizer: + self.tokenizer = auto_tokenizer.from_pretrained(model_name) + else: + self.tokenizer = custom_tokenizer + + log.info(f"Done loading model {model_name}") + + except Exception as e: + log.error(f"Error loading model {model_name}: {e}") + raise e + + # + # Inference code + # + def infer_embedding( + self, + input=Union[str, List[str], List[List[str]]], + tolist: bool = True, + ): + model = self.model + + self.handle_input(input) + batch_dict = self.tokenize_inputs(input, kwargs=self.tokenizer_kwargs) + outputs = self.compute_model_outputs(batch_dict, model) + embeddings = self.compute_embeddings(outputs, batch_dict) + + # normalize embeddings + embeddings = F.normalize(embeddings, p=2, dim=1) + if tolist: + embeddings = embeddings.tolist() + return embeddings + + def handle_input(self, input: Union[str, List[str], List[List[str]]]): + if isinstance(input, str): + input = [input] + return input + + def tokenize_inputs(self, input, kwargs: Optional[dict] = dict()): + batch_dict = self.tokenizer(input, **kwargs) + return batch_dict + + def compute_model_outputs(self, batch_dict, model): + with torch.no_grad(): + outputs = model(**batch_dict) + return outputs + + def compute_embeddings(self, outputs, batch_dict): + embeddings = mean_pooling( + outputs, batch_dict["attention_mask"] + ) + return embeddings + """ + def __call__(self, **kwargs): + if "model" not in kwargs: + raise ValueError("model is required") + + if "mock" in kwargs and kwargs["mock"]: + import numpy as np + + embeddings = np.array([np.random.rand(768).tolist()]) + return embeddings + + # inference the model + return self.infer_embedding(kwargs["input"]) + """ + + # + # Preprocessing, postprocessing and call for inference code + # + def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED): + + # I don't think it is useful anymore + # if "model" not in api_kwargs: + # raise ValueError("model must be specified in api_kwargs") + if ( + model_type == ModelType.EMBEDDER + # and "model" in api_kwargs + ): + if "mock" in api_kwargs and api_kwargs["mock"]: + import numpy as np + + embeddings = np.array([np.random.rand(768).tolist()]) + return embeddings + + # inference the model + return self.infer_embedding(api_kwargs["input"]) + + def parse_embedding_response(self, response: Any) -> EmbedderOutput: + embeddings: List[Embedding] = [] + for idx, emb in enumerate(response): + embeddings.append(Embedding(index=idx, embedding=emb)) + response = EmbedderOutput(data=embeddings) + return response + + def convert_inputs_to_api_kwargs( + self, + input: Any, # for retriever, it is a single query, + model_kwargs: dict = {}, + model_type: ModelType = ModelType.UNDEFINED, + ) -> dict: + final_model_kwargs = model_kwargs.copy() + if model_type == ModelType.EMBEDDER: + final_model_kwargs["input"] = input + return final_model_kwargs + +# +# +# +# END OF DRAFT +# +# +# + # TODO: provide a standard api for embedding and chat models used in local model SDKs class TransformerEmbedder: """Local model SDK for transformers. From 123396cbfcbb95a2f6cf7b3c0b7ec5f73053e161 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Thu, 29 Aug 2024 12:26:31 +0000 Subject: [PATCH 02/36] Fixed typo. --- .../adalflow/components/model_client/transformers_client.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 20ef1ce6..4f1825b1 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -74,9 +74,9 @@ def __init__( super().__init__() self.model_name = model_name self.tokenizer_kwargs = tokenizer_kwargs - self.auto_model=auto_model, - self.auto_tokenizer=auto_tokenizer, - self.custom_model=custom_model, + self.auto_model=auto_model + self.auto_tokenizer=auto_tokenizer + self.custom_model=custom_model self.custom_tokenizer=custom_tokenizer # Check if there is conflicting arguments From 21217741865f90132cd9699f6b65bdbd03e61381 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Thu, 29 Aug 2024 14:26:56 +0000 Subject: [PATCH 03/36] Added type hints to signatures + removed now useless model_type. --- .../model_client/transformers_client.py | 57 +++++++------------ 1 file changed, 21 insertions(+), 36 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 4f1825b1..5dadc4d2 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -35,7 +35,7 @@ log = logging.getLogger(__name__) -def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: +def average_pool(last_hidden_states: Tensor, attention_mask: list) -> Tensor: last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] @@ -51,7 +51,7 @@ def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: # from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast -def mean_pooling(model_output, attention_mask): +def mean_pooling(model_output: dict, attention_mask) -> Tensor: token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) @@ -147,7 +147,7 @@ def infer_embedding( self, input=Union[str, List[str], List[List[str]]], tolist: bool = True, - ): + ) -> Union[List, Tensor]: model = self.model self.handle_input(input) @@ -161,62 +161,48 @@ def infer_embedding( embeddings = embeddings.tolist() return embeddings - def handle_input(self, input: Union[str, List[str], List[List[str]]]): + def handle_input(self, input: Union[str, List[str], List[List[str]]]) -> Union[List[str], List[List[str]]]: if isinstance(input, str): input = [input] return input - def tokenize_inputs(self, input, kwargs: Optional[dict] = dict()): + def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = dict()) -> dict: batch_dict = self.tokenizer(input, **kwargs) return batch_dict - def compute_model_outputs(self, batch_dict, model): + def compute_model_outputs(self, batch_dict: dict, model: PreTrainedModel) -> dict: with torch.no_grad(): outputs = model(**batch_dict) return outputs - def compute_embeddings(self, outputs, batch_dict): + def compute_embeddings(self, outputs: dict, batch_dict: dict): embeddings = mean_pooling( outputs, batch_dict["attention_mask"] ) return embeddings - """ - def __call__(self, **kwargs): - if "model" not in kwargs: - raise ValueError("model is required") - - if "mock" in kwargs and kwargs["mock"]: - import numpy as np - - embeddings = np.array([np.random.rand(768).tolist()]) - return embeddings - - # inference the model - return self.infer_embedding(kwargs["input"]) - """ # # Preprocessing, postprocessing and call for inference code # - def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED): + def call(self, api_kwargs: Dict = {}) -> Union[List, Tensor]: # I don't think it is useful anymore # if "model" not in api_kwargs: # raise ValueError("model must be specified in api_kwargs") - if ( - model_type == ModelType.EMBEDDER - # and "model" in api_kwargs - ): - if "mock" in api_kwargs and api_kwargs["mock"]: - import numpy as np + # if ( + # model_type == ModelType.EMBEDDER + # # and "model" in api_kwargs + # ): + if "mock" in api_kwargs and api_kwargs["mock"]: + import numpy as np - embeddings = np.array([np.random.rand(768).tolist()]) - return embeddings + embeddings = np.array([np.random.rand(768).tolist()]) + return embeddings # inference the model return self.infer_embedding(api_kwargs["input"]) - def parse_embedding_response(self, response: Any) -> EmbedderOutput: + def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOutput: embeddings: List[Embedding] = [] for idx, emb in enumerate(response): embeddings.append(Embedding(index=idx, embedding=emb)) @@ -226,13 +212,12 @@ def parse_embedding_response(self, response: Any) -> EmbedderOutput: def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, - model_kwargs: dict = {}, - model_type: ModelType = ModelType.UNDEFINED, + model_kwargs: dict = {} ) -> dict: final_model_kwargs = model_kwargs.copy() - if model_type == ModelType.EMBEDDER: - final_model_kwargs["input"] = input - return final_model_kwargs + # if model_type == ModelType.EMBEDDER: + final_model_kwargs["input"] = input + return final_model_kwargs # # From e2023b2984d9891f69b33f32ca39efcb185f6392 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Thu, 29 Aug 2024 14:27:21 +0000 Subject: [PATCH 04/36] Removed now useless model_types. --- adalflow/adalflow/core/embedder.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py index 89aac0c5..1ce131b1 100644 --- a/adalflow/adalflow/core/embedder.py +++ b/adalflow/adalflow/core/embedder.py @@ -36,7 +36,6 @@ class Embedder(Component): - Use ``BatchEmbedder`` for automatically batching input of large size, larger than 100. """ - model_type: ModelType = ModelType.EMBEDDER model_client: ModelClient output_processors: Optional[Component] @@ -100,8 +99,7 @@ def _pre_call( # step 2: convert the input to the api_kwargs api_kwargs = self.model_client.convert_inputs_to_api_kwargs( input=input, - model_kwargs=composed_model_kwargs, - model_type=self.model_type, + model_kwargs=composed_model_kwargs ) log.debug(f"api_kwargs: {api_kwargs}") return api_kwargs @@ -140,7 +138,7 @@ def call( response = None try: response = self.model_client.call( - api_kwargs=api_kwargs, model_type=self.model_type + api_kwargs=api_kwargs ) except Exception as e: log.error(f"Error calling the model: {e}") From a93bd6a68311d726ed62af82d4b1b74cb33e565a Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Thu, 29 Aug 2024 14:27:50 +0000 Subject: [PATCH 05/36] Added test for TransformerEmbeddingModelClient execution. --- adalflow/tests/test_transformer_client.py | 129 +++++++++++++++------- 1 file changed, 92 insertions(+), 37 deletions(-) diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py index d8562454..86111281 100644 --- a/adalflow/tests/test_transformer_client.py +++ b/adalflow/tests/test_transformer_client.py @@ -1,55 +1,110 @@ import unittest import torch - +from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient +from adalflow.core.types import ModelType +from adalflow.core import Embedder # Set the number of threads for PyTorch, avoid segementation fault torch.set_num_threads(1) torch.set_num_interop_threads(1) -class TestTransformerModelClient(unittest.TestCase): +class TestTransformerEmbeddingModelClient(unittest.TestCase): def setUp(self) -> None: - self.query = "what is panda?" self.documents = [ "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.", "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.", ] - # def test_transformer_embedder(self): - # transformer_embedder_model = "thenlper/gte-base" - # transformer_embedder_model_component = TransformerEmbedder( - # model_name=transformer_embedder_model - # ) - # print( - # f"Testing transformer embedder with model {transformer_embedder_model_component}" - # ) - # print("Testing transformer embedder") - # output = transformer_embedder_model_component( - # model=transformer_embedder_model, input="Hello world" - # ) - # print(output) - - # def test_transformer_client(self): - # transformer_client = TransformersClient() - # print("Testing transformer client") - # # run the model - # kwargs = { - # "model": "thenlper/gte-base", - # # "mock": False, - # } - # api_kwargs = transformer_client.convert_inputs_to_api_kwargs( - # input="Hello world", - # model_kwargs=kwargs, - # model_type=ModelType.EMBEDDER, - # ) - # # print(api_kwargs) - # output = transformer_client.call( - # api_kwargs=api_kwargs, model_type=ModelType.EMBEDDER - # ) - - # # print(transformer_client) - # # print(output) + def test_execution(self): + test_input = "Hello word" + embedding_model = "thenlper/gte-base" + model_kwargs = {"model": embedding_model} + tokenizer_kwargs = { + "max_length": 512, + "padding": True, + "truncation": True, + "return_tensors": 'pt' + } + model_client = TransformerEmbeddingModelClient( + model_name=embedding_model, + tokenizer_kwargs=tokenizer_kwargs + ) + print( + f"Testing model client with model {embedding_model}" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input=test_input, model_kwargs=model_kwargs) + output = model_client.call(api_kwargs=api_kwargs) + print(output) + + def test_integration_with_embedder(self): + + test_input = "Hello word" + embedding_model = "thenlper/gte-base" + model_kwargs = {"model": embedding_model} + tokenizer_kwargs = { + "max_length": 512, + "padding": True, + "truncation": True, + "return_tensors": 'pt' + } + model_client = TransformerEmbeddingModelClient( + model_name=embedding_model, + tokenizer_kwargs=tokenizer_kwargs + ) + print( + f"Testing model client with model {embedding_model}" + ) + embedder = Embedder(model_client=model_client, + model_kwargs=model_kwargs + ) + output = embedder(test_input) + print(output) + +# class TestTransformerModelClient(unittest.TestCase): +# def setUp(self) -> None: + +# self.query = "what is panda?" +# self.documents = [ +# "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.", +# "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.", +# ] + +# def test_transformer_embedder(self): +# transformer_embedder_model = "thenlper/gte-base" +# transformer_embedder_model_component = TransformerEmbedder( +# model_name=transformer_embedder_model +# ) +# print( +# f"Testing transformer embedder with model {transformer_embedder_model_component}" +# ) +# print("Testing transformer embedder") +# output = transformer_embedder_model_component( +# model=transformer_embedder_model, input="Hello world" +# ) +# print(output) + +# def test_transformer_client(self): +# transformer_client = TransformersClient() +# print("Testing transformer client") +# # run the model +# kwargs = { +# "model": "thenlper/gte-base", +# # "mock": False, +# } +# api_kwargs = transformer_client.convert_inputs_to_api_kwargs( +# input="Hello world", +# model_kwargs=kwargs, +# model_type=ModelType.EMBEDDER, +# ) +# # print(api_kwargs) +# output = transformer_client.call( +# api_kwargs=api_kwargs, model_type=ModelType.EMBEDDER +# ) + + # print(transformer_client) + # print(output) # def test_transformer_reranker(self): # transformer_reranker_model = "BAAI/bge-reranker-base" From 424cbfb130e83131809ce6f9e407b6bcaa714cc4 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Tue, 3 Sep 2024 15:18:51 +0000 Subject: [PATCH 06/36] Changed my mind. --- .../adalflow/components/model_client/transformers_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 5dadc4d2..5a0f544a 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -186,9 +186,9 @@ def compute_embeddings(self, outputs: dict, batch_dict: dict): # def call(self, api_kwargs: Dict = {}) -> Union[List, Tensor]: + if "model" not in api_kwargs: + raise ValueError("model must be specified in api_kwargs") # I don't think it is useful anymore - # if "model" not in api_kwargs: - # raise ValueError("model must be specified in api_kwargs") # if ( # model_type == ModelType.EMBEDDER # # and "model" in api_kwargs From ef2783d47bd8c96e10b22d9660a888e39b70098a Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Tue, 3 Sep 2024 17:12:19 +0000 Subject: [PATCH 07/36] Changed my mind. removing model type might introduce issues. --- adalflow/adalflow/core/embedder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py index 1ce131b1..7dae1ca7 100644 --- a/adalflow/adalflow/core/embedder.py +++ b/adalflow/adalflow/core/embedder.py @@ -36,6 +36,7 @@ class Embedder(Component): - Use ``BatchEmbedder`` for automatically batching input of large size, larger than 100. """ + model_type: ModelType = ModelType.EMBEDDER model_client: ModelClient output_processors: Optional[Component] From 3601445968b198b0bfef64b58b31c11cf721084f Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Tue, 3 Sep 2024 17:54:51 +0000 Subject: [PATCH 08/36] Removed now useless argument. --- adalflow/adalflow/core/embedder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py index 7dae1ca7..c43518df 100644 --- a/adalflow/adalflow/core/embedder.py +++ b/adalflow/adalflow/core/embedder.py @@ -168,7 +168,7 @@ async def acall( response = None try: response = await self.model_client.acall( - api_kwargs=api_kwargs, model_type=self.model_type + api_kwargs=api_kwargs ) except Exception as e: log.error(f"Error calling the model: {e}") From 3cdab7b4916c0d6179a46c35dc2591fceafc9f21 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Tue, 3 Sep 2024 18:02:13 +0000 Subject: [PATCH 09/36] Removed now useless arguments. --- adalflow/adalflow/core/generator.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py index 02765132..220420aa 100644 --- a/adalflow/adalflow/core/generator.py +++ b/adalflow/adalflow/core/generator.py @@ -312,7 +312,6 @@ def _pre_call(self, prompt_kwargs: Dict, model_kwargs: Dict) -> Dict[str, Any]: api_kwargs = self.model_client.convert_inputs_to_api_kwargs( input=prompt_str, model_kwargs=composed_model_kwargs, - model_type=self.model_type, ) return api_kwargs @@ -329,7 +328,7 @@ def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any: return cached_completion completion = self.model_client.call( - api_kwargs=api_kwargs, model_type=self.model_type + api_kwargs=api_kwargs ) # prepare cache if use_cache: @@ -799,7 +798,7 @@ async def acall( try: completion = await self.model_client.acall( - api_kwargs=api_kwargs, model_type=self.model_type + api_kwargs=api_kwargs ) except Exception as e: log.error(f"Error calling the model: {e}") From 2d1152fd7cf6a7e599c016a482f36c098d16e43e Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Tue, 3 Sep 2024 22:02:12 +0000 Subject: [PATCH 10/36] DRAFT: merge TransformerClient and TransformerLLM in 1 class. --- .../model_client/transformers_client.py | 268 ++++++++++++++++++ 1 file changed, 268 insertions(+) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 5a0f544a..b032c22a 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -28,9 +28,12 @@ from transformers import ( AutoTokenizer, AutoModel, + AutoModelForCausalLM, AutoModelForSequenceClassification, + pipeline ) +from os import getenv as get_env_variable log = logging.getLogger(__name__) @@ -219,6 +222,271 @@ def convert_inputs_to_api_kwargs( final_model_kwargs["input"] = input return final_model_kwargs + +class TransformerLLMModelClient(ModelClient): + + # + # Model initialisation + # + def __init__( + self, + model_name: Optional[str] = None, + init_from: Optional[str] = "autoclass", + use_token: bool = False, + torch_dtype: Optional[Any] = torch.bfloat16, + local_files_only: Optional[bool] = False + ): + super().__init__() + + self.model_name = model_name # current model to use + self.use_token = use_token + self.torch_dtype = torch_dtype + self.init_from = init_from + self.local_files_only = local_files_only + self.model = None + if model_name is not None: + self.init_model(model_name=model_name) + + def _check_token(self, token: str): + if get_env_variable(token) is None: + warnings.warn( + f"{token} is not set. You may not be able to access the model." + ) + + def _get_token_if_relevant(self) -> Union[str, bool]: + if self.use_token: + self._check_token("HF_TOKEN") + token = get_env_variable("HF_TOKEN") + else: + token = False + return token + + def _init_from_pipeline(self): + + clean_device_cache() + token = self._get_token_if_relevant() # return a token string or False + self.model = pipeline( + "text-generation", + model=self.model_name, + torch_dtype=self.torch_dtype, + device=get_device(), + token=token + ) + + def _init_from_automodelcasual_lm(self): + + token = self._get_token_if_relevant() # return a token str or False + + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_name, + token=token, + local_files_only=self.local_files_only + ) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_name, + torch_dtype=self.torch_dtype, + device_map="auto", + token=token, + local_files_only=self.local_files_only + ) + + @lru_cache(None) + def init_model(self, model_name: str): + + log.debug(f"Loading model {model_name}") + try: + if self.init_from == "autoclass": + self._init_from_automodelcasual_lm() + elif self.init_from == "pipeline": + self._init_from_pipeline() + else: + raise ValueError("argument 'init_from' must be one of 'autoclass' or 'pipeline'.") + except Exception as e: + log.error(f"Error loading model {model_name}: {e}") + raise e + + # + # Inference code + # + def _infer_from_pipeline( + self, + *, + model: str, + messages: Sequence[Dict[str, str]], + max_tokens: Optional[int] = None, + apply_chat_template: bool = False, + chat_template: Optional[str] = None, + chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True), + **kwargs, + ): + + if not self.model: + self.init_model(model_name=model) + + log.info( + f"Start to infer model {model}, messages: {messages}, kwargs: {kwargs}" + ) + # TO DO: add default values in doc + final_kwargs = { + "max_new_tokens": max_tokens or 256, + "do_sample": True, + "temperature": kwargs.get("temperature", 0.7), + "top_k": kwargs.get("top_k", 50), + "top_p": kwargs.get("top_p", 0.95), + } + if apply_chat_template: + model_input = self._handle_input( + messages, + apply_chat_template=True, + chat_template_kwargs=chat_template_kwargs, + chat_template=chat_template + ) + else: + model_input = self._handle_input(messages) + + outputs = self.model( + model_input, + **final_kwargs, + ) + log.info(f"Outputs: {outputs}") + return outputs + + def _infer_from_automodelcasual_lm( + self, + *, + model: str, + messages: Sequence[Dict[str, str]], + max_tokens: Optional[int] = None, + max_length: Optional[int] = 8192, # model-agnostic + apply_chat_template: bool = False, + chat_template: Optional[str] = None, + chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True), + **kwargs, + ): + if not self.model: + self.init_model(model_name=model) + + if apply_chat_template: + model_input = self._handle_input( + messages, + apply_chat_template=True, + chat_template_kwargs=chat_template_kwargs, + chat_template=chat_template + ) + else: + model_input = self._handle_input(messages) + + input_ids = self.tokenizer(model_input, return_tensors="pt").to( + get_device() + ) + outputs_tokens = self.model.generate(**input_ids, max_length=max_length, max_new_tokens=max_tokens, **kwargs) + outputs = [] + for output in outputs_tokens: + outputs.append(self.tokenizer.decode(output)) + return outputs + + def _handle_input( + self, + messages: Sequence[Dict[str, str]], + apply_chat_template: bool = False, + chat_template_kwargs: dict = None, + chat_template: Optional[str] = None, + ) -> str: + + if apply_chat_template: + if chat_template is not None: + self.tokenizer.chat_template = chat_template + prompt = self.model.tokenizer.apply_chat_template( + messages, **chat_template_kwargs + ) + return prompt + else: + text = messages[-1]["content"] + return text + + def infer_llm( + self, + *, + model: str, + messages: Sequence[Dict[str, str]], + max_tokens: Optional[int] = None, + **kwargs, + ): + + if self.init_from == "pipeline": + return self._infer_from_pipeline( + model=model, messages=messages, max_tokens=max_tokens, **kwargs + ) + else: + return self._infer_from_automodelcasual_lm( + model=model, messages=messages, max_tokens=max_tokens, **kwargs + ) + + # + # Preprocessing, postprocessing and call for inference code + # + def call(self, api_kwargs: Dict = {}): + + log.debug(f"api_kwargs: {api_kwargs}") + + if "model" not in api_kwargs: + raise ValueError("model must be specified in api_kwargs") + + model_name = api_kwargs["model"] + if (model_name != self.model_name) and (self.model_name is not None): + # need to update the model_name + log.warning(f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}.") + self.model_name = model_name + self.init_model(model_name=model_name) + elif (model_name != self.model_name) and (self.model_name is None): + # need to initialize the model for the first time + self.model_name = model_name + self.init_model(model_name=model_name) + + + output = self.infer_llm(**api_kwargs) + return output + + def _parse_chat_completion_from_pipeline(self, completion: Any) -> str: + + text = completion[0]["generated_text"] + + pattern = r"(?<=\|assistant\|>).*" + + match = re.search(pattern, text) + + if match: + text = match.group().strip().lstrip("\\n") + return text + else: + return "" + + def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> GeneratorOutput: + print(f"completion: {completion}") + return completion[0] + + def parse_chat_completion(self, completion: Any) -> str: + try: + if self.init_from == "pipeline": + output = self._parse_chat_completion_from_pipeline(completion) + else: + output = self._parse_chat_completion_from_automodelcasual_lm(completion) + return GeneratorOutput(data=output, raw_response=str(completion)) + except Exception as e: + log.error(f"Error parsing chat completion: {e}") + return GeneratorOutput(data=None, raw_response=str(completion), error=e) + + def convert_inputs_to_api_kwargs( + self, + input: Any, # for retriever, it is a single query, + model_kwargs: dict = {} + ) -> dict: + final_model_kwargs = model_kwargs.copy() + assert "model" in final_model_kwargs, "model must be specified" + messages = [{"role": "system", "content": input}] + final_model_kwargs["messages"] = messages + return final_model_kwargs + # # # From d839edaaac0bb6ffcd1db52d67abfc26ffbd6929 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 4 Sep 2024 18:49:47 +0000 Subject: [PATCH 11/36] Multiline message: Added 'tokenizer_kwargs' in TransformerLLMModelClient constructor for more flexibility. Added chat template argument in constructor for more flexibility. Added pad token check. Added tokenizer in '_infer_from_pipeline()' when chat_template is used (required). Fixed _handle_input() for 'apply_chat_template'==True. Not sure: ficed message in convert_inputs_to_api_kwargs(). --- .../model_client/transformers_client.py | 78 +++++++++++++++---- 1 file changed, 62 insertions(+), 16 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index b032c22a..b604606d 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -231,7 +231,11 @@ class TransformerLLMModelClient(ModelClient): def __init__( self, model_name: Optional[str] = None, + tokenizer_kwargs: Optional[dict] = {}, init_from: Optional[str] = "autoclass", + apply_chat_template: bool = False, + chat_template: Optional[str] = None, + chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True), use_token: bool = False, torch_dtype: Optional[Any] = torch.bfloat16, local_files_only: Optional[bool] = False @@ -239,9 +243,13 @@ def __init__( super().__init__() self.model_name = model_name # current model to use + self.tokenizer_kwargs = tokenizer_kwargs self.use_token = use_token self.torch_dtype = torch_dtype self.init_from = init_from + self.apply_chat_template = apply_chat_template + self.chat_template = chat_template + self.chat_template_kwargs = chat_template_kwargs self.local_files_only = local_files_only self.model = None if model_name is not None: @@ -280,7 +288,8 @@ def _init_from_automodelcasual_lm(self): self.tokenizer = AutoTokenizer.from_pretrained( self.model_name, token=token, - local_files_only=self.local_files_only + local_files_only=self.local_files_only, + **self.tokenizer_kwargs ) self.model = AutoModelForCausalLM.from_pretrained( self.model_name, @@ -289,6 +298,13 @@ def _init_from_automodelcasual_lm(self): token=token, local_files_only=self.local_files_only ) + # Set pad token if it's not already set + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token # common fallback + self.model.config.pad_token_id = ( + self.tokenizer.eos_token_id + ) # ensure consistency in the model config + @lru_cache(None) def init_model(self, model_name: str): @@ -309,15 +325,15 @@ def init_model(self, model_name: str): # Inference code # def _infer_from_pipeline( - self, - *, - model: str, - messages: Sequence[Dict[str, str]], - max_tokens: Optional[int] = None, - apply_chat_template: bool = False, - chat_template: Optional[str] = None, - chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True), - **kwargs, + self, + *, + model: str, + messages: Sequence[Dict[str, str]], + max_tokens: Optional[int] = None, + apply_chat_template: bool = False, + chat_template: Optional[str] = None, + chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True), + **kwargs, ): if not self.model: @@ -335,11 +351,24 @@ def _infer_from_pipeline( "top_p": kwargs.get("top_p", 0.95), } if apply_chat_template: + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_name, + token=self._get_token_if_relevant(), + local_files_only=self.local_files_only, + **self.tokenizer_kwargs + ) + # Set pad token if it's not already set + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token # common fallback + self.model.config.pad_token_id = ( + self.tokenizer.eos_token_id + ) # ensure consistency in the model config + model_input = self._handle_input( messages, apply_chat_template=True, + chat_template=chat_template, chat_template_kwargs=chat_template_kwargs, - chat_template=chat_template ) else: model_input = self._handle_input(messages) @@ -396,10 +425,14 @@ def _handle_input( if apply_chat_template: if chat_template is not None: self.tokenizer.chat_template = chat_template - prompt = self.model.tokenizer.apply_chat_template( + prompt = self.tokenizer.apply_chat_template( messages, **chat_template_kwargs ) - return prompt + if ("tokenize" in chat_template_kwargs) and (chat_template_kwargs["tokenize"] == True): + prompt = self.tokenizer.decode(prompt) + return prompt + else: + return prompt else: text = messages[-1]["content"] return text @@ -415,11 +448,23 @@ def infer_llm( if self.init_from == "pipeline": return self._infer_from_pipeline( - model=model, messages=messages, max_tokens=max_tokens, **kwargs + model=model, + messages=messages, + max_tokens=max_tokens, + apply_chat_template=self.apply_chat_template, + chat_template=self.chat_template, + chat_template_kwargs=self.chat_template_kwargs, + **kwargs ) else: return self._infer_from_automodelcasual_lm( - model=model, messages=messages, max_tokens=max_tokens, **kwargs + model=model, + messages=messages, + max_tokens=max_tokens, + apply_chat_template=self.apply_chat_template, + chat_template=self.chat_template, + chat_template_kwargs=self.chat_template_kwargs, + **kwargs ) # @@ -483,7 +528,8 @@ def convert_inputs_to_api_kwargs( ) -> dict: final_model_kwargs = model_kwargs.copy() assert "model" in final_model_kwargs, "model must be specified" - messages = [{"role": "system", "content": input}] + #messages = [{"role": "system", "content": input}] + messages = [{"role": "user", "content": input}] # Not sure, but it seems to make more sense final_model_kwargs["messages"] = messages return final_model_kwargs From 7daf8ae37334687ac12f47eb48338cb60e7a262c Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Fri, 6 Sep 2024 11:59:18 +0000 Subject: [PATCH 12/36] Added ests for TransformerLLMModelClient. --- adalflow/tests/test_transformer_client.py | 85 ++++++++++++++++++++++- 1 file changed, 83 insertions(+), 2 deletions(-) diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py index 86111281..193ecbc8 100644 --- a/adalflow/tests/test_transformer_client.py +++ b/adalflow/tests/test_transformer_client.py @@ -1,8 +1,8 @@ import unittest import torch -from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient +from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient, TransformerLLMModelClient from adalflow.core.types import ModelType -from adalflow.core import Embedder +from adalflow.core import Embedder, Generator # Set the number of threads for PyTorch, avoid segementation fault torch.set_num_threads(1) @@ -62,6 +62,87 @@ def test_integration_with_embedder(self): output = embedder(test_input) print(output) +class TestTransformerLLMModelClient(unittest.TestCase): + + def setUp(self) -> None: + + self.model_kwargs = { + "model": "roneneldan/TinyStories-1M", + "temperature": 0.1, + "do_sample": True + } + self.tokenizer_kwargs = { + "max_length": True, + "truncation": True, + } + self.prompt_kwargs = { + "input_str": "Where is Brian?", # test input + } + self.chat_template_kwargs = { + "tokenize": False, + "add_generation_prompt": False + } + self.chat_template = """ + {%- for message in messages %} + {%- if message['role'] == 'user' %} + {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} + {%- elif message['role'] == 'system' %} + {{- '<>\\n' + message['content'].strip() + '\\n<>\\n\\n' }} + {%- elif message['role'] == 'assistant' %} + {{- '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }} + {%- endif %} + {%- endfor %} + """ # Reference: https://huggingface.co/docs/transformers/main/en/chat_templating#how-do-i-create-a-chat-template + + def test_exectution(self): + model_client = TransformerLLMModelClient( + tokenizer_kwargs=self.tokenizer_kwargs, + local_files_only=False, + init_from="autoclass", + apply_chat_template=True, + chat_template=self.chat_template, + chat_template_kwargs=self.chat_template_kwargs + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input="Where is brian?", model_kwargs=self.model_kwargs) + output = model_client.call(api_kwargs=api_kwargs) + print(output) + + def test_integration_with_generator_autoclass(self): + model_client = TransformerLLMModelClient( + tokenizer_kwargs=self.tokenizer_kwargs, + local_files_only=False, + init_from="autoclass", + apply_chat_template=True, + chat_template=self.chat_template, + chat_template_kwargs=self.chat_template_kwargs + ) + generator = Generator( + model_client=model_client, + model_kwargs=self.model_kwargs, + # prompt_kwargs=prompt_kwargs, + # output_processors=JsonParser(), + ) + output = generator(prompt_kwargs=self.prompt_kwargs) + print(output) + + def test_integration_with_generator_pipeline(self): + model_client = TransformerLLMModelClient( + tokenizer_kwargs=self.tokenizer_kwargs, + local_files_only=False, + init_from="pipeline", + apply_chat_template=True, + chat_template=self.chat_template, + chat_template_kwargs=self.chat_template_kwargs + ) + generator = Generator( + model_client=model_client, + model_kwargs=self.model_kwargs, + # prompt_kwargs=prompt_kwargs, + # output_processors=JsonParser(), + ) + output = generator(prompt_kwargs=self.prompt_kwargs) + print(output) + # class TestTransformerModelClient(unittest.TestCase): # def setUp(self) -> None: From a4eb3bb3d70f43cc3a99af71fc22da473641b405 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Fri, 6 Sep 2024 12:02:11 +0000 Subject: [PATCH 13/36] Removed temporary log. --- .../adalflow/components/model_client/transformers_client.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index b604606d..df98d4f9 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -404,7 +404,6 @@ def _infer_from_automodelcasual_lm( ) else: model_input = self._handle_input(messages) - input_ids = self.tokenizer(model_input, return_tensors="pt").to( get_device() ) @@ -472,8 +471,6 @@ def infer_llm( # def call(self, api_kwargs: Dict = {}): - log.debug(f"api_kwargs: {api_kwargs}") - if "model" not in api_kwargs: raise ValueError("model must be specified in api_kwargs") From 6a9c657d0f7ae031dea15ff00ad0c345f570c35c Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Fri, 6 Sep 2024 14:53:56 +0000 Subject: [PATCH 14/36] Changed my mind: added model_type back into call(). --- adalflow/adalflow/core/embedder.py | 3 ++- adalflow/adalflow/core/generator.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py index c43518df..c8f069a9 100644 --- a/adalflow/adalflow/core/embedder.py +++ b/adalflow/adalflow/core/embedder.py @@ -139,7 +139,8 @@ def call( response = None try: response = self.model_client.call( - api_kwargs=api_kwargs + api_kwargs=api_kwargs, + model_type=self.model_type ) except Exception as e: log.error(f"Error calling the model: {e}") diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py index 220420aa..a89eb34b 100644 --- a/adalflow/adalflow/core/generator.py +++ b/adalflow/adalflow/core/generator.py @@ -328,7 +328,8 @@ def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any: return cached_completion completion = self.model_client.call( - api_kwargs=api_kwargs + api_kwargs=api_kwargs, + model_type=self.model_type ) # prepare cache if use_cache: From cd1823b7a4f3dc4fe59d2c02e77c33914597a751 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Fri, 6 Sep 2024 15:00:51 +0000 Subject: [PATCH 15/36] Changed my mind about model type. See prev commit. --- .../components/model_client/transformers_client.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index df98d4f9..14aa0a8e 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -187,7 +187,7 @@ def compute_embeddings(self, outputs: dict, batch_dict: dict): # # Preprocessing, postprocessing and call for inference code # - def call(self, api_kwargs: Dict = {}) -> Union[List, Tensor]: + def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType.UNDEFINED) -> Union[List, Tensor]: if "model" not in api_kwargs: raise ValueError("model must be specified in api_kwargs") @@ -215,7 +215,8 @@ def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOut def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, - model_kwargs: dict = {} + model_kwargs: dict = {}, + model_type: Optional[ModelType]= ModelType.UNDEFINED ) -> dict: final_model_kwargs = model_kwargs.copy() # if model_type == ModelType.EMBEDDER: @@ -469,7 +470,7 @@ def infer_llm( # # Preprocessing, postprocessing and call for inference code # - def call(self, api_kwargs: Dict = {}): + def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType.UNDEFINED): if "model" not in api_kwargs: raise ValueError("model must be specified in api_kwargs") @@ -521,7 +522,8 @@ def parse_chat_completion(self, completion: Any) -> str: def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, - model_kwargs: dict = {} + model_kwargs: dict = {}, + model_type: Optional[ModelType]= ModelType.UNDEFINED ) -> dict: final_model_kwargs = model_kwargs.copy() assert "model" in final_model_kwargs, "model must be specified" From 2bf711a48601bf446fcee74df1065cdc7ec0301e Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 09:40:10 +0000 Subject: [PATCH 16/36] Ensured tokenizer_kwargs has 'return_tensors' set to 'pt' by default. --- .../adalflow/components/model_client/transformers_client.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 14aa0a8e..9f7b7b9f 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -77,6 +77,8 @@ def __init__( super().__init__() self.model_name = model_name self.tokenizer_kwargs = tokenizer_kwargs + if "return_tensors" not in self.tokenizer_kwargs: + self.tokenizer_kwargs["return_tensors"]= "pt" self.auto_model=auto_model self.auto_tokenizer=auto_tokenizer self.custom_model=custom_model @@ -245,6 +247,8 @@ def __init__( self.model_name = model_name # current model to use self.tokenizer_kwargs = tokenizer_kwargs + if "return_tensors" not in self.tokenizer_kwargs: + self.tokenizer_kwargs["return_tensors"]= "pt" self.use_token = use_token self.torch_dtype = torch_dtype self.init_from = init_from From 570c8b11ba537ac4a2c3fee02029c361e41c504a Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 10:07:23 +0000 Subject: [PATCH 17/36] DRAFT: merge TransformerClient and TransformerReranker in 1 class. --- .../model_client/transformers_client.py | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 9f7b7b9f..74197a78 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -536,6 +536,132 @@ def convert_inputs_to_api_kwargs( final_model_kwargs["messages"] = messages return final_model_kwargs + +class TransformerRerankerModelClient(ModelClient): + + # + # Model initialisation + # + def __init__( + self, + model_name: Optional[str] = None, + tokenizer_kwargs: Optional[dict] = {}, + local_files_only: Optional[bool] = False + ): + self.model_name = model_name + self.tokenizer_kwargs = tokenizer_kwargs + if "return_tensors" not in self.tokenizer_kwargs: + self.tokenizer_kwargs["return_tensors"]= "pt" + self.local_files_only = local_files_only + if model_name is not None: + self.init_model(model_name=model_name) + + def init_model(self, model_name: str): + try: + self.tokenizer = AutoTokenizer.from_pretrained( + self.model_name, + local_files_only=self.local_files_only, + **self.tokenizer_kwargs + ) + self.model = AutoModelForSequenceClassification.from_pretrained( + self.model_name, + local_files_only=self.local_files_only + ) + # Check device availability and set the device + device = get_device() + + # Move model to the selected device + self.device = device + self.model.to(device) + self.model.eval() + # register the model + log.info(f"Done loading model {model_name}") + + except Exception as e: + log.error(f"Error loading model {model_name}: {e}") + raise e + + # + # Inference code + # + + def infer_reranker( + self, + model: str, + query: str, + documents: List[str], + ) -> List[float]: + if not self.model: + self.init_model(model_name=model) + # convert the query and documents to pair input + input = [(query, doc) for doc in documents] + + with torch.no_grad(): + + inputs = self.tokenizer( + input, + **self.tokenizer_kwargs + ) + inputs = {k: v.to(self.device) for k, v in inputs.items()} + scores = ( + self.model(**inputs, return_dict=True) + .logits.view( + -1, + ) + .float() + ) + # apply sigmoid to get the scores + scores = F.sigmoid(scores) + + scores = scores.tolist() + return scores + + # + # Preprocessing, postprocessing and call for inference code + # + def call(self, api_kwargs: Dict = {}): + + if "model" not in api_kwargs: + raise ValueError("model must be specified in api_kwargs") + + model_name = api_kwargs["model"] + if (model_name != self.model_name) and (self.model_name is not None): + # need to update the model_name + log.warning(f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}.") + self.model_name = model_name + self.init_model(model_name=model_name) + elif (model_name != self.model_name) and (self.model_name is None): + # need to initialize the model for the first time + self.model_name = model_name + self.init_model(model_name=model_name) + + assert "query" in api_kwargs, "query is required" + assert "documents" in api_kwargs, "documents is required" + assert "top_k" in api_kwargs, "top_k is required" + + top_k = api_kwargs.pop("top_k") + scores = self.infer_reranker(**api_kwargs) + top_k_indices, top_k_scores = get_top_k_indices_scores( + scores, top_k + ) + log.warning(f"output: ({top_k_indices}, {top_k_scores})") + return top_k_indices, top_k_scores + + def convert_inputs_to_api_kwargs( + self, + input: Any, # for retriever, it is a single query, + model_kwargs: dict = {}, + model_type: ModelType = ModelType.UNDEFINED, + ) -> dict: + final_model_kwargs = model_kwargs.copy() + + assert "model" in final_model_kwargs, "model must be specified" + assert "documents" in final_model_kwargs, "documents must be specified" + assert "top_k" in final_model_kwargs, "top_k must be specified" + final_model_kwargs["query"] = input + return final_model_kwargs + + # # # From 881cfb6847f43a572f6cc0752fbe3c9bf5433faf Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 10:16:02 +0000 Subject: [PATCH 18/36] Commented out old classes. --- .../model_client/transformers_client.py | 1502 ++++++++--------- 1 file changed, 743 insertions(+), 759 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 74197a78..8d97afd8 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -43,15 +43,6 @@ def average_pool(last_hidden_states: Tensor, attention_mask: list) -> Tensor: return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] - - -# -# -# -# DRAFT -# -# -# from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast def mean_pooling(model_output: dict, attention_mask) -> Tensor: @@ -662,754 +653,747 @@ def convert_inputs_to_api_kwargs( return final_model_kwargs -# -# -# -# END OF DRAFT -# -# -# - -# TODO: provide a standard api for embedding and chat models used in local model SDKs -class TransformerEmbedder: - """Local model SDK for transformers. - - - There are two ways to run transformers: - (1) model and then run model inference - (2) Pipeline and then run pipeline inference - - This file demonstrates how to - (1) create a torch model inference component: TransformerEmbedder which equalize to OpenAI(), the SyncAPIClient - (2) Convert this model inference component to LightRAG API client: TransformersClient - - The is now just an exmplary component that initialize a certain model from transformers and run inference on it. - It is not tested on all transformer models yet. It might be necessary to write one for each model. - - References: - - transformers: https://huggingface.co/docs/transformers/en/index - - thenlper/gte-base model:https://huggingface.co/thenlper/gte-base - """ - - models: Dict[str, type] = {} - - def __init__(self, model_name: Optional[str] = "thenlper/gte-base"): - super().__init__() - - if model_name is not None: - self.init_model(model_name=model_name) - - @lru_cache(None) - def init_model(self, model_name: str): - try: - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModel.from_pretrained(model_name) - # register the model - self.models[model_name] = self.model - log.info(f"Done loading model {model_name}") - - except Exception as e: - log.error(f"Error loading model {model_name}: {e}") - raise e - - def infer_gte_base_embedding( - self, - input=Union[str, List[str]], - tolist: bool = True, - ): - model = self.models.get("thenlper/gte-base", None) - if model is None: - # initialize the model - self.init_model("thenlper/gte-base") - - if isinstance(input, str): - input = [input] - # Tokenize the input texts - batch_dict = self.tokenizer( - input, max_length=512, padding=True, truncation=True, return_tensors="pt" - ) - outputs = model(**batch_dict) - embeddings = average_pool( - outputs.last_hidden_state, batch_dict["attention_mask"] - ) - # (Optionally) normalize embeddings - embeddings = F.normalize(embeddings, p=2, dim=1) - if tolist: - embeddings = embeddings.tolist() - return embeddings - - def __call__(self, **kwargs): - if "model" not in kwargs: - raise ValueError("model is required") - - if "mock" in kwargs and kwargs["mock"]: - import numpy as np - - embeddings = np.array([np.random.rand(768).tolist()]) - return embeddings - # load files and models, cache it for the next inference - model_name = kwargs["model"] - # inference the model - if model_name == "thenlper/gte-base": - return self.infer_gte_base_embedding(kwargs["input"]) - else: - raise ValueError(f"model {model_name} is not supported") - - -def get_device(): - # Check device availability and set the device - if torch.cuda.is_available(): - device = torch.device("cuda") - log.info("Using CUDA (GPU) for inference.") - elif torch.backends.mps.is_available(): - device = torch.device("mps") - log.info("Using MPS (Apple Silicon) for inference.") - else: - device = torch.device("cpu") - log.info("Using CPU for inference.") - - return device - - -def clean_device_cache(): - import torch - - if torch.has_mps: - torch.mps.empty_cache() - - torch.mps.set_per_process_memory_fraction(1.0) - - -class TransformerReranker: - __doc__ = r"""Local model SDK for a reranker model using transformers. - - References: - - model: https://huggingface.co/BAAI/bge-reranker-base - - paper: https://arxiv.org/abs/2309.07597 - - note: - If you are using Macbook M1 series chips, you need to ensure ``torch.device("mps")`` is set. - """ - models: Dict[str, type] = {} - - def __init__(self, model_name: Optional[str] = "BAAI/bge-reranker-base"): - self.model_name = model_name or "BAAI/bge-reranker-base" - if model_name is not None: - self.init_model(model_name=model_name) - - def init_model(self, model_name: str): - try: - self.tokenizer = AutoTokenizer.from_pretrained(model_name) - self.model = AutoModelForSequenceClassification.from_pretrained(model_name) - # Check device availability and set the device - device = get_device() - - # Move model to the selected device - self.device = device - self.model.to(device) - self.model.eval() - # register the model - self.models[model_name] = self.model # TODO: better model registration - log.info(f"Done loading model {model_name}") - - except Exception as e: - log.error(f"Error loading model {model_name}: {e}") - raise e - - def infer_bge_reranker_base( - self, - # input=List[Tuple[str, str]], # list of pairs of the query and the candidate - query: str, - documents: List[str], - ) -> List[float]: - model = self.models.get(self.model_name, None) - if model is None: - # initialize the model - self.init_model(self.model_name) - - # convert the query and documents to pair input - input = [(query, doc) for doc in documents] - - with torch.no_grad(): - - inputs = self.tokenizer( - input, - padding=True, - truncation=True, - return_tensors="pt", - max_length=512, - ) - inputs = {k: v.to(self.device) for k, v in inputs.items()} - scores = ( - model(**inputs, return_dict=True) - .logits.view( - -1, - ) - .float() - ) - # apply sigmoid to get the scores - scores = F.sigmoid(scores) - - scores = scores.tolist() - return scores - - def __call__(self, **kwargs): - r"""Ensure "model" and "input" are in the kwargs.""" - if "model" not in kwargs: - raise ValueError("model is required") - - # if "mock" in kwargs and kwargs["mock"]: - # import numpy as np - - # scores = np.array([np.random.rand(1).tolist()]) - # return scores - # load files and models, cache it for the next inference - model_name = kwargs["model"] - # inference the model - if model_name == self.model_name: - assert "query" in kwargs, "query is required" - assert "documents" in kwargs, "documents is required" - scores = self.infer_bge_reranker_base(kwargs["query"], kwargs["documents"]) - return scores - else: - raise ValueError(f"model {model_name} is not supported") - - -class TransformerLLM: - __doc__ = r"""Local model SDK for transformers LLM. - - NOTE: - This inference component is only specific to the HuggingFaceH4/zephyr-7b-beta model. - - The example raw output: - # <|system|> - # You are a friendly chatbot who always responds in the style of a pirate. - # <|user|> - # How many helicopters can a human eat in one sitting? - # <|assistant|> - # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food! - - - References: - - model: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta - - https://huggingface.co/google/gemma-2b - - https://huggingface.co/google/gemma-2-2b - - """ - models: Dict[str, type] = {} # to register the model - tokenizer: Dict[str, type] = {} - - model_to_init_func = { - "HuggingFaceH4/zephyr-7b-beta": "use_pipeline", - "google/gemma-2-2b": "use_pipeline", - } - - def __init__( - self, - model_name: Optional[str] = None, - ): - super().__init__() - - self.model_name = model_name # current model to use - - if model_name is not None and model_name not in self.models: - self.init_model(model_name=model_name) - - def _check_token(self, token: str): - import os - - if os.getenv(token) is None: - warnings.warn( - f"{token} is not set. You may not be able to access the model." - ) - - def _init_from_pipeline(self, model_name: str): - from transformers import pipeline - - clean_device_cache() - self._check_token("HF_TOKEN") - try: - import os - - pipe = pipeline( - "text-generation", - model=model_name, - torch_dtype=torch.bfloat16, - device=get_device(), - token=os.getenv("HF_TOKEN"), - ) - self.models[model_name] = pipe - except Exception as e: - log.error(f"Error loading model {model_name}: {e}") - raise e - - def _init_from_automodelcasual_lm(self, model_name: str): - try: - from transformers import AutoTokenizer, AutoModelForCausalLM - except ImportError: - raise ImportError( - "transformers is not installed. Please install it with `pip install transformers`" - ) - - try: - import os - - if os.getenv("HF_TOKEN") is None: - warnings.warn( - "HF_TOKEN is not set. You may not be able to access the model." - ) - - tokenizer = AutoTokenizer.from_pretrained( - model_name, token=os.getenv("HF_TOKEN") - ) - model = AutoModelForCausalLM.from_pretrained( - model_name, - torch_dtype=torch.bfloat16, - device_map="auto", - token=os.getenv("HF_TOKEN"), - ) - self.models[model_name] = model - self.tokenizer[model_name] = tokenizer - except Exception as e: - log.error(f"Error loading model {model_name}: {e}") - raise e - - @lru_cache(None) - def init_model(self, model_name: str): - log.debug(f"Loading model {model_name}") - - model_setup = self.model_to_init_func.get(model_name, None) - if model_setup: - if model_setup == "use_pipeline": - self._init_from_pipeline(model_name) - else: - self._init_from_automodelcasual_lm(model_name) - else: - raise ValueError(f"Model {model_name} is not supported") - - def _parse_chat_completion_from_pipeline(self, completion: Any) -> str: - - text = completion[0]["generated_text"] - - pattern = r"(?<=\|assistant\|>).*" - - match = re.search(pattern, text) - - if match: - text = match.group().strip().lstrip("\\n") - return text - else: - return "" - - def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> str: - print(f"completion: {completion}") - return completion[0] - - def parse_chat_completion(self, completion: Any) -> str: - model_name = self.model_name - model_setup = self.model_to_init_func.get(model_name, None) - if model_setup: - if model_setup == "use_pipeline": - return self._parse_chat_completion_from_pipeline(completion) - else: - return self._parse_chat_completion_from_automodelcasual_lm(completion) - else: - raise ValueError(f"Model {model_name} is not supported") - - def _infer_from_pipeline( - self, - *, - model: str, - messages: Sequence[Dict[str, str]], - max_tokens: Optional[int] = None, - **kwargs, - ): - if not model: - raise ValueError("Model is not provided.") - - if model not in self.models: - self.init_model(model_name=model) - - model_to_use = self.models[model] - log.info( - f"Start to infer model {model}, messages: {messages}, kwargs: {kwargs}" - ) - - if model == "HuggingFaceH4/zephyr-7b-beta": - - prompt = model_to_use.tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - - final_kwargs = { - "max_new_tokens": max_tokens or 256, - "do_sample": True, - "temperature": kwargs.get("temperature", 0.7), - "top_k": kwargs.get("top_k", 50), - "top_p": kwargs.get("top_p", 0.95), - } - outputs = model_to_use(prompt, **final_kwargs) - elif model == "google/gemma-2-2b": - final_kwargs = { - "max_new_tokens": max_tokens or 256, - "do_sample": True, - "temperature": kwargs.get("temperature", 0.7), - "top_k": kwargs.get("top_k", 50), - "top_p": kwargs.get("top_p", 0.95), - } - text = messages[0]["content"] - outputs = model_to_use( - text, - **final_kwargs, - ) - - log.info(f"Outputs: {outputs}") - return outputs - - def _infer_from_automodelcasual_lm( - self, - *, - model: str, - messages: Sequence[Dict[str, str]], - max_length: Optional[int] = 8192, # model-agnostic - **kwargs, - ): - if not model: - raise ValueError("Model is not provided.") - if model not in self.models: - self.init_model(model_name=model) - model_to_use = self.models[model] - tokenizer_to_use = self.tokenizer[model] - - input_ids = tokenizer_to_use(messages[0]["content"], return_tensors="pt").to( - get_device() - ) - print(input_ids) - outputs_tokens = model_to_use.generate(**input_ids, max_length=max_length) - outputs = [] - for i, output in enumerate(outputs_tokens): - outputs.append(tokenizer_to_use.decode(output)) - return outputs - - def infer_llm( - self, - *, - model: str, - messages: Sequence[Dict[str, str]], - max_tokens: Optional[int] = None, - **kwargs, - ): - # TODO: generalize the code for more models - model_setup = self.model_to_init_func.get(model, None) - if model_setup: - if model_setup == "use_pipeline": - return self._infer_from_pipeline( - model=model, messages=messages, max_tokens=max_tokens, **kwargs - ) - else: - return self._infer_from_automodelcasual_lm( - model=model, messages=messages, max_tokens=max_tokens, **kwargs - ) - else: - raise ValueError(f"Model {model} is not supported") - - def __call__(self, **kwargs): - r"""Ensure "model" and "input" are in the kwargs.""" - log.debug(f"kwargs: {kwargs}") - if "model" not in kwargs: - raise ValueError("model is required") - - if "messages" not in kwargs: - raise ValueError("messages is required") - - model_name = kwargs["model"] - if model_name != self.model_name: - # need to initialize the model and update the model_name - self.model_name = model_name - self.init_model(model_name=model_name) - - output = self.infer_llm(**kwargs) - return output - - -class TransformersClient(ModelClient): - __doc__ = r"""LightRAG API client for transformers. - - Use: ``ls ~/.cache/huggingface/hub `` to see the cached models. - - Some modeles are gated, you will need to their page to get the access token. - Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens - Once you have a token and have access, put the token in the environment variable HF_TOKEN. - """ - - support_models = { - "thenlper/gte-base": { - "type": ModelType.EMBEDDER, - }, - "BAAI/bge-reranker-base": { - "type": ModelType.RERANKER, - }, - "HuggingFaceH4/zephyr-7b-beta": {"type": ModelType.LLM}, - "google/gemma-2-2b": {"type": ModelType.LLM}, - } - - def __init__(self, model_name: Optional[str] = None) -> None: - super().__init__() - self._model_name = model_name - if self._model_name: - assert ( - self._model_name in self.support_models - ), f"model {self._model_name} is not supported" - if self._model_name == "thenlper/gte-base": - self.sync_client = self.init_sync_client() - elif self._model_name == "BAAI/bge-reranker-base": - self.reranker_client = self.init_reranker_client() - elif self._model_name == "HuggingFaceH4/zephyr-7b-beta": - self.llm_client = self.init_llm_client() - self.async_client = None - - def init_sync_client(self): - return TransformerEmbedder() - - def init_reranker_client(self): - return TransformerReranker() - - def init_llm_client(self): - return TransformerLLM() - - def set_llm_client(self, llm_client: object): - r"""Allow user to pass a custom llm client. Here is an example of a custom llm client: - - Ensure you have parse_chat_completion and __call__ methods which will be applied to api_kwargs specified in transform_client.call(). - - .. code-block:: python - - class CustomizeLLM: - - def __init__(self) -> None: - pass - - def parse_chat_completion(self, completion: Any) -> str: - return completion - - def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs): - from transformers import AutoTokenizer, AutoModelForCausalLM - - tokenizer = AutoTokenizer.from_pretrained( - "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True - ) - model = AutoModelForCausalLM.from_pretrained( - "deepseek-ai/deepseek-coder-1.3b-instruct", - trust_remote_code=True, - torch_dtype=torch.bfloat16, - ).to(get_device()) - messages = [ - {"role": "user", "content": "write a quick sort algorithm in python."} - ] - inputs = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, return_tensors="pt" - ).to(model.device) - # tokenizer.eos_token_id is the id of <|EOT|> token - outputs = model.generate( - inputs, - max_new_tokens=512, - do_sample=False, - top_k=50, - top_p=0.95, - num_return_sequences=1, - eos_token_id=tokenizer.eos_token_id, - ) - print( - tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True) - ) - decoded_outputs = [] - for output in outputs: - decoded_outputs.append( - tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True) - ) - return decoded_outputs - - llm_client = CustomizeLLM() - transformer_client.set_llm_client(llm_client) - # use in the generator - generator = Generator( - model_client=transformer_client, - model_kwargs=model_kwargs, - prompt_kwargs=prompt_kwargs, - ...) - - """ - self.llm_client = llm_client - - def parse_embedding_response(self, response: Any) -> EmbedderOutput: - embeddings: List[Embedding] = [] - for idx, emb in enumerate(response): - embeddings.append(Embedding(index=idx, embedding=emb)) - response = EmbedderOutput(data=embeddings) - return response - - def parse_chat_completion(self, completion: Any) -> GeneratorOutput: - try: - output = self.llm_client.parse_chat_completion(completion) - - return GeneratorOutput(data=output, raw_response=str(completion)) - except Exception as e: - log.error(f"Error parsing chat completion: {e}") - return GeneratorOutput(data=None, raw_response=str(completion), error=e) - - def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED): - if "model" not in api_kwargs: - raise ValueError("model must be specified in api_kwargs") - if api_kwargs["model"] not in self.support_models: - raise ValueError(f"model {api_kwargs['model']} is not supported") - - if ( - model_type == ModelType.EMBEDDER - and "model" in api_kwargs - and api_kwargs["model"] == "thenlper/gte-base" - ): - if self.sync_client is None: - self.sync_client = self.init_sync_client() - return self.sync_client(**api_kwargs) - elif ( # reranker - model_type == ModelType.RERANKER - and "model" in api_kwargs - and api_kwargs["model"] == "BAAI/bge-reranker-base" - ): - if not hasattr(self, "reranker_client") or self.reranker_client is None: - self.reranker_client = self.init_reranker_client() - scores = self.reranker_client(**api_kwargs) - top_k_indices, top_k_scores = get_top_k_indices_scores( - scores, api_kwargs["top_k"] - ) - return top_k_indices, top_k_scores - elif model_type == ModelType.LLM and "model" in api_kwargs: # LLM - if not hasattr(self, "llm_client") or self.llm_client is None: - self.llm_client = self.init_llm_client() - response = self.llm_client(**api_kwargs) - return response - else: - raise ValueError(f"model_type {model_type} is not supported") - - def convert_inputs_to_api_kwargs( - self, - input: Any, # for retriever, it is a single query, - model_kwargs: dict = {}, - model_type: ModelType = ModelType.UNDEFINED, - ) -> dict: - final_model_kwargs = model_kwargs.copy() - if model_type == ModelType.EMBEDDER: - final_model_kwargs["input"] = input - return final_model_kwargs - elif model_type == ModelType.RERANKER: - assert "model" in final_model_kwargs, "model must be specified" - assert "documents" in final_model_kwargs, "documents must be specified" - assert "top_k" in final_model_kwargs, "top_k must be specified" - final_model_kwargs["query"] = input - return final_model_kwargs - elif model_type == ModelType.LLM: - assert "model" in final_model_kwargs, "model must be specified" - messages = [{"role": "system", "content": input}] - final_model_kwargs["messages"] = messages - return final_model_kwargs - else: - raise ValueError(f"model_type {model_type} is not supported") - - -if __name__ == "__main__": - from adalflow.core import Generator - - import adalflow as adal - - adal.setup_env() - - rag_template = r""" -You are a helpful assistant. - -Your task is to answer the query that may or may not come with context information. -When context is provided, you should stick to the context and less on your prior knowledge to answer the query. - - - - {{input_str}} - - {% if context_str %} - - {{context_str}} - - {% endif %} - -""" - - template = """{{input_str}}""" - - model_kwargs = { - "model": "google/gemma-2-2b", - "temperature": 1, - "stream": False, - } - prompt_kwargs = { - "input_str": "Where is Brian?", - # "context_str": "Brian is in the kitchen.", - } - prompt_kwargs = { - "input_str": "What is the capital of France?", - } - - class CustomizeLLM: - - def __init__(self) -> None: - pass - - def parse_chat_completion(self, completion: Any) -> str: - return completion[0] - - def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs): - r"""take api key""" - from transformers import AutoTokenizer, AutoModelForCausalLM - - tokenizer = AutoTokenizer.from_pretrained( - "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True - ) - model = AutoModelForCausalLM.from_pretrained( - "deepseek-ai/deepseek-coder-1.3b-instruct", - trust_remote_code=True, - torch_dtype=torch.bfloat16, - ).to(get_device()) - messages = [ - {"role": "user", "content": "write a quick sort algorithm in python."} - ] - inputs = tokenizer.apply_chat_template( - messages, add_generation_prompt=True, return_tensors="pt" - ).to(model.device) - # tokenizer.eos_token_id is the id of <|EOT|> token - outputs = model.generate( - inputs, - max_new_tokens=512, - do_sample=False, - top_k=50, - top_p=0.95, - num_return_sequences=1, - eos_token_id=tokenizer.eos_token_id, - ) - - decoded_outputs = [] - for output in outputs: - decoded_outputs.append( - tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True) - ) - return decoded_outputs - - transformer_client = TransformersClient() - transformer_client.set_llm_client(CustomizeLLM()) - generator = Generator( - model_client=transformer_client, - model_kwargs=model_kwargs, - # prompt_kwargs=prompt_kwargs, - template=template, - # output_processors=JsonParser(), - ) - - output = generator(prompt_kwargs=prompt_kwargs) - print(output) +# # TODO: provide a standard api for embedding and chat models used in local model SDKs +# class TransformerEmbedder: +# """Local model SDK for transformers. + + +# There are two ways to run transformers: +# (1) model and then run model inference +# (2) Pipeline and then run pipeline inference + +# This file demonstrates how to +# (1) create a torch model inference component: TransformerEmbedder which equalize to OpenAI(), the SyncAPIClient +# (2) Convert this model inference component to LightRAG API client: TransformersClient + +# The is now just an exmplary component that initialize a certain model from transformers and run inference on it. +# It is not tested on all transformer models yet. It might be necessary to write one for each model. + +# References: +# - transformers: https://huggingface.co/docs/transformers/en/index +# - thenlper/gte-base model:https://huggingface.co/thenlper/gte-base +# """ + +# models: Dict[str, type] = {} + +# def __init__(self, model_name: Optional[str] = "thenlper/gte-base"): +# super().__init__() + +# if model_name is not None: +# self.init_model(model_name=model_name) + +# @lru_cache(None) +# def init_model(self, model_name: str): +# try: +# self.tokenizer = AutoTokenizer.from_pretrained(model_name) +# self.model = AutoModel.from_pretrained(model_name) +# # register the model +# self.models[model_name] = self.model +# log.info(f"Done loading model {model_name}") + +# except Exception as e: +# log.error(f"Error loading model {model_name}: {e}") +# raise e + +# def infer_gte_base_embedding( +# self, +# input=Union[str, List[str]], +# tolist: bool = True, +# ): +# model = self.models.get("thenlper/gte-base", None) +# if model is None: +# # initialize the model +# self.init_model("thenlper/gte-base") + +# if isinstance(input, str): +# input = [input] +# # Tokenize the input texts +# batch_dict = self.tokenizer( +# input, max_length=512, padding=True, truncation=True, return_tensors="pt" +# ) +# outputs = model(**batch_dict) +# embeddings = average_pool( +# outputs.last_hidden_state, batch_dict["attention_mask"] +# ) +# # (Optionally) normalize embeddings +# embeddings = F.normalize(embeddings, p=2, dim=1) +# if tolist: +# embeddings = embeddings.tolist() +# return embeddings + +# def __call__(self, **kwargs): +# if "model" not in kwargs: +# raise ValueError("model is required") + +# if "mock" in kwargs and kwargs["mock"]: +# import numpy as np + +# embeddings = np.array([np.random.rand(768).tolist()]) +# return embeddings +# # load files and models, cache it for the next inference +# model_name = kwargs["model"] +# # inference the model +# if model_name == "thenlper/gte-base": +# return self.infer_gte_base_embedding(kwargs["input"]) +# else: +# raise ValueError(f"model {model_name} is not supported") + + +# def get_device(): +# # Check device availability and set the device +# if torch.cuda.is_available(): +# device = torch.device("cuda") +# log.info("Using CUDA (GPU) for inference.") +# elif torch.backends.mps.is_available(): +# device = torch.device("mps") +# log.info("Using MPS (Apple Silicon) for inference.") +# else: +# device = torch.device("cpu") +# log.info("Using CPU for inference.") + +# return device + + +# def clean_device_cache(): +# import torch + +# if torch.has_mps: +# torch.mps.empty_cache() + +# torch.mps.set_per_process_memory_fraction(1.0) + + +# class TransformerReranker: +# __doc__ = r"""Local model SDK for a reranker model using transformers. + +# References: +# - model: https://huggingface.co/BAAI/bge-reranker-base +# - paper: https://arxiv.org/abs/2309.07597 + +# note: +# If you are using Macbook M1 series chips, you need to ensure ``torch.device("mps")`` is set. +# """ +# models: Dict[str, type] = {} + +# def __init__(self, model_name: Optional[str] = "BAAI/bge-reranker-base"): +# self.model_name = model_name or "BAAI/bge-reranker-base" +# if model_name is not None: +# self.init_model(model_name=model_name) + +# def init_model(self, model_name: str): +# try: +# self.tokenizer = AutoTokenizer.from_pretrained(model_name) +# self.model = AutoModelForSequenceClassification.from_pretrained(model_name) +# # Check device availability and set the device +# device = get_device() + +# # Move model to the selected device +# self.device = device +# self.model.to(device) +# self.model.eval() +# # register the model +# self.models[model_name] = self.model # TODO: better model registration +# log.info(f"Done loading model {model_name}") + +# except Exception as e: +# log.error(f"Error loading model {model_name}: {e}") +# raise e + +# def infer_bge_reranker_base( +# self, +# # input=List[Tuple[str, str]], # list of pairs of the query and the candidate +# query: str, +# documents: List[str], +# ) -> List[float]: +# model = self.models.get(self.model_name, None) +# if model is None: +# # initialize the model +# self.init_model(self.model_name) + +# # convert the query and documents to pair input +# input = [(query, doc) for doc in documents] + +# with torch.no_grad(): + +# inputs = self.tokenizer( +# input, +# padding=True, +# truncation=True, +# return_tensors="pt", +# max_length=512, +# ) +# inputs = {k: v.to(self.device) for k, v in inputs.items()} +# scores = ( +# model(**inputs, return_dict=True) +# .logits.view( +# -1, +# ) +# .float() +# ) +# # apply sigmoid to get the scores +# scores = F.sigmoid(scores) + +# scores = scores.tolist() +# return scores + +# def __call__(self, **kwargs): +# r"""Ensure "model" and "input" are in the kwargs.""" +# if "model" not in kwargs: +# raise ValueError("model is required") + +# # if "mock" in kwargs and kwargs["mock"]: +# # import numpy as np + +# # scores = np.array([np.random.rand(1).tolist()]) +# # return scores +# # load files and models, cache it for the next inference +# model_name = kwargs["model"] +# # inference the model +# if model_name == self.model_name: +# assert "query" in kwargs, "query is required" +# assert "documents" in kwargs, "documents is required" +# scores = self.infer_bge_reranker_base(kwargs["query"], kwargs["documents"]) +# return scores +# else: +# raise ValueError(f"model {model_name} is not supported") + + +# class TransformerLLM: +# __doc__ = r"""Local model SDK for transformers LLM. + +# NOTE: +# This inference component is only specific to the HuggingFaceH4/zephyr-7b-beta model. + +# The example raw output: +# # <|system|> +# # You are a friendly chatbot who always responds in the style of a pirate. +# # <|user|> +# # How many helicopters can a human eat in one sitting? +# # <|assistant|> +# # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food! + + +# References: +# - model: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta +# - https://huggingface.co/google/gemma-2b +# - https://huggingface.co/google/gemma-2-2b + +# """ +# models: Dict[str, type] = {} # to register the model +# tokenizer: Dict[str, type] = {} + +# model_to_init_func = { +# "HuggingFaceH4/zephyr-7b-beta": "use_pipeline", +# "google/gemma-2-2b": "use_pipeline", +# } + +# def __init__( +# self, +# model_name: Optional[str] = None, +# ): +# super().__init__() + +# self.model_name = model_name # current model to use + +# if model_name is not None and model_name not in self.models: +# self.init_model(model_name=model_name) + +# def _check_token(self, token: str): +# import os + +# if os.getenv(token) is None: +# warnings.warn( +# f"{token} is not set. You may not be able to access the model." +# ) + +# def _init_from_pipeline(self, model_name: str): +# from transformers import pipeline + +# clean_device_cache() +# self._check_token("HF_TOKEN") +# try: +# import os + +# pipe = pipeline( +# "text-generation", +# model=model_name, +# torch_dtype=torch.bfloat16, +# device=get_device(), +# token=os.getenv("HF_TOKEN"), +# ) +# self.models[model_name] = pipe +# except Exception as e: +# log.error(f"Error loading model {model_name}: {e}") +# raise e + +# def _init_from_automodelcasual_lm(self, model_name: str): +# try: +# from transformers import AutoTokenizer, AutoModelForCausalLM +# except ImportError: +# raise ImportError( +# "transformers is not installed. Please install it with `pip install transformers`" +# ) + +# try: +# import os + +# if os.getenv("HF_TOKEN") is None: +# warnings.warn( +# "HF_TOKEN is not set. You may not be able to access the model." +# ) + +# tokenizer = AutoTokenizer.from_pretrained( +# model_name, token=os.getenv("HF_TOKEN") +# ) +# model = AutoModelForCausalLM.from_pretrained( +# model_name, +# torch_dtype=torch.bfloat16, +# device_map="auto", +# token=os.getenv("HF_TOKEN"), +# ) +# self.models[model_name] = model +# self.tokenizer[model_name] = tokenizer +# except Exception as e: +# log.error(f"Error loading model {model_name}: {e}") +# raise e + +# @lru_cache(None) +# def init_model(self, model_name: str): +# log.debug(f"Loading model {model_name}") + +# model_setup = self.model_to_init_func.get(model_name, None) +# if model_setup: +# if model_setup == "use_pipeline": +# self._init_from_pipeline(model_name) +# else: +# self._init_from_automodelcasual_lm(model_name) +# else: +# raise ValueError(f"Model {model_name} is not supported") + +# def _parse_chat_completion_from_pipeline(self, completion: Any) -> str: + +# text = completion[0]["generated_text"] + +# pattern = r"(?<=\|assistant\|>).*" + +# match = re.search(pattern, text) + +# if match: +# text = match.group().strip().lstrip("\\n") +# return text +# else: +# return "" + +# def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> str: +# print(f"completion: {completion}") +# return completion[0] + +# def parse_chat_completion(self, completion: Any) -> str: +# model_name = self.model_name +# model_setup = self.model_to_init_func.get(model_name, None) +# if model_setup: +# if model_setup == "use_pipeline": +# return self._parse_chat_completion_from_pipeline(completion) +# else: +# return self._parse_chat_completion_from_automodelcasual_lm(completion) +# else: +# raise ValueError(f"Model {model_name} is not supported") + +# def _infer_from_pipeline( +# self, +# *, +# model: str, +# messages: Sequence[Dict[str, str]], +# max_tokens: Optional[int] = None, +# **kwargs, +# ): +# if not model: +# raise ValueError("Model is not provided.") + +# if model not in self.models: +# self.init_model(model_name=model) + +# model_to_use = self.models[model] + +# log.info( +# f"Start to infer model {model}, messages: {messages}, kwargs: {kwargs}" +# ) + +# if model == "HuggingFaceH4/zephyr-7b-beta": + +# prompt = model_to_use.tokenizer.apply_chat_template( +# messages, tokenize=False, add_generation_prompt=True +# ) + +# final_kwargs = { +# "max_new_tokens": max_tokens or 256, +# "do_sample": True, +# "temperature": kwargs.get("temperature", 0.7), +# "top_k": kwargs.get("top_k", 50), +# "top_p": kwargs.get("top_p", 0.95), +# } +# outputs = model_to_use(prompt, **final_kwargs) +# elif model == "google/gemma-2-2b": +# final_kwargs = { +# "max_new_tokens": max_tokens or 256, +# "do_sample": True, +# "temperature": kwargs.get("temperature", 0.7), +# "top_k": kwargs.get("top_k", 50), +# "top_p": kwargs.get("top_p", 0.95), +# } +# text = messages[0]["content"] +# outputs = model_to_use( +# text, +# **final_kwargs, +# ) + +# log.info(f"Outputs: {outputs}") +# return outputs + +# def _infer_from_automodelcasual_lm( +# self, +# *, +# model: str, +# messages: Sequence[Dict[str, str]], +# max_length: Optional[int] = 8192, # model-agnostic +# **kwargs, +# ): +# if not model: +# raise ValueError("Model is not provided.") +# if model not in self.models: +# self.init_model(model_name=model) +# model_to_use = self.models[model] +# tokenizer_to_use = self.tokenizer[model] + +# input_ids = tokenizer_to_use(messages[0]["content"], return_tensors="pt").to( +# get_device() +# ) +# print(input_ids) +# outputs_tokens = model_to_use.generate(**input_ids, max_length=max_length) +# outputs = [] +# for i, output in enumerate(outputs_tokens): +# outputs.append(tokenizer_to_use.decode(output)) +# return outputs + +# def infer_llm( +# self, +# *, +# model: str, +# messages: Sequence[Dict[str, str]], +# max_tokens: Optional[int] = None, +# **kwargs, +# ): +# # TODO: generalize the code for more models +# model_setup = self.model_to_init_func.get(model, None) +# if model_setup: +# if model_setup == "use_pipeline": +# return self._infer_from_pipeline( +# model=model, messages=messages, max_tokens=max_tokens, **kwargs +# ) +# else: +# return self._infer_from_automodelcasual_lm( +# model=model, messages=messages, max_tokens=max_tokens, **kwargs +# ) +# else: +# raise ValueError(f"Model {model} is not supported") + +# def __call__(self, **kwargs): +# r"""Ensure "model" and "input" are in the kwargs.""" +# log.debug(f"kwargs: {kwargs}") +# if "model" not in kwargs: +# raise ValueError("model is required") + +# if "messages" not in kwargs: +# raise ValueError("messages is required") + +# model_name = kwargs["model"] +# if model_name != self.model_name: +# # need to initialize the model and update the model_name +# self.model_name = model_name +# self.init_model(model_name=model_name) + +# output = self.infer_llm(**kwargs) +# return output + + +# class TransformersClient(ModelClient): +# __doc__ = r"""LightRAG API client for transformers. + +# Use: ``ls ~/.cache/huggingface/hub `` to see the cached models. + +# Some modeles are gated, you will need to their page to get the access token. +# Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens +# Once you have a token and have access, put the token in the environment variable HF_TOKEN. +# """ + +# support_models = { +# "thenlper/gte-base": { +# "type": ModelType.EMBEDDER, +# }, +# "BAAI/bge-reranker-base": { +# "type": ModelType.RERANKER, +# }, +# "HuggingFaceH4/zephyr-7b-beta": {"type": ModelType.LLM}, +# "google/gemma-2-2b": {"type": ModelType.LLM}, +# } + +# def __init__(self, model_name: Optional[str] = None) -> None: +# super().__init__() +# self._model_name = model_name +# if self._model_name: +# assert ( +# self._model_name in self.support_models +# ), f"model {self._model_name} is not supported" +# if self._model_name == "thenlper/gte-base": +# self.sync_client = self.init_sync_client() +# elif self._model_name == "BAAI/bge-reranker-base": +# self.reranker_client = self.init_reranker_client() +# elif self._model_name == "HuggingFaceH4/zephyr-7b-beta": +# self.llm_client = self.init_llm_client() +# self.async_client = None + +# def init_sync_client(self): +# return TransformerEmbedder() + +# def init_reranker_client(self): +# return TransformerReranker() + +# def init_llm_client(self): +# return TransformerLLM() + +# def set_llm_client(self, llm_client: object): +# r"""Allow user to pass a custom llm client. Here is an example of a custom llm client: + +# Ensure you have parse_chat_completion and __call__ methods which will be applied to api_kwargs specified in transform_client.call(). + +# .. code-block:: python + +# class CustomizeLLM: + +# def __init__(self) -> None: +# pass + +# def parse_chat_completion(self, completion: Any) -> str: +# return completion + +# def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs): +# from transformers import AutoTokenizer, AutoModelForCausalLM + +# tokenizer = AutoTokenizer.from_pretrained( +# "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True +# ) +# model = AutoModelForCausalLM.from_pretrained( +# "deepseek-ai/deepseek-coder-1.3b-instruct", +# trust_remote_code=True, +# torch_dtype=torch.bfloat16, +# ).to(get_device()) +# messages = [ +# {"role": "user", "content": "write a quick sort algorithm in python."} +# ] +# inputs = tokenizer.apply_chat_template( +# messages, add_generation_prompt=True, return_tensors="pt" +# ).to(model.device) +# # tokenizer.eos_token_id is the id of <|EOT|> token +# outputs = model.generate( +# inputs, +# max_new_tokens=512, +# do_sample=False, +# top_k=50, +# top_p=0.95, +# num_return_sequences=1, +# eos_token_id=tokenizer.eos_token_id, +# ) +# print( +# tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True) +# ) +# decoded_outputs = [] +# for output in outputs: +# decoded_outputs.append( +# tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True) +# ) +# return decoded_outputs + +# llm_client = CustomizeLLM() +# transformer_client.set_llm_client(llm_client) +# # use in the generator +# generator = Generator( +# model_client=transformer_client, +# model_kwargs=model_kwargs, +# prompt_kwargs=prompt_kwargs, +# ...) + +# """ +# self.llm_client = llm_client + +# def parse_embedding_response(self, response: Any) -> EmbedderOutput: +# embeddings: List[Embedding] = [] +# for idx, emb in enumerate(response): +# embeddings.append(Embedding(index=idx, embedding=emb)) +# response = EmbedderOutput(data=embeddings) +# return response + +# def parse_chat_completion(self, completion: Any) -> GeneratorOutput: +# try: +# output = self.llm_client.parse_chat_completion(completion) + +# return GeneratorOutput(data=output, raw_response=str(completion)) +# except Exception as e: +# log.error(f"Error parsing chat completion: {e}") +# return GeneratorOutput(data=None, raw_response=str(completion), error=e) + +# def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED): +# if "model" not in api_kwargs: +# raise ValueError("model must be specified in api_kwargs") +# if api_kwargs["model"] not in self.support_models: +# raise ValueError(f"model {api_kwargs['model']} is not supported") + +# if ( +# model_type == ModelType.EMBEDDER +# and "model" in api_kwargs +# and api_kwargs["model"] == "thenlper/gte-base" +# ): +# if self.sync_client is None: +# self.sync_client = self.init_sync_client() +# return self.sync_client(**api_kwargs) +# elif ( # reranker +# model_type == ModelType.RERANKER +# and "model" in api_kwargs +# and api_kwargs["model"] == "BAAI/bge-reranker-base" +# ): +# if not hasattr(self, "reranker_client") or self.reranker_client is None: +# self.reranker_client = self.init_reranker_client() +# scores = self.reranker_client(**api_kwargs) +# top_k_indices, top_k_scores = get_top_k_indices_scores( +# scores, api_kwargs["top_k"] +# ) +# return top_k_indices, top_k_scores +# elif model_type == ModelType.LLM and "model" in api_kwargs: # LLM +# if not hasattr(self, "llm_client") or self.llm_client is None: +# self.llm_client = self.init_llm_client() +# response = self.llm_client(**api_kwargs) +# return response +# else: +# raise ValueError(f"model_type {model_type} is not supported") + +# def convert_inputs_to_api_kwargs( +# self, +# input: Any, # for retriever, it is a single query, +# model_kwargs: dict = {}, +# model_type: ModelType = ModelType.UNDEFINED, +# ) -> dict: +# final_model_kwargs = model_kwargs.copy() +# if model_type == ModelType.EMBEDDER: +# final_model_kwargs["input"] = input +# return final_model_kwargs +# elif model_type == ModelType.RERANKER: +# assert "model" in final_model_kwargs, "model must be specified" +# assert "documents" in final_model_kwargs, "documents must be specified" +# assert "top_k" in final_model_kwargs, "top_k must be specified" +# final_model_kwargs["query"] = input +# return final_model_kwargs +# elif model_type == ModelType.LLM: +# assert "model" in final_model_kwargs, "model must be specified" +# messages = [{"role": "system", "content": input}] +# final_model_kwargs["messages"] = messages +# return final_model_kwargs +# else: +# raise ValueError(f"model_type {model_type} is not supported") + + +# if __name__ == "__main__": +# from adalflow.core import Generator + +# import adalflow as adal + +# adal.setup_env() + +# rag_template = r""" +# You are a helpful assistant. + +# Your task is to answer the query that may or may not come with context information. +# When context is provided, you should stick to the context and less on your prior knowledge to answer the query. +# +# +# +# {{input_str}} +# +# {% if context_str %} +# +# {{context_str}} +# +# {% endif %} +# +# """ + +# template = """{{input_str}}""" + +# model_kwargs = { +# "model": "google/gemma-2-2b", +# "temperature": 1, +# "stream": False, +# } +# prompt_kwargs = { +# "input_str": "Where is Brian?", +# # "context_str": "Brian is in the kitchen.", +# } +# prompt_kwargs = { +# "input_str": "What is the capital of France?", +# } + +# class CustomizeLLM: + +# def __init__(self) -> None: +# pass + +# def parse_chat_completion(self, completion: Any) -> str: +# return completion[0] + +# def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs): +# r"""take api key""" +# from transformers import AutoTokenizer, AutoModelForCausalLM + +# tokenizer = AutoTokenizer.from_pretrained( +# "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True +# ) +# model = AutoModelForCausalLM.from_pretrained( +# "deepseek-ai/deepseek-coder-1.3b-instruct", +# trust_remote_code=True, +# torch_dtype=torch.bfloat16, +# ).to(get_device()) +# messages = [ +# {"role": "user", "content": "write a quick sort algorithm in python."} +# ] +# inputs = tokenizer.apply_chat_template( +# messages, add_generation_prompt=True, return_tensors="pt" +# ).to(model.device) +# # tokenizer.eos_token_id is the id of <|EOT|> token +# outputs = model.generate( +# inputs, +# max_new_tokens=512, +# do_sample=False, +# top_k=50, +# top_p=0.95, +# num_return_sequences=1, +# eos_token_id=tokenizer.eos_token_id, +# ) + +# decoded_outputs = [] +# for output in outputs: +# decoded_outputs.append( +# tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True) +# ) +# return decoded_outputs + +# transformer_client = TransformersClient() +# transformer_client.set_llm_client(CustomizeLLM()) +# generator = Generator( +# model_client=transformer_client, +# model_kwargs=model_kwargs, +# # prompt_kwargs=prompt_kwargs, +# template=template, +# # output_processors=JsonParser(), +# ) + +# output = generator(prompt_kwargs=prompt_kwargs) +# print(output) From bd610d24c4758b8cb9db892e7db2b5b30dcfe7d8 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 12:42:31 +0000 Subject: [PATCH 19/36] Added tests for TransformerRerankerModelClient. --- adalflow/tests/test_transformer_client.py | 64 ++++++++++++++++++++++- 1 file changed, 63 insertions(+), 1 deletion(-) diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py index 193ecbc8..bc5aaaf6 100644 --- a/adalflow/tests/test_transformer_client.py +++ b/adalflow/tests/test_transformer_client.py @@ -1,6 +1,6 @@ import unittest import torch -from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient, TransformerLLMModelClient +from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient, TransformerLLMModelClient, TransformerRerankerModelClient from adalflow.core.types import ModelType from adalflow.core import Embedder, Generator @@ -143,6 +143,68 @@ def test_integration_with_generator_pipeline(self): output = generator(prompt_kwargs=self.prompt_kwargs) print(output) +class TestTransformerModelClient(unittest.TestCase): + def setUp(self) -> None: + + self.query = "what is panda?" + self.documents = [ + "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.", + "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.", + ] + + def test_execution(self): + transformer_reranker_model = "BAAI/bge-reranker-base" + transformer_reranker_model_client = TransformerRerankerModelClient( + tokenizer_kwargs={"padding": True} + ) + print( + f"Testing TransformerRerankerModelClient with model {transformer_reranker_model}" + ) + + model_kwargs = { + "model": transformer_reranker_model, + "documents": self.documents, + "top_k": 2, + } + + api_kwargs = transformer_reranker_model_client.convert_inputs_to_api_kwargs(self.query, model_kwargs=model_kwargs) + output = transformer_reranker_model_client.call(api_kwargs) + # assert output is a list of list with length 2 + self.assertEqual(len(output), 2) + self.assertEqual(type(output[0]), list) + self.assertEqual(type(output[1]), list) + # assert output[0] is a list of int of length top_k + tok_k = model_kwargs["top_k"] + self.assertTrue(all([isinstance(elmt, int) for elmt in output[0]])) + self.assertEqual(len(output[0]), tok_k) + # assert output[1] is a list of float of length top_k + tok_k = model_kwargs["top_k"] + self.assertTrue(all([isinstance(elmt, float) for elmt in output[1]])) + self.assertEqual(len(output[1]), tok_k) + + def test_transformer_reranker_client(self): + transformer_reranker_client = TransformerRerankerModelClient( + tokenizer_kwargs={"padding": True} + ) + print("Testing transformer reranker client") + # run the model + kwargs = { + "model": "BAAI/bge-reranker-base", + "documents": self.documents, + "top_k": 2, + } + api_kwargs = transformer_reranker_client.convert_inputs_to_api_kwargs( + input=self.query, + model_kwargs=kwargs, + + ) + print(api_kwargs) + self.assertEqual(api_kwargs["model"], "BAAI/bge-reranker-base") + output = transformer_reranker_client.call( + api_kwargs=api_kwargs + ) + self.assertEqual(type(output), tuple) + # class TestTransformerModelClient(unittest.TestCase): # def setUp(self) -> None: From 25fe83438d1981ea35b26d5cd713e47ddba1d8dc Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 13:50:37 +0000 Subject: [PATCH 20/36] Add test for llm response + remove test for old class. --- adalflow/tests/test_transformer_client.py | 133 ++++------------------ 1 file changed, 22 insertions(+), 111 deletions(-) diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py index bc5aaaf6..a1a3e658 100644 --- a/adalflow/tests/test_transformer_client.py +++ b/adalflow/tests/test_transformer_client.py @@ -107,6 +107,28 @@ def test_exectution(self): output = model_client.call(api_kwargs=api_kwargs) print(output) + def test_response(self): + + """Test the TransformerLLM model with roneneldan/TinyStories-1M for generating a response.""" + model_client = TransformerLLMModelClient( + ) + + # Define a sample input + input_text = "Hello, what's the weather today?" + + # Test generating a response, providing the 'model' keyword + # response = transformer_llm_model_component(input=input_text, model=transformer_llm_model) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input_text, self.model_kwargs) + response = model_client.call(api_kwargs) + + # Check if the response is valid + self.assertIsInstance(response, list, "The response should be a list.") + self.assertTrue(all([isinstance(elmt, str) for elmt in response]), "all elements in the response list should be strings.") + self.assertTrue(len(response) > 0, "The response should not be empty.") + + # Optionally, print the response for visual verification during testing + print(f"Generated response: {response}") + def test_integration_with_generator_autoclass(self): model_client = TransformerLLMModelClient( tokenizer_kwargs=self.tokenizer_kwargs, @@ -205,117 +227,6 @@ def test_transformer_reranker_client(self): ) self.assertEqual(type(output), tuple) -# class TestTransformerModelClient(unittest.TestCase): -# def setUp(self) -> None: - -# self.query = "what is panda?" -# self.documents = [ -# "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.", -# "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.", -# ] - -# def test_transformer_embedder(self): -# transformer_embedder_model = "thenlper/gte-base" -# transformer_embedder_model_component = TransformerEmbedder( -# model_name=transformer_embedder_model -# ) -# print( -# f"Testing transformer embedder with model {transformer_embedder_model_component}" -# ) -# print("Testing transformer embedder") -# output = transformer_embedder_model_component( -# model=transformer_embedder_model, input="Hello world" -# ) -# print(output) - -# def test_transformer_client(self): -# transformer_client = TransformersClient() -# print("Testing transformer client") -# # run the model -# kwargs = { -# "model": "thenlper/gte-base", -# # "mock": False, -# } -# api_kwargs = transformer_client.convert_inputs_to_api_kwargs( -# input="Hello world", -# model_kwargs=kwargs, -# model_type=ModelType.EMBEDDER, -# ) -# # print(api_kwargs) -# output = transformer_client.call( -# api_kwargs=api_kwargs, model_type=ModelType.EMBEDDER -# ) - - # print(transformer_client) - # print(output) - - # def test_transformer_reranker(self): - # transformer_reranker_model = "BAAI/bge-reranker-base" - # transformer_reranker_model_component = TransformerReranker() - # # print( - # # f"Testing transformer reranker with model {transformer_reranker_model_component}" - # # ) - - # model_kwargs = { - # "model": transformer_reranker_model, - # "documents": self.documents, - # "query": self.query, - # "top_k": 2, - # } - - # output = transformer_reranker_model_component( - # **model_kwargs, - # ) - # # assert output is a list of float with length 2 - # self.assertEqual(len(output), 2) - # self.assertEqual(type(output[0]), float) - - # def test_transformer_reranker_client(self): - # transformer_reranker_client = TransformersClient( - # model_name="BAAI/bge-reranker-base" - # ) - # print("Testing transformer reranker client") - # # run the model - # kwargs = { - # "model": "BAAI/bge-reranker-base", - # "documents": self.documents, - # "top_k": 2, - # } - # api_kwargs = transformer_reranker_client.convert_inputs_to_api_kwargs( - # input=self.query, - # model_kwargs=kwargs, - # model_type=ModelType.RERANKER, - # ) - # print(api_kwargs) - # self.assertEqual(api_kwargs["model"], "BAAI/bge-reranker-base") - # output = transformer_reranker_client.call( - # api_kwargs=api_kwargs, model_type=ModelType.RERANKER - # ) - # self.assertEqual(type(output), tuple) - - # def test_transformer_llm_response(self): - # from adalflow.components.model_client.transformers_client import TransformerLLM - - # """Test the TransformerLLM model with zephyr-7b-beta for generating a response.""" - # transformer_llm_model = "HuggingFaceH4/zephyr-7b-beta" - # transformer_llm_model_component = TransformerLLM( - # model_name=transformer_llm_model - # ) - - # # Define a sample input - # input_text = "Hello, what's the weather today?" - - # # Test generating a response, providing the 'model' keyword - # # response = transformer_llm_model_component(input=input_text, model=transformer_llm_model) - # response = transformer_llm_model_component(input_text=input_text) - - # # Check if the response is valid - # self.assertIsInstance(response, str, "The response should be a string.") - # self.assertTrue(len(response) > 0, "The response should not be empty.") - - # # Optionally, print the response for visual verification during testing - # print(f"Generated response: {response}") - if __name__ == "__main__": unittest.main() From e3b3b25c2584701cffae1f4ca8be44311bfdfcd7 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 13:51:03 +0000 Subject: [PATCH 21/36] Fixed test class name. --- adalflow/tests/test_transformer_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py index a1a3e658..c0b649fc 100644 --- a/adalflow/tests/test_transformer_client.py +++ b/adalflow/tests/test_transformer_client.py @@ -165,7 +165,7 @@ def test_integration_with_generator_pipeline(self): output = generator(prompt_kwargs=self.prompt_kwargs) print(output) -class TestTransformerModelClient(unittest.TestCase): +class TransformerRerankerModelClient(unittest.TestCase): def setUp(self) -> None: self.query = "what is panda?" From 2c6ee8ceeb89d41ed404b1b7fcba0c5007d3c3a8 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 13:53:45 +0000 Subject: [PATCH 22/36] Multiline message: Moved get_device andclean_device_cache at top of file. Allow user to specify autoclasses for Reranker models. --- .../model_client/transformers_client.py | 57 ++++++++++--------- 1 file changed, 30 insertions(+), 27 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 8d97afd8..846f997d 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -50,6 +50,30 @@ def mean_pooling(model_output: dict, attention_mask) -> Tensor: input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) +def get_device(): + # Check device availability and set the device + if torch.cuda.is_available(): + device = torch.device("cuda") + log.info("Using CUDA (GPU) for inference.") + elif torch.backends.mps.is_available(): + device = torch.device("mps") + log.info("Using MPS (Apple Silicon) for inference.") + else: + device = torch.device("cpu") + log.info("Using CPU for inference.") + + return device + + +def clean_device_cache(): + import torch + + if torch.backends.mps.is_built(): + torch.mps.empty_cache() + + torch.mps.set_per_process_memory_fraction(1.0) + + class TransformerEmbeddingModelClient(ModelClient): # @@ -536,9 +560,13 @@ class TransformerRerankerModelClient(ModelClient): def __init__( self, model_name: Optional[str] = None, + auto_model: Optional[type] = AutoModelForSequenceClassification, + auto_tokenizer: Optional[type] = AutoTokenizer, tokenizer_kwargs: Optional[dict] = {}, local_files_only: Optional[bool] = False ): + self.auto_model = auto_model + self.auto_tokenizer= auto_tokenizer self.model_name = model_name self.tokenizer_kwargs = tokenizer_kwargs if "return_tensors" not in self.tokenizer_kwargs: @@ -549,12 +577,12 @@ def __init__( def init_model(self, model_name: str): try: - self.tokenizer = AutoTokenizer.from_pretrained( + self.tokenizer = self.auto_tokenizer.from_pretrained( self.model_name, local_files_only=self.local_files_only, **self.tokenizer_kwargs ) - self.model = AutoModelForSequenceClassification.from_pretrained( + self.model = self.auto_model.from_pretrained( self.model_name, local_files_only=self.local_files_only ) @@ -739,31 +767,6 @@ def convert_inputs_to_api_kwargs( # else: # raise ValueError(f"model {model_name} is not supported") - -# def get_device(): -# # Check device availability and set the device -# if torch.cuda.is_available(): -# device = torch.device("cuda") -# log.info("Using CUDA (GPU) for inference.") -# elif torch.backends.mps.is_available(): -# device = torch.device("mps") -# log.info("Using MPS (Apple Silicon) for inference.") -# else: -# device = torch.device("cpu") -# log.info("Using CPU for inference.") - -# return device - - -# def clean_device_cache(): -# import torch - -# if torch.has_mps: -# torch.mps.empty_cache() - -# torch.mps.set_per_process_memory_fraction(1.0) - - # class TransformerReranker: # __doc__ = r"""Local model SDK for a reranker model using transformers. From 1bd854538b9a186c52cffcc846a604ecd67e400d Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 14:03:18 +0000 Subject: [PATCH 23/36] Deleted code for the old TransformerClientClass. --- .../model_client/transformers_client.py | 721 ------------------ 1 file changed, 721 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 846f997d..19890eba 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -679,724 +679,3 @@ def convert_inputs_to_api_kwargs( assert "top_k" in final_model_kwargs, "top_k must be specified" final_model_kwargs["query"] = input return final_model_kwargs - - - -# # TODO: provide a standard api for embedding and chat models used in local model SDKs -# class TransformerEmbedder: -# """Local model SDK for transformers. - - -# There are two ways to run transformers: -# (1) model and then run model inference -# (2) Pipeline and then run pipeline inference - -# This file demonstrates how to -# (1) create a torch model inference component: TransformerEmbedder which equalize to OpenAI(), the SyncAPIClient -# (2) Convert this model inference component to LightRAG API client: TransformersClient - -# The is now just an exmplary component that initialize a certain model from transformers and run inference on it. -# It is not tested on all transformer models yet. It might be necessary to write one for each model. - -# References: -# - transformers: https://huggingface.co/docs/transformers/en/index -# - thenlper/gte-base model:https://huggingface.co/thenlper/gte-base -# """ - -# models: Dict[str, type] = {} - -# def __init__(self, model_name: Optional[str] = "thenlper/gte-base"): -# super().__init__() - -# if model_name is not None: -# self.init_model(model_name=model_name) - -# @lru_cache(None) -# def init_model(self, model_name: str): -# try: -# self.tokenizer = AutoTokenizer.from_pretrained(model_name) -# self.model = AutoModel.from_pretrained(model_name) -# # register the model -# self.models[model_name] = self.model -# log.info(f"Done loading model {model_name}") - -# except Exception as e: -# log.error(f"Error loading model {model_name}: {e}") -# raise e - -# def infer_gte_base_embedding( -# self, -# input=Union[str, List[str]], -# tolist: bool = True, -# ): -# model = self.models.get("thenlper/gte-base", None) -# if model is None: -# # initialize the model -# self.init_model("thenlper/gte-base") - -# if isinstance(input, str): -# input = [input] -# # Tokenize the input texts -# batch_dict = self.tokenizer( -# input, max_length=512, padding=True, truncation=True, return_tensors="pt" -# ) -# outputs = model(**batch_dict) -# embeddings = average_pool( -# outputs.last_hidden_state, batch_dict["attention_mask"] -# ) -# # (Optionally) normalize embeddings -# embeddings = F.normalize(embeddings, p=2, dim=1) -# if tolist: -# embeddings = embeddings.tolist() -# return embeddings - -# def __call__(self, **kwargs): -# if "model" not in kwargs: -# raise ValueError("model is required") - -# if "mock" in kwargs and kwargs["mock"]: -# import numpy as np - -# embeddings = np.array([np.random.rand(768).tolist()]) -# return embeddings -# # load files and models, cache it for the next inference -# model_name = kwargs["model"] -# # inference the model -# if model_name == "thenlper/gte-base": -# return self.infer_gte_base_embedding(kwargs["input"]) -# else: -# raise ValueError(f"model {model_name} is not supported") - -# class TransformerReranker: -# __doc__ = r"""Local model SDK for a reranker model using transformers. - -# References: -# - model: https://huggingface.co/BAAI/bge-reranker-base -# - paper: https://arxiv.org/abs/2309.07597 - -# note: -# If you are using Macbook M1 series chips, you need to ensure ``torch.device("mps")`` is set. -# """ -# models: Dict[str, type] = {} - -# def __init__(self, model_name: Optional[str] = "BAAI/bge-reranker-base"): -# self.model_name = model_name or "BAAI/bge-reranker-base" -# if model_name is not None: -# self.init_model(model_name=model_name) - -# def init_model(self, model_name: str): -# try: -# self.tokenizer = AutoTokenizer.from_pretrained(model_name) -# self.model = AutoModelForSequenceClassification.from_pretrained(model_name) -# # Check device availability and set the device -# device = get_device() - -# # Move model to the selected device -# self.device = device -# self.model.to(device) -# self.model.eval() -# # register the model -# self.models[model_name] = self.model # TODO: better model registration -# log.info(f"Done loading model {model_name}") - -# except Exception as e: -# log.error(f"Error loading model {model_name}: {e}") -# raise e - -# def infer_bge_reranker_base( -# self, -# # input=List[Tuple[str, str]], # list of pairs of the query and the candidate -# query: str, -# documents: List[str], -# ) -> List[float]: -# model = self.models.get(self.model_name, None) -# if model is None: -# # initialize the model -# self.init_model(self.model_name) - -# # convert the query and documents to pair input -# input = [(query, doc) for doc in documents] - -# with torch.no_grad(): - -# inputs = self.tokenizer( -# input, -# padding=True, -# truncation=True, -# return_tensors="pt", -# max_length=512, -# ) -# inputs = {k: v.to(self.device) for k, v in inputs.items()} -# scores = ( -# model(**inputs, return_dict=True) -# .logits.view( -# -1, -# ) -# .float() -# ) -# # apply sigmoid to get the scores -# scores = F.sigmoid(scores) - -# scores = scores.tolist() -# return scores - -# def __call__(self, **kwargs): -# r"""Ensure "model" and "input" are in the kwargs.""" -# if "model" not in kwargs: -# raise ValueError("model is required") - -# # if "mock" in kwargs and kwargs["mock"]: -# # import numpy as np - -# # scores = np.array([np.random.rand(1).tolist()]) -# # return scores -# # load files and models, cache it for the next inference -# model_name = kwargs["model"] -# # inference the model -# if model_name == self.model_name: -# assert "query" in kwargs, "query is required" -# assert "documents" in kwargs, "documents is required" -# scores = self.infer_bge_reranker_base(kwargs["query"], kwargs["documents"]) -# return scores -# else: -# raise ValueError(f"model {model_name} is not supported") - - -# class TransformerLLM: -# __doc__ = r"""Local model SDK for transformers LLM. - -# NOTE: -# This inference component is only specific to the HuggingFaceH4/zephyr-7b-beta model. - -# The example raw output: -# # <|system|> -# # You are a friendly chatbot who always responds in the style of a pirate. -# # <|user|> -# # How many helicopters can a human eat in one sitting? -# # <|assistant|> -# # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food! - - -# References: -# - model: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta -# - https://huggingface.co/google/gemma-2b -# - https://huggingface.co/google/gemma-2-2b - -# """ -# models: Dict[str, type] = {} # to register the model -# tokenizer: Dict[str, type] = {} - -# model_to_init_func = { -# "HuggingFaceH4/zephyr-7b-beta": "use_pipeline", -# "google/gemma-2-2b": "use_pipeline", -# } - -# def __init__( -# self, -# model_name: Optional[str] = None, -# ): -# super().__init__() - -# self.model_name = model_name # current model to use - -# if model_name is not None and model_name not in self.models: -# self.init_model(model_name=model_name) - -# def _check_token(self, token: str): -# import os - -# if os.getenv(token) is None: -# warnings.warn( -# f"{token} is not set. You may not be able to access the model." -# ) - -# def _init_from_pipeline(self, model_name: str): -# from transformers import pipeline - -# clean_device_cache() -# self._check_token("HF_TOKEN") -# try: -# import os - -# pipe = pipeline( -# "text-generation", -# model=model_name, -# torch_dtype=torch.bfloat16, -# device=get_device(), -# token=os.getenv("HF_TOKEN"), -# ) -# self.models[model_name] = pipe -# except Exception as e: -# log.error(f"Error loading model {model_name}: {e}") -# raise e - -# def _init_from_automodelcasual_lm(self, model_name: str): -# try: -# from transformers import AutoTokenizer, AutoModelForCausalLM -# except ImportError: -# raise ImportError( -# "transformers is not installed. Please install it with `pip install transformers`" -# ) - -# try: -# import os - -# if os.getenv("HF_TOKEN") is None: -# warnings.warn( -# "HF_TOKEN is not set. You may not be able to access the model." -# ) - -# tokenizer = AutoTokenizer.from_pretrained( -# model_name, token=os.getenv("HF_TOKEN") -# ) -# model = AutoModelForCausalLM.from_pretrained( -# model_name, -# torch_dtype=torch.bfloat16, -# device_map="auto", -# token=os.getenv("HF_TOKEN"), -# ) -# self.models[model_name] = model -# self.tokenizer[model_name] = tokenizer -# except Exception as e: -# log.error(f"Error loading model {model_name}: {e}") -# raise e - -# @lru_cache(None) -# def init_model(self, model_name: str): -# log.debug(f"Loading model {model_name}") - -# model_setup = self.model_to_init_func.get(model_name, None) -# if model_setup: -# if model_setup == "use_pipeline": -# self._init_from_pipeline(model_name) -# else: -# self._init_from_automodelcasual_lm(model_name) -# else: -# raise ValueError(f"Model {model_name} is not supported") - -# def _parse_chat_completion_from_pipeline(self, completion: Any) -> str: - -# text = completion[0]["generated_text"] - -# pattern = r"(?<=\|assistant\|>).*" - -# match = re.search(pattern, text) - -# if match: -# text = match.group().strip().lstrip("\\n") -# return text -# else: -# return "" - -# def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> str: -# print(f"completion: {completion}") -# return completion[0] - -# def parse_chat_completion(self, completion: Any) -> str: -# model_name = self.model_name -# model_setup = self.model_to_init_func.get(model_name, None) -# if model_setup: -# if model_setup == "use_pipeline": -# return self._parse_chat_completion_from_pipeline(completion) -# else: -# return self._parse_chat_completion_from_automodelcasual_lm(completion) -# else: -# raise ValueError(f"Model {model_name} is not supported") - -# def _infer_from_pipeline( -# self, -# *, -# model: str, -# messages: Sequence[Dict[str, str]], -# max_tokens: Optional[int] = None, -# **kwargs, -# ): -# if not model: -# raise ValueError("Model is not provided.") - -# if model not in self.models: -# self.init_model(model_name=model) - -# model_to_use = self.models[model] - -# log.info( -# f"Start to infer model {model}, messages: {messages}, kwargs: {kwargs}" -# ) - -# if model == "HuggingFaceH4/zephyr-7b-beta": - -# prompt = model_to_use.tokenizer.apply_chat_template( -# messages, tokenize=False, add_generation_prompt=True -# ) - -# final_kwargs = { -# "max_new_tokens": max_tokens or 256, -# "do_sample": True, -# "temperature": kwargs.get("temperature", 0.7), -# "top_k": kwargs.get("top_k", 50), -# "top_p": kwargs.get("top_p", 0.95), -# } -# outputs = model_to_use(prompt, **final_kwargs) -# elif model == "google/gemma-2-2b": -# final_kwargs = { -# "max_new_tokens": max_tokens or 256, -# "do_sample": True, -# "temperature": kwargs.get("temperature", 0.7), -# "top_k": kwargs.get("top_k", 50), -# "top_p": kwargs.get("top_p", 0.95), -# } -# text = messages[0]["content"] -# outputs = model_to_use( -# text, -# **final_kwargs, -# ) - -# log.info(f"Outputs: {outputs}") -# return outputs - -# def _infer_from_automodelcasual_lm( -# self, -# *, -# model: str, -# messages: Sequence[Dict[str, str]], -# max_length: Optional[int] = 8192, # model-agnostic -# **kwargs, -# ): -# if not model: -# raise ValueError("Model is not provided.") -# if model not in self.models: -# self.init_model(model_name=model) -# model_to_use = self.models[model] -# tokenizer_to_use = self.tokenizer[model] - -# input_ids = tokenizer_to_use(messages[0]["content"], return_tensors="pt").to( -# get_device() -# ) -# print(input_ids) -# outputs_tokens = model_to_use.generate(**input_ids, max_length=max_length) -# outputs = [] -# for i, output in enumerate(outputs_tokens): -# outputs.append(tokenizer_to_use.decode(output)) -# return outputs - -# def infer_llm( -# self, -# *, -# model: str, -# messages: Sequence[Dict[str, str]], -# max_tokens: Optional[int] = None, -# **kwargs, -# ): -# # TODO: generalize the code for more models -# model_setup = self.model_to_init_func.get(model, None) -# if model_setup: -# if model_setup == "use_pipeline": -# return self._infer_from_pipeline( -# model=model, messages=messages, max_tokens=max_tokens, **kwargs -# ) -# else: -# return self._infer_from_automodelcasual_lm( -# model=model, messages=messages, max_tokens=max_tokens, **kwargs -# ) -# else: -# raise ValueError(f"Model {model} is not supported") - -# def __call__(self, **kwargs): -# r"""Ensure "model" and "input" are in the kwargs.""" -# log.debug(f"kwargs: {kwargs}") -# if "model" not in kwargs: -# raise ValueError("model is required") - -# if "messages" not in kwargs: -# raise ValueError("messages is required") - -# model_name = kwargs["model"] -# if model_name != self.model_name: -# # need to initialize the model and update the model_name -# self.model_name = model_name -# self.init_model(model_name=model_name) - -# output = self.infer_llm(**kwargs) -# return output - - -# class TransformersClient(ModelClient): -# __doc__ = r"""LightRAG API client for transformers. - -# Use: ``ls ~/.cache/huggingface/hub `` to see the cached models. - -# Some modeles are gated, you will need to their page to get the access token. -# Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens -# Once you have a token and have access, put the token in the environment variable HF_TOKEN. -# """ - -# support_models = { -# "thenlper/gte-base": { -# "type": ModelType.EMBEDDER, -# }, -# "BAAI/bge-reranker-base": { -# "type": ModelType.RERANKER, -# }, -# "HuggingFaceH4/zephyr-7b-beta": {"type": ModelType.LLM}, -# "google/gemma-2-2b": {"type": ModelType.LLM}, -# } - -# def __init__(self, model_name: Optional[str] = None) -> None: -# super().__init__() -# self._model_name = model_name -# if self._model_name: -# assert ( -# self._model_name in self.support_models -# ), f"model {self._model_name} is not supported" -# if self._model_name == "thenlper/gte-base": -# self.sync_client = self.init_sync_client() -# elif self._model_name == "BAAI/bge-reranker-base": -# self.reranker_client = self.init_reranker_client() -# elif self._model_name == "HuggingFaceH4/zephyr-7b-beta": -# self.llm_client = self.init_llm_client() -# self.async_client = None - -# def init_sync_client(self): -# return TransformerEmbedder() - -# def init_reranker_client(self): -# return TransformerReranker() - -# def init_llm_client(self): -# return TransformerLLM() - -# def set_llm_client(self, llm_client: object): -# r"""Allow user to pass a custom llm client. Here is an example of a custom llm client: - -# Ensure you have parse_chat_completion and __call__ methods which will be applied to api_kwargs specified in transform_client.call(). - -# .. code-block:: python - -# class CustomizeLLM: - -# def __init__(self) -> None: -# pass - -# def parse_chat_completion(self, completion: Any) -> str: -# return completion - -# def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs): -# from transformers import AutoTokenizer, AutoModelForCausalLM - -# tokenizer = AutoTokenizer.from_pretrained( -# "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True -# ) -# model = AutoModelForCausalLM.from_pretrained( -# "deepseek-ai/deepseek-coder-1.3b-instruct", -# trust_remote_code=True, -# torch_dtype=torch.bfloat16, -# ).to(get_device()) -# messages = [ -# {"role": "user", "content": "write a quick sort algorithm in python."} -# ] -# inputs = tokenizer.apply_chat_template( -# messages, add_generation_prompt=True, return_tensors="pt" -# ).to(model.device) -# # tokenizer.eos_token_id is the id of <|EOT|> token -# outputs = model.generate( -# inputs, -# max_new_tokens=512, -# do_sample=False, -# top_k=50, -# top_p=0.95, -# num_return_sequences=1, -# eos_token_id=tokenizer.eos_token_id, -# ) -# print( -# tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True) -# ) -# decoded_outputs = [] -# for output in outputs: -# decoded_outputs.append( -# tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True) -# ) -# return decoded_outputs - -# llm_client = CustomizeLLM() -# transformer_client.set_llm_client(llm_client) -# # use in the generator -# generator = Generator( -# model_client=transformer_client, -# model_kwargs=model_kwargs, -# prompt_kwargs=prompt_kwargs, -# ...) - -# """ -# self.llm_client = llm_client - -# def parse_embedding_response(self, response: Any) -> EmbedderOutput: -# embeddings: List[Embedding] = [] -# for idx, emb in enumerate(response): -# embeddings.append(Embedding(index=idx, embedding=emb)) -# response = EmbedderOutput(data=embeddings) -# return response - -# def parse_chat_completion(self, completion: Any) -> GeneratorOutput: -# try: -# output = self.llm_client.parse_chat_completion(completion) - -# return GeneratorOutput(data=output, raw_response=str(completion)) -# except Exception as e: -# log.error(f"Error parsing chat completion: {e}") -# return GeneratorOutput(data=None, raw_response=str(completion), error=e) - -# def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED): -# if "model" not in api_kwargs: -# raise ValueError("model must be specified in api_kwargs") -# if api_kwargs["model"] not in self.support_models: -# raise ValueError(f"model {api_kwargs['model']} is not supported") - -# if ( -# model_type == ModelType.EMBEDDER -# and "model" in api_kwargs -# and api_kwargs["model"] == "thenlper/gte-base" -# ): -# if self.sync_client is None: -# self.sync_client = self.init_sync_client() -# return self.sync_client(**api_kwargs) -# elif ( # reranker -# model_type == ModelType.RERANKER -# and "model" in api_kwargs -# and api_kwargs["model"] == "BAAI/bge-reranker-base" -# ): -# if not hasattr(self, "reranker_client") or self.reranker_client is None: -# self.reranker_client = self.init_reranker_client() -# scores = self.reranker_client(**api_kwargs) -# top_k_indices, top_k_scores = get_top_k_indices_scores( -# scores, api_kwargs["top_k"] -# ) -# return top_k_indices, top_k_scores -# elif model_type == ModelType.LLM and "model" in api_kwargs: # LLM -# if not hasattr(self, "llm_client") or self.llm_client is None: -# self.llm_client = self.init_llm_client() -# response = self.llm_client(**api_kwargs) -# return response -# else: -# raise ValueError(f"model_type {model_type} is not supported") - -# def convert_inputs_to_api_kwargs( -# self, -# input: Any, # for retriever, it is a single query, -# model_kwargs: dict = {}, -# model_type: ModelType = ModelType.UNDEFINED, -# ) -> dict: -# final_model_kwargs = model_kwargs.copy() -# if model_type == ModelType.EMBEDDER: -# final_model_kwargs["input"] = input -# return final_model_kwargs -# elif model_type == ModelType.RERANKER: -# assert "model" in final_model_kwargs, "model must be specified" -# assert "documents" in final_model_kwargs, "documents must be specified" -# assert "top_k" in final_model_kwargs, "top_k must be specified" -# final_model_kwargs["query"] = input -# return final_model_kwargs -# elif model_type == ModelType.LLM: -# assert "model" in final_model_kwargs, "model must be specified" -# messages = [{"role": "system", "content": input}] -# final_model_kwargs["messages"] = messages -# return final_model_kwargs -# else: -# raise ValueError(f"model_type {model_type} is not supported") - - -# if __name__ == "__main__": -# from adalflow.core import Generator - -# import adalflow as adal - -# adal.setup_env() - -# rag_template = r""" -# You are a helpful assistant. - -# Your task is to answer the query that may or may not come with context information. -# When context is provided, you should stick to the context and less on your prior knowledge to answer the query. -# -# -# -# {{input_str}} -# -# {% if context_str %} -# -# {{context_str}} -# -# {% endif %} -# -# """ - -# template = """{{input_str}}""" - -# model_kwargs = { -# "model": "google/gemma-2-2b", -# "temperature": 1, -# "stream": False, -# } -# prompt_kwargs = { -# "input_str": "Where is Brian?", -# # "context_str": "Brian is in the kitchen.", -# } -# prompt_kwargs = { -# "input_str": "What is the capital of France?", -# } - -# class CustomizeLLM: - -# def __init__(self) -> None: -# pass - -# def parse_chat_completion(self, completion: Any) -> str: -# return completion[0] - -# def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs): -# r"""take api key""" -# from transformers import AutoTokenizer, AutoModelForCausalLM - -# tokenizer = AutoTokenizer.from_pretrained( -# "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True -# ) -# model = AutoModelForCausalLM.from_pretrained( -# "deepseek-ai/deepseek-coder-1.3b-instruct", -# trust_remote_code=True, -# torch_dtype=torch.bfloat16, -# ).to(get_device()) -# messages = [ -# {"role": "user", "content": "write a quick sort algorithm in python."} -# ] -# inputs = tokenizer.apply_chat_template( -# messages, add_generation_prompt=True, return_tensors="pt" -# ).to(model.device) -# # tokenizer.eos_token_id is the id of <|EOT|> token -# outputs = model.generate( -# inputs, -# max_new_tokens=512, -# do_sample=False, -# top_k=50, -# top_p=0.95, -# num_return_sequences=1, -# eos_token_id=tokenizer.eos_token_id, -# ) - -# decoded_outputs = [] -# for output in outputs: -# decoded_outputs.append( -# tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True) -# ) -# return decoded_outputs - -# transformer_client = TransformersClient() -# transformer_client.set_llm_client(CustomizeLLM()) -# generator = Generator( -# model_client=transformer_client, -# model_kwargs=model_kwargs, -# # prompt_kwargs=prompt_kwargs, -# template=template, -# # output_processors=JsonParser(), -# ) - -# output = generator(prompt_kwargs=prompt_kwargs) -# print(output) From 0059bbfc57093c8bb2a0b289f919b0db70ec68bf Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Mon, 9 Sep 2024 14:04:46 +0000 Subject: [PATCH 24/36] Added __doc__ for the client classes. --- .../model_client/transformers_client.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 19890eba..f1ea0f7e 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -75,7 +75,14 @@ def clean_device_cache(): class TransformerEmbeddingModelClient(ModelClient): + __doc__ = r"""LightRAG API client for embedding models using HuggingFace's transformers library. + Use: ``ls ~/.cache/huggingface/hub `` to see the cached models. + + Some modeles are gated, you will need to their page to get the access token. + Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens + Once you have a token and have access, put the token in the environment variable HF_TOKEN. + """ # # Model initialisation # @@ -242,7 +249,14 @@ def convert_inputs_to_api_kwargs( class TransformerLLMModelClient(ModelClient): + __doc__ = r"""LightRAG API client for text generation models using HuggingFace's transformers library. + + Use: ``ls ~/.cache/huggingface/hub `` to see the cached models. + Some modeles are gated, you will need to their page to get the access token. + Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens + Once you have a token and have access, put the token in the environment variable HF_TOKEN. + """ # # Model initialisation # @@ -553,7 +567,14 @@ def convert_inputs_to_api_kwargs( class TransformerRerankerModelClient(ModelClient): + __doc__ = r"""LightRAG API client for reranker (cross-encoder) models using HuggingFace's transformers library. + + Use: ``ls ~/.cache/huggingface/hub `` to see the cached models. + Some modeles are gated, you will need to their page to get the access token. + Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens + Once you have a token and have access, put the token in the environment variable HF_TOKEN. + """ # # Model initialisation # From 53d5384a75d2cea238c120fa9b2a18718e900a78 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Tue, 10 Sep 2024 10:19:10 +0000 Subject: [PATCH 25/36] Formatting. --- .../model_client/transformers_client.py | 47 +++++++++++++------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index f1ea0f7e..f6f8b239 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -6,7 +6,6 @@ import re import warnings - from adalflow.core.model_client import ModelClient from adalflow.core.types import GeneratorOutput, ModelType, Embedding, EmbedderOutput from adalflow.core.functional import get_top_k_indices_scores @@ -14,27 +13,29 @@ # optional import from adalflow.utils.lazy_import import safe_import, OptionalPackages - -transformers = safe_import( - OptionalPackages.TRANSFORMERS.value[0], OptionalPackages.TRANSFORMERS.value[1] -) -torch = safe_import(OptionalPackages.TORCH.value[0], OptionalPackages.TORCH.value[1]) - -import torch - import torch.nn.functional as F from torch import Tensor +import torch from transformers import ( + PreTrainedModel, + PreTrainedTokenizer, + PreTrainedTokenizerFast, AutoTokenizer, AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, pipeline ) - from os import getenv as get_env_variable +transformers = safe_import( + OptionalPackages.TRANSFORMERS.value[0], + OptionalPackages.TRANSFORMERS.value[1] +) +torch = safe_import(OptionalPackages.TORCH.value[0], OptionalPackages.TORCH.value[1]) + + log = logging.getLogger(__name__) @@ -43,13 +44,12 @@ def average_pool(last_hidden_states: Tensor, attention_mask: list) -> Tensor: return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] -from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast - def mean_pooling(model_output: dict, attention_mask) -> Tensor: token_embeddings = model_output[0] #First element of model_output contains all token embeddings input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + def get_device(): # Check device availability and set the device if torch.cuda.is_available(): @@ -131,6 +131,7 @@ def __init__( self.init_sync_client() + def init_sync_client(self): self.init_model( model_name=self.model_name, @@ -140,6 +141,7 @@ def init_sync_client(self): custom_tokenizer=self.custom_tokenizer ) + @lru_cache(None) def init_model( self, @@ -188,20 +190,24 @@ def infer_embedding( embeddings = embeddings.tolist() return embeddings + def handle_input(self, input: Union[str, List[str], List[List[str]]]) -> Union[List[str], List[List[str]]]: if isinstance(input, str): input = [input] return input + def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = dict()) -> dict: batch_dict = self.tokenizer(input, **kwargs) return batch_dict + def compute_model_outputs(self, batch_dict: dict, model: PreTrainedModel) -> dict: with torch.no_grad(): outputs = model(**batch_dict) return outputs + def compute_embeddings(self, outputs: dict, batch_dict: dict): embeddings = mean_pooling( outputs, batch_dict["attention_mask"] @@ -229,6 +235,7 @@ def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType # inference the model return self.infer_embedding(api_kwargs["input"]) + def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOutput: embeddings: List[Embedding] = [] for idx, emb in enumerate(response): @@ -236,6 +243,7 @@ def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOut response = EmbedderOutput(data=embeddings) return response + def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, @@ -289,12 +297,14 @@ def __init__( if model_name is not None: self.init_model(model_name=model_name) + def _check_token(self, token: str): if get_env_variable(token) is None: warnings.warn( f"{token} is not set. You may not be able to access the model." ) + def _get_token_if_relevant(self) -> Union[str, bool]: if self.use_token: self._check_token("HF_TOKEN") @@ -303,6 +313,7 @@ def _get_token_if_relevant(self) -> Union[str, bool]: token = False return token + def _init_from_pipeline(self): clean_device_cache() @@ -315,6 +326,7 @@ def _init_from_pipeline(self): token=token ) + def _init_from_automodelcasual_lm(self): token = self._get_token_if_relevant() # return a token str or False @@ -414,6 +426,7 @@ def _infer_from_pipeline( log.info(f"Outputs: {outputs}") return outputs + def _infer_from_automodelcasual_lm( self, *, @@ -447,6 +460,7 @@ def _infer_from_automodelcasual_lm( outputs.append(self.tokenizer.decode(output)) return outputs + def _handle_input( self, messages: Sequence[Dict[str, str]], @@ -470,6 +484,7 @@ def _handle_input( text = messages[-1]["content"] return text + def infer_llm( self, *, @@ -523,12 +538,11 @@ def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType output = self.infer_llm(**api_kwargs) return output + def _parse_chat_completion_from_pipeline(self, completion: Any) -> str: text = completion[0]["generated_text"] - pattern = r"(?<=\|assistant\|>).*" - match = re.search(pattern, text) if match: @@ -537,10 +551,12 @@ def _parse_chat_completion_from_pipeline(self, completion: Any) -> str: else: return "" + def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> GeneratorOutput: print(f"completion: {completion}") return completion[0] + def parse_chat_completion(self, completion: Any) -> str: try: if self.init_from == "pipeline": @@ -552,6 +568,7 @@ def parse_chat_completion(self, completion: Any) -> str: log.error(f"Error parsing chat completion: {e}") return GeneratorOutput(data=None, raw_response=str(completion), error=e) + def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, @@ -596,6 +613,7 @@ def __init__( if model_name is not None: self.init_model(model_name=model_name) + def init_model(self, model_name: str): try: self.tokenizer = self.auto_tokenizer.from_pretrained( @@ -687,6 +705,7 @@ def call(self, api_kwargs: Dict = {}): log.warning(f"output: ({top_k_indices}, {top_k_scores})") return top_k_indices, top_k_scores + def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, From be8041c67cdfc35dde0e09bf8267ad2ec7486f15 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Tue, 10 Sep 2024 10:58:59 +0000 Subject: [PATCH 26/36] Added example for transformers_client module + fixed import. --- tutorials/model_client.ipynb | 170 ++++++++++++++++++++++++++++++++++- 1 file changed, 167 insertions(+), 3 deletions(-) diff --git a/tutorials/model_client.ipynb b/tutorials/model_client.ipynb index 60ea6585..f3f302d9 100644 --- a/tutorials/model_client.ipynb +++ b/tutorials/model_client.ipynb @@ -24,9 +24,9 @@ } ], "source": [ - "from lightrag.components.model_client import OpenAIClient\n", - "from lightrag.core.types import ModelType\n", - "from lightrag.utils import setup_env\n", + "from adalflow.components.model_client import OpenAIClient\n", + "from adalflow.core.types import ModelType\n", + "from adalflow.utils import setup_env\n", "\n", "openai_client = OpenAIClient()\n", "\n", @@ -61,6 +61,170 @@ "print(f\"reponse_embedder_output: {reponse_embedder_output}\")\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For local models, we can use the client classes from the `transformers_client` module:\n", + "- TransformerEmbeddingModelClient\n", + "- TransformerLLMModelClient\n", + "- TransformerRerankerModelClient" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'adalflow'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb Cell 4\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39madalflow\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mcomponents\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmodel_client\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtransformers_client\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m 2\u001b[0m TransformerEmbeddingModelClient,\n\u001b[1;32m 3\u001b[0m TransformerLLMModelClient,\n\u001b[1;32m 4\u001b[0m TransformerRerankerModelClient\n\u001b[1;32m 5\u001b[0m )\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'adalflow'" + ] + } + ], + "source": [ + "from adalflow.components.model_client.transformers_client import (\n", + " TransformerEmbeddingModelClient,\n", + " TransformerLLMModelClient,\n", + " TransformerRerankerModelClient\n", + " )\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"Where is Brian?\"\n", + "documents = [\n", + " \"Brian is in the kitchen.\",\n", + " \"I love Adalflow.\",\n", + " \"Brian too.\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'TransformerEmbeddingModelClient' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb Cell 6\u001b[0m line \u001b[0;36m9\n\u001b[1;32m 2\u001b[0m model_kwargs \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39mmodel\u001b[39m\u001b[39m\"\u001b[39m: embedding_model}\n\u001b[1;32m 3\u001b[0m tokenizer_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 4\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mmax_length\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m512\u001b[39m,\n\u001b[1;32m 5\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mpadding\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 6\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtruncation\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 7\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mreturn_tensors\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m'\u001b[39m\u001b[39mpt\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m 8\u001b[0m }\n\u001b[0;32m----> 9\u001b[0m model_client \u001b[39m=\u001b[39m TransformerEmbeddingModelClient(\n\u001b[1;32m 10\u001b[0m model_name\u001b[39m=\u001b[39membedding_model,\n\u001b[1;32m 11\u001b[0m tokenizer_kwargs\u001b[39m=\u001b[39mtokenizer_kwargs\n\u001b[1;32m 12\u001b[0m )\n\u001b[1;32m 13\u001b[0m \u001b[39mprint\u001b[39m(\n\u001b[1;32m 14\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mTesting model client with model \u001b[39m\u001b[39m{\u001b[39;00membedding_model\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m 15\u001b[0m )\n\u001b[1;32m 16\u001b[0m api_kwargs \u001b[39m=\u001b[39m model_client\u001b[39m.\u001b[39mconvert_inputs_to_api_kwargs(\u001b[39minput\u001b[39m\u001b[39m=\u001b[39mquery, model_kwargs\u001b[39m=\u001b[39mmodel_kwargs)\n", + "\u001b[0;31mNameError\u001b[0m: name 'TransformerEmbeddingModelClient' is not defined" + ] + } + ], + "source": [ + "embedding_model = \"thenlper/gte-base\"\n", + "model_kwargs = {\"model\": embedding_model}\n", + "tokenizer_kwargs = {\n", + " \"max_length\": 512,\n", + " \"padding\": True,\n", + " \"truncation\": True,\n", + " \"return_tensors\": 'pt'\n", + "}\n", + "model_client = TransformerEmbeddingModelClient(\n", + " model_name=embedding_model,\n", + " tokenizer_kwargs=tokenizer_kwargs\n", + ")\n", + "print(\n", + " f\"Testing model client with model {embedding_model}\"\n", + ")\n", + "api_kwargs = model_client.convert_inputs_to_api_kwargs(input=query, model_kwargs=model_kwargs)\n", + "print(f\"api_kwargs: {api_kwargs}\")\n", + "output = model_client.call(api_kwargs=api_kwargs)\n", + "print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'TransformerLLMModelClient' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb Cell 7\u001b[0m line \u001b[0;36m1\n\u001b[1;32m 6\u001b[0m tokenizer_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 7\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mmax_length\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 8\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtruncation\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m 9\u001b[0m }\n\u001b[1;32m 10\u001b[0m prompt_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 11\u001b[0m \u001b[39m\"\u001b[39m\u001b[39minput_str\u001b[39m\u001b[39m\"\u001b[39m: query, \n\u001b[1;32m 12\u001b[0m }\n\u001b[0;32m---> 13\u001b[0m model_client \u001b[39m=\u001b[39m TransformerLLMModelClient(\n\u001b[1;32m 14\u001b[0m tokenizer_kwargs\u001b[39m=\u001b[39mtokenizer_kwargs,\n\u001b[1;32m 15\u001b[0m local_files_only\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m 16\u001b[0m init_from\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mautoclass\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 17\u001b[0m )\n\u001b[1;32m 18\u001b[0m api_kwargs \u001b[39m=\u001b[39m model_client\u001b[39m.\u001b[39mconvert_inputs_to_api_kwargs(\u001b[39minput\u001b[39m\u001b[39m=\u001b[39mquery, model_kwargs\u001b[39m=\u001b[39mmodel_kwargs)\n\u001b[1;32m 19\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mapi_kwargs: \u001b[39m\u001b[39m{\u001b[39;00mapi_kwargs\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'TransformerLLMModelClient' is not defined" + ] + } + ], + "source": [ + "model_kwargs = {\n", + " \"model\": \"roneneldan/TinyStories-1M\",\n", + " \"temperature\": 0.1,\n", + " \"do_sample\": True\n", + "}\n", + "tokenizer_kwargs = {\n", + " \"max_length\": True,\n", + " \"truncation\": True,\n", + "}\n", + "prompt_kwargs = {\n", + " \"input_str\": query, \n", + "}\n", + "model_client = TransformerLLMModelClient(\n", + " tokenizer_kwargs=tokenizer_kwargs,\n", + " local_files_only=False,\n", + " init_from=\"autoclass\",\n", + " )\n", + "api_kwargs = model_client.convert_inputs_to_api_kwargs(input=query, model_kwargs=model_kwargs)\n", + "print(f\"api_kwargs: {api_kwargs}\")\n", + "output = model_client.call(api_kwargs=api_kwargs)\n", + "print(f\"reponse_embedder_output: {reponse_embedder_output}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'TransformerRerankerModelClient' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb Cell 8\u001b[0m line \u001b[0;36m2\n\u001b[1;32m 1\u001b[0m transformer_reranker_model \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mBAAI/bge-reranker-base\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m model_client \u001b[39m=\u001b[39m TransformerRerankerModelClient(\n\u001b[1;32m 3\u001b[0m tokenizer_kwargs\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mpadding\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m}\n\u001b[1;32m 4\u001b[0m )\n\u001b[1;32m 6\u001b[0m model_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m 7\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mmodel\u001b[39m\u001b[39m\"\u001b[39m: transformer_reranker_model,\n\u001b[1;32m 8\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mdocuments\u001b[39m\u001b[39m\"\u001b[39m: documents,\n\u001b[1;32m 9\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mtop_k\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m2\u001b[39m,\n\u001b[1;32m 10\u001b[0m }\n\u001b[1;32m 12\u001b[0m api_kwargs \u001b[39m=\u001b[39m model_client\u001b[39m.\u001b[39mconvert_inputs_to_api_kwargs(query, model_kwargs\u001b[39m=\u001b[39mmodel_kwargs)\n", + "\u001b[0;31mNameError\u001b[0m: name 'TransformerRerankerModelClient' is not defined" + ] + } + ], + "source": [ + "transformer_reranker_model = \"BAAI/bge-reranker-base\"\n", + "model_client = TransformerRerankerModelClient(\n", + " tokenizer_kwargs={\"padding\": True}\n", + ")\n", + "\n", + "model_kwargs = {\n", + " \"model\": transformer_reranker_model,\n", + " \"documents\": documents,\n", + " \"top_k\": 2,\n", + "}\n", + "\n", + "api_kwargs = model_client.convert_inputs_to_api_kwargs(query, model_kwargs=model_kwargs)\n", + "print(f\"api_kwargs: {api_kwargs}\")\n", + "output = model_client.call(api_kwargs)\n", + "print(f\"reponse_embedder_output: {reponse_embedder_output}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, From f4abfeb9683e266a9208602cb74cf4b3aee04965 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Tue, 10 Sep 2024 11:54:20 +0000 Subject: [PATCH 27/36] Restored originial file. --- adalflow/adalflow/core/embedder.py | 10 +++--- adalflow/adalflow/core/generator.py | 51 ++++++++++++++++------------- 2 files changed, 34 insertions(+), 27 deletions(-) diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py index c8f069a9..588347ba 100644 --- a/adalflow/adalflow/core/embedder.py +++ b/adalflow/adalflow/core/embedder.py @@ -100,7 +100,8 @@ def _pre_call( # step 2: convert the input to the api_kwargs api_kwargs = self.model_client.convert_inputs_to_api_kwargs( input=input, - model_kwargs=composed_model_kwargs + model_kwargs=composed_model_kwargs, + model_type=self.model_type, ) log.debug(f"api_kwargs: {api_kwargs}") return api_kwargs @@ -139,8 +140,7 @@ def call( response = None try: response = self.model_client.call( - api_kwargs=api_kwargs, - model_type=self.model_type + api_kwargs=api_kwargs, model_type=self.model_type ) except Exception as e: log.error(f"Error calling the model: {e}") @@ -169,7 +169,7 @@ async def acall( response = None try: response = await self.model_client.acall( - api_kwargs=api_kwargs + api_kwargs=api_kwargs, model_type=self.model_type ) except Exception as e: log.error(f"Error calling the model: {e}") @@ -231,4 +231,4 @@ def call( input=batch_input, model_kwargs=model_kwargs ) embeddings.append(batch_output) - return embeddings + return embeddings \ No newline at end of file diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py index a89eb34b..7868b44e 100644 --- a/adalflow/adalflow/core/generator.py +++ b/adalflow/adalflow/core/generator.py @@ -6,7 +6,6 @@ import json from typing import Any, Dict, Optional, Union, Callable, Tuple, List -from copy import deepcopy import logging @@ -110,11 +109,6 @@ def __init__( ) template = template or DEFAULT_LIGHTRAG_SYSTEM_PROMPT - try: - prompt_kwargs = deepcopy(prompt_kwargs) - except Exception as e: - log.warning(f"Error copying the prompt_kwargs: {e}") - prompt_kwargs = prompt_kwargs # Cache model_str = ( @@ -125,8 +119,6 @@ def __init__( ) self.cache_path = os.path.join(_cache_path, f"cache_{model_str}.db") - print(f"cache_path: {self.cache_path}") - CachedEngine.__init__(self, cache_path=self.cache_path) Component.__init__(self) GradComponent.__init__(self) @@ -167,6 +159,10 @@ def __init__( } self._teacher: Optional["Generator"] = None + def get_cache_path(self) -> str: + r"""Get the cache path for the generator.""" + return self.cache_path + @staticmethod def _get_default_mapping( output: "GeneratorOutput" = None, @@ -269,11 +265,9 @@ def _compose_model_kwargs(self, **model_kwargs) -> Dict: return combined_model_kwargs def print_prompt(self, **kwargs) -> str: - # prompt_kwargs_str = _convert_prompt_kwargs_to_str(kwargs) return self.prompt.print_prompt(**kwargs) def get_prompt(self, **kwargs) -> str: - # prompt_kwargs_str = _convert_prompt_kwargs_to_str(kwargs) return self.prompt.call(**kwargs) def _extra_repr(self) -> str: @@ -312,6 +306,7 @@ def _pre_call(self, prompt_kwargs: Dict, model_kwargs: Dict) -> Dict[str, Any]: api_kwargs = self.model_client.convert_inputs_to_api_kwargs( input=prompt_str, model_kwargs=composed_model_kwargs, + model_type=self.model_type, ) return api_kwargs @@ -328,8 +323,7 @@ def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any: return cached_completion completion = self.model_client.call( - api_kwargs=api_kwargs, - model_type=self.model_type + api_kwargs=api_kwargs, model_type=self.model_type ) # prepare cache if use_cache: @@ -420,8 +414,12 @@ def forward( if self.mock_output: output = GeneratorOutput(data=self.mock_output_data) else: - if self.teacher_mode: + if self.teacher_mode and not isinstance(self, BackwardEngine): if not self._teacher: + print( + f"prompt_kwargs: {prompt_kwargs}, model_kwargs: {model_kwargs}" + ) + print(f"names: {self.name}") raise ValueError("Teacher generator is not set.") log.info(f"Using teacher: {self._teacher}") input_args = { @@ -706,7 +704,6 @@ def _run_callbacks( model_kwargs=model_kwargs, ) if output.error: - print(f"call back on failure: {output}") self.trigger_callbacks( "on_failure", output=output, @@ -799,7 +796,7 @@ async def acall( try: completion = await self.model_client.acall( - api_kwargs=api_kwargs + api_kwargs=api_kwargs, model_type=self.model_type ) except Exception as e: log.error(f"Error calling the model: {e}") @@ -830,9 +827,23 @@ def __call__(self, *args, **kwargs) -> Union[GeneratorOutputType, Any]: return self.call(*args, **kwargs) def _extra_repr(self) -> str: + # Create the string for model_kwargs s = f"model_kwargs={self.model_kwargs}, " + + # Create the string for trainable prompt_kwargs + prompt_kwargs_repr = [ + k + for k, v in self.prompt_kwargs.items() + if isinstance(v, Parameter) and v.requires_opt + ] + + s += f"trainable_prompt_kwargs={prompt_kwargs_repr}" return s + def to_dict(self) -> Dict[str, Any]: + r"""Convert the generator to a dictionary.""" + # exclude default functions + @staticmethod def failure_message_to_backward_engine( gradient_response: GeneratorOutput, @@ -854,6 +865,8 @@ def __init__(self, **kwargs): kwargs = {} kwargs["template"] = FEEDBACK_ENGINE_TEMPLATE super().__init__(**kwargs) + self.name = "BackwardEngine" + self.teacher_mode = False @staticmethod def failure_message_to_optimizer( @@ -954,7 +967,6 @@ def create_teacher_generator( call_logger = GeneratorCallLogger(save_dir="traces") def on_complete(output, input, prompt_kwargs, model_kwargs, logger_call: Callable): - print(f"on_complet output: {output}") logger_call( output=output, input=input, @@ -963,13 +975,9 @@ def on_complete(output, input, prompt_kwargs, model_kwargs, logger_call: Callabl ) for model in [llama3_model, gpt_3_model, gemini_model, claude_model]: - print(f"""model: {model["model_kwargs"]["model"]}""") generator = Generator(**model) - print("_kwargs: ", generator._kwargs) - teacher = create_teacher_generator(generator, **claude_model) - print(f"teacher: {teacher}") call_logger.register_generator("generator", "generator_call") # setup the callback @@ -983,8 +991,7 @@ def on_complete(output, input, prompt_kwargs, model_kwargs, logger_call: Callabl "input_str": "Hello, world!", } ) - print(f"output: {output}") break # test the backward engine - # TODO: test ollama and transformer client to update the change + # TODO: test ollama and transformer client to update the change \ No newline at end of file From 39f2ae2f7ec69596e26498d2e2640ce6794313b6 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 08:26:19 +0000 Subject: [PATCH 28/36] Fixed test class name. --- adalflow/tests/test_transformer_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py index c0b649fc..63ba019d 100644 --- a/adalflow/tests/test_transformer_client.py +++ b/adalflow/tests/test_transformer_client.py @@ -165,7 +165,7 @@ def test_integration_with_generator_pipeline(self): output = generator(prompt_kwargs=self.prompt_kwargs) print(output) -class TransformerRerankerModelClient(unittest.TestCase): +class TestTransformerRerankerModelClient(unittest.TestCase): def setUp(self) -> None: self.query = "what is panda?" From 5481ee78794e16c2d02e3529f611c4fd610d764f Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 08:42:01 +0000 Subject: [PATCH 29/36] Added kwargs for model and tokenizer init. --- .../model_client/transformers_client.py | 39 +++++++++++++------ 1 file changed, 28 insertions(+), 11 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index f6f8b239..b0050a75 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -90,6 +90,8 @@ def __init__( self, model_name: Optional[str] = None, tokenizer_kwargs: Optional[dict] = dict(), + auto_model_kargs: Optional[dict] = dict(), + auto_tokenizer_kwargs: Optional[dict] = dict(), auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, custom_model: Optional[PreTrainedModel] = None, @@ -99,6 +101,8 @@ def __init__( super().__init__() self.model_name = model_name self.tokenizer_kwargs = tokenizer_kwargs + self.auto_model_kargs = auto_model_kargs + self.auto_tokenizer_kwargs = auto_tokenizer_kwargs if "return_tensors" not in self.tokenizer_kwargs: self.tokenizer_kwargs["return_tensors"]= "pt" self.auto_model=auto_model @@ -135,6 +139,8 @@ def __init__( def init_sync_client(self): self.init_model( model_name=self.model_name, + auto_model_kargs=self.auto_model_kargs, + auto_tokenizer_kwargs=self.auto_tokenizer_kwargs, auto_model=self.auto_model, auto_tokenizer=self.auto_tokenizer, custom_model=self.custom_model, @@ -146,6 +152,8 @@ def init_sync_client(self): def init_model( self, model_name: Optional[str] = None, + auto_model_kargs: Optional[dict] = dict(), + auto_tokenizer_kwargs: Optional[dict] = dict(), auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, custom_model: Optional[PreTrainedModel] = None, @@ -154,12 +162,12 @@ def init_model( try: if self.use_auto_model: - self.model = auto_model.from_pretrained(model_name) + self.model = auto_model.from_pretrained(model_name, **auto_model_kargs) else: self.model = custom_model if self.use_auto_tokenizer: - self.tokenizer = auto_tokenizer.from_pretrained(model_name) + self.tokenizer = auto_tokenizer.from_pretrained(model_name, **auto_tokenizer_kwargs) else: self.tokenizer = custom_tokenizer @@ -271,7 +279,9 @@ class TransformerLLMModelClient(ModelClient): def __init__( self, model_name: Optional[str] = None, - tokenizer_kwargs: Optional[dict] = {}, + tokenizer_decode_kwargs: Optional[dict] = {}, + auto_model_kargs: Optional[dict] = dict(), + auto_tokenizer_kwargs: Optional[dict] = dict(), init_from: Optional[str] = "autoclass", apply_chat_template: bool = False, chat_template: Optional[str] = None, @@ -283,7 +293,9 @@ def __init__( super().__init__() self.model_name = model_name # current model to use - self.tokenizer_kwargs = tokenizer_kwargs + self.tokenizer_decode_kwargs = tokenizer_decode_kwargs + self.auto_model_kargs = auto_model_kargs + self.auto_tokenizer_kwargs = auto_tokenizer_kwargs if "return_tensors" not in self.tokenizer_kwargs: self.tokenizer_kwargs["return_tensors"]= "pt" self.use_token = use_token @@ -335,14 +347,15 @@ def _init_from_automodelcasual_lm(self): self.model_name, token=token, local_files_only=self.local_files_only, - **self.tokenizer_kwargs + **self.auto_tokenizer_kwargs ) self.model = AutoModelForCausalLM.from_pretrained( self.model_name, torch_dtype=self.torch_dtype, device_map="auto", token=token, - local_files_only=self.local_files_only + local_files_only=self.local_files_only, + **self.auto_model_kargs ) # Set pad token if it's not already set if self.tokenizer.pad_token is None: @@ -401,7 +414,6 @@ def _infer_from_pipeline( self.model_name, token=self._get_token_if_relevant(), local_files_only=self.local_files_only, - **self.tokenizer_kwargs ) # Set pad token if it's not already set if self.tokenizer.pad_token is None: @@ -476,7 +488,7 @@ def _handle_input( messages, **chat_template_kwargs ) if ("tokenize" in chat_template_kwargs) and (chat_template_kwargs["tokenize"] == True): - prompt = self.tokenizer.decode(prompt) + prompt = self.tokenizer.decode(prompt, **self.tokenizer_decode_kwargs) return prompt else: return prompt @@ -598,12 +610,16 @@ class TransformerRerankerModelClient(ModelClient): def __init__( self, model_name: Optional[str] = None, + tokenizer_kwargs: Optional[dict] = {}, + auto_model_kargs: Optional[dict] = dict(), + auto_tokenizer_kwargs: Optional[dict] = dict(), auto_model: Optional[type] = AutoModelForSequenceClassification, auto_tokenizer: Optional[type] = AutoTokenizer, - tokenizer_kwargs: Optional[dict] = {}, local_files_only: Optional[bool] = False ): self.auto_model = auto_model + self.auto_model_kargs = auto_model_kargs + self.auto_tokenizer_kwargs = auto_tokenizer_kwargs self.auto_tokenizer= auto_tokenizer self.model_name = model_name self.tokenizer_kwargs = tokenizer_kwargs @@ -619,11 +635,12 @@ def init_model(self, model_name: str): self.tokenizer = self.auto_tokenizer.from_pretrained( self.model_name, local_files_only=self.local_files_only, - **self.tokenizer_kwargs + **self.auto_tokenizer_kwargs ) self.model = self.auto_model.from_pretrained( self.model_name, - local_files_only=self.local_files_only + local_files_only=self.local_files_only, + **self.auto_model_kargs ) # Check device availability and set the device device = get_device() From eab39cdebbd780054846ed8f31201efaa645cd05 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 09:29:10 +0000 Subject: [PATCH 30/36] Fixed typo. --- .../model_client/transformers_client.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index b0050a75..8a3d06ab 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -90,7 +90,7 @@ def __init__( self, model_name: Optional[str] = None, tokenizer_kwargs: Optional[dict] = dict(), - auto_model_kargs: Optional[dict] = dict(), + auto_model_kwargs: Optional[dict] = dict(), auto_tokenizer_kwargs: Optional[dict] = dict(), auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, @@ -101,7 +101,7 @@ def __init__( super().__init__() self.model_name = model_name self.tokenizer_kwargs = tokenizer_kwargs - self.auto_model_kargs = auto_model_kargs + self.auto_model_kwargs = auto_model_kwargs self.auto_tokenizer_kwargs = auto_tokenizer_kwargs if "return_tensors" not in self.tokenizer_kwargs: self.tokenizer_kwargs["return_tensors"]= "pt" @@ -139,7 +139,7 @@ def __init__( def init_sync_client(self): self.init_model( model_name=self.model_name, - auto_model_kargs=self.auto_model_kargs, + auto_model_kwargs=self.auto_model_kwargs, auto_tokenizer_kwargs=self.auto_tokenizer_kwargs, auto_model=self.auto_model, auto_tokenizer=self.auto_tokenizer, @@ -152,7 +152,7 @@ def init_sync_client(self): def init_model( self, model_name: Optional[str] = None, - auto_model_kargs: Optional[dict] = dict(), + auto_model_kwargs: Optional[dict] = dict(), auto_tokenizer_kwargs: Optional[dict] = dict(), auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, @@ -162,7 +162,7 @@ def init_model( try: if self.use_auto_model: - self.model = auto_model.from_pretrained(model_name, **auto_model_kargs) + self.model = auto_model.from_pretrained(model_name, **auto_model_kwargs) else: self.model = custom_model @@ -280,7 +280,7 @@ def __init__( self, model_name: Optional[str] = None, tokenizer_decode_kwargs: Optional[dict] = {}, - auto_model_kargs: Optional[dict] = dict(), + auto_model_kwargs: Optional[dict] = dict(), auto_tokenizer_kwargs: Optional[dict] = dict(), init_from: Optional[str] = "autoclass", apply_chat_template: bool = False, @@ -294,7 +294,7 @@ def __init__( self.model_name = model_name # current model to use self.tokenizer_decode_kwargs = tokenizer_decode_kwargs - self.auto_model_kargs = auto_model_kargs + self.auto_model_kwargs = auto_model_kwargs self.auto_tokenizer_kwargs = auto_tokenizer_kwargs if "return_tensors" not in self.tokenizer_kwargs: self.tokenizer_kwargs["return_tensors"]= "pt" @@ -355,7 +355,7 @@ def _init_from_automodelcasual_lm(self): device_map="auto", token=token, local_files_only=self.local_files_only, - **self.auto_model_kargs + **self.auto_model_kwargs ) # Set pad token if it's not already set if self.tokenizer.pad_token is None: @@ -611,14 +611,14 @@ def __init__( self, model_name: Optional[str] = None, tokenizer_kwargs: Optional[dict] = {}, - auto_model_kargs: Optional[dict] = dict(), + auto_model_kwargs: Optional[dict] = dict(), auto_tokenizer_kwargs: Optional[dict] = dict(), auto_model: Optional[type] = AutoModelForSequenceClassification, auto_tokenizer: Optional[type] = AutoTokenizer, local_files_only: Optional[bool] = False ): self.auto_model = auto_model - self.auto_model_kargs = auto_model_kargs + self.auto_model_kwargs = auto_model_kwargs self.auto_tokenizer_kwargs = auto_tokenizer_kwargs self.auto_tokenizer= auto_tokenizer self.model_name = model_name @@ -640,7 +640,7 @@ def init_model(self, model_name: str): self.model = self.auto_model.from_pretrained( self.model_name, local_files_only=self.local_files_only, - **self.auto_model_kargs + **self.auto_model_kwargs ) # Check device availability and set the device device = get_device() From dcd3a7aec73480ad718385fa079915441c03da60 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 10:53:06 +0000 Subject: [PATCH 31/36] Fixed missing tokenizer_kwargs in TransformerLLMModelClient. --- .../adalflow/components/model_client/transformers_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 8a3d06ab..b8c4cc03 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -280,6 +280,7 @@ def __init__( self, model_name: Optional[str] = None, tokenizer_decode_kwargs: Optional[dict] = {}, + tokenizer_kwargs: Optional[dict] = {}, auto_model_kwargs: Optional[dict] = dict(), auto_tokenizer_kwargs: Optional[dict] = dict(), init_from: Optional[str] = "autoclass", @@ -294,6 +295,7 @@ def __init__( self.model_name = model_name # current model to use self.tokenizer_decode_kwargs = tokenizer_decode_kwargs + self.tokenizer_kwargs = tokenizer_kwargs self.auto_model_kwargs = auto_model_kwargs self.auto_tokenizer_kwargs = auto_tokenizer_kwargs if "return_tensors" not in self.tokenizer_kwargs: @@ -463,7 +465,7 @@ def _infer_from_automodelcasual_lm( ) else: model_input = self._handle_input(messages) - input_ids = self.tokenizer(model_input, return_tensors="pt").to( + input_ids = self.tokenizer(model_input, **self.tokenizer_kwargs).to( get_device() ) outputs_tokens = self.model.generate(**input_ids, max_length=max_length, max_new_tokens=max_tokens, **kwargs) From 8e39e01dde4516b0ceac213db00be7933097abf3 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 11:15:08 +0000 Subject: [PATCH 32/36] Addded local_files_only to TransformerEmbeddingModelClient --- .../components/model_client/transformers_client.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index b8c4cc03..c0159890 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -94,6 +94,7 @@ def __init__( auto_tokenizer_kwargs: Optional[dict] = dict(), auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, + local_files_only: Optional[bool] = False, custom_model: Optional[PreTrainedModel] = None, custom_tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]] = None ): @@ -107,6 +108,7 @@ def __init__( self.tokenizer_kwargs["return_tensors"]= "pt" self.auto_model=auto_model self.auto_tokenizer=auto_tokenizer + self.local_files_only = local_files_only self.custom_model=custom_model self.custom_tokenizer=custom_tokenizer @@ -162,12 +164,20 @@ def init_model( try: if self.use_auto_model: - self.model = auto_model.from_pretrained(model_name, **auto_model_kwargs) + self.model = auto_model.from_pretrained( + model_name, + local_files_only=self.local_files_only, + **auto_model_kwargs + ) else: self.model = custom_model if self.use_auto_tokenizer: - self.tokenizer = auto_tokenizer.from_pretrained(model_name, **auto_tokenizer_kwargs) + self.tokenizer = auto_tokenizer.from_pretrained( + model_name, + local_files_only=self.local_files_only, + **auto_tokenizer_kwargs + ) else: self.tokenizer = custom_tokenizer From 92faa910fda107a3748ef49ac670411d3dc2b060 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 11:42:42 +0000 Subject: [PATCH 33/36] Fixed mutable default arguments. --- .../model_client/transformers_client.py | 76 ++++++++++--------- 1 file changed, 41 insertions(+), 35 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index c0159890..cfb354e9 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -89,9 +89,9 @@ class TransformerEmbeddingModelClient(ModelClient): def __init__( self, model_name: Optional[str] = None, - tokenizer_kwargs: Optional[dict] = dict(), - auto_model_kwargs: Optional[dict] = dict(), - auto_tokenizer_kwargs: Optional[dict] = dict(), + tokenizer_kwargs: Optional[dict] = None, + auto_model_kwargs: Optional[dict] = None, + auto_tokenizer_kwargs: Optional[dict] = None, auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, local_files_only: Optional[bool] = False, @@ -101,9 +101,9 @@ def __init__( super().__init__() self.model_name = model_name - self.tokenizer_kwargs = tokenizer_kwargs - self.auto_model_kwargs = auto_model_kwargs - self.auto_tokenizer_kwargs = auto_tokenizer_kwargs + self.tokenizer_kwargs = tokenizer_kwargs or dict() + self.auto_model_kwargs = auto_model_kwargs or dict() + self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict() if "return_tensors" not in self.tokenizer_kwargs: self.tokenizer_kwargs["return_tensors"]= "pt" self.auto_model=auto_model @@ -154,14 +154,16 @@ def init_sync_client(self): def init_model( self, model_name: Optional[str] = None, - auto_model_kwargs: Optional[dict] = dict(), - auto_tokenizer_kwargs: Optional[dict] = dict(), + auto_model_kwargs: Optional[dict] = None, + auto_tokenizer_kwargs: Optional[dict] = None, auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, custom_model: Optional[PreTrainedModel] = None, custom_tokenizer: Optional[PreTrainedTokenizer | PreTrainedTokenizerFast] = None ): + self.auto_model_kwargs = auto_model_kwargs or dict() + self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict() try: if self.use_auto_model: self.model = auto_model.from_pretrained( @@ -215,7 +217,8 @@ def handle_input(self, input: Union[str, List[str], List[List[str]]]) -> Union[L return input - def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = dict()) -> dict: + def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = None) -> dict: + kwargs = kwargs or dict() batch_dict = self.tokenizer(input, **kwargs) return batch_dict @@ -235,8 +238,9 @@ def compute_embeddings(self, outputs: dict, batch_dict: dict): # # Preprocessing, postprocessing and call for inference code # - def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType.UNDEFINED) -> Union[List, Tensor]: - + def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED) -> Union[List, Tensor]: + + api_kwargs = api_kwargs or dict() if "model" not in api_kwargs: raise ValueError("model must be specified in api_kwargs") # I don't think it is useful anymore @@ -289,14 +293,14 @@ class TransformerLLMModelClient(ModelClient): def __init__( self, model_name: Optional[str] = None, - tokenizer_decode_kwargs: Optional[dict] = {}, - tokenizer_kwargs: Optional[dict] = {}, - auto_model_kwargs: Optional[dict] = dict(), - auto_tokenizer_kwargs: Optional[dict] = dict(), + tokenizer_decode_kwargs: Optional[dict] = None, + tokenizer_kwargs: Optional[dict] = None, + auto_model_kwargs: Optional[dict] = None, + auto_tokenizer_kwargs: Optional[dict] = None, init_from: Optional[str] = "autoclass", apply_chat_template: bool = False, chat_template: Optional[str] = None, - chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True), + chat_template_kwargs: Optional[dict] = None, use_token: bool = False, torch_dtype: Optional[Any] = torch.bfloat16, local_files_only: Optional[bool] = False @@ -304,10 +308,10 @@ def __init__( super().__init__() self.model_name = model_name # current model to use - self.tokenizer_decode_kwargs = tokenizer_decode_kwargs - self.tokenizer_kwargs = tokenizer_kwargs - self.auto_model_kwargs = auto_model_kwargs - self.auto_tokenizer_kwargs = auto_tokenizer_kwargs + self.tokenizer_decode_kwargs = tokenizer_decode_kwargs or dict() + self.tokenizer_kwargs = tokenizer_kwargs or dict() + self.auto_model_kwargs = auto_model_kwargs or dict() + self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict() if "return_tensors" not in self.tokenizer_kwargs: self.tokenizer_kwargs["return_tensors"]= "pt" self.use_token = use_token @@ -315,7 +319,7 @@ def __init__( self.init_from = init_from self.apply_chat_template = apply_chat_template self.chat_template = chat_template - self.chat_template_kwargs = chat_template_kwargs + self.chat_template_kwargs = chat_template_kwargs or dict(tokenize=False, add_generation_prompt=True) self.local_files_only = local_files_only self.model = None if model_name is not None: @@ -403,7 +407,7 @@ def _infer_from_pipeline( max_tokens: Optional[int] = None, apply_chat_template: bool = False, chat_template: Optional[str] = None, - chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True), + chat_template_kwargs: Optional[dict] = None, **kwargs, ): @@ -460,7 +464,7 @@ def _infer_from_automodelcasual_lm( max_length: Optional[int] = 8192, # model-agnostic apply_chat_template: bool = False, chat_template: Optional[str] = None, - chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True), + chat_template_kwargs: Optional[dict] = None, **kwargs, ): if not self.model: @@ -542,8 +546,8 @@ def infer_llm( # # Preprocessing, postprocessing and call for inference code # - def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType.UNDEFINED): - + def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED): + api_kwargs = api_kwargs or dict() if "model" not in api_kwargs: raise ValueError("model must be specified in api_kwargs") @@ -596,9 +600,10 @@ def parse_chat_completion(self, completion: Any) -> str: def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, - model_kwargs: dict = {}, + model_kwargs: dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED ) -> dict: + model_kwargs = model_kwargs or dict() final_model_kwargs = model_kwargs.copy() assert "model" in final_model_kwargs, "model must be specified" #messages = [{"role": "system", "content": input}] @@ -622,19 +627,19 @@ class TransformerRerankerModelClient(ModelClient): def __init__( self, model_name: Optional[str] = None, - tokenizer_kwargs: Optional[dict] = {}, - auto_model_kwargs: Optional[dict] = dict(), - auto_tokenizer_kwargs: Optional[dict] = dict(), + tokenizer_kwargs: Optional[dict] = None, + auto_model_kwargs: Optional[dict] = None, + auto_tokenizer_kwargs: Optional[dict] = None, auto_model: Optional[type] = AutoModelForSequenceClassification, auto_tokenizer: Optional[type] = AutoTokenizer, local_files_only: Optional[bool] = False ): self.auto_model = auto_model - self.auto_model_kwargs = auto_model_kwargs - self.auto_tokenizer_kwargs = auto_tokenizer_kwargs + self.auto_model_kwargs = auto_model_kwargs or dict() + self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict() self.auto_tokenizer= auto_tokenizer self.model_name = model_name - self.tokenizer_kwargs = tokenizer_kwargs + self.tokenizer_kwargs = tokenizer_kwargs or dict() if "return_tensors" not in self.tokenizer_kwargs: self.tokenizer_kwargs["return_tensors"]= "pt" self.local_files_only = local_files_only @@ -706,8 +711,8 @@ def infer_reranker( # # Preprocessing, postprocessing and call for inference code # - def call(self, api_kwargs: Dict = {}): - + def call(self, api_kwargs: Dict = None): + api_kwargs = api_kwargs or dict() if "model" not in api_kwargs: raise ValueError("model must be specified in api_kwargs") @@ -738,9 +743,10 @@ def call(self, api_kwargs: Dict = {}): def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, - model_kwargs: dict = {}, + model_kwargs: dict = None, model_type: ModelType = ModelType.UNDEFINED, ) -> dict: + model_kwargs = model_kwargs or dict() final_model_kwargs = model_kwargs.copy() assert "model" in final_model_kwargs, "model must be specified" From 6e5f109cad0972832a87c3c367908a5740ecef3a Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 12:16:03 +0000 Subject: [PATCH 34/36] Removed dict args that were conflicting with @lru_cache. --- .../components/model_client/transformers_client.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index cfb354e9..482f50ad 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -141,8 +141,6 @@ def __init__( def init_sync_client(self): self.init_model( model_name=self.model_name, - auto_model_kwargs=self.auto_model_kwargs, - auto_tokenizer_kwargs=self.auto_tokenizer_kwargs, auto_model=self.auto_model, auto_tokenizer=self.auto_tokenizer, custom_model=self.custom_model, @@ -154,22 +152,18 @@ def init_sync_client(self): def init_model( self, model_name: Optional[str] = None, - auto_model_kwargs: Optional[dict] = None, - auto_tokenizer_kwargs: Optional[dict] = None, auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, custom_model: Optional[PreTrainedModel] = None, custom_tokenizer: Optional[PreTrainedTokenizer | PreTrainedTokenizerFast] = None ): - self.auto_model_kwargs = auto_model_kwargs or dict() - self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict() try: if self.use_auto_model: self.model = auto_model.from_pretrained( model_name, local_files_only=self.local_files_only, - **auto_model_kwargs + **self.auto_model_kwargs ) else: self.model = custom_model @@ -178,7 +172,7 @@ def init_model( self.tokenizer = auto_tokenizer.from_pretrained( model_name, local_files_only=self.local_files_only, - **auto_tokenizer_kwargs + **self.auto_tokenizer_kwargs ) else: self.tokenizer = custom_tokenizer From 508079d20e3fa0cf552426e856888976699bb52f Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 15:16:22 +0000 Subject: [PATCH 35/36] Added tests to check transformer_client compatibility with different models. --- adalflow/tests/test_transformers_models.py | 186 +++++++++++++++++++++ 1 file changed, 186 insertions(+) create mode 100644 adalflow/tests/test_transformers_models.py diff --git a/adalflow/tests/test_transformers_models.py b/adalflow/tests/test_transformers_models.py new file mode 100644 index 00000000..fc553f6e --- /dev/null +++ b/adalflow/tests/test_transformers_models.py @@ -0,0 +1,186 @@ +"""This tests that the new transformer_client compatibility with several models hosted on HuggingFace.""" +import unittest +import torch +from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient, TransformerLLMModelClient, TransformerRerankerModelClient +from transformers import AutoModelForSequenceClassification + +class TestEmbeddingModels(unittest.TestCase): + def setUp(self) -> None: + self.test_input = "Hello world" + self.auto_tokenizer_kwargs = { + "max_length": 512, + "padding": True, + "truncation": True, + "return_tensors": 'pt' + } + def test_thenhelper_gte_base(self): + embedding_model = "thenlper/gte-base" + model_kwargs = {"model": embedding_model} + + model_client = TransformerEmbeddingModelClient( + model_name=embedding_model, + auto_tokenizer_kwargs=self.auto_tokenizer_kwargs + ) + print( + f"Testing model client with model {embedding_model}" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.test_input, model_kwargs=model_kwargs) + output = model_client.call(api_kwargs=api_kwargs) + print(output) + + def test_jina_embeddings_V2_small_en(self): + embedding_model = "jinaai/jina-embeddings-v2-small-en" + model_kwargs = {"model": embedding_model} + model_client = TransformerEmbeddingModelClient( + model_name=embedding_model, + auto_tokenizer_kwargs=self.auto_tokenizer_kwargs + ) + print( + f"Testing model client with model {embedding_model}" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.test_input, model_kwargs=model_kwargs) + output = model_client.call(api_kwargs=api_kwargs) + print(output) + + def test_t5_small_standard_bahasa_cased(self): + embedding_model = "mesolitica/t5-small-standard-bahasa-cased" + model_kwargs = {"model": embedding_model} + + # Subclass TransformerEmbeddingModelClient to adapt the class to Encoder-Decoder architecture + class T5SmallStandardBahasaCased(TransformerEmbeddingModelClient): + + def compute_model_outputs(self, batch_dict: dict, model) -> dict: + print(batch_dict) + with torch.no_grad(): + outputs = model.encoder(**batch_dict) + return outputs + + + + model_client = T5SmallStandardBahasaCased( + model_name=embedding_model, + auto_tokenizer_kwargs=self.auto_tokenizer_kwargs + ) + print( + f"Testing model client with model {embedding_model}" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.test_input, model_kwargs=model_kwargs) + output = model_client.call(api_kwargs=api_kwargs) + print(output) + + def test_sentence_transformers_all_miniLM_L6_V2(self): + embedding_model = "sentence-transformers/all-MiniLM-L6-v2" + model_kwargs = {"model": embedding_model} + + model_client = TransformerEmbeddingModelClient( + model_name=embedding_model, + auto_tokenizer_kwargs=self.auto_tokenizer_kwargs + ) + print( + f"Testing model client with model {embedding_model}" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.test_input, model_kwargs=model_kwargs) + output = model_client.call(api_kwargs=api_kwargs) + print(output) + +class TestLLMModels(unittest.TestCase): + """This class 'has accelerate' as a dependencie for both tests. + You might need to run the following command in the terminal. + `pip install accelerate` + """ + def setUp(self) -> None: + self.input_text = "Where is Brian?" + self.auto_tokenizer_kwargs = {} + self.model_kwargs = { + "temperature": 0.1, + "do_sample": True + } + self.tokenizer_decode_kwargs = { + "max_length": True, + "truncation": True, + } + self.prompt_kwargs = { + "input_str": "Where is Brian?", # test input + } + + def test_roneneld_tiny_stories_1M(self): + self.model_kwargs["model"] = "roneneldan/TinyStories-1M" + model_client = TransformerLLMModelClient( + auto_tokenizer_kwargs=self.auto_tokenizer_kwargs, + local_files_only=False, + init_from="autoclass", + ) + print( + f"Testing model client with model {"roneneldan/TinyStories-1M"}" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.input_text, model_kwargs=self.model_kwargs) + output = model_client.call(api_kwargs=api_kwargs) + print(output) + + def test_nickypro_tinyllama_15m(self): + self.model_kwargs["model"] = "nickypro/tinyllama-15M" + model_client = TransformerLLMModelClient( + auto_tokenizer_kwargs=self.auto_tokenizer_kwargs, + local_files_only=False, + init_from="autoclass", + ) + print( + f"Testing model client with model {"nickypro/tinyllama-15M"}" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.input_text, model_kwargs=self.model_kwargs) + output = model_client.call(api_kwargs=api_kwargs) + print(output) + +class TestRerankerModel(unittest.TestCase): + """This class has sentencepieces as a dependencie. + You might need to run the following command in the terminal. + `pip install transformers[sentencepiece`]` + """ + def setUp(self) -> None: + self.query = "Where is Brian." + self.documents = [ + "Brian is in the Kitchen.", + "Brian loves Adalflow.", + "Adalflow is a python library, not some food inside the kitchen.", + ] + self.model_kwargs = { + "documents": self.documents, + "top_k": 2, + } + + def test_jinja_reranker_V1_tiny_en(self): + self.model_kwargs["model"] = "jinaai/jina-reranker-v1-tiny-en" + model_client = TransformerRerankerModelClient( + tokenizer_kwargs={"padding": True}, + auto_model_kwargs={"num_labels": 1} + ) + print( + f"Testing model client with model jinaai/jina-reranker-v1-tiny-en" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(self.query, model_kwargs=self.model_kwargs) + output = model_client.call(api_kwargs) + + def test_baai_bge_reranker_base(self): + self.model_kwargs["model"] = "BAAI/bge-reranker-base" + model_client = TransformerRerankerModelClient( + tokenizer_kwargs={"padding": True}, + ) + print( + f"Testing model client with model BAAI/bge-reranker-base" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(self.query, model_kwargs=self.model_kwargs) + output = model_client.call(api_kwargs) + + def test_cross_encoder_ms_marco_minilm_L_2_V2(self): + self.model_kwargs["model"] = "cross-encoder/ms-marco-MiniLM-L-2-v2" + model_client = TransformerRerankerModelClient( + tokenizer_kwargs={"padding": True}, + ) + print( + f"Testing model client with model cross-encoder/ms-marco-MiniLM-L-2-v2" + ) + api_kwargs = model_client.convert_inputs_to_api_kwargs(self.query, model_kwargs=self.model_kwargs) + output = model_client.call(api_kwargs) + +if __name__ == "__main__": + unittest.main(verbosity=6) \ No newline at end of file From c8fe73a94f74f441eda20e6c3ffe23b511ff7662 Mon Sep 17 00:00:00 2001 From: Alexandre Adjahossou Date: Wed, 11 Sep 2024 15:42:48 +0000 Subject: [PATCH 36/36] Formatting. --- .../model_client/transformers_client.py | 242 ++++++++++-------- 1 file changed, 130 insertions(+), 112 deletions(-) diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py index 482f50ad..7ef85ab0 100644 --- a/adalflow/adalflow/components/model_client/transformers_client.py +++ b/adalflow/adalflow/components/model_client/transformers_client.py @@ -25,13 +25,12 @@ AutoModel, AutoModelForCausalLM, AutoModelForSequenceClassification, - pipeline + pipeline, ) from os import getenv as get_env_variable transformers = safe_import( - OptionalPackages.TRANSFORMERS.value[0], - OptionalPackages.TRANSFORMERS.value[1] + OptionalPackages.TRANSFORMERS.value[0], OptionalPackages.TRANSFORMERS.value[1] ) torch = safe_import(OptionalPackages.TORCH.value[0], OptionalPackages.TORCH.value[1]) @@ -45,9 +44,15 @@ def average_pool(last_hidden_states: Tensor, attention_mask: list) -> Tensor: def mean_pooling(model_output: dict, attention_mask) -> Tensor: - token_embeddings = model_output[0] #First element of model_output contains all token embeddings - input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() - return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) + token_embeddings = model_output[ + 0 + ] # First element of model_output contains all token embeddings + input_mask_expanded = ( + attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() + ) + return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp( + input_mask_expanded.sum(1), min=1e-9 + ) def get_device(): @@ -83,21 +88,24 @@ class TransformerEmbeddingModelClient(ModelClient): Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens Once you have a token and have access, put the token in the environment variable HF_TOKEN. """ + # # Model initialisation # def __init__( - self, - model_name: Optional[str] = None, - tokenizer_kwargs: Optional[dict] = None, - auto_model_kwargs: Optional[dict] = None, - auto_tokenizer_kwargs: Optional[dict] = None, - auto_model: Optional[type] = AutoModel, - auto_tokenizer: Optional[type] = AutoTokenizer, - local_files_only: Optional[bool] = False, - custom_model: Optional[PreTrainedModel] = None, - custom_tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]] = None - ): + self, + model_name: Optional[str] = None, + tokenizer_kwargs: Optional[dict] = None, + auto_model_kwargs: Optional[dict] = None, + auto_tokenizer_kwargs: Optional[dict] = None, + auto_model: Optional[type] = AutoModel, + auto_tokenizer: Optional[type] = AutoTokenizer, + local_files_only: Optional[bool] = False, + custom_model: Optional[PreTrainedModel] = None, + custom_tokenizer: Optional[ + Union[PreTrainedTokenizer, PreTrainedTokenizerFast] + ] = None, + ): super().__init__() self.model_name = model_name @@ -105,12 +113,12 @@ def __init__( self.auto_model_kwargs = auto_model_kwargs or dict() self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict() if "return_tensors" not in self.tokenizer_kwargs: - self.tokenizer_kwargs["return_tensors"]= "pt" - self.auto_model=auto_model - self.auto_tokenizer=auto_tokenizer + self.tokenizer_kwargs["return_tensors"] = "pt" + self.auto_model = auto_model + self.auto_tokenizer = auto_tokenizer self.local_files_only = local_files_only - self.custom_model=custom_model - self.custom_tokenizer=custom_tokenizer + self.custom_model = custom_model + self.custom_tokenizer = custom_tokenizer # Check if there is conflicting arguments self.use_auto_model = auto_model is not None @@ -125,28 +133,32 @@ def __init__( elif (not self.use_auto_model) and (not self.use_cusom_model): raise ValueError("Need to specify either 'auto_model' or 'custom_model'.") elif self.use_auto_model and (not self.model_name_exit): - raise ValueError("When 'auto_model' is specified 'model_name' must be specified too.") - + raise ValueError( + "When 'auto_model' is specified 'model_name' must be specified too." + ) + ## arguments related to tokenizer if self.use_auto_tokenizer and self.use_cusom_tokenizer: raise Exception("Cannot specify 'auto_tokenizer' and 'custom_tokenizer'.") elif (not self.use_auto_tokenizer) and (not self.use_cusom_tokenizer): - raise Exception("Need to specify either'auto_tokenizer' and 'custom_tokenizer'.") + raise Exception( + "Need to specify either'auto_tokenizer' and 'custom_tokenizer'." + ) elif self.use_auto_tokenizer and (not self.model_name_exit): - raise ValueError("When 'auto_tokenizer' is specified 'model_name' must be specified too.") + raise ValueError( + "When 'auto_tokenizer' is specified 'model_name' must be specified too." + ) self.init_sync_client() - def init_sync_client(self): self.init_model( model_name=self.model_name, auto_model=self.auto_model, auto_tokenizer=self.auto_tokenizer, custom_model=self.custom_model, - custom_tokenizer=self.custom_tokenizer - ) - + custom_tokenizer=self.custom_tokenizer, + ) @lru_cache(None) def init_model( @@ -155,16 +167,18 @@ def init_model( auto_model: Optional[type] = AutoModel, auto_tokenizer: Optional[type] = AutoTokenizer, custom_model: Optional[PreTrainedModel] = None, - custom_tokenizer: Optional[PreTrainedTokenizer | PreTrainedTokenizerFast] = None - ): + custom_tokenizer: Optional[ + PreTrainedTokenizer | PreTrainedTokenizerFast + ] = None, + ): try: if self.use_auto_model: self.model = auto_model.from_pretrained( model_name, local_files_only=self.local_files_only, - **self.auto_model_kwargs - ) + **self.auto_model_kwargs, + ) else: self.model = custom_model @@ -172,8 +186,8 @@ def init_model( self.tokenizer = auto_tokenizer.from_pretrained( model_name, local_files_only=self.local_files_only, - **self.auto_tokenizer_kwargs - ) + **self.auto_tokenizer_kwargs, + ) else: self.tokenizer = custom_tokenizer @@ -204,35 +218,39 @@ def infer_embedding( embeddings = embeddings.tolist() return embeddings - - def handle_input(self, input: Union[str, List[str], List[List[str]]]) -> Union[List[str], List[List[str]]]: + def handle_input( + self, input: Union[str, List[str], List[List[str]]] + ) -> Union[List[str], List[List[str]]]: if isinstance(input, str): input = [input] return input - - def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = None) -> dict: + def tokenize_inputs( + self, + input: Union[str, List[str], List[List[str]]], + kwargs: Optional[dict] = None, + ) -> dict: kwargs = kwargs or dict() batch_dict = self.tokenizer(input, **kwargs) return batch_dict - def compute_model_outputs(self, batch_dict: dict, model: PreTrainedModel) -> dict: with torch.no_grad(): outputs = model(**batch_dict) return outputs - def compute_embeddings(self, outputs: dict, batch_dict: dict): - embeddings = mean_pooling( - outputs, batch_dict["attention_mask"] - ) + embeddings = mean_pooling(outputs, batch_dict["attention_mask"]) return embeddings # # Preprocessing, postprocessing and call for inference code # - def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED) -> Union[List, Tensor]: + def call( + self, + api_kwargs: Dict = None, + model_type: Optional[ModelType] = ModelType.UNDEFINED, + ) -> Union[List, Tensor]: api_kwargs = api_kwargs or dict() if "model" not in api_kwargs: @@ -251,7 +269,6 @@ def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelTy # inference the model return self.infer_embedding(api_kwargs["input"]) - def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOutput: embeddings: List[Embedding] = [] for idx, emb in enumerate(response): @@ -259,12 +276,11 @@ def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOut response = EmbedderOutput(data=embeddings) return response - def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, model_kwargs: dict = {}, - model_type: Optional[ModelType]= ModelType.UNDEFINED + model_type: Optional[ModelType] = ModelType.UNDEFINED, ) -> dict: final_model_kwargs = model_kwargs.copy() # if model_type == ModelType.EMBEDDER: @@ -281,6 +297,7 @@ class TransformerLLMModelClient(ModelClient): Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens Once you have a token and have access, put the token in the environment variable HF_TOKEN. """ + # # Model initialisation # @@ -297,7 +314,7 @@ def __init__( chat_template_kwargs: Optional[dict] = None, use_token: bool = False, torch_dtype: Optional[Any] = torch.bfloat16, - local_files_only: Optional[bool] = False + local_files_only: Optional[bool] = False, ): super().__init__() @@ -307,57 +324,55 @@ def __init__( self.auto_model_kwargs = auto_model_kwargs or dict() self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict() if "return_tensors" not in self.tokenizer_kwargs: - self.tokenizer_kwargs["return_tensors"]= "pt" + self.tokenizer_kwargs["return_tensors"] = "pt" self.use_token = use_token self.torch_dtype = torch_dtype self.init_from = init_from self.apply_chat_template = apply_chat_template self.chat_template = chat_template - self.chat_template_kwargs = chat_template_kwargs or dict(tokenize=False, add_generation_prompt=True) + self.chat_template_kwargs = chat_template_kwargs or dict( + tokenize=False, add_generation_prompt=True + ) self.local_files_only = local_files_only self.model = None if model_name is not None: self.init_model(model_name=model_name) - def _check_token(self, token: str): if get_env_variable(token) is None: warnings.warn( f"{token} is not set. You may not be able to access the model." ) - def _get_token_if_relevant(self) -> Union[str, bool]: if self.use_token: self._check_token("HF_TOKEN") token = get_env_variable("HF_TOKEN") else: - token = False + token = False return token - def _init_from_pipeline(self): clean_device_cache() - token = self._get_token_if_relevant() # return a token string or False + token = self._get_token_if_relevant() # return a token string or False self.model = pipeline( "text-generation", model=self.model_name, torch_dtype=self.torch_dtype, device=get_device(), - token=token + token=token, ) - def _init_from_automodelcasual_lm(self): - token = self._get_token_if_relevant() # return a token str or False + token = self._get_token_if_relevant() # return a token str or False self.tokenizer = AutoTokenizer.from_pretrained( self.model_name, token=token, local_files_only=self.local_files_only, - **self.auto_tokenizer_kwargs + **self.auto_tokenizer_kwargs, ) self.model = AutoModelForCausalLM.from_pretrained( self.model_name, @@ -365,7 +380,7 @@ def _init_from_automodelcasual_lm(self): device_map="auto", token=token, local_files_only=self.local_files_only, - **self.auto_model_kwargs + **self.auto_model_kwargs, ) # Set pad token if it's not already set if self.tokenizer.pad_token is None: @@ -374,18 +389,19 @@ def _init_from_automodelcasual_lm(self): self.tokenizer.eos_token_id ) # ensure consistency in the model config - @lru_cache(None) def init_model(self, model_name: str): - log.debug(f"Loading model {model_name}") + log.debug(f"Loading model {model_name}") try: if self.init_from == "autoclass": self._init_from_automodelcasual_lm() elif self.init_from == "pipeline": self._init_from_pipeline() else: - raise ValueError("argument 'init_from' must be one of 'autoclass' or 'pipeline'.") + raise ValueError( + "argument 'init_from' must be one of 'autoclass' or 'pipeline'." + ) except Exception as e: log.error(f"Error loading model {model_name}: {e}") raise e @@ -437,7 +453,7 @@ def _infer_from_pipeline( apply_chat_template=True, chat_template=chat_template, chat_template_kwargs=chat_template_kwargs, - ) + ) else: model_input = self._handle_input(messages) @@ -448,7 +464,6 @@ def _infer_from_pipeline( log.info(f"Outputs: {outputs}") return outputs - def _infer_from_automodelcasual_lm( self, *, @@ -469,27 +484,28 @@ def _infer_from_automodelcasual_lm( messages, apply_chat_template=True, chat_template_kwargs=chat_template_kwargs, - chat_template=chat_template - ) + chat_template=chat_template, + ) else: - model_input = self._handle_input(messages) + model_input = self._handle_input(messages) input_ids = self.tokenizer(model_input, **self.tokenizer_kwargs).to( get_device() ) - outputs_tokens = self.model.generate(**input_ids, max_length=max_length, max_new_tokens=max_tokens, **kwargs) + outputs_tokens = self.model.generate( + **input_ids, max_length=max_length, max_new_tokens=max_tokens, **kwargs + ) outputs = [] for output in outputs_tokens: outputs.append(self.tokenizer.decode(output)) return outputs - def _handle_input( - self, - messages: Sequence[Dict[str, str]], - apply_chat_template: bool = False, - chat_template_kwargs: dict = None, - chat_template: Optional[str] = None, - ) -> str: + self, + messages: Sequence[Dict[str, str]], + apply_chat_template: bool = False, + chat_template_kwargs: dict = None, + chat_template: Optional[str] = None, + ) -> str: if apply_chat_template: if chat_template is not None: @@ -497,7 +513,9 @@ def _handle_input( prompt = self.tokenizer.apply_chat_template( messages, **chat_template_kwargs ) - if ("tokenize" in chat_template_kwargs) and (chat_template_kwargs["tokenize"] == True): + if ("tokenize" in chat_template_kwargs) and ( + chat_template_kwargs["tokenize"] == True + ): prompt = self.tokenizer.decode(prompt, **self.tokenizer_decode_kwargs) return prompt else: @@ -506,7 +524,6 @@ def _handle_input( text = messages[-1]["content"] return text - def infer_llm( self, *, @@ -524,7 +541,7 @@ def infer_llm( apply_chat_template=self.apply_chat_template, chat_template=self.chat_template, chat_template_kwargs=self.chat_template_kwargs, - **kwargs + **kwargs, ) else: return self._infer_from_automodelcasual_lm( @@ -534,13 +551,17 @@ def infer_llm( apply_chat_template=self.apply_chat_template, chat_template=self.chat_template, chat_template_kwargs=self.chat_template_kwargs, - **kwargs + **kwargs, ) # # Preprocessing, postprocessing and call for inference code # - def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED): + def call( + self, + api_kwargs: Dict = None, + model_type: Optional[ModelType] = ModelType.UNDEFINED, + ): api_kwargs = api_kwargs or dict() if "model" not in api_kwargs: raise ValueError("model must be specified in api_kwargs") @@ -548,7 +569,9 @@ def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelTy model_name = api_kwargs["model"] if (model_name != self.model_name) and (self.model_name is not None): # need to update the model_name - log.warning(f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}.") + log.warning( + f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}." + ) self.model_name = model_name self.init_model(model_name=model_name) elif (model_name != self.model_name) and (self.model_name is None): @@ -556,11 +579,9 @@ def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelTy self.model_name = model_name self.init_model(model_name=model_name) - output = self.infer_llm(**api_kwargs) return output - def _parse_chat_completion_from_pipeline(self, completion: Any) -> str: text = completion[0]["generated_text"] @@ -573,12 +594,12 @@ def _parse_chat_completion_from_pipeline(self, completion: Any) -> str: else: return "" - - def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> GeneratorOutput: + def _parse_chat_completion_from_automodelcasual_lm( + self, completion: Any + ) -> GeneratorOutput: print(f"completion: {completion}") return completion[0] - def parse_chat_completion(self, completion: Any) -> str: try: if self.init_from == "pipeline": @@ -590,18 +611,19 @@ def parse_chat_completion(self, completion: Any) -> str: log.error(f"Error parsing chat completion: {e}") return GeneratorOutput(data=None, raw_response=str(completion), error=e) - def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query, model_kwargs: dict = None, - model_type: Optional[ModelType]= ModelType.UNDEFINED + model_type: Optional[ModelType] = ModelType.UNDEFINED, ) -> dict: model_kwargs = model_kwargs or dict() final_model_kwargs = model_kwargs.copy() assert "model" in final_model_kwargs, "model must be specified" - #messages = [{"role": "system", "content": input}] - messages = [{"role": "user", "content": input}] # Not sure, but it seems to make more sense + # messages = [{"role": "system", "content": input}] + messages = [ + {"role": "user", "content": input} + ] # Not sure, but it seems to make more sense final_model_kwargs["messages"] = messages return final_model_kwargs @@ -615,6 +637,7 @@ class TransformerRerankerModelClient(ModelClient): Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens Once you have a token and have access, put the token in the environment variable HF_TOKEN. """ + # # Model initialisation # @@ -626,32 +649,31 @@ def __init__( auto_tokenizer_kwargs: Optional[dict] = None, auto_model: Optional[type] = AutoModelForSequenceClassification, auto_tokenizer: Optional[type] = AutoTokenizer, - local_files_only: Optional[bool] = False + local_files_only: Optional[bool] = False, ): self.auto_model = auto_model self.auto_model_kwargs = auto_model_kwargs or dict() self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict() - self.auto_tokenizer= auto_tokenizer + self.auto_tokenizer = auto_tokenizer self.model_name = model_name self.tokenizer_kwargs = tokenizer_kwargs or dict() if "return_tensors" not in self.tokenizer_kwargs: - self.tokenizer_kwargs["return_tensors"]= "pt" + self.tokenizer_kwargs["return_tensors"] = "pt" self.local_files_only = local_files_only if model_name is not None: self.init_model(model_name=model_name) - def init_model(self, model_name: str): try: self.tokenizer = self.auto_tokenizer.from_pretrained( - self.model_name, - local_files_only=self.local_files_only, - **self.auto_tokenizer_kwargs + self.model_name, + local_files_only=self.local_files_only, + **self.auto_tokenizer_kwargs, ) self.model = self.auto_model.from_pretrained( - self.model_name, - local_files_only=self.local_files_only, - **self.auto_model_kwargs + self.model_name, + local_files_only=self.local_files_only, + **self.auto_model_kwargs, ) # Check device availability and set the device device = get_device() @@ -684,10 +706,7 @@ def infer_reranker( with torch.no_grad(): - inputs = self.tokenizer( - input, - **self.tokenizer_kwargs - ) + inputs = self.tokenizer(input, **self.tokenizer_kwargs) inputs = {k: v.to(self.device) for k, v in inputs.items()} scores = ( self.model(**inputs, return_dict=True) @@ -713,7 +732,9 @@ def call(self, api_kwargs: Dict = None): model_name = api_kwargs["model"] if (model_name != self.model_name) and (self.model_name is not None): # need to update the model_name - log.warning(f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}.") + log.warning( + f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}." + ) self.model_name = model_name self.init_model(model_name=model_name) elif (model_name != self.model_name) and (self.model_name is None): @@ -727,13 +748,10 @@ def call(self, api_kwargs: Dict = None): top_k = api_kwargs.pop("top_k") scores = self.infer_reranker(**api_kwargs) - top_k_indices, top_k_scores = get_top_k_indices_scores( - scores, top_k - ) + top_k_indices, top_k_scores = get_top_k_indices_scores(scores, top_k) log.warning(f"output: ({top_k_indices}, {top_k_scores})") return top_k_indices, top_k_scores - def convert_inputs_to_api_kwargs( self, input: Any, # for retriever, it is a single query,