From e8a690b4e3e57b51be9d70f21b15b1fc497bdfa4 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 28 Aug 2024 18:21:21 +0000
Subject: [PATCH 01/36] DRAFT: merge TransformerClient & TransformerEmbedder
 into 1 class.

---
 .../model_client/transformers_client.py       | 202 ++++++++++++++++++
 1 file changed, 202 insertions(+)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index f681f23f..20ef1ce6 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -40,6 +40,208 @@ def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 
 
+
+
+#
+#
+#
+# DRAFT
+#
+#
+#
+from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
+
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+
+class TransformerEmbeddingModelClient(ModelClient):
+
+    #
+    #   Model initialisation
+    #
+    def __init__(
+            self,
+            model_name: Optional[str] = None,
+            tokenizer_kwargs: Optional[dict] = dict(),
+            auto_model: Optional[type] = AutoModel,
+            auto_tokenizer: Optional[type] = AutoTokenizer,
+            custom_model: Optional[PreTrainedModel] = None,
+            custom_tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]] = None
+            ):
+
+        super().__init__()
+        self.model_name = model_name
+        self.tokenizer_kwargs = tokenizer_kwargs
+        self.auto_model=auto_model,
+        self.auto_tokenizer=auto_tokenizer,
+        self.custom_model=custom_model,
+        self.custom_tokenizer=custom_tokenizer
+
+        # Check if there is conflicting arguments
+        self.use_auto_model = auto_model is not None
+        self.use_auto_tokenizer = auto_tokenizer is not None
+        self.use_cusom_model = custom_model is not None
+        self.use_cusom_tokenizer = custom_tokenizer is not None
+        self.model_name_exit = model_name is not None
+
+        ## arguments related to model
+        if self.use_auto_model and self.use_cusom_model:
+            raise ValueError("Cannot specify 'auto_model' and 'custom_model'.")
+        elif (not self.use_auto_model) and (not self.use_cusom_model):
+            raise ValueError("Need to specify either 'auto_model' or 'custom_model'.")
+        elif self.use_auto_model and (not self.model_name_exit):
+            raise ValueError("When 'auto_model' is specified 'model_name' must be specified too.")
+        
+        ## arguments related to tokenizer
+        if self.use_auto_tokenizer and self.use_cusom_tokenizer:
+            raise Exception("Cannot specify 'auto_tokenizer' and 'custom_tokenizer'.")
+        elif (not self.use_auto_tokenizer) and (not self.use_cusom_tokenizer):
+            raise Exception("Need to specify either'auto_tokenizer' and 'custom_tokenizer'.")
+        elif self.use_auto_tokenizer and (not self.model_name_exit):
+            raise ValueError("When 'auto_tokenizer' is specified 'model_name' must be specified too.")
+
+        self.init_sync_client()
+
+    def init_sync_client(self):
+        self.init_model(
+            model_name=self.model_name,
+            auto_model=self.auto_model,
+            auto_tokenizer=self.auto_tokenizer,
+            custom_model=self.custom_model,
+            custom_tokenizer=self.custom_tokenizer
+            )
+
+    @lru_cache(None)
+    def init_model(
+        self,
+        model_name: Optional[str] = None,
+        auto_model: Optional[type] = AutoModel,
+        auto_tokenizer: Optional[type] = AutoTokenizer,
+        custom_model: Optional[PreTrainedModel] = None,
+        custom_tokenizer: Optional[PreTrainedTokenizer | PreTrainedTokenizerFast] = None
+        ):
+
+        try:
+            if self.use_auto_model:
+                self.model = auto_model.from_pretrained(model_name)
+            else:
+                self.model = custom_model
+
+            if self.use_auto_tokenizer:
+                self.tokenizer = auto_tokenizer.from_pretrained(model_name)
+            else:
+                self.tokenizer = custom_tokenizer
+
+            log.info(f"Done loading model {model_name}")
+
+        except Exception as e:
+            log.error(f"Error loading model {model_name}: {e}")
+            raise e
+
+    #
+    #   Inference code
+    #
+    def infer_embedding(
+        self,
+        input=Union[str, List[str], List[List[str]]],
+        tolist: bool = True,
+    ):
+        model = self.model
+
+        self.handle_input(input)
+        batch_dict = self.tokenize_inputs(input, kwargs=self.tokenizer_kwargs)
+        outputs = self.compute_model_outputs(batch_dict, model)
+        embeddings = self.compute_embeddings(outputs, batch_dict)
+
+        # normalize embeddings
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        if tolist:
+            embeddings = embeddings.tolist()
+        return embeddings
+
+    def handle_input(self, input: Union[str, List[str], List[List[str]]]):
+        if isinstance(input, str):
+            input = [input]
+        return input
+     
+    def tokenize_inputs(self, input, kwargs: Optional[dict] = dict()):
+        batch_dict = self.tokenizer(input, **kwargs)
+        return batch_dict
+
+    def compute_model_outputs(self, batch_dict, model):
+        with torch.no_grad():
+            outputs = model(**batch_dict)
+        return outputs
+
+    def compute_embeddings(self, outputs, batch_dict):
+        embeddings = mean_pooling(
+            outputs, batch_dict["attention_mask"]
+        )
+        return embeddings
+    """
+    def __call__(self, **kwargs):
+        if "model" not in kwargs:
+            raise ValueError("model is required")
+
+        if "mock" in kwargs and kwargs["mock"]:
+            import numpy as np
+
+            embeddings = np.array([np.random.rand(768).tolist()])
+            return embeddings
+
+        # inference the model
+        return self.infer_embedding(kwargs["input"])
+    """
+
+    #
+    # Preprocessing, postprocessing and call for inference code
+    #
+    def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED):
+        
+        # I don't think it is useful anymore
+        # if "model" not in api_kwargs:
+        #     raise ValueError("model must be specified in api_kwargs")
+        if (
+            model_type == ModelType.EMBEDDER
+            # and "model" in api_kwargs
+        ):
+            if "mock" in api_kwargs and api_kwargs["mock"]:
+                import numpy as np
+
+                embeddings = np.array([np.random.rand(768).tolist()])
+                return embeddings
+
+        # inference the model
+        return self.infer_embedding(api_kwargs["input"])
+
+    def parse_embedding_response(self, response: Any) -> EmbedderOutput:
+        embeddings: List[Embedding] = []
+        for idx, emb in enumerate(response):
+            embeddings.append(Embedding(index=idx, embedding=emb))
+        response = EmbedderOutput(data=embeddings)
+        return response
+
+    def convert_inputs_to_api_kwargs(
+        self,
+        input: Any,  # for retriever, it is a single query,
+        model_kwargs: dict = {},
+        model_type: ModelType = ModelType.UNDEFINED,
+    ) -> dict:
+        final_model_kwargs = model_kwargs.copy()
+        if model_type == ModelType.EMBEDDER:
+            final_model_kwargs["input"] = input
+            return final_model_kwargs
+
+#
+#
+#
+#  END OF DRAFT 
+#
+#
+#
+
 # TODO: provide a standard api for embedding and chat models used in local model SDKs
 class TransformerEmbedder:
     """Local model SDK for transformers.

From 123396cbfcbb95a2f6cf7b3c0b7ec5f73053e161 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Thu, 29 Aug 2024 12:26:31 +0000
Subject: [PATCH 02/36] Fixed typo.

---
 .../adalflow/components/model_client/transformers_client.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 20ef1ce6..4f1825b1 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -74,9 +74,9 @@ def __init__(
         super().__init__()
         self.model_name = model_name
         self.tokenizer_kwargs = tokenizer_kwargs
-        self.auto_model=auto_model,
-        self.auto_tokenizer=auto_tokenizer,
-        self.custom_model=custom_model,
+        self.auto_model=auto_model
+        self.auto_tokenizer=auto_tokenizer
+        self.custom_model=custom_model
         self.custom_tokenizer=custom_tokenizer
 
         # Check if there is conflicting arguments

From 21217741865f90132cd9699f6b65bdbd03e61381 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Thu, 29 Aug 2024 14:26:56 +0000
Subject: [PATCH 03/36] Added type hints to signatures + removed now useless
 model_type.

---
 .../model_client/transformers_client.py       | 57 +++++++------------
 1 file changed, 21 insertions(+), 36 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 4f1825b1..5dadc4d2 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -35,7 +35,7 @@
 log = logging.getLogger(__name__)
 
 
-def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
+def average_pool(last_hidden_states: Tensor, attention_mask: list) -> Tensor:
     last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 
@@ -51,7 +51,7 @@ def average_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
 #
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 
-def mean_pooling(model_output, attention_mask):
+def mean_pooling(model_output: dict, attention_mask) -> Tensor:
     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
@@ -147,7 +147,7 @@ def infer_embedding(
         self,
         input=Union[str, List[str], List[List[str]]],
         tolist: bool = True,
-    ):
+    ) -> Union[List, Tensor]:
         model = self.model
 
         self.handle_input(input)
@@ -161,62 +161,48 @@ def infer_embedding(
             embeddings = embeddings.tolist()
         return embeddings
 
-    def handle_input(self, input: Union[str, List[str], List[List[str]]]):
+    def handle_input(self, input: Union[str, List[str], List[List[str]]]) -> Union[List[str], List[List[str]]]:
         if isinstance(input, str):
             input = [input]
         return input
      
-    def tokenize_inputs(self, input, kwargs: Optional[dict] = dict()):
+    def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = dict()) -> dict:
         batch_dict = self.tokenizer(input, **kwargs)
         return batch_dict
 
-    def compute_model_outputs(self, batch_dict, model):
+    def compute_model_outputs(self, batch_dict: dict, model: PreTrainedModel) -> dict:
         with torch.no_grad():
             outputs = model(**batch_dict)
         return outputs
 
-    def compute_embeddings(self, outputs, batch_dict):
+    def compute_embeddings(self, outputs: dict, batch_dict: dict):
         embeddings = mean_pooling(
             outputs, batch_dict["attention_mask"]
         )
         return embeddings
-    """
-    def __call__(self, **kwargs):
-        if "model" not in kwargs:
-            raise ValueError("model is required")
-
-        if "mock" in kwargs and kwargs["mock"]:
-            import numpy as np
-
-            embeddings = np.array([np.random.rand(768).tolist()])
-            return embeddings
-
-        # inference the model
-        return self.infer_embedding(kwargs["input"])
-    """
 
     #
     # Preprocessing, postprocessing and call for inference code
     #
-    def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED):
+    def call(self, api_kwargs: Dict = {}) -> Union[List, Tensor]:
         
         # I don't think it is useful anymore
         # if "model" not in api_kwargs:
         #     raise ValueError("model must be specified in api_kwargs")
-        if (
-            model_type == ModelType.EMBEDDER
-            # and "model" in api_kwargs
-        ):
-            if "mock" in api_kwargs and api_kwargs["mock"]:
-                import numpy as np
+        # if (
+        #     model_type == ModelType.EMBEDDER
+        #     # and "model" in api_kwargs
+        # ):
+        if "mock" in api_kwargs and api_kwargs["mock"]:
+            import numpy as np
 
-                embeddings = np.array([np.random.rand(768).tolist()])
-                return embeddings
+            embeddings = np.array([np.random.rand(768).tolist()])
+            return embeddings
 
         # inference the model
         return self.infer_embedding(api_kwargs["input"])
 
-    def parse_embedding_response(self, response: Any) -> EmbedderOutput:
+    def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOutput:
         embeddings: List[Embedding] = []
         for idx, emb in enumerate(response):
             embeddings.append(Embedding(index=idx, embedding=emb))
@@ -226,13 +212,12 @@ def parse_embedding_response(self, response: Any) -> EmbedderOutput:
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
-        model_kwargs: dict = {},
-        model_type: ModelType = ModelType.UNDEFINED,
+        model_kwargs: dict = {}
     ) -> dict:
         final_model_kwargs = model_kwargs.copy()
-        if model_type == ModelType.EMBEDDER:
-            final_model_kwargs["input"] = input
-            return final_model_kwargs
+        # if model_type == ModelType.EMBEDDER:
+        final_model_kwargs["input"] = input
+        return final_model_kwargs
 
 #
 #

From e2023b2984d9891f69b33f32ca39efcb185f6392 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Thu, 29 Aug 2024 14:27:21 +0000
Subject: [PATCH 04/36] Removed now useless model_types.

---
 adalflow/adalflow/core/embedder.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py
index 89aac0c5..1ce131b1 100644
--- a/adalflow/adalflow/core/embedder.py
+++ b/adalflow/adalflow/core/embedder.py
@@ -36,7 +36,6 @@ class Embedder(Component):
         - Use ``BatchEmbedder`` for automatically batching input of large size, larger than 100.
     """
 
-    model_type: ModelType = ModelType.EMBEDDER
     model_client: ModelClient
     output_processors: Optional[Component]
 
@@ -100,8 +99,7 @@ def _pre_call(
         # step 2: convert the input to the api_kwargs
         api_kwargs = self.model_client.convert_inputs_to_api_kwargs(
             input=input,
-            model_kwargs=composed_model_kwargs,
-            model_type=self.model_type,
+            model_kwargs=composed_model_kwargs
         )
         log.debug(f"api_kwargs: {api_kwargs}")
         return api_kwargs
@@ -140,7 +138,7 @@ def call(
         response = None
         try:
             response = self.model_client.call(
-                api_kwargs=api_kwargs, model_type=self.model_type
+                api_kwargs=api_kwargs
             )
         except Exception as e:
             log.error(f"Error calling the model: {e}")

From a93bd6a68311d726ed62af82d4b1b74cb33e565a Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Thu, 29 Aug 2024 14:27:50 +0000
Subject: [PATCH 05/36] Added test for TransformerEmbeddingModelClient
 execution.

---
 adalflow/tests/test_transformer_client.py | 129 +++++++++++++++-------
 1 file changed, 92 insertions(+), 37 deletions(-)

diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py
index d8562454..86111281 100644
--- a/adalflow/tests/test_transformer_client.py
+++ b/adalflow/tests/test_transformer_client.py
@@ -1,55 +1,110 @@
 import unittest
 import torch
-
+from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient
+from adalflow.core.types import ModelType
+from adalflow.core import Embedder
 
 # Set the number of threads for PyTorch, avoid segementation fault
 torch.set_num_threads(1)
 torch.set_num_interop_threads(1)
 
 
-class TestTransformerModelClient(unittest.TestCase):
+class TestTransformerEmbeddingModelClient(unittest.TestCase):
     def setUp(self) -> None:
-
         self.query = "what is panda?"
         self.documents = [
             "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
             "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.",
         ]
 
-    # def test_transformer_embedder(self):
-    #     transformer_embedder_model = "thenlper/gte-base"
-    #     transformer_embedder_model_component = TransformerEmbedder(
-    #         model_name=transformer_embedder_model
-    #     )
-    #     print(
-    #         f"Testing transformer embedder with model {transformer_embedder_model_component}"
-    #     )
-    #     print("Testing transformer embedder")
-    #     output = transformer_embedder_model_component(
-    #         model=transformer_embedder_model, input="Hello world"
-    #     )
-    #     print(output)
-
-    # def test_transformer_client(self):
-    #     transformer_client = TransformersClient()
-    #     print("Testing transformer client")
-    #     # run the model
-    #     kwargs = {
-    #         "model": "thenlper/gte-base",
-    #         # "mock": False,
-    #     }
-    #     api_kwargs = transformer_client.convert_inputs_to_api_kwargs(
-    #         input="Hello world",
-    #         model_kwargs=kwargs,
-    #         model_type=ModelType.EMBEDDER,
-    #     )
-    #     # print(api_kwargs)
-    #     output = transformer_client.call(
-    #         api_kwargs=api_kwargs, model_type=ModelType.EMBEDDER
-    #     )
-
-    #     # print(transformer_client)
-    #     # print(output)
+    def test_execution(self):
+        test_input = "Hello word"
+        embedding_model = "thenlper/gte-base"
+        model_kwargs = {"model": embedding_model}
+        tokenizer_kwargs = {
+            "max_length": 512,
+            "padding": True,
+            "truncation": True,
+            "return_tensors": 'pt'
+        }
+        model_client = TransformerEmbeddingModelClient(
+            model_name=embedding_model,
+            tokenizer_kwargs=tokenizer_kwargs
+        )
+        print(
+            f"Testing model client with model {embedding_model}"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input=test_input, model_kwargs=model_kwargs)
+        output = model_client.call(api_kwargs=api_kwargs)
+        print(output)
+
+    def test_integration_with_embedder(self):
+
+        test_input = "Hello word"
+        embedding_model = "thenlper/gte-base"
+        model_kwargs = {"model": embedding_model}
+        tokenizer_kwargs = {
+            "max_length": 512,
+            "padding": True,
+            "truncation": True,
+            "return_tensors": 'pt'
+        }
+        model_client = TransformerEmbeddingModelClient(
+            model_name=embedding_model,
+            tokenizer_kwargs=tokenizer_kwargs
+        )
+        print(
+            f"Testing model client with model {embedding_model}"
+        )
+        embedder = Embedder(model_client=model_client,
+            model_kwargs=model_kwargs
+            )
+        output = embedder(test_input)
+        print(output)
+
+# class TestTransformerModelClient(unittest.TestCase):
+#     def setUp(self) -> None:
+
+#         self.query = "what is panda?"
+#         self.documents = [
+#             "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
+#             "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.",
+#         ]
+
+#     def test_transformer_embedder(self):
+#         transformer_embedder_model = "thenlper/gte-base"
+#         transformer_embedder_model_component = TransformerEmbedder(
+#             model_name=transformer_embedder_model
+#         )
+#         print(
+#             f"Testing transformer embedder with model {transformer_embedder_model_component}"
+#         )
+#         print("Testing transformer embedder")
+#         output = transformer_embedder_model_component(
+#             model=transformer_embedder_model, input="Hello world"
+#         )
+#         print(output)
+
+#     def test_transformer_client(self):
+#         transformer_client = TransformersClient()
+#         print("Testing transformer client")
+#         # run the model
+#         kwargs = {
+#             "model": "thenlper/gte-base",
+#             # "mock": False,
+#         }
+#         api_kwargs = transformer_client.convert_inputs_to_api_kwargs(
+#             input="Hello world",
+#             model_kwargs=kwargs,
+#             model_type=ModelType.EMBEDDER,
+#         )
+#         # print(api_kwargs)
+#         output = transformer_client.call(
+#             api_kwargs=api_kwargs, model_type=ModelType.EMBEDDER
+#         )
+
+        # print(transformer_client)
+        # print(output)
 
     # def test_transformer_reranker(self):
     #     transformer_reranker_model = "BAAI/bge-reranker-base"

From 424cbfb130e83131809ce6f9e407b6bcaa714cc4 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Tue, 3 Sep 2024 15:18:51 +0000
Subject: [PATCH 06/36] Changed my mind.

---
 .../adalflow/components/model_client/transformers_client.py   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 5dadc4d2..5a0f544a 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -186,9 +186,9 @@ def compute_embeddings(self, outputs: dict, batch_dict: dict):
     #
     def call(self, api_kwargs: Dict = {}) -> Union[List, Tensor]:
         
+        if "model" not in api_kwargs:
+            raise ValueError("model must be specified in api_kwargs")
         # I don't think it is useful anymore
-        # if "model" not in api_kwargs:
-        #     raise ValueError("model must be specified in api_kwargs")
         # if (
         #     model_type == ModelType.EMBEDDER
         #     # and "model" in api_kwargs

From ef2783d47bd8c96e10b22d9660a888e39b70098a Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Tue, 3 Sep 2024 17:12:19 +0000
Subject: [PATCH 07/36] Changed my mind. removing model type might introduce
 issues.

---
 adalflow/adalflow/core/embedder.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py
index 1ce131b1..7dae1ca7 100644
--- a/adalflow/adalflow/core/embedder.py
+++ b/adalflow/adalflow/core/embedder.py
@@ -36,6 +36,7 @@ class Embedder(Component):
         - Use ``BatchEmbedder`` for automatically batching input of large size, larger than 100.
     """
 
+    model_type: ModelType = ModelType.EMBEDDER
     model_client: ModelClient
     output_processors: Optional[Component]
 

From 3601445968b198b0bfef64b58b31c11cf721084f Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Tue, 3 Sep 2024 17:54:51 +0000
Subject: [PATCH 08/36] Removed now useless argument.

---
 adalflow/adalflow/core/embedder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py
index 7dae1ca7..c43518df 100644
--- a/adalflow/adalflow/core/embedder.py
+++ b/adalflow/adalflow/core/embedder.py
@@ -168,7 +168,7 @@ async def acall(
         response = None
         try:
             response = await self.model_client.acall(
-                api_kwargs=api_kwargs, model_type=self.model_type
+                api_kwargs=api_kwargs
             )
         except Exception as e:
             log.error(f"Error calling the model: {e}")

From 3cdab7b4916c0d6179a46c35dc2591fceafc9f21 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Tue, 3 Sep 2024 18:02:13 +0000
Subject: [PATCH 09/36] Removed now useless arguments.

---
 adalflow/adalflow/core/generator.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
index 02765132..220420aa 100644
--- a/adalflow/adalflow/core/generator.py
+++ b/adalflow/adalflow/core/generator.py
@@ -312,7 +312,6 @@ def _pre_call(self, prompt_kwargs: Dict, model_kwargs: Dict) -> Dict[str, Any]:
         api_kwargs = self.model_client.convert_inputs_to_api_kwargs(
             input=prompt_str,
             model_kwargs=composed_model_kwargs,
-            model_type=self.model_type,
         )
         return api_kwargs
 
@@ -329,7 +328,7 @@ def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any:
                     return cached_completion
 
             completion = self.model_client.call(
-                api_kwargs=api_kwargs, model_type=self.model_type
+                api_kwargs=api_kwargs
             )
             # prepare cache
             if use_cache:
@@ -799,7 +798,7 @@ async def acall(
 
         try:
             completion = await self.model_client.acall(
-                api_kwargs=api_kwargs, model_type=self.model_type
+                api_kwargs=api_kwargs
             )
         except Exception as e:
             log.error(f"Error calling the model: {e}")

From 2d1152fd7cf6a7e599c016a482f36c098d16e43e Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Tue, 3 Sep 2024 22:02:12 +0000
Subject: [PATCH 10/36] DRAFT: merge TransformerClient and TransformerLLM in 1
 class.

---
 .../model_client/transformers_client.py       | 268 ++++++++++++++++++
 1 file changed, 268 insertions(+)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 5a0f544a..b032c22a 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -28,9 +28,12 @@
 from transformers import (
     AutoTokenizer,
     AutoModel,
+    AutoModelForCausalLM,
     AutoModelForSequenceClassification,
+    pipeline
 )
 
+from os import getenv as get_env_variable
 
 log = logging.getLogger(__name__)
 
@@ -219,6 +222,271 @@ def convert_inputs_to_api_kwargs(
         final_model_kwargs["input"] = input
         return final_model_kwargs
 
+
+class TransformerLLMModelClient(ModelClient):
+
+    #
+    #   Model initialisation
+    #
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        init_from: Optional[str] = "autoclass",
+        use_token: bool = False,
+        torch_dtype: Optional[Any] = torch.bfloat16,
+        local_files_only: Optional[bool] = False
+    ):
+        super().__init__()
+
+        self.model_name = model_name  # current model to use
+        self.use_token = use_token
+        self.torch_dtype = torch_dtype
+        self.init_from = init_from
+        self.local_files_only = local_files_only
+        self.model = None
+        if model_name is not None:
+            self.init_model(model_name=model_name)
+
+    def _check_token(self, token: str):
+        if get_env_variable(token) is None:
+            warnings.warn(
+                f"{token} is not set. You may not be able to access the model."
+            )
+
+    def _get_token_if_relevant(self) -> Union[str, bool]:
+        if self.use_token:
+            self._check_token("HF_TOKEN")
+            token = get_env_variable("HF_TOKEN")
+        else:
+            token = False      
+        return token
+
+    def _init_from_pipeline(self):
+
+        clean_device_cache()
+        token = self._get_token_if_relevant() # return a token string or False
+        self.model = pipeline(
+            "text-generation",
+            model=self.model_name,
+            torch_dtype=self.torch_dtype,
+            device=get_device(),
+            token=token
+        )
+
+    def _init_from_automodelcasual_lm(self):
+
+        token = self._get_token_if_relevant() # return a token str or False
+
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            token=token,
+            local_files_only=self.local_files_only
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            torch_dtype=self.torch_dtype,
+            device_map="auto",
+            token=token,
+            local_files_only=self.local_files_only
+        )
+
+    @lru_cache(None)
+    def init_model(self, model_name: str):
+
+        log.debug(f"Loading model {model_name}") 
+        try:
+            if self.init_from == "autoclass":
+                self._init_from_automodelcasual_lm()
+            elif self.init_from == "pipeline":
+                self._init_from_pipeline()
+            else:
+                raise ValueError("argument 'init_from' must be one of 'autoclass' or 'pipeline'.")
+        except Exception as e:
+            log.error(f"Error loading model {model_name}: {e}")
+            raise e
+
+    #
+    #   Inference code
+    #
+    def _infer_from_pipeline(
+    self,
+    *,
+    model: str,
+    messages: Sequence[Dict[str, str]],
+    max_tokens: Optional[int] = None,
+    apply_chat_template: bool = False,
+    chat_template: Optional[str] = None,
+    chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True),
+    **kwargs,
+    ):
+
+        if not self.model:
+            self.init_model(model_name=model)
+
+        log.info(
+            f"Start to infer model {model}, messages: {messages}, kwargs: {kwargs}"
+        )
+        #  TO DO: add default values in doc
+        final_kwargs = {
+            "max_new_tokens": max_tokens or 256,
+            "do_sample": True,
+            "temperature": kwargs.get("temperature", 0.7),
+            "top_k": kwargs.get("top_k", 50),
+            "top_p": kwargs.get("top_p", 0.95),
+        }
+        if apply_chat_template:
+            model_input = self._handle_input(
+                messages,
+                apply_chat_template=True,
+                chat_template_kwargs=chat_template_kwargs,
+                chat_template=chat_template
+                )
+        else:
+            model_input = self._handle_input(messages)
+
+        outputs = self.model(
+            model_input,
+            **final_kwargs,
+        )
+        log.info(f"Outputs: {outputs}")
+        return outputs
+
+    def _infer_from_automodelcasual_lm(
+        self,
+        *,
+        model: str,
+        messages: Sequence[Dict[str, str]],
+        max_tokens: Optional[int] = None,
+        max_length: Optional[int] = 8192,  # model-agnostic
+        apply_chat_template: bool = False,
+        chat_template: Optional[str] = None,
+        chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True),
+        **kwargs,
+    ):
+        if not self.model:
+            self.init_model(model_name=model)
+
+        if apply_chat_template:
+            model_input = self._handle_input(
+                messages,
+                apply_chat_template=True,
+                chat_template_kwargs=chat_template_kwargs,
+                chat_template=chat_template
+                )
+        else:
+           model_input = self._handle_input(messages) 
+
+        input_ids = self.tokenizer(model_input, return_tensors="pt").to(
+            get_device()
+        )
+        outputs_tokens = self.model.generate(**input_ids, max_length=max_length, max_new_tokens=max_tokens, **kwargs)
+        outputs = []
+        for output in outputs_tokens:
+            outputs.append(self.tokenizer.decode(output))
+        return outputs
+
+    def _handle_input(
+            self,
+            messages: Sequence[Dict[str, str]],
+            apply_chat_template: bool = False,
+            chat_template_kwargs: dict = None,
+            chat_template: Optional[str] = None,
+            ) -> str:
+
+        if apply_chat_template:
+            if chat_template is not None:
+                self.tokenizer.chat_template = chat_template
+            prompt = self.model.tokenizer.apply_chat_template(
+                messages, **chat_template_kwargs
+            )
+            return prompt
+        else:
+            text = messages[-1]["content"]
+            return text
+
+    def infer_llm(
+        self,
+        *,
+        model: str,
+        messages: Sequence[Dict[str, str]],
+        max_tokens: Optional[int] = None,
+        **kwargs,
+    ):
+
+        if self.init_from == "pipeline":
+            return self._infer_from_pipeline(
+                model=model, messages=messages, max_tokens=max_tokens, **kwargs
+            )
+        else:
+            return self._infer_from_automodelcasual_lm(
+                model=model, messages=messages, max_tokens=max_tokens, **kwargs
+            )
+
+    #
+    # Preprocessing, postprocessing and call for inference code
+    #
+    def call(self, api_kwargs: Dict = {}):
+
+        log.debug(f"api_kwargs: {api_kwargs}")
+        
+        if "model" not in api_kwargs:
+            raise ValueError("model must be specified in api_kwargs")
+
+        model_name = api_kwargs["model"]
+        if (model_name != self.model_name) and (self.model_name is not None):
+            # need to update the model_name
+            log.warning(f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}.")
+            self.model_name = model_name
+            self.init_model(model_name=model_name)
+        elif (model_name != self.model_name) and (self.model_name is None):
+            # need to initialize the model for the first time
+            self.model_name = model_name
+            self.init_model(model_name=model_name)
+
+
+        output = self.infer_llm(**api_kwargs)
+        return output
+
+    def _parse_chat_completion_from_pipeline(self, completion: Any) -> str:
+
+        text = completion[0]["generated_text"]
+
+        pattern = r"(?<=\|assistant\|>).*"
+
+        match = re.search(pattern, text)
+
+        if match:
+            text = match.group().strip().lstrip("\\n")
+            return text
+        else:
+            return ""
+
+    def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> GeneratorOutput:
+        print(f"completion: {completion}")
+        return completion[0]
+
+    def parse_chat_completion(self, completion: Any) -> str:
+        try:
+            if self.init_from == "pipeline":
+                output = self._parse_chat_completion_from_pipeline(completion)
+            else:
+                output = self._parse_chat_completion_from_automodelcasual_lm(completion)
+            return GeneratorOutput(data=output, raw_response=str(completion))
+        except Exception as e:
+            log.error(f"Error parsing chat completion: {e}")
+            return GeneratorOutput(data=None, raw_response=str(completion), error=e)
+
+    def convert_inputs_to_api_kwargs(
+        self,
+        input: Any,  # for retriever, it is a single query,
+        model_kwargs: dict = {}
+    ) -> dict:
+        final_model_kwargs = model_kwargs.copy()
+        assert "model" in final_model_kwargs, "model must be specified"
+        messages = [{"role": "system", "content": input}]
+        final_model_kwargs["messages"] = messages
+        return final_model_kwargs
+
 #
 #
 #

From d839edaaac0bb6ffcd1db52d67abfc26ffbd6929 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 4 Sep 2024 18:49:47 +0000
Subject: [PATCH 11/36] Multiline message:

Added 'tokenizer_kwargs' in TransformerLLMModelClient constructor for more flexibility.

Added chat template argument in constructor for more flexibility.

Added pad token check.

Added tokenizer in '_infer_from_pipeline()' when chat_template is used (required).

Fixed _handle_input() for 'apply_chat_template'==True.

Not sure: ficed message in convert_inputs_to_api_kwargs().
---
 .../model_client/transformers_client.py       | 78 +++++++++++++++----
 1 file changed, 62 insertions(+), 16 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index b032c22a..b604606d 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -231,7 +231,11 @@ class TransformerLLMModelClient(ModelClient):
     def __init__(
         self,
         model_name: Optional[str] = None,
+        tokenizer_kwargs: Optional[dict] = {},
         init_from: Optional[str] = "autoclass",
+        apply_chat_template: bool = False,
+        chat_template: Optional[str] = None,
+        chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True),
         use_token: bool = False,
         torch_dtype: Optional[Any] = torch.bfloat16,
         local_files_only: Optional[bool] = False
@@ -239,9 +243,13 @@ def __init__(
         super().__init__()
 
         self.model_name = model_name  # current model to use
+        self.tokenizer_kwargs = tokenizer_kwargs
         self.use_token = use_token
         self.torch_dtype = torch_dtype
         self.init_from = init_from
+        self.apply_chat_template = apply_chat_template
+        self.chat_template = chat_template
+        self.chat_template_kwargs = chat_template_kwargs
         self.local_files_only = local_files_only
         self.model = None
         if model_name is not None:
@@ -280,7 +288,8 @@ def _init_from_automodelcasual_lm(self):
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
             token=token,
-            local_files_only=self.local_files_only
+            local_files_only=self.local_files_only,
+            **self.tokenizer_kwargs
         )
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
@@ -289,6 +298,13 @@ def _init_from_automodelcasual_lm(self):
             token=token,
             local_files_only=self.local_files_only
         )
+        # Set pad token if it's not already set
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token  # common fallback
+            self.model.config.pad_token_id = (
+                self.tokenizer.eos_token_id
+            )  # ensure consistency in the model config
+
 
     @lru_cache(None)
     def init_model(self, model_name: str):
@@ -309,15 +325,15 @@ def init_model(self, model_name: str):
     #   Inference code
     #
     def _infer_from_pipeline(
-    self,
-    *,
-    model: str,
-    messages: Sequence[Dict[str, str]],
-    max_tokens: Optional[int] = None,
-    apply_chat_template: bool = False,
-    chat_template: Optional[str] = None,
-    chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True),
-    **kwargs,
+        self,
+        *,
+        model: str,
+        messages: Sequence[Dict[str, str]],
+        max_tokens: Optional[int] = None,
+        apply_chat_template: bool = False,
+        chat_template: Optional[str] = None,
+        chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True),
+        **kwargs,
     ):
 
         if not self.model:
@@ -335,11 +351,24 @@ def _infer_from_pipeline(
             "top_p": kwargs.get("top_p", 0.95),
         }
         if apply_chat_template:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.model_name,
+                token=self._get_token_if_relevant(),
+                local_files_only=self.local_files_only,
+                **self.tokenizer_kwargs
+            )
+            # Set pad token if it's not already set
+            if self.tokenizer.pad_token is None:
+                self.tokenizer.pad_token = self.tokenizer.eos_token  # common fallback
+                self.model.config.pad_token_id = (
+                    self.tokenizer.eos_token_id
+                )  # ensure consistency in the model config
+
             model_input = self._handle_input(
                 messages,
                 apply_chat_template=True,
+                chat_template=chat_template,
                 chat_template_kwargs=chat_template_kwargs,
-                chat_template=chat_template
                 )
         else:
             model_input = self._handle_input(messages)
@@ -396,10 +425,14 @@ def _handle_input(
         if apply_chat_template:
             if chat_template is not None:
                 self.tokenizer.chat_template = chat_template
-            prompt = self.model.tokenizer.apply_chat_template(
+            prompt = self.tokenizer.apply_chat_template(
                 messages, **chat_template_kwargs
             )
-            return prompt
+            if ("tokenize" in chat_template_kwargs) and (chat_template_kwargs["tokenize"] == True):
+                prompt = self.tokenizer.decode(prompt)
+                return prompt
+            else:
+                return prompt
         else:
             text = messages[-1]["content"]
             return text
@@ -415,11 +448,23 @@ def infer_llm(
 
         if self.init_from == "pipeline":
             return self._infer_from_pipeline(
-                model=model, messages=messages, max_tokens=max_tokens, **kwargs
+                model=model,
+                messages=messages,
+                max_tokens=max_tokens,
+                apply_chat_template=self.apply_chat_template,
+                chat_template=self.chat_template,
+                chat_template_kwargs=self.chat_template_kwargs,
+                **kwargs
             )
         else:
             return self._infer_from_automodelcasual_lm(
-                model=model, messages=messages, max_tokens=max_tokens, **kwargs
+                model=model,
+                messages=messages,
+                max_tokens=max_tokens,
+                apply_chat_template=self.apply_chat_template,
+                chat_template=self.chat_template,
+                chat_template_kwargs=self.chat_template_kwargs,
+                **kwargs
             )
 
     #
@@ -483,7 +528,8 @@ def convert_inputs_to_api_kwargs(
     ) -> dict:
         final_model_kwargs = model_kwargs.copy()
         assert "model" in final_model_kwargs, "model must be specified"
-        messages = [{"role": "system", "content": input}]
+        #messages = [{"role": "system", "content": input}]
+        messages = [{"role": "user", "content": input}] # Not sure, but it seems to make more sense
         final_model_kwargs["messages"] = messages
         return final_model_kwargs
 

From 7daf8ae37334687ac12f47eb48338cb60e7a262c Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Fri, 6 Sep 2024 11:59:18 +0000
Subject: [PATCH 12/36] Added ests for TransformerLLMModelClient.

---
 adalflow/tests/test_transformer_client.py | 85 ++++++++++++++++++++++-
 1 file changed, 83 insertions(+), 2 deletions(-)

diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py
index 86111281..193ecbc8 100644
--- a/adalflow/tests/test_transformer_client.py
+++ b/adalflow/tests/test_transformer_client.py
@@ -1,8 +1,8 @@
 import unittest
 import torch
-from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient
+from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient, TransformerLLMModelClient
 from adalflow.core.types import ModelType
-from adalflow.core import Embedder
+from adalflow.core import Embedder, Generator
 
 # Set the number of threads for PyTorch, avoid segementation fault
 torch.set_num_threads(1)
@@ -62,6 +62,87 @@ def test_integration_with_embedder(self):
         output = embedder(test_input)
         print(output)
 
+class TestTransformerLLMModelClient(unittest.TestCase):
+
+    def setUp(self) -> None:
+
+        self.model_kwargs = {
+            "model": "roneneldan/TinyStories-1M",
+            "temperature": 0.1,
+            "do_sample": True
+        }
+        self.tokenizer_kwargs = {
+            "max_length": True,
+            "truncation": True,
+        }
+        self.prompt_kwargs = {
+            "input_str": "Where is Brian?", # test input
+        }
+        self.chat_template_kwargs = {
+            "tokenize": False,
+            "add_generation_prompt": False
+        }
+        self.chat_template = """
+        {%- for message in messages %}
+            {%- if message['role'] == 'user' %}
+                {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
+            {%- elif message['role'] == 'system' %}
+                {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
+            {%- elif message['role'] == 'assistant' %}
+                {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
+            {%- endif %}
+        {%- endfor %}
+        """ # Reference: https://huggingface.co/docs/transformers/main/en/chat_templating#how-do-i-create-a-chat-template
+    
+    def test_exectution(self):
+        model_client = TransformerLLMModelClient(
+            tokenizer_kwargs=self.tokenizer_kwargs,
+            local_files_only=False,
+            init_from="autoclass",
+            apply_chat_template=True,
+            chat_template=self.chat_template,
+            chat_template_kwargs=self.chat_template_kwargs
+            )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input="Where is brian?", model_kwargs=self.model_kwargs)
+        output = model_client.call(api_kwargs=api_kwargs)
+        print(output)
+
+    def test_integration_with_generator_autoclass(self):
+        model_client = TransformerLLMModelClient(
+            tokenizer_kwargs=self.tokenizer_kwargs,
+            local_files_only=False,
+            init_from="autoclass",
+            apply_chat_template=True,
+            chat_template=self.chat_template,
+            chat_template_kwargs=self.chat_template_kwargs
+            )
+        generator = Generator(
+            model_client=model_client,
+            model_kwargs=self.model_kwargs,
+            # prompt_kwargs=prompt_kwargs,
+            # output_processors=JsonParser(),
+        )
+        output = generator(prompt_kwargs=self.prompt_kwargs)
+        print(output)
+
+    def test_integration_with_generator_pipeline(self):
+        model_client = TransformerLLMModelClient(
+            tokenizer_kwargs=self.tokenizer_kwargs,
+            local_files_only=False,
+            init_from="pipeline",
+            apply_chat_template=True,
+            chat_template=self.chat_template,
+            chat_template_kwargs=self.chat_template_kwargs
+            )
+        generator = Generator(
+            model_client=model_client,
+            model_kwargs=self.model_kwargs,
+            # prompt_kwargs=prompt_kwargs,
+            # output_processors=JsonParser(),
+        )
+        output = generator(prompt_kwargs=self.prompt_kwargs)
+        print(output)
+
 # class TestTransformerModelClient(unittest.TestCase):
 #     def setUp(self) -> None:
 

From a4eb3bb3d70f43cc3a99af71fc22da473641b405 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Fri, 6 Sep 2024 12:02:11 +0000
Subject: [PATCH 13/36] Removed temporary log.

---
 .../adalflow/components/model_client/transformers_client.py    | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index b604606d..df98d4f9 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -404,7 +404,6 @@ def _infer_from_automodelcasual_lm(
                 )
         else:
            model_input = self._handle_input(messages) 
-
         input_ids = self.tokenizer(model_input, return_tensors="pt").to(
             get_device()
         )
@@ -472,8 +471,6 @@ def infer_llm(
     #
     def call(self, api_kwargs: Dict = {}):
 
-        log.debug(f"api_kwargs: {api_kwargs}")
-        
         if "model" not in api_kwargs:
             raise ValueError("model must be specified in api_kwargs")
 

From 6a9c657d0f7ae031dea15ff00ad0c345f570c35c Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Fri, 6 Sep 2024 14:53:56 +0000
Subject: [PATCH 14/36] Changed my mind: added model_type back into call().

---
 adalflow/adalflow/core/embedder.py  | 3 ++-
 adalflow/adalflow/core/generator.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py
index c43518df..c8f069a9 100644
--- a/adalflow/adalflow/core/embedder.py
+++ b/adalflow/adalflow/core/embedder.py
@@ -139,7 +139,8 @@ def call(
         response = None
         try:
             response = self.model_client.call(
-                api_kwargs=api_kwargs
+                api_kwargs=api_kwargs,
+                model_type=self.model_type
             )
         except Exception as e:
             log.error(f"Error calling the model: {e}")
diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
index 220420aa..a89eb34b 100644
--- a/adalflow/adalflow/core/generator.py
+++ b/adalflow/adalflow/core/generator.py
@@ -328,7 +328,8 @@ def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any:
                     return cached_completion
 
             completion = self.model_client.call(
-                api_kwargs=api_kwargs
+                api_kwargs=api_kwargs,
+                model_type=self.model_type
             )
             # prepare cache
             if use_cache:

From cd1823b7a4f3dc4fe59d2c02e77c33914597a751 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Fri, 6 Sep 2024 15:00:51 +0000
Subject: [PATCH 15/36] Changed my mind about model type. See prev commit.

---
 .../components/model_client/transformers_client.py     | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index df98d4f9..14aa0a8e 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -187,7 +187,7 @@ def compute_embeddings(self, outputs: dict, batch_dict: dict):
     #
     # Preprocessing, postprocessing and call for inference code
     #
-    def call(self, api_kwargs: Dict = {}) -> Union[List, Tensor]:
+    def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType.UNDEFINED) -> Union[List, Tensor]:
         
         if "model" not in api_kwargs:
             raise ValueError("model must be specified in api_kwargs")
@@ -215,7 +215,8 @@ def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOut
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
-        model_kwargs: dict = {}
+        model_kwargs: dict = {},
+        model_type: Optional[ModelType]= ModelType.UNDEFINED
     ) -> dict:
         final_model_kwargs = model_kwargs.copy()
         # if model_type == ModelType.EMBEDDER:
@@ -469,7 +470,7 @@ def infer_llm(
     #
     # Preprocessing, postprocessing and call for inference code
     #
-    def call(self, api_kwargs: Dict = {}):
+    def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType.UNDEFINED):
 
         if "model" not in api_kwargs:
             raise ValueError("model must be specified in api_kwargs")
@@ -521,7 +522,8 @@ def parse_chat_completion(self, completion: Any) -> str:
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
-        model_kwargs: dict = {}
+        model_kwargs: dict = {},
+        model_type: Optional[ModelType]= ModelType.UNDEFINED
     ) -> dict:
         final_model_kwargs = model_kwargs.copy()
         assert "model" in final_model_kwargs, "model must be specified"

From 2bf711a48601bf446fcee74df1065cdc7ec0301e Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 09:40:10 +0000
Subject: [PATCH 16/36] Ensured tokenizer_kwargs has 'return_tensors' set to
 'pt' by default.

---
 .../adalflow/components/model_client/transformers_client.py   | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 14aa0a8e..9f7b7b9f 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -77,6 +77,8 @@ def __init__(
         super().__init__()
         self.model_name = model_name
         self.tokenizer_kwargs = tokenizer_kwargs
+        if "return_tensors" not in self.tokenizer_kwargs:
+            self.tokenizer_kwargs["return_tensors"]= "pt"
         self.auto_model=auto_model
         self.auto_tokenizer=auto_tokenizer
         self.custom_model=custom_model
@@ -245,6 +247,8 @@ def __init__(
 
         self.model_name = model_name  # current model to use
         self.tokenizer_kwargs = tokenizer_kwargs
+        if "return_tensors" not in self.tokenizer_kwargs:
+            self.tokenizer_kwargs["return_tensors"]= "pt"
         self.use_token = use_token
         self.torch_dtype = torch_dtype
         self.init_from = init_from

From 570c8b11ba537ac4a2c3fee02029c361e41c504a Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 10:07:23 +0000
Subject: [PATCH 17/36] DRAFT: merge TransformerClient and TransformerReranker
 in 1 class.

---
 .../model_client/transformers_client.py       | 126 ++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 9f7b7b9f..74197a78 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -536,6 +536,132 @@ def convert_inputs_to_api_kwargs(
         final_model_kwargs["messages"] = messages
         return final_model_kwargs
 
+
+class TransformerRerankerModelClient(ModelClient):
+
+    #
+    #   Model initialisation
+    #
+    def __init__(
+        self,
+        model_name: Optional[str] = None,
+        tokenizer_kwargs: Optional[dict] = {},
+        local_files_only: Optional[bool] = False
+    ):
+        self.model_name = model_name
+        self.tokenizer_kwargs = tokenizer_kwargs
+        if "return_tensors" not in self.tokenizer_kwargs:
+            self.tokenizer_kwargs["return_tensors"]= "pt"
+        self.local_files_only = local_files_only
+        if model_name is not None:
+            self.init_model(model_name=model_name)
+
+    def init_model(self, model_name: str):
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+            self.model_name,
+            local_files_only=self.local_files_only,
+            **self.tokenizer_kwargs
+            )
+            self.model = AutoModelForSequenceClassification.from_pretrained(
+            self.model_name,
+            local_files_only=self.local_files_only
+            )
+            # Check device availability and set the device
+            device = get_device()
+
+            # Move model to the selected device
+            self.device = device
+            self.model.to(device)
+            self.model.eval()
+            # register the model
+            log.info(f"Done loading model {model_name}")
+
+        except Exception as e:
+            log.error(f"Error loading model {model_name}: {e}")
+            raise e
+
+    #
+    #   Inference code
+    #
+
+    def infer_reranker(
+        self,
+        model: str,
+        query: str,
+        documents: List[str],
+    ) -> List[float]:
+        if not self.model:
+            self.init_model(model_name=model)
+        # convert the query and documents to pair input
+        input = [(query, doc) for doc in documents]
+
+        with torch.no_grad():
+
+            inputs = self.tokenizer(
+                input,
+                **self.tokenizer_kwargs
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            scores = (
+                self.model(**inputs, return_dict=True)
+                .logits.view(
+                    -1,
+                )
+                .float()
+            )
+            # apply sigmoid to get the scores
+            scores = F.sigmoid(scores)
+
+        scores = scores.tolist()
+        return scores
+
+    #
+    # Preprocessing, postprocessing and call for inference code
+    #
+    def call(self, api_kwargs: Dict = {}):
+
+        if "model" not in api_kwargs:
+            raise ValueError("model must be specified in api_kwargs")
+
+        model_name = api_kwargs["model"]
+        if (model_name != self.model_name) and (self.model_name is not None):
+            # need to update the model_name
+            log.warning(f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}.")
+            self.model_name = model_name
+            self.init_model(model_name=model_name)
+        elif (model_name != self.model_name) and (self.model_name is None):
+            # need to initialize the model for the first time
+            self.model_name = model_name
+            self.init_model(model_name=model_name)
+
+        assert "query" in api_kwargs, "query is required"
+        assert "documents" in api_kwargs, "documents is required"
+        assert "top_k" in api_kwargs, "top_k is required"
+
+        top_k = api_kwargs.pop("top_k")
+        scores = self.infer_reranker(**api_kwargs)
+        top_k_indices, top_k_scores = get_top_k_indices_scores(
+            scores, top_k
+        )
+        log.warning(f"output: ({top_k_indices}, {top_k_scores})")
+        return top_k_indices, top_k_scores
+
+    def convert_inputs_to_api_kwargs(
+        self,
+        input: Any,  # for retriever, it is a single query,
+        model_kwargs: dict = {},
+        model_type: ModelType = ModelType.UNDEFINED,
+    ) -> dict:
+        final_model_kwargs = model_kwargs.copy()
+
+        assert "model" in final_model_kwargs, "model must be specified"
+        assert "documents" in final_model_kwargs, "documents must be specified"
+        assert "top_k" in final_model_kwargs, "top_k must be specified"
+        final_model_kwargs["query"] = input
+        return final_model_kwargs
+
+
 #
 #
 #

From 881cfb6847f43a572f6cc0752fbe3c9bf5433faf Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 10:16:02 +0000
Subject: [PATCH 18/36] Commented out old classes.

---
 .../model_client/transformers_client.py       | 1502 ++++++++---------
 1 file changed, 743 insertions(+), 759 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 74197a78..8d97afd8 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -43,15 +43,6 @@ def average_pool(last_hidden_states: Tensor, attention_mask: list) -> Tensor:
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 
 
-
-
-#
-#
-#
-# DRAFT
-#
-#
-#
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 
 def mean_pooling(model_output: dict, attention_mask) -> Tensor:
@@ -662,754 +653,747 @@ def convert_inputs_to_api_kwargs(
         return final_model_kwargs
 
 
-#
-#
-#
-#  END OF DRAFT 
-#
-#
-#
-
-# TODO: provide a standard api for embedding and chat models used in local model SDKs
-class TransformerEmbedder:
-    """Local model SDK for transformers.
-
-
-    There are two ways to run transformers:
-    (1) model and then run model inference
-    (2) Pipeline and then run pipeline inference
-
-    This file demonstrates how to
-    (1) create a torch model inference component:  TransformerEmbedder which equalize to OpenAI(), the SyncAPIClient
-    (2) Convert this model inference component to LightRAG API client: TransformersClient
-
-    The is now just an exmplary component that initialize a certain model from transformers and run inference on it.
-    It is not tested on all transformer models yet. It might be necessary to write one for each model.
-
-    References:
-    - transformers: https://huggingface.co/docs/transformers/en/index
-    - thenlper/gte-base model:https://huggingface.co/thenlper/gte-base
-    """
-
-    models: Dict[str, type] = {}
-
-    def __init__(self, model_name: Optional[str] = "thenlper/gte-base"):
-        super().__init__()
-
-        if model_name is not None:
-            self.init_model(model_name=model_name)
-
-    @lru_cache(None)
-    def init_model(self, model_name: str):
-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-            self.model = AutoModel.from_pretrained(model_name)
-            # register the model
-            self.models[model_name] = self.model
-            log.info(f"Done loading model {model_name}")
-
-        except Exception as e:
-            log.error(f"Error loading model {model_name}: {e}")
-            raise e
-
-    def infer_gte_base_embedding(
-        self,
-        input=Union[str, List[str]],
-        tolist: bool = True,
-    ):
-        model = self.models.get("thenlper/gte-base", None)
-        if model is None:
-            # initialize the model
-            self.init_model("thenlper/gte-base")
-
-        if isinstance(input, str):
-            input = [input]
-        # Tokenize the input texts
-        batch_dict = self.tokenizer(
-            input, max_length=512, padding=True, truncation=True, return_tensors="pt"
-        )
-        outputs = model(**batch_dict)
-        embeddings = average_pool(
-            outputs.last_hidden_state, batch_dict["attention_mask"]
-        )
-        # (Optionally) normalize embeddings
-        embeddings = F.normalize(embeddings, p=2, dim=1)
-        if tolist:
-            embeddings = embeddings.tolist()
-        return embeddings
-
-    def __call__(self, **kwargs):
-        if "model" not in kwargs:
-            raise ValueError("model is required")
-
-        if "mock" in kwargs and kwargs["mock"]:
-            import numpy as np
-
-            embeddings = np.array([np.random.rand(768).tolist()])
-            return embeddings
-        # load files and models, cache it for the next inference
-        model_name = kwargs["model"]
-        # inference the model
-        if model_name == "thenlper/gte-base":
-            return self.infer_gte_base_embedding(kwargs["input"])
-        else:
-            raise ValueError(f"model {model_name} is not supported")
-
-
-def get_device():
-    # Check device availability and set the device
-    if torch.cuda.is_available():
-        device = torch.device("cuda")
-        log.info("Using CUDA (GPU) for inference.")
-    elif torch.backends.mps.is_available():
-        device = torch.device("mps")
-        log.info("Using MPS (Apple Silicon) for inference.")
-    else:
-        device = torch.device("cpu")
-        log.info("Using CPU for inference.")
-
-    return device
-
-
-def clean_device_cache():
-    import torch
-
-    if torch.has_mps:
-        torch.mps.empty_cache()
-
-        torch.mps.set_per_process_memory_fraction(1.0)
-
-
-class TransformerReranker:
-    __doc__ = r"""Local model SDK for a reranker model using transformers.
-
-    References:
-    - model: https://huggingface.co/BAAI/bge-reranker-base
-    - paper: https://arxiv.org/abs/2309.07597
-
-    note:
-    If you are using Macbook M1 series chips, you need to ensure ``torch.device("mps")`` is set.
-    """
-    models: Dict[str, type] = {}
-
-    def __init__(self, model_name: Optional[str] = "BAAI/bge-reranker-base"):
-        self.model_name = model_name or "BAAI/bge-reranker-base"
-        if model_name is not None:
-            self.init_model(model_name=model_name)
-
-    def init_model(self, model_name: str):
-        try:
-            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-            self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
-            # Check device availability and set the device
-            device = get_device()
-
-            # Move model to the selected device
-            self.device = device
-            self.model.to(device)
-            self.model.eval()
-            # register the model
-            self.models[model_name] = self.model  # TODO: better model registration
-            log.info(f"Done loading model {model_name}")
-
-        except Exception as e:
-            log.error(f"Error loading model {model_name}: {e}")
-            raise e
-
-    def infer_bge_reranker_base(
-        self,
-        # input=List[Tuple[str, str]],  # list of pairs of the query and the candidate
-        query: str,
-        documents: List[str],
-    ) -> List[float]:
-        model = self.models.get(self.model_name, None)
-        if model is None:
-            # initialize the model
-            self.init_model(self.model_name)
-
-        # convert the query and documents to pair input
-        input = [(query, doc) for doc in documents]
-
-        with torch.no_grad():
-
-            inputs = self.tokenizer(
-                input,
-                padding=True,
-                truncation=True,
-                return_tensors="pt",
-                max_length=512,
-            )
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
-            scores = (
-                model(**inputs, return_dict=True)
-                .logits.view(
-                    -1,
-                )
-                .float()
-            )
-            # apply sigmoid to get the scores
-            scores = F.sigmoid(scores)
-
-        scores = scores.tolist()
-        return scores
-
-    def __call__(self, **kwargs):
-        r"""Ensure "model" and "input" are in the kwargs."""
-        if "model" not in kwargs:
-            raise ValueError("model is required")
-
-        # if "mock" in kwargs and kwargs["mock"]:
-        #     import numpy as np
-
-        #     scores = np.array([np.random.rand(1).tolist()])
-        #     return scores
-        # load files and models, cache it for the next inference
-        model_name = kwargs["model"]
-        # inference the model
-        if model_name == self.model_name:
-            assert "query" in kwargs, "query is required"
-            assert "documents" in kwargs, "documents is required"
-            scores = self.infer_bge_reranker_base(kwargs["query"], kwargs["documents"])
-            return scores
-        else:
-            raise ValueError(f"model {model_name} is not supported")
-
-
-class TransformerLLM:
-    __doc__ = r"""Local model SDK for transformers LLM.
-
-    NOTE:
-        This inference component is only specific to the HuggingFaceH4/zephyr-7b-beta model.
-
-    The example raw output:
-    # <|system|>
-    # You are a friendly chatbot who always responds in the style of a pirate.</s>
-    # <|user|>
-    # How many helicopters can a human eat in one sitting?</s>
-    # <|assistant|>
-    # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!
-
-
-    References:
-    - model: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
-    - https://huggingface.co/google/gemma-2b
-    - https://huggingface.co/google/gemma-2-2b
-
-    """
-    models: Dict[str, type] = {}  # to register the model
-    tokenizer: Dict[str, type] = {}
-
-    model_to_init_func = {
-        "HuggingFaceH4/zephyr-7b-beta": "use_pipeline",
-        "google/gemma-2-2b": "use_pipeline",
-    }
-
-    def __init__(
-        self,
-        model_name: Optional[str] = None,
-    ):
-        super().__init__()
-
-        self.model_name = model_name  # current model to use
-
-        if model_name is not None and model_name not in self.models:
-            self.init_model(model_name=model_name)
-
-    def _check_token(self, token: str):
-        import os
-
-        if os.getenv(token) is None:
-            warnings.warn(
-                f"{token} is not set. You may not be able to access the model."
-            )
-
-    def _init_from_pipeline(self, model_name: str):
-        from transformers import pipeline
-
-        clean_device_cache()
-        self._check_token("HF_TOKEN")
-        try:
-            import os
-
-            pipe = pipeline(
-                "text-generation",
-                model=model_name,
-                torch_dtype=torch.bfloat16,
-                device=get_device(),
-                token=os.getenv("HF_TOKEN"),
-            )
-            self.models[model_name] = pipe
-        except Exception as e:
-            log.error(f"Error loading model {model_name}: {e}")
-            raise e
-
-    def _init_from_automodelcasual_lm(self, model_name: str):
-        try:
-            from transformers import AutoTokenizer, AutoModelForCausalLM
-        except ImportError:
-            raise ImportError(
-                "transformers is not installed. Please install it with `pip install transformers`"
-            )
-
-        try:
-            import os
-
-            if os.getenv("HF_TOKEN") is None:
-                warnings.warn(
-                    "HF_TOKEN is not set. You may not be able to access the model."
-                )
-
-            tokenizer = AutoTokenizer.from_pretrained(
-                model_name, token=os.getenv("HF_TOKEN")
-            )
-            model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch.bfloat16,
-                device_map="auto",
-                token=os.getenv("HF_TOKEN"),
-            )
-            self.models[model_name] = model
-            self.tokenizer[model_name] = tokenizer
-        except Exception as e:
-            log.error(f"Error loading model {model_name}: {e}")
-            raise e
-
-    @lru_cache(None)
-    def init_model(self, model_name: str):
-        log.debug(f"Loading model {model_name}")
-
-        model_setup = self.model_to_init_func.get(model_name, None)
-        if model_setup:
-            if model_setup == "use_pipeline":
-                self._init_from_pipeline(model_name)
-            else:
-                self._init_from_automodelcasual_lm(model_name)
-        else:
-            raise ValueError(f"Model {model_name} is not supported")
-
-    def _parse_chat_completion_from_pipeline(self, completion: Any) -> str:
-
-        text = completion[0]["generated_text"]
-
-        pattern = r"(?<=\|assistant\|>).*"
-
-        match = re.search(pattern, text)
-
-        if match:
-            text = match.group().strip().lstrip("\\n")
-            return text
-        else:
-            return ""
-
-    def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> str:
-        print(f"completion: {completion}")
-        return completion[0]
-
-    def parse_chat_completion(self, completion: Any) -> str:
-        model_name = self.model_name
-        model_setup = self.model_to_init_func.get(model_name, None)
-        if model_setup:
-            if model_setup == "use_pipeline":
-                return self._parse_chat_completion_from_pipeline(completion)
-            else:
-                return self._parse_chat_completion_from_automodelcasual_lm(completion)
-        else:
-            raise ValueError(f"Model {model_name} is not supported")
-
-    def _infer_from_pipeline(
-        self,
-        *,
-        model: str,
-        messages: Sequence[Dict[str, str]],
-        max_tokens: Optional[int] = None,
-        **kwargs,
-    ):
-        if not model:
-            raise ValueError("Model is not provided.")
-
-        if model not in self.models:
-            self.init_model(model_name=model)
-
-        model_to_use = self.models[model]
 
-        log.info(
-            f"Start to infer model {model}, messages: {messages}, kwargs: {kwargs}"
-        )
-
-        if model == "HuggingFaceH4/zephyr-7b-beta":
-
-            prompt = model_to_use.tokenizer.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True
-            )
-
-            final_kwargs = {
-                "max_new_tokens": max_tokens or 256,
-                "do_sample": True,
-                "temperature": kwargs.get("temperature", 0.7),
-                "top_k": kwargs.get("top_k", 50),
-                "top_p": kwargs.get("top_p", 0.95),
-            }
-            outputs = model_to_use(prompt, **final_kwargs)
-        elif model == "google/gemma-2-2b":
-            final_kwargs = {
-                "max_new_tokens": max_tokens or 256,
-                "do_sample": True,
-                "temperature": kwargs.get("temperature", 0.7),
-                "top_k": kwargs.get("top_k", 50),
-                "top_p": kwargs.get("top_p", 0.95),
-            }
-            text = messages[0]["content"]
-            outputs = model_to_use(
-                text,
-                **final_kwargs,
-            )
-
-        log.info(f"Outputs: {outputs}")
-        return outputs
-
-    def _infer_from_automodelcasual_lm(
-        self,
-        *,
-        model: str,
-        messages: Sequence[Dict[str, str]],
-        max_length: Optional[int] = 8192,  # model-agnostic
-        **kwargs,
-    ):
-        if not model:
-            raise ValueError("Model is not provided.")
-        if model not in self.models:
-            self.init_model(model_name=model)
-        model_to_use = self.models[model]
-        tokenizer_to_use = self.tokenizer[model]
-
-        input_ids = tokenizer_to_use(messages[0]["content"], return_tensors="pt").to(
-            get_device()
-        )
-        print(input_ids)
-        outputs_tokens = model_to_use.generate(**input_ids, max_length=max_length)
-        outputs = []
-        for i, output in enumerate(outputs_tokens):
-            outputs.append(tokenizer_to_use.decode(output))
-        return outputs
-
-    def infer_llm(
-        self,
-        *,
-        model: str,
-        messages: Sequence[Dict[str, str]],
-        max_tokens: Optional[int] = None,
-        **kwargs,
-    ):
-        # TODO: generalize the code for more models
-        model_setup = self.model_to_init_func.get(model, None)
-        if model_setup:
-            if model_setup == "use_pipeline":
-                return self._infer_from_pipeline(
-                    model=model, messages=messages, max_tokens=max_tokens, **kwargs
-                )
-            else:
-                return self._infer_from_automodelcasual_lm(
-                    model=model, messages=messages, max_tokens=max_tokens, **kwargs
-                )
-        else:
-            raise ValueError(f"Model {model} is not supported")
-
-    def __call__(self, **kwargs):
-        r"""Ensure "model" and "input" are in the kwargs."""
-        log.debug(f"kwargs: {kwargs}")
-        if "model" not in kwargs:
-            raise ValueError("model is required")
-
-        if "messages" not in kwargs:
-            raise ValueError("messages is required")
-
-        model_name = kwargs["model"]
-        if model_name != self.model_name:
-            # need to initialize the model and update the model_name
-            self.model_name = model_name
-            self.init_model(model_name=model_name)
-
-        output = self.infer_llm(**kwargs)
-        return output
-
-
-class TransformersClient(ModelClient):
-    __doc__ = r"""LightRAG API client for transformers.
-
-    Use: ``ls ~/.cache/huggingface/hub `` to see the cached models.
-
-    Some modeles are gated, you will need to their page to get the access token.
-    Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
-    Once you have a token and have access, put the token in the environment variable HF_TOKEN.
-    """
-
-    support_models = {
-        "thenlper/gte-base": {
-            "type": ModelType.EMBEDDER,
-        },
-        "BAAI/bge-reranker-base": {
-            "type": ModelType.RERANKER,
-        },
-        "HuggingFaceH4/zephyr-7b-beta": {"type": ModelType.LLM},
-        "google/gemma-2-2b": {"type": ModelType.LLM},
-    }
-
-    def __init__(self, model_name: Optional[str] = None) -> None:
-        super().__init__()
-        self._model_name = model_name
-        if self._model_name:
-            assert (
-                self._model_name in self.support_models
-            ), f"model {self._model_name} is not supported"
-        if self._model_name == "thenlper/gte-base":
-            self.sync_client = self.init_sync_client()
-        elif self._model_name == "BAAI/bge-reranker-base":
-            self.reranker_client = self.init_reranker_client()
-        elif self._model_name == "HuggingFaceH4/zephyr-7b-beta":
-            self.llm_client = self.init_llm_client()
-        self.async_client = None
-
-    def init_sync_client(self):
-        return TransformerEmbedder()
-
-    def init_reranker_client(self):
-        return TransformerReranker()
-
-    def init_llm_client(self):
-        return TransformerLLM()
-
-    def set_llm_client(self, llm_client: object):
-        r"""Allow user to pass a custom llm client. Here is an example of a custom llm client:
-
-        Ensure you have parse_chat_completion and __call__ methods which will be applied to api_kwargs specified in transform_client.call().
-
-        .. code-block:: python
-
-            class CustomizeLLM:
-
-                def __init__(self) -> None:
-                    pass
-
-                def parse_chat_completion(self, completion: Any) -> str:
-                    return completion
-
-                def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs):
-                    from transformers import AutoTokenizer, AutoModelForCausalLM
-
-                    tokenizer = AutoTokenizer.from_pretrained(
-                        "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True
-                    )
-                    model = AutoModelForCausalLM.from_pretrained(
-                        "deepseek-ai/deepseek-coder-1.3b-instruct",
-                        trust_remote_code=True,
-                        torch_dtype=torch.bfloat16,
-                    ).to(get_device())
-                    messages = [
-                        {"role": "user", "content": "write a quick sort algorithm in python."}
-                    ]
-                    inputs = tokenizer.apply_chat_template(
-                        messages, add_generation_prompt=True, return_tensors="pt"
-                    ).to(model.device)
-                    # tokenizer.eos_token_id is the id of <|EOT|> token
-                    outputs = model.generate(
-                        inputs,
-                        max_new_tokens=512,
-                        do_sample=False,
-                        top_k=50,
-                        top_p=0.95,
-                        num_return_sequences=1,
-                        eos_token_id=tokenizer.eos_token_id,
-                    )
-                    print(
-                        tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
-                    )
-                    decoded_outputs = []
-                    for output in outputs:
-                        decoded_outputs.append(
-                            tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True)
-                        )
-                    return decoded_outputs
-
-            llm_client = CustomizeLLM()
-            transformer_client.set_llm_client(llm_client)
-            # use in the generator
-            generator = Generator(
-                model_client=transformer_client,
-                model_kwargs=model_kwargs,
-                prompt_kwargs=prompt_kwargs,
-                ...)
-
-        """
-        self.llm_client = llm_client
-
-    def parse_embedding_response(self, response: Any) -> EmbedderOutput:
-        embeddings: List[Embedding] = []
-        for idx, emb in enumerate(response):
-            embeddings.append(Embedding(index=idx, embedding=emb))
-        response = EmbedderOutput(data=embeddings)
-        return response
-
-    def parse_chat_completion(self, completion: Any) -> GeneratorOutput:
-        try:
-            output = self.llm_client.parse_chat_completion(completion)
-
-            return GeneratorOutput(data=output, raw_response=str(completion))
-        except Exception as e:
-            log.error(f"Error parsing chat completion: {e}")
-            return GeneratorOutput(data=None, raw_response=str(completion), error=e)
-
-    def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED):
-        if "model" not in api_kwargs:
-            raise ValueError("model must be specified in api_kwargs")
-        if api_kwargs["model"] not in self.support_models:
-            raise ValueError(f"model {api_kwargs['model']} is not supported")
-
-        if (
-            model_type == ModelType.EMBEDDER
-            and "model" in api_kwargs
-            and api_kwargs["model"] == "thenlper/gte-base"
-        ):
-            if self.sync_client is None:
-                self.sync_client = self.init_sync_client()
-            return self.sync_client(**api_kwargs)
-        elif (  # reranker
-            model_type == ModelType.RERANKER
-            and "model" in api_kwargs
-            and api_kwargs["model"] == "BAAI/bge-reranker-base"
-        ):
-            if not hasattr(self, "reranker_client") or self.reranker_client is None:
-                self.reranker_client = self.init_reranker_client()
-            scores = self.reranker_client(**api_kwargs)
-            top_k_indices, top_k_scores = get_top_k_indices_scores(
-                scores, api_kwargs["top_k"]
-            )
-            return top_k_indices, top_k_scores
-        elif model_type == ModelType.LLM and "model" in api_kwargs:  # LLM
-            if not hasattr(self, "llm_client") or self.llm_client is None:
-                self.llm_client = self.init_llm_client()
-            response = self.llm_client(**api_kwargs)
-            return response
-        else:
-            raise ValueError(f"model_type {model_type} is not supported")
-
-    def convert_inputs_to_api_kwargs(
-        self,
-        input: Any,  # for retriever, it is a single query,
-        model_kwargs: dict = {},
-        model_type: ModelType = ModelType.UNDEFINED,
-    ) -> dict:
-        final_model_kwargs = model_kwargs.copy()
-        if model_type == ModelType.EMBEDDER:
-            final_model_kwargs["input"] = input
-            return final_model_kwargs
-        elif model_type == ModelType.RERANKER:
-            assert "model" in final_model_kwargs, "model must be specified"
-            assert "documents" in final_model_kwargs, "documents must be specified"
-            assert "top_k" in final_model_kwargs, "top_k must be specified"
-            final_model_kwargs["query"] = input
-            return final_model_kwargs
-        elif model_type == ModelType.LLM:
-            assert "model" in final_model_kwargs, "model must be specified"
-            messages = [{"role": "system", "content": input}]
-            final_model_kwargs["messages"] = messages
-            return final_model_kwargs
-        else:
-            raise ValueError(f"model_type {model_type} is not supported")
-
-
-if __name__ == "__main__":
-    from adalflow.core import Generator
-
-    import adalflow as adal
-
-    adal.setup_env()
-
-    rag_template = r"""<START_OF_SYSTEM_MESSAGE>
-You are a helpful assistant.
-
-Your task is to answer the query that may or may not come with context information.
-When context is provided, you should stick to the context and less on your prior knowledge to answer the query.
-<END_OF_SYSTEM_MESSAGE>
-<START_OF_USER_MESSAGE>
-    <START_OF_QUERY>
-    {{input_str}}
-    <END_OF_QUERY>
-    {% if context_str %}
-    <START_OF_CONTEXT>
-    {{context_str}}
-    <END_OF_CONTEXT>
-    {% endif %}
-<END_OF_USER_MESSAGE>
-"""
-
-    template = """{{input_str}}"""
-
-    model_kwargs = {
-        "model": "google/gemma-2-2b",
-        "temperature": 1,
-        "stream": False,
-    }
-    prompt_kwargs = {
-        "input_str": "Where is Brian?",
-        # "context_str": "Brian is in the kitchen.",
-    }
-    prompt_kwargs = {
-        "input_str": "What is the capital of France?",
-    }
-
-    class CustomizeLLM:
-
-        def __init__(self) -> None:
-            pass
-
-        def parse_chat_completion(self, completion: Any) -> str:
-            return completion[0]
-
-        def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs):
-            r"""take api key"""
-            from transformers import AutoTokenizer, AutoModelForCausalLM
-
-            tokenizer = AutoTokenizer.from_pretrained(
-                "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True
-            )
-            model = AutoModelForCausalLM.from_pretrained(
-                "deepseek-ai/deepseek-coder-1.3b-instruct",
-                trust_remote_code=True,
-                torch_dtype=torch.bfloat16,
-            ).to(get_device())
-            messages = [
-                {"role": "user", "content": "write a quick sort algorithm in python."}
-            ]
-            inputs = tokenizer.apply_chat_template(
-                messages, add_generation_prompt=True, return_tensors="pt"
-            ).to(model.device)
-            # tokenizer.eos_token_id is the id of <|EOT|> token
-            outputs = model.generate(
-                inputs,
-                max_new_tokens=512,
-                do_sample=False,
-                top_k=50,
-                top_p=0.95,
-                num_return_sequences=1,
-                eos_token_id=tokenizer.eos_token_id,
-            )
-
-            decoded_outputs = []
-            for output in outputs:
-                decoded_outputs.append(
-                    tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True)
-                )
-            return decoded_outputs
-
-    transformer_client = TransformersClient()
-    transformer_client.set_llm_client(CustomizeLLM())
-    generator = Generator(
-        model_client=transformer_client,
-        model_kwargs=model_kwargs,
-        # prompt_kwargs=prompt_kwargs,
-        template=template,
-        # output_processors=JsonParser(),
-    )
-
-    output = generator(prompt_kwargs=prompt_kwargs)
-    print(output)
+# # TODO: provide a standard api for embedding and chat models used in local model SDKs
+# class TransformerEmbedder:
+#     """Local model SDK for transformers.
+
+
+#     There are two ways to run transformers:
+#     (1) model and then run model inference
+#     (2) Pipeline and then run pipeline inference
+
+#     This file demonstrates how to
+#     (1) create a torch model inference component:  TransformerEmbedder which equalize to OpenAI(), the SyncAPIClient
+#     (2) Convert this model inference component to LightRAG API client: TransformersClient
+
+#     The is now just an exmplary component that initialize a certain model from transformers and run inference on it.
+#     It is not tested on all transformer models yet. It might be necessary to write one for each model.
+
+#     References:
+#     - transformers: https://huggingface.co/docs/transformers/en/index
+#     - thenlper/gte-base model:https://huggingface.co/thenlper/gte-base
+#     """
+
+#     models: Dict[str, type] = {}
+
+#     def __init__(self, model_name: Optional[str] = "thenlper/gte-base"):
+#         super().__init__()
+
+#         if model_name is not None:
+#             self.init_model(model_name=model_name)
+
+#     @lru_cache(None)
+#     def init_model(self, model_name: str):
+#         try:
+#             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+#             self.model = AutoModel.from_pretrained(model_name)
+#             # register the model
+#             self.models[model_name] = self.model
+#             log.info(f"Done loading model {model_name}")
+
+#         except Exception as e:
+#             log.error(f"Error loading model {model_name}: {e}")
+#             raise e
+
+#     def infer_gte_base_embedding(
+#         self,
+#         input=Union[str, List[str]],
+#         tolist: bool = True,
+#     ):
+#         model = self.models.get("thenlper/gte-base", None)
+#         if model is None:
+#             # initialize the model
+#             self.init_model("thenlper/gte-base")
+
+#         if isinstance(input, str):
+#             input = [input]
+#         # Tokenize the input texts
+#         batch_dict = self.tokenizer(
+#             input, max_length=512, padding=True, truncation=True, return_tensors="pt"
+#         )
+#         outputs = model(**batch_dict)
+#         embeddings = average_pool(
+#             outputs.last_hidden_state, batch_dict["attention_mask"]
+#         )
+#         # (Optionally) normalize embeddings
+#         embeddings = F.normalize(embeddings, p=2, dim=1)
+#         if tolist:
+#             embeddings = embeddings.tolist()
+#         return embeddings
+
+#     def __call__(self, **kwargs):
+#         if "model" not in kwargs:
+#             raise ValueError("model is required")
+
+#         if "mock" in kwargs and kwargs["mock"]:
+#             import numpy as np
+
+#             embeddings = np.array([np.random.rand(768).tolist()])
+#             return embeddings
+#         # load files and models, cache it for the next inference
+#         model_name = kwargs["model"]
+#         # inference the model
+#         if model_name == "thenlper/gte-base":
+#             return self.infer_gte_base_embedding(kwargs["input"])
+#         else:
+#             raise ValueError(f"model {model_name} is not supported")
+
+
+# def get_device():
+#     # Check device availability and set the device
+#     if torch.cuda.is_available():
+#         device = torch.device("cuda")
+#         log.info("Using CUDA (GPU) for inference.")
+#     elif torch.backends.mps.is_available():
+#         device = torch.device("mps")
+#         log.info("Using MPS (Apple Silicon) for inference.")
+#     else:
+#         device = torch.device("cpu")
+#         log.info("Using CPU for inference.")
+
+#     return device
+
+
+# def clean_device_cache():
+#     import torch
+
+#     if torch.has_mps:
+#         torch.mps.empty_cache()
+
+#         torch.mps.set_per_process_memory_fraction(1.0)
+
+
+# class TransformerReranker:
+#     __doc__ = r"""Local model SDK for a reranker model using transformers.
+
+#     References:
+#     - model: https://huggingface.co/BAAI/bge-reranker-base
+#     - paper: https://arxiv.org/abs/2309.07597
+
+#     note:
+#     If you are using Macbook M1 series chips, you need to ensure ``torch.device("mps")`` is set.
+#     """
+#     models: Dict[str, type] = {}
+
+#     def __init__(self, model_name: Optional[str] = "BAAI/bge-reranker-base"):
+#         self.model_name = model_name or "BAAI/bge-reranker-base"
+#         if model_name is not None:
+#             self.init_model(model_name=model_name)
+
+#     def init_model(self, model_name: str):
+#         try:
+#             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+#             self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
+#             # Check device availability and set the device
+#             device = get_device()
+
+#             # Move model to the selected device
+#             self.device = device
+#             self.model.to(device)
+#             self.model.eval()
+#             # register the model
+#             self.models[model_name] = self.model  # TODO: better model registration
+#             log.info(f"Done loading model {model_name}")
+
+#         except Exception as e:
+#             log.error(f"Error loading model {model_name}: {e}")
+#             raise e
+
+#     def infer_bge_reranker_base(
+#         self,
+#         # input=List[Tuple[str, str]],  # list of pairs of the query and the candidate
+#         query: str,
+#         documents: List[str],
+#     ) -> List[float]:
+#         model = self.models.get(self.model_name, None)
+#         if model is None:
+#             # initialize the model
+#             self.init_model(self.model_name)
+
+#         # convert the query and documents to pair input
+#         input = [(query, doc) for doc in documents]
+
+#         with torch.no_grad():
+
+#             inputs = self.tokenizer(
+#                 input,
+#                 padding=True,
+#                 truncation=True,
+#                 return_tensors="pt",
+#                 max_length=512,
+#             )
+#             inputs = {k: v.to(self.device) for k, v in inputs.items()}
+#             scores = (
+#                 model(**inputs, return_dict=True)
+#                 .logits.view(
+#                     -1,
+#                 )
+#                 .float()
+#             )
+#             # apply sigmoid to get the scores
+#             scores = F.sigmoid(scores)
+
+#         scores = scores.tolist()
+#         return scores
+
+#     def __call__(self, **kwargs):
+#         r"""Ensure "model" and "input" are in the kwargs."""
+#         if "model" not in kwargs:
+#             raise ValueError("model is required")
+
+#         # if "mock" in kwargs and kwargs["mock"]:
+#         #     import numpy as np
+
+#         #     scores = np.array([np.random.rand(1).tolist()])
+#         #     return scores
+#         # load files and models, cache it for the next inference
+#         model_name = kwargs["model"]
+#         # inference the model
+#         if model_name == self.model_name:
+#             assert "query" in kwargs, "query is required"
+#             assert "documents" in kwargs, "documents is required"
+#             scores = self.infer_bge_reranker_base(kwargs["query"], kwargs["documents"])
+#             return scores
+#         else:
+#             raise ValueError(f"model {model_name} is not supported")
+
+
+# class TransformerLLM:
+#     __doc__ = r"""Local model SDK for transformers LLM.
+
+#     NOTE:
+#         This inference component is only specific to the HuggingFaceH4/zephyr-7b-beta model.
+
+#     The example raw output:
+#     # <|system|>
+#     # You are a friendly chatbot who always responds in the style of a pirate.</s>
+#     # <|user|>
+#     # How many helicopters can a human eat in one sitting?</s>
+#     # <|assistant|>
+#     # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!
+
+
+#     References:
+#     - model: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
+#     - https://huggingface.co/google/gemma-2b
+#     - https://huggingface.co/google/gemma-2-2b
+
+#     """
+#     models: Dict[str, type] = {}  # to register the model
+#     tokenizer: Dict[str, type] = {}
+
+#     model_to_init_func = {
+#         "HuggingFaceH4/zephyr-7b-beta": "use_pipeline",
+#         "google/gemma-2-2b": "use_pipeline",
+#     }
+
+#     def __init__(
+#         self,
+#         model_name: Optional[str] = None,
+#     ):
+#         super().__init__()
+
+#         self.model_name = model_name  # current model to use
+
+#         if model_name is not None and model_name not in self.models:
+#             self.init_model(model_name=model_name)
+
+#     def _check_token(self, token: str):
+#         import os
+
+#         if os.getenv(token) is None:
+#             warnings.warn(
+#                 f"{token} is not set. You may not be able to access the model."
+#             )
+
+#     def _init_from_pipeline(self, model_name: str):
+#         from transformers import pipeline
+
+#         clean_device_cache()
+#         self._check_token("HF_TOKEN")
+#         try:
+#             import os
+
+#             pipe = pipeline(
+#                 "text-generation",
+#                 model=model_name,
+#                 torch_dtype=torch.bfloat16,
+#                 device=get_device(),
+#                 token=os.getenv("HF_TOKEN"),
+#             )
+#             self.models[model_name] = pipe
+#         except Exception as e:
+#             log.error(f"Error loading model {model_name}: {e}")
+#             raise e
+
+#     def _init_from_automodelcasual_lm(self, model_name: str):
+#         try:
+#             from transformers import AutoTokenizer, AutoModelForCausalLM
+#         except ImportError:
+#             raise ImportError(
+#                 "transformers is not installed. Please install it with `pip install transformers`"
+#             )
+
+#         try:
+#             import os
+
+#             if os.getenv("HF_TOKEN") is None:
+#                 warnings.warn(
+#                     "HF_TOKEN is not set. You may not be able to access the model."
+#                 )
+
+#             tokenizer = AutoTokenizer.from_pretrained(
+#                 model_name, token=os.getenv("HF_TOKEN")
+#             )
+#             model = AutoModelForCausalLM.from_pretrained(
+#                 model_name,
+#                 torch_dtype=torch.bfloat16,
+#                 device_map="auto",
+#                 token=os.getenv("HF_TOKEN"),
+#             )
+#             self.models[model_name] = model
+#             self.tokenizer[model_name] = tokenizer
+#         except Exception as e:
+#             log.error(f"Error loading model {model_name}: {e}")
+#             raise e
+
+#     @lru_cache(None)
+#     def init_model(self, model_name: str):
+#         log.debug(f"Loading model {model_name}")
+
+#         model_setup = self.model_to_init_func.get(model_name, None)
+#         if model_setup:
+#             if model_setup == "use_pipeline":
+#                 self._init_from_pipeline(model_name)
+#             else:
+#                 self._init_from_automodelcasual_lm(model_name)
+#         else:
+#             raise ValueError(f"Model {model_name} is not supported")
+
+#     def _parse_chat_completion_from_pipeline(self, completion: Any) -> str:
+
+#         text = completion[0]["generated_text"]
+
+#         pattern = r"(?<=\|assistant\|>).*"
+
+#         match = re.search(pattern, text)
+
+#         if match:
+#             text = match.group().strip().lstrip("\\n")
+#             return text
+#         else:
+#             return ""
+
+#     def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> str:
+#         print(f"completion: {completion}")
+#         return completion[0]
+
+#     def parse_chat_completion(self, completion: Any) -> str:
+#         model_name = self.model_name
+#         model_setup = self.model_to_init_func.get(model_name, None)
+#         if model_setup:
+#             if model_setup == "use_pipeline":
+#                 return self._parse_chat_completion_from_pipeline(completion)
+#             else:
+#                 return self._parse_chat_completion_from_automodelcasual_lm(completion)
+#         else:
+#             raise ValueError(f"Model {model_name} is not supported")
+
+#     def _infer_from_pipeline(
+#         self,
+#         *,
+#         model: str,
+#         messages: Sequence[Dict[str, str]],
+#         max_tokens: Optional[int] = None,
+#         **kwargs,
+#     ):
+#         if not model:
+#             raise ValueError("Model is not provided.")
+
+#         if model not in self.models:
+#             self.init_model(model_name=model)
+
+#         model_to_use = self.models[model]
+
+#         log.info(
+#             f"Start to infer model {model}, messages: {messages}, kwargs: {kwargs}"
+#         )
+
+#         if model == "HuggingFaceH4/zephyr-7b-beta":
+
+#             prompt = model_to_use.tokenizer.apply_chat_template(
+#                 messages, tokenize=False, add_generation_prompt=True
+#             )
+
+#             final_kwargs = {
+#                 "max_new_tokens": max_tokens or 256,
+#                 "do_sample": True,
+#                 "temperature": kwargs.get("temperature", 0.7),
+#                 "top_k": kwargs.get("top_k", 50),
+#                 "top_p": kwargs.get("top_p", 0.95),
+#             }
+#             outputs = model_to_use(prompt, **final_kwargs)
+#         elif model == "google/gemma-2-2b":
+#             final_kwargs = {
+#                 "max_new_tokens": max_tokens or 256,
+#                 "do_sample": True,
+#                 "temperature": kwargs.get("temperature", 0.7),
+#                 "top_k": kwargs.get("top_k", 50),
+#                 "top_p": kwargs.get("top_p", 0.95),
+#             }
+#             text = messages[0]["content"]
+#             outputs = model_to_use(
+#                 text,
+#                 **final_kwargs,
+#             )
+
+#         log.info(f"Outputs: {outputs}")
+#         return outputs
+
+#     def _infer_from_automodelcasual_lm(
+#         self,
+#         *,
+#         model: str,
+#         messages: Sequence[Dict[str, str]],
+#         max_length: Optional[int] = 8192,  # model-agnostic
+#         **kwargs,
+#     ):
+#         if not model:
+#             raise ValueError("Model is not provided.")
+#         if model not in self.models:
+#             self.init_model(model_name=model)
+#         model_to_use = self.models[model]
+#         tokenizer_to_use = self.tokenizer[model]
+
+#         input_ids = tokenizer_to_use(messages[0]["content"], return_tensors="pt").to(
+#             get_device()
+#         )
+#         print(input_ids)
+#         outputs_tokens = model_to_use.generate(**input_ids, max_length=max_length)
+#         outputs = []
+#         for i, output in enumerate(outputs_tokens):
+#             outputs.append(tokenizer_to_use.decode(output))
+#         return outputs
+
+#     def infer_llm(
+#         self,
+#         *,
+#         model: str,
+#         messages: Sequence[Dict[str, str]],
+#         max_tokens: Optional[int] = None,
+#         **kwargs,
+#     ):
+#         # TODO: generalize the code for more models
+#         model_setup = self.model_to_init_func.get(model, None)
+#         if model_setup:
+#             if model_setup == "use_pipeline":
+#                 return self._infer_from_pipeline(
+#                     model=model, messages=messages, max_tokens=max_tokens, **kwargs
+#                 )
+#             else:
+#                 return self._infer_from_automodelcasual_lm(
+#                     model=model, messages=messages, max_tokens=max_tokens, **kwargs
+#                 )
+#         else:
+#             raise ValueError(f"Model {model} is not supported")
+
+#     def __call__(self, **kwargs):
+#         r"""Ensure "model" and "input" are in the kwargs."""
+#         log.debug(f"kwargs: {kwargs}")
+#         if "model" not in kwargs:
+#             raise ValueError("model is required")
+
+#         if "messages" not in kwargs:
+#             raise ValueError("messages is required")
+
+#         model_name = kwargs["model"]
+#         if model_name != self.model_name:
+#             # need to initialize the model and update the model_name
+#             self.model_name = model_name
+#             self.init_model(model_name=model_name)
+
+#         output = self.infer_llm(**kwargs)
+#         return output
+
+
+# class TransformersClient(ModelClient):
+#     __doc__ = r"""LightRAG API client for transformers.
+
+#     Use: ``ls ~/.cache/huggingface/hub `` to see the cached models.
+
+#     Some modeles are gated, you will need to their page to get the access token.
+#     Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
+#     Once you have a token and have access, put the token in the environment variable HF_TOKEN.
+#     """
+
+#     support_models = {
+#         "thenlper/gte-base": {
+#             "type": ModelType.EMBEDDER,
+#         },
+#         "BAAI/bge-reranker-base": {
+#             "type": ModelType.RERANKER,
+#         },
+#         "HuggingFaceH4/zephyr-7b-beta": {"type": ModelType.LLM},
+#         "google/gemma-2-2b": {"type": ModelType.LLM},
+#     }
+
+#     def __init__(self, model_name: Optional[str] = None) -> None:
+#         super().__init__()
+#         self._model_name = model_name
+#         if self._model_name:
+#             assert (
+#                 self._model_name in self.support_models
+#             ), f"model {self._model_name} is not supported"
+#         if self._model_name == "thenlper/gte-base":
+#             self.sync_client = self.init_sync_client()
+#         elif self._model_name == "BAAI/bge-reranker-base":
+#             self.reranker_client = self.init_reranker_client()
+#         elif self._model_name == "HuggingFaceH4/zephyr-7b-beta":
+#             self.llm_client = self.init_llm_client()
+#         self.async_client = None
+
+#     def init_sync_client(self):
+#         return TransformerEmbedder()
+
+#     def init_reranker_client(self):
+#         return TransformerReranker()
+
+#     def init_llm_client(self):
+#         return TransformerLLM()
+
+#     def set_llm_client(self, llm_client: object):
+#         r"""Allow user to pass a custom llm client. Here is an example of a custom llm client:
+
+#         Ensure you have parse_chat_completion and __call__ methods which will be applied to api_kwargs specified in transform_client.call().
+
+#         .. code-block:: python
+
+#             class CustomizeLLM:
+
+#                 def __init__(self) -> None:
+#                     pass
+
+#                 def parse_chat_completion(self, completion: Any) -> str:
+#                     return completion
+
+#                 def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs):
+#                     from transformers import AutoTokenizer, AutoModelForCausalLM
+
+#                     tokenizer = AutoTokenizer.from_pretrained(
+#                         "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True
+#                     )
+#                     model = AutoModelForCausalLM.from_pretrained(
+#                         "deepseek-ai/deepseek-coder-1.3b-instruct",
+#                         trust_remote_code=True,
+#                         torch_dtype=torch.bfloat16,
+#                     ).to(get_device())
+#                     messages = [
+#                         {"role": "user", "content": "write a quick sort algorithm in python."}
+#                     ]
+#                     inputs = tokenizer.apply_chat_template(
+#                         messages, add_generation_prompt=True, return_tensors="pt"
+#                     ).to(model.device)
+#                     # tokenizer.eos_token_id is the id of <|EOT|> token
+#                     outputs = model.generate(
+#                         inputs,
+#                         max_new_tokens=512,
+#                         do_sample=False,
+#                         top_k=50,
+#                         top_p=0.95,
+#                         num_return_sequences=1,
+#                         eos_token_id=tokenizer.eos_token_id,
+#                     )
+#                     print(
+#                         tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
+#                     )
+#                     decoded_outputs = []
+#                     for output in outputs:
+#                         decoded_outputs.append(
+#                             tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True)
+#                         )
+#                     return decoded_outputs
+
+#             llm_client = CustomizeLLM()
+#             transformer_client.set_llm_client(llm_client)
+#             # use in the generator
+#             generator = Generator(
+#                 model_client=transformer_client,
+#                 model_kwargs=model_kwargs,
+#                 prompt_kwargs=prompt_kwargs,
+#                 ...)
+
+#         """
+#         self.llm_client = llm_client
+
+#     def parse_embedding_response(self, response: Any) -> EmbedderOutput:
+#         embeddings: List[Embedding] = []
+#         for idx, emb in enumerate(response):
+#             embeddings.append(Embedding(index=idx, embedding=emb))
+#         response = EmbedderOutput(data=embeddings)
+#         return response
+
+#     def parse_chat_completion(self, completion: Any) -> GeneratorOutput:
+#         try:
+#             output = self.llm_client.parse_chat_completion(completion)
+
+#             return GeneratorOutput(data=output, raw_response=str(completion))
+#         except Exception as e:
+#             log.error(f"Error parsing chat completion: {e}")
+#             return GeneratorOutput(data=None, raw_response=str(completion), error=e)
+
+#     def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED):
+#         if "model" not in api_kwargs:
+#             raise ValueError("model must be specified in api_kwargs")
+#         if api_kwargs["model"] not in self.support_models:
+#             raise ValueError(f"model {api_kwargs['model']} is not supported")
+
+#         if (
+#             model_type == ModelType.EMBEDDER
+#             and "model" in api_kwargs
+#             and api_kwargs["model"] == "thenlper/gte-base"
+#         ):
+#             if self.sync_client is None:
+#                 self.sync_client = self.init_sync_client()
+#             return self.sync_client(**api_kwargs)
+#         elif (  # reranker
+#             model_type == ModelType.RERANKER
+#             and "model" in api_kwargs
+#             and api_kwargs["model"] == "BAAI/bge-reranker-base"
+#         ):
+#             if not hasattr(self, "reranker_client") or self.reranker_client is None:
+#                 self.reranker_client = self.init_reranker_client()
+#             scores = self.reranker_client(**api_kwargs)
+#             top_k_indices, top_k_scores = get_top_k_indices_scores(
+#                 scores, api_kwargs["top_k"]
+#             )
+#             return top_k_indices, top_k_scores
+#         elif model_type == ModelType.LLM and "model" in api_kwargs:  # LLM
+#             if not hasattr(self, "llm_client") or self.llm_client is None:
+#                 self.llm_client = self.init_llm_client()
+#             response = self.llm_client(**api_kwargs)
+#             return response
+#         else:
+#             raise ValueError(f"model_type {model_type} is not supported")
+
+#     def convert_inputs_to_api_kwargs(
+#         self,
+#         input: Any,  # for retriever, it is a single query,
+#         model_kwargs: dict = {},
+#         model_type: ModelType = ModelType.UNDEFINED,
+#     ) -> dict:
+#         final_model_kwargs = model_kwargs.copy()
+#         if model_type == ModelType.EMBEDDER:
+#             final_model_kwargs["input"] = input
+#             return final_model_kwargs
+#         elif model_type == ModelType.RERANKER:
+#             assert "model" in final_model_kwargs, "model must be specified"
+#             assert "documents" in final_model_kwargs, "documents must be specified"
+#             assert "top_k" in final_model_kwargs, "top_k must be specified"
+#             final_model_kwargs["query"] = input
+#             return final_model_kwargs
+#         elif model_type == ModelType.LLM:
+#             assert "model" in final_model_kwargs, "model must be specified"
+#             messages = [{"role": "system", "content": input}]
+#             final_model_kwargs["messages"] = messages
+#             return final_model_kwargs
+#         else:
+#             raise ValueError(f"model_type {model_type} is not supported")
+
+
+# if __name__ == "__main__":
+#     from adalflow.core import Generator
+
+#     import adalflow as adal
+
+#     adal.setup_env()
+
+#     rag_template = r"""<START_OF_SYSTEM_MESSAGE>
+# You are a helpful assistant.
+
+# Your task is to answer the query that may or may not come with context information.
+# When context is provided, you should stick to the context and less on your prior knowledge to answer the query.
+# <END_OF_SYSTEM_MESSAGE>
+# <START_OF_USER_MESSAGE>
+#     <START_OF_QUERY>
+#     {{input_str}}
+#     <END_OF_QUERY>
+#     {% if context_str %}
+#     <START_OF_CONTEXT>
+#     {{context_str}}
+#     <END_OF_CONTEXT>
+#     {% endif %}
+# <END_OF_USER_MESSAGE>
+# """
+
+#     template = """{{input_str}}"""
+
+#     model_kwargs = {
+#         "model": "google/gemma-2-2b",
+#         "temperature": 1,
+#         "stream": False,
+#     }
+#     prompt_kwargs = {
+#         "input_str": "Where is Brian?",
+#         # "context_str": "Brian is in the kitchen.",
+#     }
+#     prompt_kwargs = {
+#         "input_str": "What is the capital of France?",
+#     }
+
+#     class CustomizeLLM:
+
+#         def __init__(self) -> None:
+#             pass
+
+#         def parse_chat_completion(self, completion: Any) -> str:
+#             return completion[0]
+
+#         def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs):
+#             r"""take api key"""
+#             from transformers import AutoTokenizer, AutoModelForCausalLM
+
+#             tokenizer = AutoTokenizer.from_pretrained(
+#                 "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True
+#             )
+#             model = AutoModelForCausalLM.from_pretrained(
+#                 "deepseek-ai/deepseek-coder-1.3b-instruct",
+#                 trust_remote_code=True,
+#                 torch_dtype=torch.bfloat16,
+#             ).to(get_device())
+#             messages = [
+#                 {"role": "user", "content": "write a quick sort algorithm in python."}
+#             ]
+#             inputs = tokenizer.apply_chat_template(
+#                 messages, add_generation_prompt=True, return_tensors="pt"
+#             ).to(model.device)
+#             # tokenizer.eos_token_id is the id of <|EOT|> token
+#             outputs = model.generate(
+#                 inputs,
+#                 max_new_tokens=512,
+#                 do_sample=False,
+#                 top_k=50,
+#                 top_p=0.95,
+#                 num_return_sequences=1,
+#                 eos_token_id=tokenizer.eos_token_id,
+#             )
+
+#             decoded_outputs = []
+#             for output in outputs:
+#                 decoded_outputs.append(
+#                     tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True)
+#                 )
+#             return decoded_outputs
+
+#     transformer_client = TransformersClient()
+#     transformer_client.set_llm_client(CustomizeLLM())
+#     generator = Generator(
+#         model_client=transformer_client,
+#         model_kwargs=model_kwargs,
+#         # prompt_kwargs=prompt_kwargs,
+#         template=template,
+#         # output_processors=JsonParser(),
+#     )
+
+#     output = generator(prompt_kwargs=prompt_kwargs)
+#     print(output)

From bd610d24c4758b8cb9db892e7db2b5b30dcfe7d8 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 12:42:31 +0000
Subject: [PATCH 19/36] Added tests for TransformerRerankerModelClient.

---
 adalflow/tests/test_transformer_client.py | 64 ++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py
index 193ecbc8..bc5aaaf6 100644
--- a/adalflow/tests/test_transformer_client.py
+++ b/adalflow/tests/test_transformer_client.py
@@ -1,6 +1,6 @@
 import unittest
 import torch
-from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient, TransformerLLMModelClient
+from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient, TransformerLLMModelClient, TransformerRerankerModelClient
 from adalflow.core.types import ModelType
 from adalflow.core import Embedder, Generator
 
@@ -143,6 +143,68 @@ def test_integration_with_generator_pipeline(self):
         output = generator(prompt_kwargs=self.prompt_kwargs)
         print(output)
 
+class TestTransformerModelClient(unittest.TestCase):
+    def setUp(self) -> None:
+
+        self.query = "what is panda?"
+        self.documents = [
+            "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
+            "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.",
+        ]
+
+    def test_execution(self):
+        transformer_reranker_model = "BAAI/bge-reranker-base"
+        transformer_reranker_model_client = TransformerRerankerModelClient(
+            tokenizer_kwargs={"padding": True}
+        )
+        print(
+            f"Testing TransformerRerankerModelClient with model {transformer_reranker_model}"
+        )
+
+        model_kwargs = {
+            "model": transformer_reranker_model,
+            "documents": self.documents,
+            "top_k": 2,
+        }
+
+        api_kwargs = transformer_reranker_model_client.convert_inputs_to_api_kwargs(self.query, model_kwargs=model_kwargs)
+        output = transformer_reranker_model_client.call(api_kwargs)
+        # assert output is a list of list with length 2
+        self.assertEqual(len(output), 2)
+        self.assertEqual(type(output[0]), list)
+        self.assertEqual(type(output[1]), list)
+        # assert output[0] is a list of int of length top_k
+        tok_k = model_kwargs["top_k"]
+        self.assertTrue(all([isinstance(elmt, int) for elmt in output[0]]))
+        self.assertEqual(len(output[0]), tok_k)
+        # assert output[1] is a list of float of length top_k
+        tok_k = model_kwargs["top_k"]
+        self.assertTrue(all([isinstance(elmt, float) for elmt in output[1]]))
+        self.assertEqual(len(output[1]), tok_k)
+
+    def test_transformer_reranker_client(self):
+        transformer_reranker_client = TransformerRerankerModelClient(
+            tokenizer_kwargs={"padding": True}
+        )
+        print("Testing transformer reranker client")
+        # run the model
+        kwargs = {
+            "model": "BAAI/bge-reranker-base",
+            "documents": self.documents,
+            "top_k": 2,
+        }
+        api_kwargs = transformer_reranker_client.convert_inputs_to_api_kwargs(
+            input=self.query,
+            model_kwargs=kwargs,
+
+        )
+        print(api_kwargs)
+        self.assertEqual(api_kwargs["model"], "BAAI/bge-reranker-base")
+        output = transformer_reranker_client.call(
+            api_kwargs=api_kwargs
+        )
+        self.assertEqual(type(output), tuple)
+
 # class TestTransformerModelClient(unittest.TestCase):
 #     def setUp(self) -> None:
 

From 25fe83438d1981ea35b26d5cd713e47ddba1d8dc Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 13:50:37 +0000
Subject: [PATCH 20/36] Add test for llm response + remove test for old class.

---
 adalflow/tests/test_transformer_client.py | 133 ++++------------------
 1 file changed, 22 insertions(+), 111 deletions(-)

diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py
index bc5aaaf6..a1a3e658 100644
--- a/adalflow/tests/test_transformer_client.py
+++ b/adalflow/tests/test_transformer_client.py
@@ -107,6 +107,28 @@ def test_exectution(self):
         output = model_client.call(api_kwargs=api_kwargs)
         print(output)
 
+    def test_response(self):
+
+        """Test the TransformerLLM model with roneneldan/TinyStories-1M for generating a response."""
+        model_client = TransformerLLMModelClient(
+        )
+
+        # Define a sample input
+        input_text = "Hello, what's the weather today?"
+
+        # Test generating a response, providing the 'model' keyword
+        # response = transformer_llm_model_component(input=input_text, model=transformer_llm_model)
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input_text, self.model_kwargs)
+        response = model_client.call(api_kwargs)
+
+        # Check if the response is valid
+        self.assertIsInstance(response, list, "The response should be a list.")
+        self.assertTrue(all([isinstance(elmt, str) for elmt in response]),  "all elements in the response list should be strings.")
+        self.assertTrue(len(response) > 0, "The response should not be empty.")
+
+        # Optionally, print the response for visual verification during testing
+        print(f"Generated response: {response}")
+
     def test_integration_with_generator_autoclass(self):
         model_client = TransformerLLMModelClient(
             tokenizer_kwargs=self.tokenizer_kwargs,
@@ -205,117 +227,6 @@ def test_transformer_reranker_client(self):
         )
         self.assertEqual(type(output), tuple)
 
-# class TestTransformerModelClient(unittest.TestCase):
-#     def setUp(self) -> None:
-
-#         self.query = "what is panda?"
-#         self.documents = [
-#             "The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.",
-#             "The red panda (Ailurus fulgens), also called the lesser panda, the red bear-cat, and the red cat-bear, is a mammal native to the eastern Himalayas and southwestern China.",
-#         ]
-
-#     def test_transformer_embedder(self):
-#         transformer_embedder_model = "thenlper/gte-base"
-#         transformer_embedder_model_component = TransformerEmbedder(
-#             model_name=transformer_embedder_model
-#         )
-#         print(
-#             f"Testing transformer embedder with model {transformer_embedder_model_component}"
-#         )
-#         print("Testing transformer embedder")
-#         output = transformer_embedder_model_component(
-#             model=transformer_embedder_model, input="Hello world"
-#         )
-#         print(output)
-
-#     def test_transformer_client(self):
-#         transformer_client = TransformersClient()
-#         print("Testing transformer client")
-#         # run the model
-#         kwargs = {
-#             "model": "thenlper/gte-base",
-#             # "mock": False,
-#         }
-#         api_kwargs = transformer_client.convert_inputs_to_api_kwargs(
-#             input="Hello world",
-#             model_kwargs=kwargs,
-#             model_type=ModelType.EMBEDDER,
-#         )
-#         # print(api_kwargs)
-#         output = transformer_client.call(
-#             api_kwargs=api_kwargs, model_type=ModelType.EMBEDDER
-#         )
-
-        # print(transformer_client)
-        # print(output)
-
-    # def test_transformer_reranker(self):
-    #     transformer_reranker_model = "BAAI/bge-reranker-base"
-    #     transformer_reranker_model_component = TransformerReranker()
-    #     # print(
-    #     #     f"Testing transformer reranker with model {transformer_reranker_model_component}"
-    #     # )
-
-    #     model_kwargs = {
-    #         "model": transformer_reranker_model,
-    #         "documents": self.documents,
-    #         "query": self.query,
-    #         "top_k": 2,
-    #     }
-
-    #     output = transformer_reranker_model_component(
-    #         **model_kwargs,
-    #     )
-    #     # assert output is a list of float with length 2
-    #     self.assertEqual(len(output), 2)
-    #     self.assertEqual(type(output[0]), float)
-
-    # def test_transformer_reranker_client(self):
-    #     transformer_reranker_client = TransformersClient(
-    #         model_name="BAAI/bge-reranker-base"
-    #     )
-    #     print("Testing transformer reranker client")
-    #     # run the model
-    #     kwargs = {
-    #         "model": "BAAI/bge-reranker-base",
-    #         "documents": self.documents,
-    #         "top_k": 2,
-    #     }
-    #     api_kwargs = transformer_reranker_client.convert_inputs_to_api_kwargs(
-    #         input=self.query,
-    #         model_kwargs=kwargs,
-    #         model_type=ModelType.RERANKER,
-    #     )
-    #     print(api_kwargs)
-    #     self.assertEqual(api_kwargs["model"], "BAAI/bge-reranker-base")
-    #     output = transformer_reranker_client.call(
-    #         api_kwargs=api_kwargs, model_type=ModelType.RERANKER
-    #     )
-    #     self.assertEqual(type(output), tuple)
-
-    # def test_transformer_llm_response(self):
-    #     from adalflow.components.model_client.transformers_client import TransformerLLM
-
-    #     """Test the TransformerLLM model with zephyr-7b-beta for generating a response."""
-    #     transformer_llm_model = "HuggingFaceH4/zephyr-7b-beta"
-    #     transformer_llm_model_component = TransformerLLM(
-    #         model_name=transformer_llm_model
-    #     )
-
-    #     # Define a sample input
-    #     input_text = "Hello, what's the weather today?"
-
-    #     # Test generating a response, providing the 'model' keyword
-    #     # response = transformer_llm_model_component(input=input_text, model=transformer_llm_model)
-    #     response = transformer_llm_model_component(input_text=input_text)
-
-    #     # Check if the response is valid
-    #     self.assertIsInstance(response, str, "The response should be a string.")
-    #     self.assertTrue(len(response) > 0, "The response should not be empty.")
-
-    #     # Optionally, print the response for visual verification during testing
-    #     print(f"Generated response: {response}")
-
 
 if __name__ == "__main__":
     unittest.main()

From e3b3b25c2584701cffae1f4ca8be44311bfdfcd7 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 13:51:03 +0000
Subject: [PATCH 21/36] Fixed test class name.

---
 adalflow/tests/test_transformer_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py
index a1a3e658..c0b649fc 100644
--- a/adalflow/tests/test_transformer_client.py
+++ b/adalflow/tests/test_transformer_client.py
@@ -165,7 +165,7 @@ def test_integration_with_generator_pipeline(self):
         output = generator(prompt_kwargs=self.prompt_kwargs)
         print(output)
 
-class TestTransformerModelClient(unittest.TestCase):
+class TransformerRerankerModelClient(unittest.TestCase):
     def setUp(self) -> None:
 
         self.query = "what is panda?"

From 2c6ee8ceeb89d41ed404b1b7fcba0c5007d3c3a8 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 13:53:45 +0000
Subject: [PATCH 22/36] Multiline message:

Moved get_device andclean_device_cache at top of file.

Allow user to specify autoclasses for  Reranker models.
---
 .../model_client/transformers_client.py       | 57 ++++++++++---------
 1 file changed, 30 insertions(+), 27 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 8d97afd8..846f997d 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -50,6 +50,30 @@ def mean_pooling(model_output: dict, attention_mask) -> Tensor:
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
+def get_device():
+    # Check device availability and set the device
+    if torch.cuda.is_available():
+        device = torch.device("cuda")
+        log.info("Using CUDA (GPU) for inference.")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+        log.info("Using MPS (Apple Silicon) for inference.")
+    else:
+        device = torch.device("cpu")
+        log.info("Using CPU for inference.")
+
+    return device
+
+
+def clean_device_cache():
+    import torch
+
+    if torch.backends.mps.is_built():
+        torch.mps.empty_cache()
+
+        torch.mps.set_per_process_memory_fraction(1.0)
+
+
 class TransformerEmbeddingModelClient(ModelClient):
 
     #
@@ -536,9 +560,13 @@ class TransformerRerankerModelClient(ModelClient):
     def __init__(
         self,
         model_name: Optional[str] = None,
+        auto_model: Optional[type] = AutoModelForSequenceClassification,
+        auto_tokenizer: Optional[type] = AutoTokenizer,
         tokenizer_kwargs: Optional[dict] = {},
         local_files_only: Optional[bool] = False
     ):
+        self.auto_model = auto_model
+        self.auto_tokenizer= auto_tokenizer
         self.model_name = model_name
         self.tokenizer_kwargs = tokenizer_kwargs
         if "return_tensors" not in self.tokenizer_kwargs:
@@ -549,12 +577,12 @@ def __init__(
 
     def init_model(self, model_name: str):
         try:
-            self.tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer = self.auto_tokenizer.from_pretrained(
             self.model_name,
             local_files_only=self.local_files_only,
             **self.tokenizer_kwargs
             )
-            self.model = AutoModelForSequenceClassification.from_pretrained(
+            self.model = self.auto_model.from_pretrained(
             self.model_name,
             local_files_only=self.local_files_only
             )
@@ -739,31 +767,6 @@ def convert_inputs_to_api_kwargs(
 #         else:
 #             raise ValueError(f"model {model_name} is not supported")
 
-
-# def get_device():
-#     # Check device availability and set the device
-#     if torch.cuda.is_available():
-#         device = torch.device("cuda")
-#         log.info("Using CUDA (GPU) for inference.")
-#     elif torch.backends.mps.is_available():
-#         device = torch.device("mps")
-#         log.info("Using MPS (Apple Silicon) for inference.")
-#     else:
-#         device = torch.device("cpu")
-#         log.info("Using CPU for inference.")
-
-#     return device
-
-
-# def clean_device_cache():
-#     import torch
-
-#     if torch.has_mps:
-#         torch.mps.empty_cache()
-
-#         torch.mps.set_per_process_memory_fraction(1.0)
-
-
 # class TransformerReranker:
 #     __doc__ = r"""Local model SDK for a reranker model using transformers.
 

From 1bd854538b9a186c52cffcc846a604ecd67e400d Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 14:03:18 +0000
Subject: [PATCH 23/36] Deleted code for the old TransformerClientClass.

---
 .../model_client/transformers_client.py       | 721 ------------------
 1 file changed, 721 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 846f997d..19890eba 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -679,724 +679,3 @@ def convert_inputs_to_api_kwargs(
         assert "top_k" in final_model_kwargs, "top_k must be specified"
         final_model_kwargs["query"] = input
         return final_model_kwargs
-
-
-
-# # TODO: provide a standard api for embedding and chat models used in local model SDKs
-# class TransformerEmbedder:
-#     """Local model SDK for transformers.
-
-
-#     There are two ways to run transformers:
-#     (1) model and then run model inference
-#     (2) Pipeline and then run pipeline inference
-
-#     This file demonstrates how to
-#     (1) create a torch model inference component:  TransformerEmbedder which equalize to OpenAI(), the SyncAPIClient
-#     (2) Convert this model inference component to LightRAG API client: TransformersClient
-
-#     The is now just an exmplary component that initialize a certain model from transformers and run inference on it.
-#     It is not tested on all transformer models yet. It might be necessary to write one for each model.
-
-#     References:
-#     - transformers: https://huggingface.co/docs/transformers/en/index
-#     - thenlper/gte-base model:https://huggingface.co/thenlper/gte-base
-#     """
-
-#     models: Dict[str, type] = {}
-
-#     def __init__(self, model_name: Optional[str] = "thenlper/gte-base"):
-#         super().__init__()
-
-#         if model_name is not None:
-#             self.init_model(model_name=model_name)
-
-#     @lru_cache(None)
-#     def init_model(self, model_name: str):
-#         try:
-#             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-#             self.model = AutoModel.from_pretrained(model_name)
-#             # register the model
-#             self.models[model_name] = self.model
-#             log.info(f"Done loading model {model_name}")
-
-#         except Exception as e:
-#             log.error(f"Error loading model {model_name}: {e}")
-#             raise e
-
-#     def infer_gte_base_embedding(
-#         self,
-#         input=Union[str, List[str]],
-#         tolist: bool = True,
-#     ):
-#         model = self.models.get("thenlper/gte-base", None)
-#         if model is None:
-#             # initialize the model
-#             self.init_model("thenlper/gte-base")
-
-#         if isinstance(input, str):
-#             input = [input]
-#         # Tokenize the input texts
-#         batch_dict = self.tokenizer(
-#             input, max_length=512, padding=True, truncation=True, return_tensors="pt"
-#         )
-#         outputs = model(**batch_dict)
-#         embeddings = average_pool(
-#             outputs.last_hidden_state, batch_dict["attention_mask"]
-#         )
-#         # (Optionally) normalize embeddings
-#         embeddings = F.normalize(embeddings, p=2, dim=1)
-#         if tolist:
-#             embeddings = embeddings.tolist()
-#         return embeddings
-
-#     def __call__(self, **kwargs):
-#         if "model" not in kwargs:
-#             raise ValueError("model is required")
-
-#         if "mock" in kwargs and kwargs["mock"]:
-#             import numpy as np
-
-#             embeddings = np.array([np.random.rand(768).tolist()])
-#             return embeddings
-#         # load files and models, cache it for the next inference
-#         model_name = kwargs["model"]
-#         # inference the model
-#         if model_name == "thenlper/gte-base":
-#             return self.infer_gte_base_embedding(kwargs["input"])
-#         else:
-#             raise ValueError(f"model {model_name} is not supported")
-
-# class TransformerReranker:
-#     __doc__ = r"""Local model SDK for a reranker model using transformers.
-
-#     References:
-#     - model: https://huggingface.co/BAAI/bge-reranker-base
-#     - paper: https://arxiv.org/abs/2309.07597
-
-#     note:
-#     If you are using Macbook M1 series chips, you need to ensure ``torch.device("mps")`` is set.
-#     """
-#     models: Dict[str, type] = {}
-
-#     def __init__(self, model_name: Optional[str] = "BAAI/bge-reranker-base"):
-#         self.model_name = model_name or "BAAI/bge-reranker-base"
-#         if model_name is not None:
-#             self.init_model(model_name=model_name)
-
-#     def init_model(self, model_name: str):
-#         try:
-#             self.tokenizer = AutoTokenizer.from_pretrained(model_name)
-#             self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
-#             # Check device availability and set the device
-#             device = get_device()
-
-#             # Move model to the selected device
-#             self.device = device
-#             self.model.to(device)
-#             self.model.eval()
-#             # register the model
-#             self.models[model_name] = self.model  # TODO: better model registration
-#             log.info(f"Done loading model {model_name}")
-
-#         except Exception as e:
-#             log.error(f"Error loading model {model_name}: {e}")
-#             raise e
-
-#     def infer_bge_reranker_base(
-#         self,
-#         # input=List[Tuple[str, str]],  # list of pairs of the query and the candidate
-#         query: str,
-#         documents: List[str],
-#     ) -> List[float]:
-#         model = self.models.get(self.model_name, None)
-#         if model is None:
-#             # initialize the model
-#             self.init_model(self.model_name)
-
-#         # convert the query and documents to pair input
-#         input = [(query, doc) for doc in documents]
-
-#         with torch.no_grad():
-
-#             inputs = self.tokenizer(
-#                 input,
-#                 padding=True,
-#                 truncation=True,
-#                 return_tensors="pt",
-#                 max_length=512,
-#             )
-#             inputs = {k: v.to(self.device) for k, v in inputs.items()}
-#             scores = (
-#                 model(**inputs, return_dict=True)
-#                 .logits.view(
-#                     -1,
-#                 )
-#                 .float()
-#             )
-#             # apply sigmoid to get the scores
-#             scores = F.sigmoid(scores)
-
-#         scores = scores.tolist()
-#         return scores
-
-#     def __call__(self, **kwargs):
-#         r"""Ensure "model" and "input" are in the kwargs."""
-#         if "model" not in kwargs:
-#             raise ValueError("model is required")
-
-#         # if "mock" in kwargs and kwargs["mock"]:
-#         #     import numpy as np
-
-#         #     scores = np.array([np.random.rand(1).tolist()])
-#         #     return scores
-#         # load files and models, cache it for the next inference
-#         model_name = kwargs["model"]
-#         # inference the model
-#         if model_name == self.model_name:
-#             assert "query" in kwargs, "query is required"
-#             assert "documents" in kwargs, "documents is required"
-#             scores = self.infer_bge_reranker_base(kwargs["query"], kwargs["documents"])
-#             return scores
-#         else:
-#             raise ValueError(f"model {model_name} is not supported")
-
-
-# class TransformerLLM:
-#     __doc__ = r"""Local model SDK for transformers LLM.
-
-#     NOTE:
-#         This inference component is only specific to the HuggingFaceH4/zephyr-7b-beta model.
-
-#     The example raw output:
-#     # <|system|>
-#     # You are a friendly chatbot who always responds in the style of a pirate.</s>
-#     # <|user|>
-#     # How many helicopters can a human eat in one sitting?</s>
-#     # <|assistant|>
-#     # Ah, me hearty matey! But yer question be a puzzler! A human cannot eat a helicopter in one sitting, as helicopters are not edible. They be made of metal, plastic, and other materials, not food!
-
-
-#     References:
-#     - model: https://huggingface.co/HuggingFaceH4/zephyr-7b-beta
-#     - https://huggingface.co/google/gemma-2b
-#     - https://huggingface.co/google/gemma-2-2b
-
-#     """
-#     models: Dict[str, type] = {}  # to register the model
-#     tokenizer: Dict[str, type] = {}
-
-#     model_to_init_func = {
-#         "HuggingFaceH4/zephyr-7b-beta": "use_pipeline",
-#         "google/gemma-2-2b": "use_pipeline",
-#     }
-
-#     def __init__(
-#         self,
-#         model_name: Optional[str] = None,
-#     ):
-#         super().__init__()
-
-#         self.model_name = model_name  # current model to use
-
-#         if model_name is not None and model_name not in self.models:
-#             self.init_model(model_name=model_name)
-
-#     def _check_token(self, token: str):
-#         import os
-
-#         if os.getenv(token) is None:
-#             warnings.warn(
-#                 f"{token} is not set. You may not be able to access the model."
-#             )
-
-#     def _init_from_pipeline(self, model_name: str):
-#         from transformers import pipeline
-
-#         clean_device_cache()
-#         self._check_token("HF_TOKEN")
-#         try:
-#             import os
-
-#             pipe = pipeline(
-#                 "text-generation",
-#                 model=model_name,
-#                 torch_dtype=torch.bfloat16,
-#                 device=get_device(),
-#                 token=os.getenv("HF_TOKEN"),
-#             )
-#             self.models[model_name] = pipe
-#         except Exception as e:
-#             log.error(f"Error loading model {model_name}: {e}")
-#             raise e
-
-#     def _init_from_automodelcasual_lm(self, model_name: str):
-#         try:
-#             from transformers import AutoTokenizer, AutoModelForCausalLM
-#         except ImportError:
-#             raise ImportError(
-#                 "transformers is not installed. Please install it with `pip install transformers`"
-#             )
-
-#         try:
-#             import os
-
-#             if os.getenv("HF_TOKEN") is None:
-#                 warnings.warn(
-#                     "HF_TOKEN is not set. You may not be able to access the model."
-#                 )
-
-#             tokenizer = AutoTokenizer.from_pretrained(
-#                 model_name, token=os.getenv("HF_TOKEN")
-#             )
-#             model = AutoModelForCausalLM.from_pretrained(
-#                 model_name,
-#                 torch_dtype=torch.bfloat16,
-#                 device_map="auto",
-#                 token=os.getenv("HF_TOKEN"),
-#             )
-#             self.models[model_name] = model
-#             self.tokenizer[model_name] = tokenizer
-#         except Exception as e:
-#             log.error(f"Error loading model {model_name}: {e}")
-#             raise e
-
-#     @lru_cache(None)
-#     def init_model(self, model_name: str):
-#         log.debug(f"Loading model {model_name}")
-
-#         model_setup = self.model_to_init_func.get(model_name, None)
-#         if model_setup:
-#             if model_setup == "use_pipeline":
-#                 self._init_from_pipeline(model_name)
-#             else:
-#                 self._init_from_automodelcasual_lm(model_name)
-#         else:
-#             raise ValueError(f"Model {model_name} is not supported")
-
-#     def _parse_chat_completion_from_pipeline(self, completion: Any) -> str:
-
-#         text = completion[0]["generated_text"]
-
-#         pattern = r"(?<=\|assistant\|>).*"
-
-#         match = re.search(pattern, text)
-
-#         if match:
-#             text = match.group().strip().lstrip("\\n")
-#             return text
-#         else:
-#             return ""
-
-#     def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> str:
-#         print(f"completion: {completion}")
-#         return completion[0]
-
-#     def parse_chat_completion(self, completion: Any) -> str:
-#         model_name = self.model_name
-#         model_setup = self.model_to_init_func.get(model_name, None)
-#         if model_setup:
-#             if model_setup == "use_pipeline":
-#                 return self._parse_chat_completion_from_pipeline(completion)
-#             else:
-#                 return self._parse_chat_completion_from_automodelcasual_lm(completion)
-#         else:
-#             raise ValueError(f"Model {model_name} is not supported")
-
-#     def _infer_from_pipeline(
-#         self,
-#         *,
-#         model: str,
-#         messages: Sequence[Dict[str, str]],
-#         max_tokens: Optional[int] = None,
-#         **kwargs,
-#     ):
-#         if not model:
-#             raise ValueError("Model is not provided.")
-
-#         if model not in self.models:
-#             self.init_model(model_name=model)
-
-#         model_to_use = self.models[model]
-
-#         log.info(
-#             f"Start to infer model {model}, messages: {messages}, kwargs: {kwargs}"
-#         )
-
-#         if model == "HuggingFaceH4/zephyr-7b-beta":
-
-#             prompt = model_to_use.tokenizer.apply_chat_template(
-#                 messages, tokenize=False, add_generation_prompt=True
-#             )
-
-#             final_kwargs = {
-#                 "max_new_tokens": max_tokens or 256,
-#                 "do_sample": True,
-#                 "temperature": kwargs.get("temperature", 0.7),
-#                 "top_k": kwargs.get("top_k", 50),
-#                 "top_p": kwargs.get("top_p", 0.95),
-#             }
-#             outputs = model_to_use(prompt, **final_kwargs)
-#         elif model == "google/gemma-2-2b":
-#             final_kwargs = {
-#                 "max_new_tokens": max_tokens or 256,
-#                 "do_sample": True,
-#                 "temperature": kwargs.get("temperature", 0.7),
-#                 "top_k": kwargs.get("top_k", 50),
-#                 "top_p": kwargs.get("top_p", 0.95),
-#             }
-#             text = messages[0]["content"]
-#             outputs = model_to_use(
-#                 text,
-#                 **final_kwargs,
-#             )
-
-#         log.info(f"Outputs: {outputs}")
-#         return outputs
-
-#     def _infer_from_automodelcasual_lm(
-#         self,
-#         *,
-#         model: str,
-#         messages: Sequence[Dict[str, str]],
-#         max_length: Optional[int] = 8192,  # model-agnostic
-#         **kwargs,
-#     ):
-#         if not model:
-#             raise ValueError("Model is not provided.")
-#         if model not in self.models:
-#             self.init_model(model_name=model)
-#         model_to_use = self.models[model]
-#         tokenizer_to_use = self.tokenizer[model]
-
-#         input_ids = tokenizer_to_use(messages[0]["content"], return_tensors="pt").to(
-#             get_device()
-#         )
-#         print(input_ids)
-#         outputs_tokens = model_to_use.generate(**input_ids, max_length=max_length)
-#         outputs = []
-#         for i, output in enumerate(outputs_tokens):
-#             outputs.append(tokenizer_to_use.decode(output))
-#         return outputs
-
-#     def infer_llm(
-#         self,
-#         *,
-#         model: str,
-#         messages: Sequence[Dict[str, str]],
-#         max_tokens: Optional[int] = None,
-#         **kwargs,
-#     ):
-#         # TODO: generalize the code for more models
-#         model_setup = self.model_to_init_func.get(model, None)
-#         if model_setup:
-#             if model_setup == "use_pipeline":
-#                 return self._infer_from_pipeline(
-#                     model=model, messages=messages, max_tokens=max_tokens, **kwargs
-#                 )
-#             else:
-#                 return self._infer_from_automodelcasual_lm(
-#                     model=model, messages=messages, max_tokens=max_tokens, **kwargs
-#                 )
-#         else:
-#             raise ValueError(f"Model {model} is not supported")
-
-#     def __call__(self, **kwargs):
-#         r"""Ensure "model" and "input" are in the kwargs."""
-#         log.debug(f"kwargs: {kwargs}")
-#         if "model" not in kwargs:
-#             raise ValueError("model is required")
-
-#         if "messages" not in kwargs:
-#             raise ValueError("messages is required")
-
-#         model_name = kwargs["model"]
-#         if model_name != self.model_name:
-#             # need to initialize the model and update the model_name
-#             self.model_name = model_name
-#             self.init_model(model_name=model_name)
-
-#         output = self.infer_llm(**kwargs)
-#         return output
-
-
-# class TransformersClient(ModelClient):
-#     __doc__ = r"""LightRAG API client for transformers.
-
-#     Use: ``ls ~/.cache/huggingface/hub `` to see the cached models.
-
-#     Some modeles are gated, you will need to their page to get the access token.
-#     Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
-#     Once you have a token and have access, put the token in the environment variable HF_TOKEN.
-#     """
-
-#     support_models = {
-#         "thenlper/gte-base": {
-#             "type": ModelType.EMBEDDER,
-#         },
-#         "BAAI/bge-reranker-base": {
-#             "type": ModelType.RERANKER,
-#         },
-#         "HuggingFaceH4/zephyr-7b-beta": {"type": ModelType.LLM},
-#         "google/gemma-2-2b": {"type": ModelType.LLM},
-#     }
-
-#     def __init__(self, model_name: Optional[str] = None) -> None:
-#         super().__init__()
-#         self._model_name = model_name
-#         if self._model_name:
-#             assert (
-#                 self._model_name in self.support_models
-#             ), f"model {self._model_name} is not supported"
-#         if self._model_name == "thenlper/gte-base":
-#             self.sync_client = self.init_sync_client()
-#         elif self._model_name == "BAAI/bge-reranker-base":
-#             self.reranker_client = self.init_reranker_client()
-#         elif self._model_name == "HuggingFaceH4/zephyr-7b-beta":
-#             self.llm_client = self.init_llm_client()
-#         self.async_client = None
-
-#     def init_sync_client(self):
-#         return TransformerEmbedder()
-
-#     def init_reranker_client(self):
-#         return TransformerReranker()
-
-#     def init_llm_client(self):
-#         return TransformerLLM()
-
-#     def set_llm_client(self, llm_client: object):
-#         r"""Allow user to pass a custom llm client. Here is an example of a custom llm client:
-
-#         Ensure you have parse_chat_completion and __call__ methods which will be applied to api_kwargs specified in transform_client.call().
-
-#         .. code-block:: python
-
-#             class CustomizeLLM:
-
-#                 def __init__(self) -> None:
-#                     pass
-
-#                 def parse_chat_completion(self, completion: Any) -> str:
-#                     return completion
-
-#                 def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs):
-#                     from transformers import AutoTokenizer, AutoModelForCausalLM
-
-#                     tokenizer = AutoTokenizer.from_pretrained(
-#                         "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True
-#                     )
-#                     model = AutoModelForCausalLM.from_pretrained(
-#                         "deepseek-ai/deepseek-coder-1.3b-instruct",
-#                         trust_remote_code=True,
-#                         torch_dtype=torch.bfloat16,
-#                     ).to(get_device())
-#                     messages = [
-#                         {"role": "user", "content": "write a quick sort algorithm in python."}
-#                     ]
-#                     inputs = tokenizer.apply_chat_template(
-#                         messages, add_generation_prompt=True, return_tensors="pt"
-#                     ).to(model.device)
-#                     # tokenizer.eos_token_id is the id of <|EOT|> token
-#                     outputs = model.generate(
-#                         inputs,
-#                         max_new_tokens=512,
-#                         do_sample=False,
-#                         top_k=50,
-#                         top_p=0.95,
-#                         num_return_sequences=1,
-#                         eos_token_id=tokenizer.eos_token_id,
-#                     )
-#                     print(
-#                         tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
-#                     )
-#                     decoded_outputs = []
-#                     for output in outputs:
-#                         decoded_outputs.append(
-#                             tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True)
-#                         )
-#                     return decoded_outputs
-
-#             llm_client = CustomizeLLM()
-#             transformer_client.set_llm_client(llm_client)
-#             # use in the generator
-#             generator = Generator(
-#                 model_client=transformer_client,
-#                 model_kwargs=model_kwargs,
-#                 prompt_kwargs=prompt_kwargs,
-#                 ...)
-
-#         """
-#         self.llm_client = llm_client
-
-#     def parse_embedding_response(self, response: Any) -> EmbedderOutput:
-#         embeddings: List[Embedding] = []
-#         for idx, emb in enumerate(response):
-#             embeddings.append(Embedding(index=idx, embedding=emb))
-#         response = EmbedderOutput(data=embeddings)
-#         return response
-
-#     def parse_chat_completion(self, completion: Any) -> GeneratorOutput:
-#         try:
-#             output = self.llm_client.parse_chat_completion(completion)
-
-#             return GeneratorOutput(data=output, raw_response=str(completion))
-#         except Exception as e:
-#             log.error(f"Error parsing chat completion: {e}")
-#             return GeneratorOutput(data=None, raw_response=str(completion), error=e)
-
-#     def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINED):
-#         if "model" not in api_kwargs:
-#             raise ValueError("model must be specified in api_kwargs")
-#         if api_kwargs["model"] not in self.support_models:
-#             raise ValueError(f"model {api_kwargs['model']} is not supported")
-
-#         if (
-#             model_type == ModelType.EMBEDDER
-#             and "model" in api_kwargs
-#             and api_kwargs["model"] == "thenlper/gte-base"
-#         ):
-#             if self.sync_client is None:
-#                 self.sync_client = self.init_sync_client()
-#             return self.sync_client(**api_kwargs)
-#         elif (  # reranker
-#             model_type == ModelType.RERANKER
-#             and "model" in api_kwargs
-#             and api_kwargs["model"] == "BAAI/bge-reranker-base"
-#         ):
-#             if not hasattr(self, "reranker_client") or self.reranker_client is None:
-#                 self.reranker_client = self.init_reranker_client()
-#             scores = self.reranker_client(**api_kwargs)
-#             top_k_indices, top_k_scores = get_top_k_indices_scores(
-#                 scores, api_kwargs["top_k"]
-#             )
-#             return top_k_indices, top_k_scores
-#         elif model_type == ModelType.LLM and "model" in api_kwargs:  # LLM
-#             if not hasattr(self, "llm_client") or self.llm_client is None:
-#                 self.llm_client = self.init_llm_client()
-#             response = self.llm_client(**api_kwargs)
-#             return response
-#         else:
-#             raise ValueError(f"model_type {model_type} is not supported")
-
-#     def convert_inputs_to_api_kwargs(
-#         self,
-#         input: Any,  # for retriever, it is a single query,
-#         model_kwargs: dict = {},
-#         model_type: ModelType = ModelType.UNDEFINED,
-#     ) -> dict:
-#         final_model_kwargs = model_kwargs.copy()
-#         if model_type == ModelType.EMBEDDER:
-#             final_model_kwargs["input"] = input
-#             return final_model_kwargs
-#         elif model_type == ModelType.RERANKER:
-#             assert "model" in final_model_kwargs, "model must be specified"
-#             assert "documents" in final_model_kwargs, "documents must be specified"
-#             assert "top_k" in final_model_kwargs, "top_k must be specified"
-#             final_model_kwargs["query"] = input
-#             return final_model_kwargs
-#         elif model_type == ModelType.LLM:
-#             assert "model" in final_model_kwargs, "model must be specified"
-#             messages = [{"role": "system", "content": input}]
-#             final_model_kwargs["messages"] = messages
-#             return final_model_kwargs
-#         else:
-#             raise ValueError(f"model_type {model_type} is not supported")
-
-
-# if __name__ == "__main__":
-#     from adalflow.core import Generator
-
-#     import adalflow as adal
-
-#     adal.setup_env()
-
-#     rag_template = r"""<START_OF_SYSTEM_MESSAGE>
-# You are a helpful assistant.
-
-# Your task is to answer the query that may or may not come with context information.
-# When context is provided, you should stick to the context and less on your prior knowledge to answer the query.
-# <END_OF_SYSTEM_MESSAGE>
-# <START_OF_USER_MESSAGE>
-#     <START_OF_QUERY>
-#     {{input_str}}
-#     <END_OF_QUERY>
-#     {% if context_str %}
-#     <START_OF_CONTEXT>
-#     {{context_str}}
-#     <END_OF_CONTEXT>
-#     {% endif %}
-# <END_OF_USER_MESSAGE>
-# """
-
-#     template = """{{input_str}}"""
-
-#     model_kwargs = {
-#         "model": "google/gemma-2-2b",
-#         "temperature": 1,
-#         "stream": False,
-#     }
-#     prompt_kwargs = {
-#         "input_str": "Where is Brian?",
-#         # "context_str": "Brian is in the kitchen.",
-#     }
-#     prompt_kwargs = {
-#         "input_str": "What is the capital of France?",
-#     }
-
-#     class CustomizeLLM:
-
-#         def __init__(self) -> None:
-#             pass
-
-#         def parse_chat_completion(self, completion: Any) -> str:
-#             return completion[0]
-
-#         def __call__(self, messages: Sequence[Dict[str, str]], model: str, **kwargs):
-#             r"""take api key"""
-#             from transformers import AutoTokenizer, AutoModelForCausalLM
-
-#             tokenizer = AutoTokenizer.from_pretrained(
-#                 "deepseek-ai/deepseek-coder-1.3b-instruct", trust_remote_code=True
-#             )
-#             model = AutoModelForCausalLM.from_pretrained(
-#                 "deepseek-ai/deepseek-coder-1.3b-instruct",
-#                 trust_remote_code=True,
-#                 torch_dtype=torch.bfloat16,
-#             ).to(get_device())
-#             messages = [
-#                 {"role": "user", "content": "write a quick sort algorithm in python."}
-#             ]
-#             inputs = tokenizer.apply_chat_template(
-#                 messages, add_generation_prompt=True, return_tensors="pt"
-#             ).to(model.device)
-#             # tokenizer.eos_token_id is the id of <|EOT|> token
-#             outputs = model.generate(
-#                 inputs,
-#                 max_new_tokens=512,
-#                 do_sample=False,
-#                 top_k=50,
-#                 top_p=0.95,
-#                 num_return_sequences=1,
-#                 eos_token_id=tokenizer.eos_token_id,
-#             )
-
-#             decoded_outputs = []
-#             for output in outputs:
-#                 decoded_outputs.append(
-#                     tokenizer.decode(output[len(inputs[0]) :], skip_special_tokens=True)
-#                 )
-#             return decoded_outputs
-
-#     transformer_client = TransformersClient()
-#     transformer_client.set_llm_client(CustomizeLLM())
-#     generator = Generator(
-#         model_client=transformer_client,
-#         model_kwargs=model_kwargs,
-#         # prompt_kwargs=prompt_kwargs,
-#         template=template,
-#         # output_processors=JsonParser(),
-#     )
-
-#     output = generator(prompt_kwargs=prompt_kwargs)
-#     print(output)

From 0059bbfc57093c8bb2a0b289f919b0db70ec68bf Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Mon, 9 Sep 2024 14:04:46 +0000
Subject: [PATCH 24/36] Added __doc__ for the client classes.

---
 .../model_client/transformers_client.py       | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 19890eba..f1ea0f7e 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -75,7 +75,14 @@ def clean_device_cache():
 
 
 class TransformerEmbeddingModelClient(ModelClient):
+    __doc__ = r"""LightRAG API client for embedding models using HuggingFace's transformers library.
 
+    Use: ``ls ~/.cache/huggingface/hub `` to see the cached models.
+
+    Some modeles are gated, you will need to their page to get the access token.
+    Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
+    Once you have a token and have access, put the token in the environment variable HF_TOKEN.
+    """
     #
     #   Model initialisation
     #
@@ -242,7 +249,14 @@ def convert_inputs_to_api_kwargs(
 
 
 class TransformerLLMModelClient(ModelClient):
+    __doc__ = r"""LightRAG API client for text generation models using HuggingFace's transformers library.
+
+    Use: ``ls ~/.cache/huggingface/hub `` to see the cached models.
 
+    Some modeles are gated, you will need to their page to get the access token.
+    Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
+    Once you have a token and have access, put the token in the environment variable HF_TOKEN.
+    """
     #
     #   Model initialisation
     #
@@ -553,7 +567,14 @@ def convert_inputs_to_api_kwargs(
 
 
 class TransformerRerankerModelClient(ModelClient):
+    __doc__ = r"""LightRAG API client for reranker (cross-encoder) models using HuggingFace's transformers library.
+
+    Use: ``ls ~/.cache/huggingface/hub `` to see the cached models.
 
+    Some modeles are gated, you will need to their page to get the access token.
+    Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
+    Once you have a token and have access, put the token in the environment variable HF_TOKEN.
+    """
     #
     #   Model initialisation
     #

From 53d5384a75d2cea238c120fa9b2a18718e900a78 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Tue, 10 Sep 2024 10:19:10 +0000
Subject: [PATCH 25/36] Formatting.

---
 .../model_client/transformers_client.py       | 47 +++++++++++++------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index f1ea0f7e..f6f8b239 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -6,7 +6,6 @@
 import re
 import warnings
 
-
 from adalflow.core.model_client import ModelClient
 from adalflow.core.types import GeneratorOutput, ModelType, Embedding, EmbedderOutput
 from adalflow.core.functional import get_top_k_indices_scores
@@ -14,27 +13,29 @@
 # optional import
 from adalflow.utils.lazy_import import safe_import, OptionalPackages
 
-
-transformers = safe_import(
-    OptionalPackages.TRANSFORMERS.value[0], OptionalPackages.TRANSFORMERS.value[1]
-)
-torch = safe_import(OptionalPackages.TORCH.value[0], OptionalPackages.TORCH.value[1])
-
-import torch
-
 import torch.nn.functional as F
 from torch import Tensor
+import torch
 
 from transformers import (
+    PreTrainedModel,
+    PreTrainedTokenizer,
+    PreTrainedTokenizerFast,
     AutoTokenizer,
     AutoModel,
     AutoModelForCausalLM,
     AutoModelForSequenceClassification,
     pipeline
 )
-
 from os import getenv as get_env_variable
 
+transformers = safe_import(
+    OptionalPackages.TRANSFORMERS.value[0],
+    OptionalPackages.TRANSFORMERS.value[1]
+)
+torch = safe_import(OptionalPackages.TORCH.value[0], OptionalPackages.TORCH.value[1])
+
+
 log = logging.getLogger(__name__)
 
 
@@ -43,13 +44,12 @@ def average_pool(last_hidden_states: Tensor, attention_mask: list) -> Tensor:
     return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
 
 
-from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
-
 def mean_pooling(model_output: dict, attention_mask) -> Tensor:
     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
 
+
 def get_device():
     # Check device availability and set the device
     if torch.cuda.is_available():
@@ -131,6 +131,7 @@ def __init__(
 
         self.init_sync_client()
 
+
     def init_sync_client(self):
         self.init_model(
             model_name=self.model_name,
@@ -140,6 +141,7 @@ def init_sync_client(self):
             custom_tokenizer=self.custom_tokenizer
             )
 
+
     @lru_cache(None)
     def init_model(
         self,
@@ -188,20 +190,24 @@ def infer_embedding(
             embeddings = embeddings.tolist()
         return embeddings
 
+
     def handle_input(self, input: Union[str, List[str], List[List[str]]]) -> Union[List[str], List[List[str]]]:
         if isinstance(input, str):
             input = [input]
         return input
      
+
     def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = dict()) -> dict:
         batch_dict = self.tokenizer(input, **kwargs)
         return batch_dict
 
+
     def compute_model_outputs(self, batch_dict: dict, model: PreTrainedModel) -> dict:
         with torch.no_grad():
             outputs = model(**batch_dict)
         return outputs
 
+
     def compute_embeddings(self, outputs: dict, batch_dict: dict):
         embeddings = mean_pooling(
             outputs, batch_dict["attention_mask"]
@@ -229,6 +235,7 @@ def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType
         # inference the model
         return self.infer_embedding(api_kwargs["input"])
 
+
     def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOutput:
         embeddings: List[Embedding] = []
         for idx, emb in enumerate(response):
@@ -236,6 +243,7 @@ def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOut
         response = EmbedderOutput(data=embeddings)
         return response
 
+
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
@@ -289,12 +297,14 @@ def __init__(
         if model_name is not None:
             self.init_model(model_name=model_name)
 
+
     def _check_token(self, token: str):
         if get_env_variable(token) is None:
             warnings.warn(
                 f"{token} is not set. You may not be able to access the model."
             )
 
+
     def _get_token_if_relevant(self) -> Union[str, bool]:
         if self.use_token:
             self._check_token("HF_TOKEN")
@@ -303,6 +313,7 @@ def _get_token_if_relevant(self) -> Union[str, bool]:
             token = False      
         return token
 
+
     def _init_from_pipeline(self):
 
         clean_device_cache()
@@ -315,6 +326,7 @@ def _init_from_pipeline(self):
             token=token
         )
 
+
     def _init_from_automodelcasual_lm(self):
 
         token = self._get_token_if_relevant() # return a token str or False
@@ -414,6 +426,7 @@ def _infer_from_pipeline(
         log.info(f"Outputs: {outputs}")
         return outputs
 
+
     def _infer_from_automodelcasual_lm(
         self,
         *,
@@ -447,6 +460,7 @@ def _infer_from_automodelcasual_lm(
             outputs.append(self.tokenizer.decode(output))
         return outputs
 
+
     def _handle_input(
             self,
             messages: Sequence[Dict[str, str]],
@@ -470,6 +484,7 @@ def _handle_input(
             text = messages[-1]["content"]
             return text
 
+
     def infer_llm(
         self,
         *,
@@ -523,12 +538,11 @@ def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType
         output = self.infer_llm(**api_kwargs)
         return output
 
+
     def _parse_chat_completion_from_pipeline(self, completion: Any) -> str:
 
         text = completion[0]["generated_text"]
-
         pattern = r"(?<=\|assistant\|>).*"
-
         match = re.search(pattern, text)
 
         if match:
@@ -537,10 +551,12 @@ def _parse_chat_completion_from_pipeline(self, completion: Any) -> str:
         else:
             return ""
 
+
     def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> GeneratorOutput:
         print(f"completion: {completion}")
         return completion[0]
 
+
     def parse_chat_completion(self, completion: Any) -> str:
         try:
             if self.init_from == "pipeline":
@@ -552,6 +568,7 @@ def parse_chat_completion(self, completion: Any) -> str:
             log.error(f"Error parsing chat completion: {e}")
             return GeneratorOutput(data=None, raw_response=str(completion), error=e)
 
+
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
@@ -596,6 +613,7 @@ def __init__(
         if model_name is not None:
             self.init_model(model_name=model_name)
 
+
     def init_model(self, model_name: str):
         try:
             self.tokenizer = self.auto_tokenizer.from_pretrained(
@@ -687,6 +705,7 @@ def call(self, api_kwargs: Dict = {}):
         log.warning(f"output: ({top_k_indices}, {top_k_scores})")
         return top_k_indices, top_k_scores
 
+
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,

From be8041c67cdfc35dde0e09bf8267ad2ec7486f15 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Tue, 10 Sep 2024 10:58:59 +0000
Subject: [PATCH 26/36] Added example for transformers_client module + fixed
 import.

---
 tutorials/model_client.ipynb | 170 ++++++++++++++++++++++++++++++++++-
 1 file changed, 167 insertions(+), 3 deletions(-)

diff --git a/tutorials/model_client.ipynb b/tutorials/model_client.ipynb
index 60ea6585..f3f302d9 100644
--- a/tutorials/model_client.ipynb
+++ b/tutorials/model_client.ipynb
@@ -24,9 +24,9 @@
     }
    ],
    "source": [
-    "from lightrag.components.model_client import OpenAIClient\n",
-    "from lightrag.core.types import ModelType\n",
-    "from lightrag.utils import setup_env\n",
+    "from adalflow.components.model_client import OpenAIClient\n",
+    "from adalflow.core.types import ModelType\n",
+    "from adalflow.utils import setup_env\n",
     "\n",
     "openai_client = OpenAIClient()\n",
     "\n",
@@ -61,6 +61,170 @@
     "print(f\"reponse_embedder_output: {reponse_embedder_output}\")\n"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For local models, we can use the client classes from the `transformers_client` module:\n",
+    "- TransformerEmbeddingModelClient\n",
+    "- TransformerLLMModelClient\n",
+    "- TransformerRerankerModelClient"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'adalflow'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[1;32m/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb Cell 4\u001b[0m line \u001b[0;36m1\n\u001b[0;32m----> <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39madalflow\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mcomponents\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mmodel_client\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mtransformers_client\u001b[39;00m \u001b[39mimport\u001b[39;00m (\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m     TransformerEmbeddingModelClient,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m     TransformerLLMModelClient,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m     TransformerRerankerModelClient\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X16sdnNjb2RlLXJlbW90ZQ%3D%3D?line=4'>5</a>\u001b[0m     )\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'adalflow'"
+     ]
+    }
+   ],
+   "source": [
+    "from adalflow.components.model_client.transformers_client import (\n",
+    "    TransformerEmbeddingModelClient,\n",
+    "    TransformerLLMModelClient,\n",
+    "    TransformerRerankerModelClient\n",
+    "    )\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"Where is Brian?\"\n",
+    "documents = [\n",
+    "    \"Brian is in the kitchen.\",\n",
+    "    \"I love Adalflow.\",\n",
+    "    \"Brian too.\"\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'TransformerEmbeddingModelClient' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb Cell 6\u001b[0m line \u001b[0;36m9\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m model_kwargs \u001b[39m=\u001b[39m {\u001b[39m\"\u001b[39m\u001b[39mmodel\u001b[39m\u001b[39m\"\u001b[39m: embedding_model}\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m tokenizer_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mmax_length\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m512\u001b[39m,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=4'>5</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mpadding\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=5'>6</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mtruncation\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=6'>7</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mreturn_tensors\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m'\u001b[39m\u001b[39mpt\u001b[39m\u001b[39m'\u001b[39m\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=7'>8</a>\u001b[0m }\n\u001b[0;32m----> <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=8'>9</a>\u001b[0m model_client \u001b[39m=\u001b[39m TransformerEmbeddingModelClient(\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=9'>10</a>\u001b[0m     model_name\u001b[39m=\u001b[39membedding_model,\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=10'>11</a>\u001b[0m     tokenizer_kwargs\u001b[39m=\u001b[39mtokenizer_kwargs\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=11'>12</a>\u001b[0m )\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=12'>13</a>\u001b[0m \u001b[39mprint\u001b[39m(\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=13'>14</a>\u001b[0m     \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mTesting model client with model \u001b[39m\u001b[39m{\u001b[39;00membedding_model\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=14'>15</a>\u001b[0m )\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X21sdnNjb2RlLXJlbW90ZQ%3D%3D?line=15'>16</a>\u001b[0m api_kwargs \u001b[39m=\u001b[39m model_client\u001b[39m.\u001b[39mconvert_inputs_to_api_kwargs(\u001b[39minput\u001b[39m\u001b[39m=\u001b[39mquery, model_kwargs\u001b[39m=\u001b[39mmodel_kwargs)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'TransformerEmbeddingModelClient' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "embedding_model = \"thenlper/gte-base\"\n",
+    "model_kwargs = {\"model\": embedding_model}\n",
+    "tokenizer_kwargs = {\n",
+    "    \"max_length\": 512,\n",
+    "    \"padding\": True,\n",
+    "    \"truncation\": True,\n",
+    "    \"return_tensors\": 'pt'\n",
+    "}\n",
+    "model_client = TransformerEmbeddingModelClient(\n",
+    "    model_name=embedding_model,\n",
+    "    tokenizer_kwargs=tokenizer_kwargs\n",
+    ")\n",
+    "print(\n",
+    "    f\"Testing model client with model {embedding_model}\"\n",
+    ")\n",
+    "api_kwargs = model_client.convert_inputs_to_api_kwargs(input=query, model_kwargs=model_kwargs)\n",
+    "print(f\"api_kwargs: {api_kwargs}\")\n",
+    "output = model_client.call(api_kwargs=api_kwargs)\n",
+    "print(output)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'TransformerLLMModelClient' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb Cell 7\u001b[0m line \u001b[0;36m1\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=5'>6</a>\u001b[0m tokenizer_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=6'>7</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mmax_length\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=7'>8</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mtruncation\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=8'>9</a>\u001b[0m }\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=9'>10</a>\u001b[0m prompt_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=10'>11</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39minput_str\u001b[39m\u001b[39m\"\u001b[39m: query, \n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=11'>12</a>\u001b[0m }\n\u001b[0;32m---> <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=12'>13</a>\u001b[0m model_client \u001b[39m=\u001b[39m TransformerLLMModelClient(\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=13'>14</a>\u001b[0m     tokenizer_kwargs\u001b[39m=\u001b[39mtokenizer_kwargs,\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=14'>15</a>\u001b[0m     local_files_only\u001b[39m=\u001b[39m\u001b[39mFalse\u001b[39;00m,\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=15'>16</a>\u001b[0m     init_from\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mautoclass\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=16'>17</a>\u001b[0m     )\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=17'>18</a>\u001b[0m api_kwargs \u001b[39m=\u001b[39m model_client\u001b[39m.\u001b[39mconvert_inputs_to_api_kwargs(\u001b[39minput\u001b[39m\u001b[39m=\u001b[39mquery, model_kwargs\u001b[39m=\u001b[39mmodel_kwargs)\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X22sdnNjb2RlLXJlbW90ZQ%3D%3D?line=18'>19</a>\u001b[0m \u001b[39mprint\u001b[39m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mapi_kwargs: \u001b[39m\u001b[39m{\u001b[39;00mapi_kwargs\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'TransformerLLMModelClient' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "model_kwargs = {\n",
+    "    \"model\": \"roneneldan/TinyStories-1M\",\n",
+    "    \"temperature\": 0.1,\n",
+    "    \"do_sample\": True\n",
+    "}\n",
+    "tokenizer_kwargs = {\n",
+    "    \"max_length\": True,\n",
+    "    \"truncation\": True,\n",
+    "}\n",
+    "prompt_kwargs = {\n",
+    "    \"input_str\": query, \n",
+    "}\n",
+    "model_client = TransformerLLMModelClient(\n",
+    "    tokenizer_kwargs=tokenizer_kwargs,\n",
+    "    local_files_only=False,\n",
+    "    init_from=\"autoclass\",\n",
+    "    )\n",
+    "api_kwargs = model_client.convert_inputs_to_api_kwargs(input=query, model_kwargs=model_kwargs)\n",
+    "print(f\"api_kwargs: {api_kwargs}\")\n",
+    "output = model_client.call(api_kwargs=api_kwargs)\n",
+    "print(f\"reponse_embedder_output: {reponse_embedder_output}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'TransformerRerankerModelClient' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[1;32m/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb Cell 8\u001b[0m line \u001b[0;36m2\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=0'>1</a>\u001b[0m transformer_reranker_model \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mBAAI/bge-reranker-base\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=1'>2</a>\u001b[0m model_client \u001b[39m=\u001b[39m TransformerRerankerModelClient(\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=2'>3</a>\u001b[0m     tokenizer_kwargs\u001b[39m=\u001b[39m{\u001b[39m\"\u001b[39m\u001b[39mpadding\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39mTrue\u001b[39;00m}\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=3'>4</a>\u001b[0m )\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=5'>6</a>\u001b[0m model_kwargs \u001b[39m=\u001b[39m {\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=6'>7</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mmodel\u001b[39m\u001b[39m\"\u001b[39m: transformer_reranker_model,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=7'>8</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mdocuments\u001b[39m\u001b[39m\"\u001b[39m: documents,\n\u001b[1;32m      <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=8'>9</a>\u001b[0m     \u001b[39m\"\u001b[39m\u001b[39mtop_k\u001b[39m\u001b[39m\"\u001b[39m: \u001b[39m2\u001b[39m,\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=9'>10</a>\u001b[0m }\n\u001b[1;32m     <a href='vscode-notebook-cell://dev-container%2B7b22766f6c756d654e616d65223a227472616e73666f726d65725f636c69656e74222c22666f6c646572223a227472616e73666f726d65725f636c69656e74222c2273657474696e6773223a7b22636f6e74657874223a226465736b746f702d6c696e7578227d7d/workspaces/transformer_client/AdalFlow/tutorials/model_client.ipynb#X23sdnNjb2RlLXJlbW90ZQ%3D%3D?line=11'>12</a>\u001b[0m api_kwargs \u001b[39m=\u001b[39m model_client\u001b[39m.\u001b[39mconvert_inputs_to_api_kwargs(query, model_kwargs\u001b[39m=\u001b[39mmodel_kwargs)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'TransformerRerankerModelClient' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "transformer_reranker_model = \"BAAI/bge-reranker-base\"\n",
+    "model_client = TransformerRerankerModelClient(\n",
+    "    tokenizer_kwargs={\"padding\": True}\n",
+    ")\n",
+    "\n",
+    "model_kwargs = {\n",
+    "    \"model\": transformer_reranker_model,\n",
+    "    \"documents\": documents,\n",
+    "    \"top_k\": 2,\n",
+    "}\n",
+    "\n",
+    "api_kwargs = model_client.convert_inputs_to_api_kwargs(query, model_kwargs=model_kwargs)\n",
+    "print(f\"api_kwargs: {api_kwargs}\")\n",
+    "output = model_client.call(api_kwargs)\n",
+    "print(f\"reponse_embedder_output: {reponse_embedder_output}\")"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From f4abfeb9683e266a9208602cb74cf4b3aee04965 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Tue, 10 Sep 2024 11:54:20 +0000
Subject: [PATCH 27/36] Restored originial file.

---
 adalflow/adalflow/core/embedder.py  | 10 +++---
 adalflow/adalflow/core/generator.py | 51 ++++++++++++++++-------------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/adalflow/adalflow/core/embedder.py b/adalflow/adalflow/core/embedder.py
index c8f069a9..588347ba 100644
--- a/adalflow/adalflow/core/embedder.py
+++ b/adalflow/adalflow/core/embedder.py
@@ -100,7 +100,8 @@ def _pre_call(
         # step 2: convert the input to the api_kwargs
         api_kwargs = self.model_client.convert_inputs_to_api_kwargs(
             input=input,
-            model_kwargs=composed_model_kwargs
+            model_kwargs=composed_model_kwargs,
+            model_type=self.model_type,
         )
         log.debug(f"api_kwargs: {api_kwargs}")
         return api_kwargs
@@ -139,8 +140,7 @@ def call(
         response = None
         try:
             response = self.model_client.call(
-                api_kwargs=api_kwargs,
-                model_type=self.model_type
+                api_kwargs=api_kwargs, model_type=self.model_type
             )
         except Exception as e:
             log.error(f"Error calling the model: {e}")
@@ -169,7 +169,7 @@ async def acall(
         response = None
         try:
             response = await self.model_client.acall(
-                api_kwargs=api_kwargs
+                api_kwargs=api_kwargs, model_type=self.model_type
             )
         except Exception as e:
             log.error(f"Error calling the model: {e}")
@@ -231,4 +231,4 @@ def call(
                 input=batch_input, model_kwargs=model_kwargs
             )
             embeddings.append(batch_output)
-        return embeddings
+        return embeddings
\ No newline at end of file
diff --git a/adalflow/adalflow/core/generator.py b/adalflow/adalflow/core/generator.py
index a89eb34b..7868b44e 100644
--- a/adalflow/adalflow/core/generator.py
+++ b/adalflow/adalflow/core/generator.py
@@ -6,7 +6,6 @@
 import json
 
 from typing import Any, Dict, Optional, Union, Callable, Tuple, List
-from copy import deepcopy
 import logging
 
 
@@ -110,11 +109,6 @@ def __init__(
             )
 
         template = template or DEFAULT_LIGHTRAG_SYSTEM_PROMPT
-        try:
-            prompt_kwargs = deepcopy(prompt_kwargs)
-        except Exception as e:
-            log.warning(f"Error copying the prompt_kwargs: {e}")
-            prompt_kwargs = prompt_kwargs
 
         # Cache
         model_str = (
@@ -125,8 +119,6 @@ def __init__(
         )
         self.cache_path = os.path.join(_cache_path, f"cache_{model_str}.db")
 
-        print(f"cache_path: {self.cache_path}")
-
         CachedEngine.__init__(self, cache_path=self.cache_path)
         Component.__init__(self)
         GradComponent.__init__(self)
@@ -167,6 +159,10 @@ def __init__(
         }
         self._teacher: Optional["Generator"] = None
 
+    def get_cache_path(self) -> str:
+        r"""Get the cache path for the generator."""
+        return self.cache_path
+
     @staticmethod
     def _get_default_mapping(
         output: "GeneratorOutput" = None,
@@ -269,11 +265,9 @@ def _compose_model_kwargs(self, **model_kwargs) -> Dict:
         return combined_model_kwargs
 
     def print_prompt(self, **kwargs) -> str:
-        # prompt_kwargs_str = _convert_prompt_kwargs_to_str(kwargs)
         return self.prompt.print_prompt(**kwargs)
 
     def get_prompt(self, **kwargs) -> str:
-        # prompt_kwargs_str = _convert_prompt_kwargs_to_str(kwargs)
         return self.prompt.call(**kwargs)
 
     def _extra_repr(self) -> str:
@@ -312,6 +306,7 @@ def _pre_call(self, prompt_kwargs: Dict, model_kwargs: Dict) -> Dict[str, Any]:
         api_kwargs = self.model_client.convert_inputs_to_api_kwargs(
             input=prompt_str,
             model_kwargs=composed_model_kwargs,
+            model_type=self.model_type,
         )
         return api_kwargs
 
@@ -328,8 +323,7 @@ def _model_client_call(self, api_kwargs: Dict, use_cache: bool = False) -> Any:
                     return cached_completion
 
             completion = self.model_client.call(
-                api_kwargs=api_kwargs,
-                model_type=self.model_type
+                api_kwargs=api_kwargs, model_type=self.model_type
             )
             # prepare cache
             if use_cache:
@@ -420,8 +414,12 @@ def forward(
         if self.mock_output:
             output = GeneratorOutput(data=self.mock_output_data)
         else:
-            if self.teacher_mode:
+            if self.teacher_mode and not isinstance(self, BackwardEngine):
                 if not self._teacher:
+                    print(
+                        f"prompt_kwargs: {prompt_kwargs}, model_kwargs: {model_kwargs}"
+                    )
+                    print(f"names: {self.name}")
                     raise ValueError("Teacher generator is not set.")
                 log.info(f"Using teacher: {self._teacher}")
                 input_args = {
@@ -706,7 +704,6 @@ def _run_callbacks(
             model_kwargs=model_kwargs,
         )
         if output.error:
-            print(f"call back on failure: {output}")
             self.trigger_callbacks(
                 "on_failure",
                 output=output,
@@ -799,7 +796,7 @@ async def acall(
 
         try:
             completion = await self.model_client.acall(
-                api_kwargs=api_kwargs
+                api_kwargs=api_kwargs, model_type=self.model_type
             )
         except Exception as e:
             log.error(f"Error calling the model: {e}")
@@ -830,9 +827,23 @@ def __call__(self, *args, **kwargs) -> Union[GeneratorOutputType, Any]:
             return self.call(*args, **kwargs)
 
     def _extra_repr(self) -> str:
+        # Create the string for model_kwargs
         s = f"model_kwargs={self.model_kwargs}, "
+
+        # Create the string for trainable prompt_kwargs
+        prompt_kwargs_repr = [
+            k
+            for k, v in self.prompt_kwargs.items()
+            if isinstance(v, Parameter) and v.requires_opt
+        ]
+
+        s += f"trainable_prompt_kwargs={prompt_kwargs_repr}"
         return s
 
+    def to_dict(self) -> Dict[str, Any]:
+        r"""Convert the generator to a dictionary."""
+        # exclude default functions
+
     @staticmethod
     def failure_message_to_backward_engine(
         gradient_response: GeneratorOutput,
@@ -854,6 +865,8 @@ def __init__(self, **kwargs):
             kwargs = {}
         kwargs["template"] = FEEDBACK_ENGINE_TEMPLATE
         super().__init__(**kwargs)
+        self.name = "BackwardEngine"
+        self.teacher_mode = False
 
     @staticmethod
     def failure_message_to_optimizer(
@@ -954,7 +967,6 @@ def create_teacher_generator(
     call_logger = GeneratorCallLogger(save_dir="traces")
 
     def on_complete(output, input, prompt_kwargs, model_kwargs, logger_call: Callable):
-        print(f"on_complet  output: {output}")
         logger_call(
             output=output,
             input=input,
@@ -963,13 +975,9 @@ def on_complete(output, input, prompt_kwargs, model_kwargs, logger_call: Callabl
         )
 
     for model in [llama3_model, gpt_3_model, gemini_model, claude_model]:
-        print(f"""model: {model["model_kwargs"]["model"]}""")
         generator = Generator(**model)
 
-        print("_kwargs: ", generator._kwargs)
-
         teacher = create_teacher_generator(generator, **claude_model)
-        print(f"teacher: {teacher}")
 
         call_logger.register_generator("generator", "generator_call")
         # setup the callback
@@ -983,8 +991,7 @@ def on_complete(output, input, prompt_kwargs, model_kwargs, logger_call: Callabl
                 "input_str": "Hello, world!",
             }
         )
-        print(f"output: {output}")
         break
 
     # test the backward engine
-    # TODO: test ollama and transformer client to update the change
+    # TODO: test ollama and transformer client to update the change
\ No newline at end of file

From 39f2ae2f7ec69596e26498d2e2640ce6794313b6 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 08:26:19 +0000
Subject: [PATCH 28/36] Fixed test class name.

---
 adalflow/tests/test_transformer_client.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/adalflow/tests/test_transformer_client.py b/adalflow/tests/test_transformer_client.py
index c0b649fc..63ba019d 100644
--- a/adalflow/tests/test_transformer_client.py
+++ b/adalflow/tests/test_transformer_client.py
@@ -165,7 +165,7 @@ def test_integration_with_generator_pipeline(self):
         output = generator(prompt_kwargs=self.prompt_kwargs)
         print(output)
 
-class TransformerRerankerModelClient(unittest.TestCase):
+class TestTransformerRerankerModelClient(unittest.TestCase):
     def setUp(self) -> None:
 
         self.query = "what is panda?"

From 5481ee78794e16c2d02e3529f611c4fd610d764f Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 08:42:01 +0000
Subject: [PATCH 29/36] Added kwargs for model and tokenizer init.

---
 .../model_client/transformers_client.py       | 39 +++++++++++++------
 1 file changed, 28 insertions(+), 11 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index f6f8b239..b0050a75 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -90,6 +90,8 @@ def __init__(
             self,
             model_name: Optional[str] = None,
             tokenizer_kwargs: Optional[dict] = dict(),
+            auto_model_kargs: Optional[dict] = dict(),
+            auto_tokenizer_kwargs: Optional[dict] = dict(),
             auto_model: Optional[type] = AutoModel,
             auto_tokenizer: Optional[type] = AutoTokenizer,
             custom_model: Optional[PreTrainedModel] = None,
@@ -99,6 +101,8 @@ def __init__(
         super().__init__()
         self.model_name = model_name
         self.tokenizer_kwargs = tokenizer_kwargs
+        self.auto_model_kargs = auto_model_kargs
+        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
         if "return_tensors" not in self.tokenizer_kwargs:
             self.tokenizer_kwargs["return_tensors"]= "pt"
         self.auto_model=auto_model
@@ -135,6 +139,8 @@ def __init__(
     def init_sync_client(self):
         self.init_model(
             model_name=self.model_name,
+            auto_model_kargs=self.auto_model_kargs,
+            auto_tokenizer_kwargs=self.auto_tokenizer_kwargs,
             auto_model=self.auto_model,
             auto_tokenizer=self.auto_tokenizer,
             custom_model=self.custom_model,
@@ -146,6 +152,8 @@ def init_sync_client(self):
     def init_model(
         self,
         model_name: Optional[str] = None,
+        auto_model_kargs: Optional[dict] = dict(),
+        auto_tokenizer_kwargs: Optional[dict] = dict(),
         auto_model: Optional[type] = AutoModel,
         auto_tokenizer: Optional[type] = AutoTokenizer,
         custom_model: Optional[PreTrainedModel] = None,
@@ -154,12 +162,12 @@ def init_model(
 
         try:
             if self.use_auto_model:
-                self.model = auto_model.from_pretrained(model_name)
+                self.model = auto_model.from_pretrained(model_name, **auto_model_kargs)
             else:
                 self.model = custom_model
 
             if self.use_auto_tokenizer:
-                self.tokenizer = auto_tokenizer.from_pretrained(model_name)
+                self.tokenizer = auto_tokenizer.from_pretrained(model_name, **auto_tokenizer_kwargs)
             else:
                 self.tokenizer = custom_tokenizer
 
@@ -271,7 +279,9 @@ class TransformerLLMModelClient(ModelClient):
     def __init__(
         self,
         model_name: Optional[str] = None,
-        tokenizer_kwargs: Optional[dict] = {},
+        tokenizer_decode_kwargs: Optional[dict] = {},
+        auto_model_kargs: Optional[dict] = dict(),
+        auto_tokenizer_kwargs: Optional[dict] = dict(),
         init_from: Optional[str] = "autoclass",
         apply_chat_template: bool = False,
         chat_template: Optional[str] = None,
@@ -283,7 +293,9 @@ def __init__(
         super().__init__()
 
         self.model_name = model_name  # current model to use
-        self.tokenizer_kwargs = tokenizer_kwargs
+        self.tokenizer_decode_kwargs = tokenizer_decode_kwargs
+        self.auto_model_kargs = auto_model_kargs
+        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
         if "return_tensors" not in self.tokenizer_kwargs:
             self.tokenizer_kwargs["return_tensors"]= "pt"
         self.use_token = use_token
@@ -335,14 +347,15 @@ def _init_from_automodelcasual_lm(self):
             self.model_name,
             token=token,
             local_files_only=self.local_files_only,
-            **self.tokenizer_kwargs
+            **self.auto_tokenizer_kwargs
         )
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
             torch_dtype=self.torch_dtype,
             device_map="auto",
             token=token,
-            local_files_only=self.local_files_only
+            local_files_only=self.local_files_only,
+            **self.auto_model_kargs
         )
         # Set pad token if it's not already set
         if self.tokenizer.pad_token is None:
@@ -401,7 +414,6 @@ def _infer_from_pipeline(
                 self.model_name,
                 token=self._get_token_if_relevant(),
                 local_files_only=self.local_files_only,
-                **self.tokenizer_kwargs
             )
             # Set pad token if it's not already set
             if self.tokenizer.pad_token is None:
@@ -476,7 +488,7 @@ def _handle_input(
                 messages, **chat_template_kwargs
             )
             if ("tokenize" in chat_template_kwargs) and (chat_template_kwargs["tokenize"] == True):
-                prompt = self.tokenizer.decode(prompt)
+                prompt = self.tokenizer.decode(prompt, **self.tokenizer_decode_kwargs)
                 return prompt
             else:
                 return prompt
@@ -598,12 +610,16 @@ class TransformerRerankerModelClient(ModelClient):
     def __init__(
         self,
         model_name: Optional[str] = None,
+        tokenizer_kwargs: Optional[dict] = {},
+        auto_model_kargs: Optional[dict] = dict(),
+        auto_tokenizer_kwargs: Optional[dict] = dict(),
         auto_model: Optional[type] = AutoModelForSequenceClassification,
         auto_tokenizer: Optional[type] = AutoTokenizer,
-        tokenizer_kwargs: Optional[dict] = {},
         local_files_only: Optional[bool] = False
     ):
         self.auto_model = auto_model
+        self.auto_model_kargs = auto_model_kargs
+        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
         self.auto_tokenizer= auto_tokenizer
         self.model_name = model_name
         self.tokenizer_kwargs = tokenizer_kwargs
@@ -619,11 +635,12 @@ def init_model(self, model_name: str):
             self.tokenizer = self.auto_tokenizer.from_pretrained(
             self.model_name,
             local_files_only=self.local_files_only,
-            **self.tokenizer_kwargs
+            **self.auto_tokenizer_kwargs
             )
             self.model = self.auto_model.from_pretrained(
             self.model_name,
-            local_files_only=self.local_files_only
+            local_files_only=self.local_files_only,
+            **self.auto_model_kargs
             )
             # Check device availability and set the device
             device = get_device()

From eab39cdebbd780054846ed8f31201efaa645cd05 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 09:29:10 +0000
Subject: [PATCH 30/36] Fixed typo.

---
 .../model_client/transformers_client.py       | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index b0050a75..8a3d06ab 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -90,7 +90,7 @@ def __init__(
             self,
             model_name: Optional[str] = None,
             tokenizer_kwargs: Optional[dict] = dict(),
-            auto_model_kargs: Optional[dict] = dict(),
+            auto_model_kwargs: Optional[dict] = dict(),
             auto_tokenizer_kwargs: Optional[dict] = dict(),
             auto_model: Optional[type] = AutoModel,
             auto_tokenizer: Optional[type] = AutoTokenizer,
@@ -101,7 +101,7 @@ def __init__(
         super().__init__()
         self.model_name = model_name
         self.tokenizer_kwargs = tokenizer_kwargs
-        self.auto_model_kargs = auto_model_kargs
+        self.auto_model_kwargs = auto_model_kwargs
         self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
         if "return_tensors" not in self.tokenizer_kwargs:
             self.tokenizer_kwargs["return_tensors"]= "pt"
@@ -139,7 +139,7 @@ def __init__(
     def init_sync_client(self):
         self.init_model(
             model_name=self.model_name,
-            auto_model_kargs=self.auto_model_kargs,
+            auto_model_kwargs=self.auto_model_kwargs,
             auto_tokenizer_kwargs=self.auto_tokenizer_kwargs,
             auto_model=self.auto_model,
             auto_tokenizer=self.auto_tokenizer,
@@ -152,7 +152,7 @@ def init_sync_client(self):
     def init_model(
         self,
         model_name: Optional[str] = None,
-        auto_model_kargs: Optional[dict] = dict(),
+        auto_model_kwargs: Optional[dict] = dict(),
         auto_tokenizer_kwargs: Optional[dict] = dict(),
         auto_model: Optional[type] = AutoModel,
         auto_tokenizer: Optional[type] = AutoTokenizer,
@@ -162,7 +162,7 @@ def init_model(
 
         try:
             if self.use_auto_model:
-                self.model = auto_model.from_pretrained(model_name, **auto_model_kargs)
+                self.model = auto_model.from_pretrained(model_name, **auto_model_kwargs)
             else:
                 self.model = custom_model
 
@@ -280,7 +280,7 @@ def __init__(
         self,
         model_name: Optional[str] = None,
         tokenizer_decode_kwargs: Optional[dict] = {},
-        auto_model_kargs: Optional[dict] = dict(),
+        auto_model_kwargs: Optional[dict] = dict(),
         auto_tokenizer_kwargs: Optional[dict] = dict(),
         init_from: Optional[str] = "autoclass",
         apply_chat_template: bool = False,
@@ -294,7 +294,7 @@ def __init__(
 
         self.model_name = model_name  # current model to use
         self.tokenizer_decode_kwargs = tokenizer_decode_kwargs
-        self.auto_model_kargs = auto_model_kargs
+        self.auto_model_kwargs = auto_model_kwargs
         self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
         if "return_tensors" not in self.tokenizer_kwargs:
             self.tokenizer_kwargs["return_tensors"]= "pt"
@@ -355,7 +355,7 @@ def _init_from_automodelcasual_lm(self):
             device_map="auto",
             token=token,
             local_files_only=self.local_files_only,
-            **self.auto_model_kargs
+            **self.auto_model_kwargs
         )
         # Set pad token if it's not already set
         if self.tokenizer.pad_token is None:
@@ -611,14 +611,14 @@ def __init__(
         self,
         model_name: Optional[str] = None,
         tokenizer_kwargs: Optional[dict] = {},
-        auto_model_kargs: Optional[dict] = dict(),
+        auto_model_kwargs: Optional[dict] = dict(),
         auto_tokenizer_kwargs: Optional[dict] = dict(),
         auto_model: Optional[type] = AutoModelForSequenceClassification,
         auto_tokenizer: Optional[type] = AutoTokenizer,
         local_files_only: Optional[bool] = False
     ):
         self.auto_model = auto_model
-        self.auto_model_kargs = auto_model_kargs
+        self.auto_model_kwargs = auto_model_kwargs
         self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
         self.auto_tokenizer= auto_tokenizer
         self.model_name = model_name
@@ -640,7 +640,7 @@ def init_model(self, model_name: str):
             self.model = self.auto_model.from_pretrained(
             self.model_name,
             local_files_only=self.local_files_only,
-            **self.auto_model_kargs
+            **self.auto_model_kwargs
             )
             # Check device availability and set the device
             device = get_device()

From dcd3a7aec73480ad718385fa079915441c03da60 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 10:53:06 +0000
Subject: [PATCH 31/36] Fixed missing tokenizer_kwargs in
 TransformerLLMModelClient.

---
 .../adalflow/components/model_client/transformers_client.py   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 8a3d06ab..b8c4cc03 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -280,6 +280,7 @@ def __init__(
         self,
         model_name: Optional[str] = None,
         tokenizer_decode_kwargs: Optional[dict] = {},
+        tokenizer_kwargs: Optional[dict] = {},
         auto_model_kwargs: Optional[dict] = dict(),
         auto_tokenizer_kwargs: Optional[dict] = dict(),
         init_from: Optional[str] = "autoclass",
@@ -294,6 +295,7 @@ def __init__(
 
         self.model_name = model_name  # current model to use
         self.tokenizer_decode_kwargs = tokenizer_decode_kwargs
+        self.tokenizer_kwargs = tokenizer_kwargs
         self.auto_model_kwargs = auto_model_kwargs
         self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
         if "return_tensors" not in self.tokenizer_kwargs:
@@ -463,7 +465,7 @@ def _infer_from_automodelcasual_lm(
                 )
         else:
            model_input = self._handle_input(messages) 
-        input_ids = self.tokenizer(model_input, return_tensors="pt").to(
+        input_ids = self.tokenizer(model_input, **self.tokenizer_kwargs).to(
             get_device()
         )
         outputs_tokens = self.model.generate(**input_ids, max_length=max_length, max_new_tokens=max_tokens, **kwargs)

From 8e39e01dde4516b0ceac213db00be7933097abf3 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 11:15:08 +0000
Subject: [PATCH 32/36] Addded local_files_only to
 TransformerEmbeddingModelClient

---
 .../components/model_client/transformers_client.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index b8c4cc03..c0159890 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -94,6 +94,7 @@ def __init__(
             auto_tokenizer_kwargs: Optional[dict] = dict(),
             auto_model: Optional[type] = AutoModel,
             auto_tokenizer: Optional[type] = AutoTokenizer,
+            local_files_only: Optional[bool] = False,
             custom_model: Optional[PreTrainedModel] = None,
             custom_tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]] = None
             ):
@@ -107,6 +108,7 @@ def __init__(
             self.tokenizer_kwargs["return_tensors"]= "pt"
         self.auto_model=auto_model
         self.auto_tokenizer=auto_tokenizer
+        self.local_files_only = local_files_only
         self.custom_model=custom_model
         self.custom_tokenizer=custom_tokenizer
 
@@ -162,12 +164,20 @@ def init_model(
 
         try:
             if self.use_auto_model:
-                self.model = auto_model.from_pretrained(model_name, **auto_model_kwargs)
+                self.model = auto_model.from_pretrained(
+                    model_name,
+                    local_files_only=self.local_files_only,
+                    **auto_model_kwargs
+                    )
             else:
                 self.model = custom_model
 
             if self.use_auto_tokenizer:
-                self.tokenizer = auto_tokenizer.from_pretrained(model_name, **auto_tokenizer_kwargs)
+                self.tokenizer = auto_tokenizer.from_pretrained(
+                    model_name,
+                    local_files_only=self.local_files_only,
+                    **auto_tokenizer_kwargs
+                    )
             else:
                 self.tokenizer = custom_tokenizer
 

From 92faa910fda107a3748ef49ac670411d3dc2b060 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 11:42:42 +0000
Subject: [PATCH 33/36] Fixed mutable default arguments.

---
 .../model_client/transformers_client.py       | 76 ++++++++++---------
 1 file changed, 41 insertions(+), 35 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index c0159890..cfb354e9 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -89,9 +89,9 @@ class TransformerEmbeddingModelClient(ModelClient):
     def __init__(
             self,
             model_name: Optional[str] = None,
-            tokenizer_kwargs: Optional[dict] = dict(),
-            auto_model_kwargs: Optional[dict] = dict(),
-            auto_tokenizer_kwargs: Optional[dict] = dict(),
+            tokenizer_kwargs: Optional[dict] = None,
+            auto_model_kwargs: Optional[dict] = None,
+            auto_tokenizer_kwargs: Optional[dict] = None,
             auto_model: Optional[type] = AutoModel,
             auto_tokenizer: Optional[type] = AutoTokenizer,
             local_files_only: Optional[bool] = False,
@@ -101,9 +101,9 @@ def __init__(
 
         super().__init__()
         self.model_name = model_name
-        self.tokenizer_kwargs = tokenizer_kwargs
-        self.auto_model_kwargs = auto_model_kwargs
-        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
+        self.tokenizer_kwargs = tokenizer_kwargs or dict()
+        self.auto_model_kwargs = auto_model_kwargs or dict()
+        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict()
         if "return_tensors" not in self.tokenizer_kwargs:
             self.tokenizer_kwargs["return_tensors"]= "pt"
         self.auto_model=auto_model
@@ -154,14 +154,16 @@ def init_sync_client(self):
     def init_model(
         self,
         model_name: Optional[str] = None,
-        auto_model_kwargs: Optional[dict] = dict(),
-        auto_tokenizer_kwargs: Optional[dict] = dict(),
+        auto_model_kwargs: Optional[dict] = None,
+        auto_tokenizer_kwargs: Optional[dict] = None,
         auto_model: Optional[type] = AutoModel,
         auto_tokenizer: Optional[type] = AutoTokenizer,
         custom_model: Optional[PreTrainedModel] = None,
         custom_tokenizer: Optional[PreTrainedTokenizer | PreTrainedTokenizerFast] = None
         ):
 
+        self.auto_model_kwargs = auto_model_kwargs or dict()
+        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict()
         try:
             if self.use_auto_model:
                 self.model = auto_model.from_pretrained(
@@ -215,7 +217,8 @@ def handle_input(self, input: Union[str, List[str], List[List[str]]]) -> Union[L
         return input
      
 
-    def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = dict()) -> dict:
+    def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = None) -> dict:
+        kwargs = kwargs or dict()
         batch_dict = self.tokenizer(input, **kwargs)
         return batch_dict
 
@@ -235,8 +238,9 @@ def compute_embeddings(self, outputs: dict, batch_dict: dict):
     #
     # Preprocessing, postprocessing and call for inference code
     #
-    def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType.UNDEFINED) -> Union[List, Tensor]:
-        
+    def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED) -> Union[List, Tensor]:
+
+        api_kwargs = api_kwargs or dict()
         if "model" not in api_kwargs:
             raise ValueError("model must be specified in api_kwargs")
         # I don't think it is useful anymore
@@ -289,14 +293,14 @@ class TransformerLLMModelClient(ModelClient):
     def __init__(
         self,
         model_name: Optional[str] = None,
-        tokenizer_decode_kwargs: Optional[dict] = {},
-        tokenizer_kwargs: Optional[dict] = {},
-        auto_model_kwargs: Optional[dict] = dict(),
-        auto_tokenizer_kwargs: Optional[dict] = dict(),
+        tokenizer_decode_kwargs: Optional[dict] = None,
+        tokenizer_kwargs: Optional[dict] = None,
+        auto_model_kwargs: Optional[dict] = None,
+        auto_tokenizer_kwargs: Optional[dict] = None,
         init_from: Optional[str] = "autoclass",
         apply_chat_template: bool = False,
         chat_template: Optional[str] = None,
-        chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True),
+        chat_template_kwargs: Optional[dict] = None,
         use_token: bool = False,
         torch_dtype: Optional[Any] = torch.bfloat16,
         local_files_only: Optional[bool] = False
@@ -304,10 +308,10 @@ def __init__(
         super().__init__()
 
         self.model_name = model_name  # current model to use
-        self.tokenizer_decode_kwargs = tokenizer_decode_kwargs
-        self.tokenizer_kwargs = tokenizer_kwargs
-        self.auto_model_kwargs = auto_model_kwargs
-        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
+        self.tokenizer_decode_kwargs = tokenizer_decode_kwargs or dict()
+        self.tokenizer_kwargs = tokenizer_kwargs or dict()
+        self.auto_model_kwargs = auto_model_kwargs or dict()
+        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict()
         if "return_tensors" not in self.tokenizer_kwargs:
             self.tokenizer_kwargs["return_tensors"]= "pt"
         self.use_token = use_token
@@ -315,7 +319,7 @@ def __init__(
         self.init_from = init_from
         self.apply_chat_template = apply_chat_template
         self.chat_template = chat_template
-        self.chat_template_kwargs = chat_template_kwargs
+        self.chat_template_kwargs = chat_template_kwargs or dict(tokenize=False, add_generation_prompt=True)
         self.local_files_only = local_files_only
         self.model = None
         if model_name is not None:
@@ -403,7 +407,7 @@ def _infer_from_pipeline(
         max_tokens: Optional[int] = None,
         apply_chat_template: bool = False,
         chat_template: Optional[str] = None,
-        chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True),
+        chat_template_kwargs: Optional[dict] = None,
         **kwargs,
     ):
 
@@ -460,7 +464,7 @@ def _infer_from_automodelcasual_lm(
         max_length: Optional[int] = 8192,  # model-agnostic
         apply_chat_template: bool = False,
         chat_template: Optional[str] = None,
-        chat_template_kwargs: Optional[dict] = dict(tokenize=False, add_generation_prompt=True),
+        chat_template_kwargs: Optional[dict] = None,
         **kwargs,
     ):
         if not self.model:
@@ -542,8 +546,8 @@ def infer_llm(
     #
     # Preprocessing, postprocessing and call for inference code
     #
-    def call(self, api_kwargs: Dict = {}, model_type: Optional[ModelType]= ModelType.UNDEFINED):
-
+    def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED):
+        api_kwargs = api_kwargs or dict()
         if "model" not in api_kwargs:
             raise ValueError("model must be specified in api_kwargs")
 
@@ -596,9 +600,10 @@ def parse_chat_completion(self, completion: Any) -> str:
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
-        model_kwargs: dict = {},
+        model_kwargs: dict = None,
         model_type: Optional[ModelType]= ModelType.UNDEFINED
     ) -> dict:
+        model_kwargs = model_kwargs or dict()
         final_model_kwargs = model_kwargs.copy()
         assert "model" in final_model_kwargs, "model must be specified"
         #messages = [{"role": "system", "content": input}]
@@ -622,19 +627,19 @@ class TransformerRerankerModelClient(ModelClient):
     def __init__(
         self,
         model_name: Optional[str] = None,
-        tokenizer_kwargs: Optional[dict] = {},
-        auto_model_kwargs: Optional[dict] = dict(),
-        auto_tokenizer_kwargs: Optional[dict] = dict(),
+        tokenizer_kwargs: Optional[dict] = None,
+        auto_model_kwargs: Optional[dict] = None,
+        auto_tokenizer_kwargs: Optional[dict] = None,
         auto_model: Optional[type] = AutoModelForSequenceClassification,
         auto_tokenizer: Optional[type] = AutoTokenizer,
         local_files_only: Optional[bool] = False
     ):
         self.auto_model = auto_model
-        self.auto_model_kwargs = auto_model_kwargs
-        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs
+        self.auto_model_kwargs = auto_model_kwargs or dict()
+        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict()
         self.auto_tokenizer= auto_tokenizer
         self.model_name = model_name
-        self.tokenizer_kwargs = tokenizer_kwargs
+        self.tokenizer_kwargs = tokenizer_kwargs or dict()
         if "return_tensors" not in self.tokenizer_kwargs:
             self.tokenizer_kwargs["return_tensors"]= "pt"
         self.local_files_only = local_files_only
@@ -706,8 +711,8 @@ def infer_reranker(
     #
     # Preprocessing, postprocessing and call for inference code
     #
-    def call(self, api_kwargs: Dict = {}):
-
+    def call(self, api_kwargs: Dict = None):
+        api_kwargs = api_kwargs or dict()
         if "model" not in api_kwargs:
             raise ValueError("model must be specified in api_kwargs")
 
@@ -738,9 +743,10 @@ def call(self, api_kwargs: Dict = {}):
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
-        model_kwargs: dict = {},
+        model_kwargs: dict = None,
         model_type: ModelType = ModelType.UNDEFINED,
     ) -> dict:
+        model_kwargs = model_kwargs or dict()
         final_model_kwargs = model_kwargs.copy()
 
         assert "model" in final_model_kwargs, "model must be specified"

From 6e5f109cad0972832a87c3c367908a5740ecef3a Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 12:16:03 +0000
Subject: [PATCH 34/36] Removed dict args that were conflicting with 
 @lru_cache.

---
 .../components/model_client/transformers_client.py     | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index cfb354e9..482f50ad 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -141,8 +141,6 @@ def __init__(
     def init_sync_client(self):
         self.init_model(
             model_name=self.model_name,
-            auto_model_kwargs=self.auto_model_kwargs,
-            auto_tokenizer_kwargs=self.auto_tokenizer_kwargs,
             auto_model=self.auto_model,
             auto_tokenizer=self.auto_tokenizer,
             custom_model=self.custom_model,
@@ -154,22 +152,18 @@ def init_sync_client(self):
     def init_model(
         self,
         model_name: Optional[str] = None,
-        auto_model_kwargs: Optional[dict] = None,
-        auto_tokenizer_kwargs: Optional[dict] = None,
         auto_model: Optional[type] = AutoModel,
         auto_tokenizer: Optional[type] = AutoTokenizer,
         custom_model: Optional[PreTrainedModel] = None,
         custom_tokenizer: Optional[PreTrainedTokenizer | PreTrainedTokenizerFast] = None
         ):
 
-        self.auto_model_kwargs = auto_model_kwargs or dict()
-        self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict()
         try:
             if self.use_auto_model:
                 self.model = auto_model.from_pretrained(
                     model_name,
                     local_files_only=self.local_files_only,
-                    **auto_model_kwargs
+                    **self.auto_model_kwargs
                     )
             else:
                 self.model = custom_model
@@ -178,7 +172,7 @@ def init_model(
                 self.tokenizer = auto_tokenizer.from_pretrained(
                     model_name,
                     local_files_only=self.local_files_only,
-                    **auto_tokenizer_kwargs
+                    **self.auto_tokenizer_kwargs
                     )
             else:
                 self.tokenizer = custom_tokenizer

From 508079d20e3fa0cf552426e856888976699bb52f Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 15:16:22 +0000
Subject: [PATCH 35/36] Added tests to check transformer_client compatibility
 with different models.

---
 adalflow/tests/test_transformers_models.py | 186 +++++++++++++++++++++
 1 file changed, 186 insertions(+)
 create mode 100644 adalflow/tests/test_transformers_models.py

diff --git a/adalflow/tests/test_transformers_models.py b/adalflow/tests/test_transformers_models.py
new file mode 100644
index 00000000..fc553f6e
--- /dev/null
+++ b/adalflow/tests/test_transformers_models.py
@@ -0,0 +1,186 @@
+"""This tests that the new transformer_client compatibility with several models hosted on HuggingFace."""
+import unittest
+import torch
+from adalflow.components.model_client.transformers_client import TransformerEmbeddingModelClient, TransformerLLMModelClient, TransformerRerankerModelClient
+from transformers import AutoModelForSequenceClassification
+
+class TestEmbeddingModels(unittest.TestCase):
+    def setUp(self) -> None:
+        self.test_input = "Hello world"
+        self.auto_tokenizer_kwargs = {
+            "max_length": 512,
+            "padding": True,
+            "truncation": True,
+            "return_tensors": 'pt'
+        }
+    def test_thenhelper_gte_base(self):
+        embedding_model = "thenlper/gte-base"
+        model_kwargs = {"model": embedding_model}
+
+        model_client = TransformerEmbeddingModelClient(
+            model_name=embedding_model,
+            auto_tokenizer_kwargs=self.auto_tokenizer_kwargs
+        )
+        print(
+            f"Testing model client with model {embedding_model}"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.test_input, model_kwargs=model_kwargs)
+        output = model_client.call(api_kwargs=api_kwargs)
+        print(output)
+
+    def test_jina_embeddings_V2_small_en(self):
+        embedding_model = "jinaai/jina-embeddings-v2-small-en"
+        model_kwargs = {"model": embedding_model}
+        model_client = TransformerEmbeddingModelClient(
+            model_name=embedding_model,
+            auto_tokenizer_kwargs=self.auto_tokenizer_kwargs
+        )
+        print(
+            f"Testing model client with model {embedding_model}"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.test_input, model_kwargs=model_kwargs)
+        output = model_client.call(api_kwargs=api_kwargs)
+        print(output)
+
+    def test_t5_small_standard_bahasa_cased(self):
+        embedding_model = "mesolitica/t5-small-standard-bahasa-cased"
+        model_kwargs = {"model": embedding_model}
+
+        # Subclass TransformerEmbeddingModelClient to adapt the class to Encoder-Decoder architecture
+        class T5SmallStandardBahasaCased(TransformerEmbeddingModelClient):
+            
+            def compute_model_outputs(self, batch_dict: dict, model) -> dict:
+                print(batch_dict)
+                with torch.no_grad():
+                    outputs = model.encoder(**batch_dict)
+                return outputs
+
+
+            
+        model_client = T5SmallStandardBahasaCased(
+            model_name=embedding_model,
+            auto_tokenizer_kwargs=self.auto_tokenizer_kwargs
+        )
+        print(
+            f"Testing model client with model {embedding_model}"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.test_input, model_kwargs=model_kwargs)
+        output = model_client.call(api_kwargs=api_kwargs)
+        print(output)
+
+    def test_sentence_transformers_all_miniLM_L6_V2(self):
+        embedding_model = "sentence-transformers/all-MiniLM-L6-v2"
+        model_kwargs = {"model": embedding_model}
+
+        model_client = TransformerEmbeddingModelClient(
+            model_name=embedding_model,
+            auto_tokenizer_kwargs=self.auto_tokenizer_kwargs
+        )
+        print(
+            f"Testing model client with model {embedding_model}"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.test_input, model_kwargs=model_kwargs)
+        output = model_client.call(api_kwargs=api_kwargs)
+        print(output)
+
+class TestLLMModels(unittest.TestCase):
+    """This class 'has accelerate' as a dependencie for both tests.
+        You might need to run the following command in the terminal.
+            `pip install accelerate`
+    """
+    def setUp(self) -> None:
+        self.input_text = "Where is Brian?"
+        self.auto_tokenizer_kwargs = {}
+        self.model_kwargs = {
+            "temperature": 0.1,
+            "do_sample": True
+        }
+        self.tokenizer_decode_kwargs = {
+            "max_length": True,
+            "truncation": True,
+        }
+        self.prompt_kwargs = {
+            "input_str": "Where is Brian?", # test input
+        }
+
+    def test_roneneld_tiny_stories_1M(self):
+        self.model_kwargs["model"] = "roneneldan/TinyStories-1M"
+        model_client = TransformerLLMModelClient(
+            auto_tokenizer_kwargs=self.auto_tokenizer_kwargs,
+            local_files_only=False,
+            init_from="autoclass",
+            )
+        print(
+            f"Testing model client with model {"roneneldan/TinyStories-1M"}"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.input_text, model_kwargs=self.model_kwargs)
+        output = model_client.call(api_kwargs=api_kwargs)
+        print(output)
+
+    def test_nickypro_tinyllama_15m(self):
+        self.model_kwargs["model"] = "nickypro/tinyllama-15M"
+        model_client = TransformerLLMModelClient(
+            auto_tokenizer_kwargs=self.auto_tokenizer_kwargs,
+            local_files_only=False,
+            init_from="autoclass",
+            )
+        print(
+            f"Testing model client with model {"nickypro/tinyllama-15M"}"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(input=self.input_text, model_kwargs=self.model_kwargs)
+        output = model_client.call(api_kwargs=api_kwargs)
+        print(output)
+
+class TestRerankerModel(unittest.TestCase):
+    """This class has sentencepieces as a dependencie.
+        You might need to run the following command in the terminal.
+            `pip install transformers[sentencepiece`]`
+    """
+    def setUp(self) -> None:
+        self.query = "Where is Brian."
+        self.documents = [
+            "Brian is in the Kitchen.",
+            "Brian loves Adalflow.",
+            "Adalflow is a python library, not some food inside the kitchen.",
+        ]
+        self.model_kwargs = {
+            "documents": self.documents,
+            "top_k": 2,
+        }
+
+    def test_jinja_reranker_V1_tiny_en(self):
+        self.model_kwargs["model"] = "jinaai/jina-reranker-v1-tiny-en"
+        model_client = TransformerRerankerModelClient(
+           tokenizer_kwargs={"padding": True},
+           auto_model_kwargs={"num_labels": 1}
+            )
+        print(
+            f"Testing model client with model jinaai/jina-reranker-v1-tiny-en"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(self.query, model_kwargs=self.model_kwargs)
+        output = model_client.call(api_kwargs)
+
+    def test_baai_bge_reranker_base(self):
+        self.model_kwargs["model"] = "BAAI/bge-reranker-base"
+        model_client = TransformerRerankerModelClient(
+            tokenizer_kwargs={"padding": True},
+            )
+        print(
+            f"Testing model client with model BAAI/bge-reranker-base"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(self.query, model_kwargs=self.model_kwargs)
+        output = model_client.call(api_kwargs)
+
+    def test_cross_encoder_ms_marco_minilm_L_2_V2(self):
+        self.model_kwargs["model"] = "cross-encoder/ms-marco-MiniLM-L-2-v2"
+        model_client = TransformerRerankerModelClient(
+            tokenizer_kwargs={"padding": True},
+            )
+        print(
+            f"Testing model client with model cross-encoder/ms-marco-MiniLM-L-2-v2"
+        )
+        api_kwargs = model_client.convert_inputs_to_api_kwargs(self.query, model_kwargs=self.model_kwargs)
+        output = model_client.call(api_kwargs)
+
+if __name__ == "__main__":
+    unittest.main(verbosity=6)
\ No newline at end of file

From c8fe73a94f74f441eda20e6c3ffe23b511ff7662 Mon Sep 17 00:00:00 2001
From: Alexandre Adjahossou <adjahossoualexandre@gmail.com>
Date: Wed, 11 Sep 2024 15:42:48 +0000
Subject: [PATCH 36/36] Formatting.

---
 .../model_client/transformers_client.py       | 242 ++++++++++--------
 1 file changed, 130 insertions(+), 112 deletions(-)

diff --git a/adalflow/adalflow/components/model_client/transformers_client.py b/adalflow/adalflow/components/model_client/transformers_client.py
index 482f50ad..7ef85ab0 100644
--- a/adalflow/adalflow/components/model_client/transformers_client.py
+++ b/adalflow/adalflow/components/model_client/transformers_client.py
@@ -25,13 +25,12 @@
     AutoModel,
     AutoModelForCausalLM,
     AutoModelForSequenceClassification,
-    pipeline
+    pipeline,
 )
 from os import getenv as get_env_variable
 
 transformers = safe_import(
-    OptionalPackages.TRANSFORMERS.value[0],
-    OptionalPackages.TRANSFORMERS.value[1]
+    OptionalPackages.TRANSFORMERS.value[0], OptionalPackages.TRANSFORMERS.value[1]
 )
 torch = safe_import(OptionalPackages.TORCH.value[0], OptionalPackages.TORCH.value[1])
 
@@ -45,9 +44,15 @@ def average_pool(last_hidden_states: Tensor, attention_mask: list) -> Tensor:
 
 
 def mean_pooling(model_output: dict, attention_mask) -> Tensor:
-    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    token_embeddings = model_output[
+        0
+    ]  # First element of model_output contains all token embeddings
+    input_mask_expanded = (
+        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    )
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
+        input_mask_expanded.sum(1), min=1e-9
+    )
 
 
 def get_device():
@@ -83,21 +88,24 @@ class TransformerEmbeddingModelClient(ModelClient):
     Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
     Once you have a token and have access, put the token in the environment variable HF_TOKEN.
     """
+
     #
     #   Model initialisation
     #
     def __init__(
-            self,
-            model_name: Optional[str] = None,
-            tokenizer_kwargs: Optional[dict] = None,
-            auto_model_kwargs: Optional[dict] = None,
-            auto_tokenizer_kwargs: Optional[dict] = None,
-            auto_model: Optional[type] = AutoModel,
-            auto_tokenizer: Optional[type] = AutoTokenizer,
-            local_files_only: Optional[bool] = False,
-            custom_model: Optional[PreTrainedModel] = None,
-            custom_tokenizer: Optional[Union[PreTrainedTokenizer, PreTrainedTokenizerFast]] = None
-            ):
+        self,
+        model_name: Optional[str] = None,
+        tokenizer_kwargs: Optional[dict] = None,
+        auto_model_kwargs: Optional[dict] = None,
+        auto_tokenizer_kwargs: Optional[dict] = None,
+        auto_model: Optional[type] = AutoModel,
+        auto_tokenizer: Optional[type] = AutoTokenizer,
+        local_files_only: Optional[bool] = False,
+        custom_model: Optional[PreTrainedModel] = None,
+        custom_tokenizer: Optional[
+            Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
+        ] = None,
+    ):
 
         super().__init__()
         self.model_name = model_name
@@ -105,12 +113,12 @@ def __init__(
         self.auto_model_kwargs = auto_model_kwargs or dict()
         self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict()
         if "return_tensors" not in self.tokenizer_kwargs:
-            self.tokenizer_kwargs["return_tensors"]= "pt"
-        self.auto_model=auto_model
-        self.auto_tokenizer=auto_tokenizer
+            self.tokenizer_kwargs["return_tensors"] = "pt"
+        self.auto_model = auto_model
+        self.auto_tokenizer = auto_tokenizer
         self.local_files_only = local_files_only
-        self.custom_model=custom_model
-        self.custom_tokenizer=custom_tokenizer
+        self.custom_model = custom_model
+        self.custom_tokenizer = custom_tokenizer
 
         # Check if there is conflicting arguments
         self.use_auto_model = auto_model is not None
@@ -125,28 +133,32 @@ def __init__(
         elif (not self.use_auto_model) and (not self.use_cusom_model):
             raise ValueError("Need to specify either 'auto_model' or 'custom_model'.")
         elif self.use_auto_model and (not self.model_name_exit):
-            raise ValueError("When 'auto_model' is specified 'model_name' must be specified too.")
-        
+            raise ValueError(
+                "When 'auto_model' is specified 'model_name' must be specified too."
+            )
+
         ## arguments related to tokenizer
         if self.use_auto_tokenizer and self.use_cusom_tokenizer:
             raise Exception("Cannot specify 'auto_tokenizer' and 'custom_tokenizer'.")
         elif (not self.use_auto_tokenizer) and (not self.use_cusom_tokenizer):
-            raise Exception("Need to specify either'auto_tokenizer' and 'custom_tokenizer'.")
+            raise Exception(
+                "Need to specify either'auto_tokenizer' and 'custom_tokenizer'."
+            )
         elif self.use_auto_tokenizer and (not self.model_name_exit):
-            raise ValueError("When 'auto_tokenizer' is specified 'model_name' must be specified too.")
+            raise ValueError(
+                "When 'auto_tokenizer' is specified 'model_name' must be specified too."
+            )
 
         self.init_sync_client()
 
-
     def init_sync_client(self):
         self.init_model(
             model_name=self.model_name,
             auto_model=self.auto_model,
             auto_tokenizer=self.auto_tokenizer,
             custom_model=self.custom_model,
-            custom_tokenizer=self.custom_tokenizer
-            )
-
+            custom_tokenizer=self.custom_tokenizer,
+        )
 
     @lru_cache(None)
     def init_model(
@@ -155,16 +167,18 @@ def init_model(
         auto_model: Optional[type] = AutoModel,
         auto_tokenizer: Optional[type] = AutoTokenizer,
         custom_model: Optional[PreTrainedModel] = None,
-        custom_tokenizer: Optional[PreTrainedTokenizer | PreTrainedTokenizerFast] = None
-        ):
+        custom_tokenizer: Optional[
+            PreTrainedTokenizer | PreTrainedTokenizerFast
+        ] = None,
+    ):
 
         try:
             if self.use_auto_model:
                 self.model = auto_model.from_pretrained(
                     model_name,
                     local_files_only=self.local_files_only,
-                    **self.auto_model_kwargs
-                    )
+                    **self.auto_model_kwargs,
+                )
             else:
                 self.model = custom_model
 
@@ -172,8 +186,8 @@ def init_model(
                 self.tokenizer = auto_tokenizer.from_pretrained(
                     model_name,
                     local_files_only=self.local_files_only,
-                    **self.auto_tokenizer_kwargs
-                    )
+                    **self.auto_tokenizer_kwargs,
+                )
             else:
                 self.tokenizer = custom_tokenizer
 
@@ -204,35 +218,39 @@ def infer_embedding(
             embeddings = embeddings.tolist()
         return embeddings
 
-
-    def handle_input(self, input: Union[str, List[str], List[List[str]]]) -> Union[List[str], List[List[str]]]:
+    def handle_input(
+        self, input: Union[str, List[str], List[List[str]]]
+    ) -> Union[List[str], List[List[str]]]:
         if isinstance(input, str):
             input = [input]
         return input
-     
 
-    def tokenize_inputs(self, input: Union[str, List[str], List[List[str]]], kwargs: Optional[dict] = None) -> dict:
+    def tokenize_inputs(
+        self,
+        input: Union[str, List[str], List[List[str]]],
+        kwargs: Optional[dict] = None,
+    ) -> dict:
         kwargs = kwargs or dict()
         batch_dict = self.tokenizer(input, **kwargs)
         return batch_dict
 
-
     def compute_model_outputs(self, batch_dict: dict, model: PreTrainedModel) -> dict:
         with torch.no_grad():
             outputs = model(**batch_dict)
         return outputs
 
-
     def compute_embeddings(self, outputs: dict, batch_dict: dict):
-        embeddings = mean_pooling(
-            outputs, batch_dict["attention_mask"]
-        )
+        embeddings = mean_pooling(outputs, batch_dict["attention_mask"])
         return embeddings
 
     #
     # Preprocessing, postprocessing and call for inference code
     #
-    def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED) -> Union[List, Tensor]:
+    def call(
+        self,
+        api_kwargs: Dict = None,
+        model_type: Optional[ModelType] = ModelType.UNDEFINED,
+    ) -> Union[List, Tensor]:
 
         api_kwargs = api_kwargs or dict()
         if "model" not in api_kwargs:
@@ -251,7 +269,6 @@ def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelTy
         # inference the model
         return self.infer_embedding(api_kwargs["input"])
 
-
     def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOutput:
         embeddings: List[Embedding] = []
         for idx, emb in enumerate(response):
@@ -259,12 +276,11 @@ def parse_embedding_response(self, response: Union[List, Tensor]) -> EmbedderOut
         response = EmbedderOutput(data=embeddings)
         return response
 
-
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
         model_kwargs: dict = {},
-        model_type: Optional[ModelType]= ModelType.UNDEFINED
+        model_type: Optional[ModelType] = ModelType.UNDEFINED,
     ) -> dict:
         final_model_kwargs = model_kwargs.copy()
         # if model_type == ModelType.EMBEDDER:
@@ -281,6 +297,7 @@ class TransformerLLMModelClient(ModelClient):
     Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
     Once you have a token and have access, put the token in the environment variable HF_TOKEN.
     """
+
     #
     #   Model initialisation
     #
@@ -297,7 +314,7 @@ def __init__(
         chat_template_kwargs: Optional[dict] = None,
         use_token: bool = False,
         torch_dtype: Optional[Any] = torch.bfloat16,
-        local_files_only: Optional[bool] = False
+        local_files_only: Optional[bool] = False,
     ):
         super().__init__()
 
@@ -307,57 +324,55 @@ def __init__(
         self.auto_model_kwargs = auto_model_kwargs or dict()
         self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict()
         if "return_tensors" not in self.tokenizer_kwargs:
-            self.tokenizer_kwargs["return_tensors"]= "pt"
+            self.tokenizer_kwargs["return_tensors"] = "pt"
         self.use_token = use_token
         self.torch_dtype = torch_dtype
         self.init_from = init_from
         self.apply_chat_template = apply_chat_template
         self.chat_template = chat_template
-        self.chat_template_kwargs = chat_template_kwargs or dict(tokenize=False, add_generation_prompt=True)
+        self.chat_template_kwargs = chat_template_kwargs or dict(
+            tokenize=False, add_generation_prompt=True
+        )
         self.local_files_only = local_files_only
         self.model = None
         if model_name is not None:
             self.init_model(model_name=model_name)
 
-
     def _check_token(self, token: str):
         if get_env_variable(token) is None:
             warnings.warn(
                 f"{token} is not set. You may not be able to access the model."
             )
 
-
     def _get_token_if_relevant(self) -> Union[str, bool]:
         if self.use_token:
             self._check_token("HF_TOKEN")
             token = get_env_variable("HF_TOKEN")
         else:
-            token = False      
+            token = False
         return token
 
-
     def _init_from_pipeline(self):
 
         clean_device_cache()
-        token = self._get_token_if_relevant() # return a token string or False
+        token = self._get_token_if_relevant()  # return a token string or False
         self.model = pipeline(
             "text-generation",
             model=self.model_name,
             torch_dtype=self.torch_dtype,
             device=get_device(),
-            token=token
+            token=token,
         )
 
-
     def _init_from_automodelcasual_lm(self):
 
-        token = self._get_token_if_relevant() # return a token str or False
+        token = self._get_token_if_relevant()  # return a token str or False
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             self.model_name,
             token=token,
             local_files_only=self.local_files_only,
-            **self.auto_tokenizer_kwargs
+            **self.auto_tokenizer_kwargs,
         )
         self.model = AutoModelForCausalLM.from_pretrained(
             self.model_name,
@@ -365,7 +380,7 @@ def _init_from_automodelcasual_lm(self):
             device_map="auto",
             token=token,
             local_files_only=self.local_files_only,
-            **self.auto_model_kwargs
+            **self.auto_model_kwargs,
         )
         # Set pad token if it's not already set
         if self.tokenizer.pad_token is None:
@@ -374,18 +389,19 @@ def _init_from_automodelcasual_lm(self):
                 self.tokenizer.eos_token_id
             )  # ensure consistency in the model config
 
-
     @lru_cache(None)
     def init_model(self, model_name: str):
 
-        log.debug(f"Loading model {model_name}") 
+        log.debug(f"Loading model {model_name}")
         try:
             if self.init_from == "autoclass":
                 self._init_from_automodelcasual_lm()
             elif self.init_from == "pipeline":
                 self._init_from_pipeline()
             else:
-                raise ValueError("argument 'init_from' must be one of 'autoclass' or 'pipeline'.")
+                raise ValueError(
+                    "argument 'init_from' must be one of 'autoclass' or 'pipeline'."
+                )
         except Exception as e:
             log.error(f"Error loading model {model_name}: {e}")
             raise e
@@ -437,7 +453,7 @@ def _infer_from_pipeline(
                 apply_chat_template=True,
                 chat_template=chat_template,
                 chat_template_kwargs=chat_template_kwargs,
-                )
+            )
         else:
             model_input = self._handle_input(messages)
 
@@ -448,7 +464,6 @@ def _infer_from_pipeline(
         log.info(f"Outputs: {outputs}")
         return outputs
 
-
     def _infer_from_automodelcasual_lm(
         self,
         *,
@@ -469,27 +484,28 @@ def _infer_from_automodelcasual_lm(
                 messages,
                 apply_chat_template=True,
                 chat_template_kwargs=chat_template_kwargs,
-                chat_template=chat_template
-                )
+                chat_template=chat_template,
+            )
         else:
-           model_input = self._handle_input(messages) 
+            model_input = self._handle_input(messages)
         input_ids = self.tokenizer(model_input, **self.tokenizer_kwargs).to(
             get_device()
         )
-        outputs_tokens = self.model.generate(**input_ids, max_length=max_length, max_new_tokens=max_tokens, **kwargs)
+        outputs_tokens = self.model.generate(
+            **input_ids, max_length=max_length, max_new_tokens=max_tokens, **kwargs
+        )
         outputs = []
         for output in outputs_tokens:
             outputs.append(self.tokenizer.decode(output))
         return outputs
 
-
     def _handle_input(
-            self,
-            messages: Sequence[Dict[str, str]],
-            apply_chat_template: bool = False,
-            chat_template_kwargs: dict = None,
-            chat_template: Optional[str] = None,
-            ) -> str:
+        self,
+        messages: Sequence[Dict[str, str]],
+        apply_chat_template: bool = False,
+        chat_template_kwargs: dict = None,
+        chat_template: Optional[str] = None,
+    ) -> str:
 
         if apply_chat_template:
             if chat_template is not None:
@@ -497,7 +513,9 @@ def _handle_input(
             prompt = self.tokenizer.apply_chat_template(
                 messages, **chat_template_kwargs
             )
-            if ("tokenize" in chat_template_kwargs) and (chat_template_kwargs["tokenize"] == True):
+            if ("tokenize" in chat_template_kwargs) and (
+                chat_template_kwargs["tokenize"] == True
+            ):
                 prompt = self.tokenizer.decode(prompt, **self.tokenizer_decode_kwargs)
                 return prompt
             else:
@@ -506,7 +524,6 @@ def _handle_input(
             text = messages[-1]["content"]
             return text
 
-
     def infer_llm(
         self,
         *,
@@ -524,7 +541,7 @@ def infer_llm(
                 apply_chat_template=self.apply_chat_template,
                 chat_template=self.chat_template,
                 chat_template_kwargs=self.chat_template_kwargs,
-                **kwargs
+                **kwargs,
             )
         else:
             return self._infer_from_automodelcasual_lm(
@@ -534,13 +551,17 @@ def infer_llm(
                 apply_chat_template=self.apply_chat_template,
                 chat_template=self.chat_template,
                 chat_template_kwargs=self.chat_template_kwargs,
-                **kwargs
+                **kwargs,
             )
 
     #
     # Preprocessing, postprocessing and call for inference code
     #
-    def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelType.UNDEFINED):
+    def call(
+        self,
+        api_kwargs: Dict = None,
+        model_type: Optional[ModelType] = ModelType.UNDEFINED,
+    ):
         api_kwargs = api_kwargs or dict()
         if "model" not in api_kwargs:
             raise ValueError("model must be specified in api_kwargs")
@@ -548,7 +569,9 @@ def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelTy
         model_name = api_kwargs["model"]
         if (model_name != self.model_name) and (self.model_name is not None):
             # need to update the model_name
-            log.warning(f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}.")
+            log.warning(
+                f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}."
+            )
             self.model_name = model_name
             self.init_model(model_name=model_name)
         elif (model_name != self.model_name) and (self.model_name is None):
@@ -556,11 +579,9 @@ def call(self, api_kwargs: Dict = None, model_type: Optional[ModelType]= ModelTy
             self.model_name = model_name
             self.init_model(model_name=model_name)
 
-
         output = self.infer_llm(**api_kwargs)
         return output
 
-
     def _parse_chat_completion_from_pipeline(self, completion: Any) -> str:
 
         text = completion[0]["generated_text"]
@@ -573,12 +594,12 @@ def _parse_chat_completion_from_pipeline(self, completion: Any) -> str:
         else:
             return ""
 
-
-    def _parse_chat_completion_from_automodelcasual_lm(self, completion: Any) -> GeneratorOutput:
+    def _parse_chat_completion_from_automodelcasual_lm(
+        self, completion: Any
+    ) -> GeneratorOutput:
         print(f"completion: {completion}")
         return completion[0]
 
-
     def parse_chat_completion(self, completion: Any) -> str:
         try:
             if self.init_from == "pipeline":
@@ -590,18 +611,19 @@ def parse_chat_completion(self, completion: Any) -> str:
             log.error(f"Error parsing chat completion: {e}")
             return GeneratorOutput(data=None, raw_response=str(completion), error=e)
 
-
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,
         model_kwargs: dict = None,
-        model_type: Optional[ModelType]= ModelType.UNDEFINED
+        model_type: Optional[ModelType] = ModelType.UNDEFINED,
     ) -> dict:
         model_kwargs = model_kwargs or dict()
         final_model_kwargs = model_kwargs.copy()
         assert "model" in final_model_kwargs, "model must be specified"
-        #messages = [{"role": "system", "content": input}]
-        messages = [{"role": "user", "content": input}] # Not sure, but it seems to make more sense
+        # messages = [{"role": "system", "content": input}]
+        messages = [
+            {"role": "user", "content": input}
+        ]  # Not sure, but it seems to make more sense
         final_model_kwargs["messages"] = messages
         return final_model_kwargs
 
@@ -615,6 +637,7 @@ class TransformerRerankerModelClient(ModelClient):
     Find how to apply tokens here: https://huggingface.co/docs/hub/security-tokens
     Once you have a token and have access, put the token in the environment variable HF_TOKEN.
     """
+
     #
     #   Model initialisation
     #
@@ -626,32 +649,31 @@ def __init__(
         auto_tokenizer_kwargs: Optional[dict] = None,
         auto_model: Optional[type] = AutoModelForSequenceClassification,
         auto_tokenizer: Optional[type] = AutoTokenizer,
-        local_files_only: Optional[bool] = False
+        local_files_only: Optional[bool] = False,
     ):
         self.auto_model = auto_model
         self.auto_model_kwargs = auto_model_kwargs or dict()
         self.auto_tokenizer_kwargs = auto_tokenizer_kwargs or dict()
-        self.auto_tokenizer= auto_tokenizer
+        self.auto_tokenizer = auto_tokenizer
         self.model_name = model_name
         self.tokenizer_kwargs = tokenizer_kwargs or dict()
         if "return_tensors" not in self.tokenizer_kwargs:
-            self.tokenizer_kwargs["return_tensors"]= "pt"
+            self.tokenizer_kwargs["return_tensors"] = "pt"
         self.local_files_only = local_files_only
         if model_name is not None:
             self.init_model(model_name=model_name)
 
-
     def init_model(self, model_name: str):
         try:
             self.tokenizer = self.auto_tokenizer.from_pretrained(
-            self.model_name,
-            local_files_only=self.local_files_only,
-            **self.auto_tokenizer_kwargs
+                self.model_name,
+                local_files_only=self.local_files_only,
+                **self.auto_tokenizer_kwargs,
             )
             self.model = self.auto_model.from_pretrained(
-            self.model_name,
-            local_files_only=self.local_files_only,
-            **self.auto_model_kwargs
+                self.model_name,
+                local_files_only=self.local_files_only,
+                **self.auto_model_kwargs,
             )
             # Check device availability and set the device
             device = get_device()
@@ -684,10 +706,7 @@ def infer_reranker(
 
         with torch.no_grad():
 
-            inputs = self.tokenizer(
-                input,
-                **self.tokenizer_kwargs
-            )
+            inputs = self.tokenizer(input, **self.tokenizer_kwargs)
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             scores = (
                 self.model(**inputs, return_dict=True)
@@ -713,7 +732,9 @@ def call(self, api_kwargs: Dict = None):
         model_name = api_kwargs["model"]
         if (model_name != self.model_name) and (self.model_name is not None):
             # need to update the model_name
-            log.warning(f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}.")
+            log.warning(
+                f"The model passed in 'model_kwargs' is different that the one that has been previously initialised: Updating model from {self.model_name} to {model_name}."
+            )
             self.model_name = model_name
             self.init_model(model_name=model_name)
         elif (model_name != self.model_name) and (self.model_name is None):
@@ -727,13 +748,10 @@ def call(self, api_kwargs: Dict = None):
 
         top_k = api_kwargs.pop("top_k")
         scores = self.infer_reranker(**api_kwargs)
-        top_k_indices, top_k_scores = get_top_k_indices_scores(
-            scores, top_k
-        )
+        top_k_indices, top_k_scores = get_top_k_indices_scores(scores, top_k)
         log.warning(f"output: ({top_k_indices}, {top_k_scores})")
         return top_k_indices, top_k_scores
 
-
     def convert_inputs_to_api_kwargs(
         self,
         input: Any,  # for retriever, it is a single query,