Merge pull request #316 from SylphAI-Inc/main

[release]
SylphAI-Inc · Jan 16, 2025 · 6dc2511 · 6dc2511
2 parents 86072d7 + 13021df
commit 6dc2511
Show file tree

Hide file tree

Showing 73 changed files with 6,648 additions and 1,970 deletions.
diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml
@@ -14,7 +14,7 @@ jobs:
         python-version: ['3.9', '3.10', '3.11', '3.12']
 
     steps:
-      - uses: actions/checkout@v3 # Updated to the latest version
+      - uses: actions/checkout@v4 # Updated to the latest version
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4 # Updated to the latest version
         with:
@@ -37,7 +37,7 @@ jobs:
           poetry run pytest
 
       - name: Upload pytest results as an artifact (optional)
-        uses: actions/upload-artifact@v3 # Updated to the latest version
+        uses: actions/upload-artifact@v4 # Updated to the latest version
         if: always() # Always run this step to ensure test results are saved even if previous steps fail
         with:
           name: pytest-results

diff --git a/adalflow/CHANGELOG.md b/adalflow/CHANGELOG.md
@@ -1,10 +1,23 @@
-## [0.2.7] - TO Be Released
+## [0.2.7] - 2025-01-16
+
 ### Added
 - `Memory` is completed with `call` and `add_dialog_turn` methods.
 - Integrated `LanceDB` in the `Retriever`
+- Multi-modal (image input and generation) in `OpenAIClient` along with tests.
+- `ComponentList` to support a list of components registered in a component. Added `test_componentlist` to test the `ComponentList`.
+
 ### Improved
+- Better diagnose report for `Trainer.diagnose`.
 - `BedrockAPIClient` added more details on setup, yet it is still in experimental stage.
 - `AzureAPIClient` added more details on setup, yet it is still in experimental stage.
+-  `Retriever` class:
+  - Support data id (field).
+- `GradComponent`: Support pass-through gradient for the `forward` method.
+
+Optimization
+- Aggregated all backward engine prompts in `backward_engine_prompt`.
+- Added `TGDData` for the optimizer to support reasoning at proposing new prompt.
+- Added `sequential_order` in the `Trainer` to support the sequential training order. Reorganized the trainer code.
 ## [0.2.6] - 2024-11-25
 ### Improved
 - Add default `max_tokens=512` to the `AnthropicAPIClient` to avoid the error when the user does not provide the `max_tokens` in the prompt.

diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py
@@ -1,7 +1,7 @@
-__version__ = "0.2.6"
+__version__ = "0.2.7"
 
 from adalflow.core.component import Component, fun_to_component
-from adalflow.core.container import Sequential
+from adalflow.core.container import Sequential, ComponentList
 from adalflow.core.base_data_class import DataClass, DataClassFormatType, required_field
 
 from adalflow.optim.grad_component import GradComponent
@@ -63,6 +63,10 @@
     BedrockAPIClient,
 )
 
+# data pipeline
+from adalflow.components.data_process.text_splitter import TextSplitter
+from adalflow.components.data_process.data_components import ToEmbeddings
+
 __all__ = [
     "Component",
     "fun_to_component",
@@ -72,7 +76,10 @@
     "required_field",
     # Container
     "Sequential",
+    "ComponentList",
+    # Grad Component
     "GradComponent",
+    # Functional Component
     "ModelClient",
     "Generator",
     "Embedder",
@@ -99,6 +106,9 @@
     "JsonOutputParser",
     "ListOutputParser",
     "DataClassParser",
+    # Data Pipeline
+    "TextSplitter",
+    "ToEmbeddings",
     # Types
     "GeneratorOutput",
     "EmbedderOutput",

diff --git a/adalflow/adalflow/components/model_client/openai_client.py b/adalflow/adalflow/components/model_client/openai_client.py
@@ -1,6 +1,7 @@
 """OpenAI ModelClient integration."""
 
 import os
+import base64
 from typing import (
     Dict,
     Sequence,
@@ -35,6 +36,7 @@
 from openai.types import (
     Completion,
     CreateEmbeddingResponse,
+    Image,
 )
 from openai.types.chat import ChatCompletionChunk, ChatCompletion
 
@@ -99,7 +101,7 @@ def get_probabilities(completion: ChatCompletion) -> List[List[TokenLogProb]]:
 class OpenAIClient(ModelClient):
     __doc__ = r"""A component wrapper for the OpenAI API client.
 
-    Support both embedding and chat completion API.
+    Support both embedding and chat completion API, including multimodal capabilities.
 
     Users (1) simplify use ``Embedder`` and ``Generator`` components by passing OpenAIClient() as the model_client.
     (2) can use this as an example to create their own API client or extend this class(copying and modifing the code) in their own project.
@@ -110,6 +112,17 @@ class OpenAIClient(ModelClient):
         Instead
         - use :ref:`OutputParser<components-output_parsers>` for response parsing and formating.
 
+        For multimodal inputs, provide images in model_kwargs["images"] as a path, URL, or list of them.
+        The model must support vision capabilities (e.g., gpt-4o, gpt-4o-mini, o1, o1-mini).
+
+        For image generation, use model_type=ModelType.IMAGE_GENERATION and provide:
+        - model: "dall-e-3" or "dall-e-2"
+        - prompt: Text description of the image to generate
+        - size: "1024x1024", "1024x1792", or "1792x1024" for DALL-E 3; "256x256", "512x512", or "1024x1024" for DALL-E 2
+        - quality: "standard" or "hd" (DALL-E 3 only)
+        - n: Number of images to generate (1 for DALL-E 3, 1-10 for DALL-E 2)
+        - response_format: "url" or "b64_json"
+
     Args:
         api_key (Optional[str], optional): OpenAI API key. Defaults to None.
         chat_completion_parser (Callable[[Completion], Any], optional): A function to parse the chat completion to a str. Defaults to None.
@@ -118,6 +131,8 @@ class OpenAIClient(ModelClient):
     References:
         - Embeddings models: https://platform.openai.com/docs/guides/embeddings
         - Chat models: https://platform.openai.com/docs/guides/text-generation
+        - Vision models: https://platform.openai.com/docs/guides/vision
+        - Image models: https://platform.openai.com/docs/guides/images
         - OpenAI docs: https://platform.openai.com/docs/introduction
     """
 
@@ -200,7 +215,7 @@ def track_completion_usage(
     def parse_embedding_response(
         self, response: CreateEmbeddingResponse
     ) -> EmbedderOutput:
-        r"""Parse the embedding response to a structure LightRAG components can understand.
+        r"""Parse the embedding response to a structure Adalflow components can understand.
 
         Should be called in ``Embedder``.
         """
@@ -218,7 +233,20 @@ def convert_inputs_to_api_kwargs(
     ) -> Dict:
         r"""
         Specify the API input type and output api_kwargs that will be used in _call and _acall methods.
-        Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format
+        Convert the Component's standard input, and system_input(chat model) and model_kwargs into API-specific format.
+        For multimodal inputs, images can be provided in model_kwargs["images"] as a string path, URL, or list of them.
+        The model specified in model_kwargs["model"] must support multimodal capabilities when using images.
+
+        Args:
+            input: The input text or messages to process
+            model_kwargs: Additional parameters including:
+                - images: Optional image source(s) as path, URL, or list of them
+                - detail: Image detail level ('auto', 'low', or 'high'), defaults to 'auto'
+                - model: The model to use (must support multimodal inputs if images are provided)
+            model_type: The type of model (EMBEDDER or LLM)
+
+        Returns:
+            Dict: API-specific kwargs for the model call
         """
 
         final_model_kwargs = model_kwargs.copy()
@@ -232,6 +260,8 @@ def convert_inputs_to_api_kwargs(
         elif model_type == ModelType.LLM:
             # convert input to messages
             messages: List[Dict[str, str]] = []
+            images = final_model_kwargs.pop("images", None)
+            detail = final_model_kwargs.pop("detail", "auto")
 
             if self._input_type == "messages":
                 system_start_tag = "<START_OF_SYSTEM_PROMPT>"
@@ -248,19 +278,74 @@ def convert_inputs_to_api_kwargs(
                 if match:
                     system_prompt = match.group(1)
                     input_str = match.group(2)
-
                 else:
                     print("No match found.")
                 if system_prompt and input_str:
                     messages.append({"role": "system", "content": system_prompt})
-                    messages.append({"role": "user", "content": input_str})
+                    if images:
+                        content = [{"type": "text", "text": input_str}]
+                        if isinstance(images, (str, dict)):
+                            images = [images]
+                        for img in images:
+                            content.append(self._prepare_image_content(img, detail))
+                        messages.append({"role": "user", "content": content})
+                    else:
+                        messages.append({"role": "user", "content": input_str})
             if len(messages) == 0:
-                messages.append({"role": "system", "content": input})
+                if images:
+                    content = [{"type": "text", "text": input}]
+                    if isinstance(images, (str, dict)):
+                        images = [images]
+                    for img in images:
+                        content.append(self._prepare_image_content(img, detail))
+                    messages.append({"role": "user", "content": content})
+                else:
+                    messages.append({"role": "system", "content": input})
             final_model_kwargs["messages"] = messages
+        elif model_type == ModelType.IMAGE_GENERATION:
+            # For image generation, input is the prompt
+            final_model_kwargs["prompt"] = input
+            # Ensure model is specified
+            if "model" not in final_model_kwargs:
+                raise ValueError("model must be specified for image generation")
+            # Set defaults for DALL-E 3 if not specified
+            final_model_kwargs["size"] = final_model_kwargs.get("size", "1024x1024")
+            final_model_kwargs["quality"] = final_model_kwargs.get(
+                "quality", "standard"
+            )
+            final_model_kwargs["n"] = final_model_kwargs.get("n", 1)
+            final_model_kwargs["response_format"] = final_model_kwargs.get(
+                "response_format", "url"
+            )
+
+            # Handle image edits and variations
+            image = final_model_kwargs.get("image")
+            if isinstance(image, str) and os.path.isfile(image):
+                final_model_kwargs["image"] = self._encode_image(image)
+
+            mask = final_model_kwargs.get("mask")
+            if isinstance(mask, str) and os.path.isfile(mask):
+                final_model_kwargs["mask"] = self._encode_image(mask)
         else:
             raise ValueError(f"model_type {model_type} is not supported")
         return final_model_kwargs
 
+    def parse_image_generation_response(self, response: List[Image]) -> GeneratorOutput:
+        """Parse the image generation response into a GeneratorOutput."""
+        try:
+            # Extract URLs or base64 data from the response
+            data = [img.url or img.b64_json for img in response]
+            # For single image responses, unwrap from list
+            if len(data) == 1:
+                data = data[0]
+            return GeneratorOutput(
+                data=data,
+                raw_response=str(response),
+            )
+        except Exception as e:
+            log.error(f"Error parsing image generation response: {e}")
+            return GeneratorOutput(data=None, error=str(e), raw_response=str(response))
+
     @backoff.on_exception(
         backoff.expo,
         (
@@ -285,6 +370,19 @@ def call(self, api_kwargs: Dict = {}, model_type: ModelType = ModelType.UNDEFINE
                 self.chat_completion_parser = handle_streaming_response
                 return self.sync_client.chat.completions.create(**api_kwargs)
             return self.sync_client.chat.completions.create(**api_kwargs)
+        elif model_type == ModelType.IMAGE_GENERATION:
+            # Determine which image API to call based on the presence of image/mask
+            if "image" in api_kwargs:
+                if "mask" in api_kwargs:
+                    # Image edit
+                    response = self.sync_client.images.edit(**api_kwargs)
+                else:
+                    # Image variation
+                    response = self.sync_client.images.create_variation(**api_kwargs)
+            else:
+                # Image generation
+                response = self.sync_client.images.generate(**api_kwargs)
+            return response.data
         else:
             raise ValueError(f"model_type {model_type} is not supported")
 
@@ -311,6 +409,21 @@ async def acall(
             return await self.async_client.embeddings.create(**api_kwargs)
         elif model_type == ModelType.LLM:
             return await self.async_client.chat.completions.create(**api_kwargs)
+        elif model_type == ModelType.IMAGE_GENERATION:
+            # Determine which image API to call based on the presence of image/mask
+            if "image" in api_kwargs:
+                if "mask" in api_kwargs:
+                    # Image edit
+                    response = await self.async_client.images.edit(**api_kwargs)
+                else:
+                    # Image variation
+                    response = await self.async_client.images.create_variation(
+                        **api_kwargs
+                    )
+            else:
+                # Image generation
+                response = await self.async_client.images.generate(**api_kwargs)
+            return response.data
         else:
             raise ValueError(f"model_type {model_type} is not supported")
 
@@ -332,22 +445,74 @@ def to_dict(self) -> Dict[str, Any]:
         output = super().to_dict(exclude=exclude)
         return output
 
+    def _encode_image(self, image_path: str) -> str:
+        """Encode image to base64 string.
+
+        Args:
+            image_path: Path to image file.
+
+        Returns:
+            Base64 encoded image string.
+
+        Raises:
+            ValueError: If the file cannot be read or doesn't exist.
+        """
+        try:
+            with open(image_path, "rb") as image_file:
+                return base64.b64encode(image_file.read()).decode("utf-8")
+        except FileNotFoundError:
+            raise ValueError(f"Image file not found: {image_path}")
+        except PermissionError:
+            raise ValueError(f"Permission denied when reading image file: {image_path}")
+        except Exception as e:
+            raise ValueError(f"Error encoding image {image_path}: {str(e)}")
+
+    def _prepare_image_content(
+        self, image_source: Union[str, Dict[str, Any]], detail: str = "auto"
+    ) -> Dict[str, Any]:
+        """Prepare image content for API request.
+
+        Args:
+            image_source: Either a path to local image or a URL.
+            detail: Image detail level ('auto', 'low', or 'high').
 
+        Returns:
+            Formatted image content for API request.
+        """
+        if isinstance(image_source, str):
+            if image_source.startswith(("http://", "https://")):
+                return {
+                    "type": "image_url",
+                    "image_url": {"url": image_source, "detail": detail},
+                }
+            else:
+                base64_image = self._encode_image(image_source)
+                return {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": f"data:image/jpeg;base64,{base64_image}",
+                        "detail": detail,
+                    },
+                }
+        return image_source
+
+
+# Example usage:
 # if __name__ == "__main__":
 #     from adalflow.core import Generator
 #     from adalflow.utils import setup_env, get_logger
-
+#
 #     log = get_logger(level="DEBUG")
-
+#
 #     setup_env()
 #     prompt_kwargs = {"input_str": "What is the meaning of life?"}
-
+#
 #     gen = Generator(
 #         model_client=OpenAIClient(),
 #         model_kwargs={"model": "gpt-3.5-turbo", "stream": True},
 #     )
 #     gen_response = gen(prompt_kwargs)
 #     print(f"gen_response: {gen_response}")
-
+#
 #     for genout in gen_response.data:
 #         print(f"genout: {genout}")
diff --git a/adalflow/adalflow/core/__init__.py b/adalflow/adalflow/core/__init__.py
@@ -1,7 +1,7 @@
 from .base_data_class import DataClass, required_field, DataClassFormatType
 
 from .component import Component, FunComponent, fun_to_component
-from .container import Sequential
+from .container import Sequential, ComponentList
 from .db import LocalDB
 from .default_prompt_template import DEFAULT_ADALFLOW_SYSTEM_PROMPT
 from .embedder import Embedder, BatchEmbedder
@@ -50,6 +50,7 @@
     "LocalDB",
     "Component",
     "Sequential",
+    "ComponentList",
     "FunComponent",
     "fun_to_component",
     "DataClass",