diff --git a/SETUP.md b/SETUP.md new file mode 100644 index 00000000..887eff68 --- /dev/null +++ b/SETUP.md @@ -0,0 +1,3 @@ +# Create a kernel + +```poetry run python -m ipykernel install --user --name my-project-kernel``` diff --git a/adalflow/CHANGELOG.md b/adalflow/CHANGELOG.md index 05bbbbd8..315814a6 100644 --- a/adalflow/CHANGELOG.md +++ b/adalflow/CHANGELOG.md @@ -1,3 +1,8 @@ +## [0.2.5] - 2024-10-28 + +### Fixed +- `DataClassParser` nested data class parsing where we have to use `from_dict(json_dict)` instead of `(**json_dict)` to parse the nested data class. + ## [0.2.4] - 2024-10-27 ### Added diff --git a/adalflow/adalflow/__init__.py b/adalflow/adalflow/__init__.py index 184dc514..af90187b 100644 --- a/adalflow/adalflow/__init__.py +++ b/adalflow/adalflow/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.2.4" +__version__ = "0.2.5" from adalflow.core.component import Component, fun_to_component from adalflow.core.container import Sequential @@ -15,6 +15,8 @@ ) from adalflow.core.model_client import ModelClient from adalflow.core.embedder import Embedder + +# parser from adalflow.core.string_parser import ( YamlParser, JsonParser, @@ -30,7 +32,10 @@ ListOutputParser, ) from adalflow.components.output_parsers.dataclass_parser import DataClassParser + from adalflow.core.prompt_builder import Prompt + +# optimization from adalflow.optim import ( Optimizer, DemoOptimizer, diff --git a/adalflow/adalflow/components/output_parsers/dataclass_parser.py b/adalflow/adalflow/components/output_parsers/dataclass_parser.py index 057f97d9..6d2e56dd 100644 --- a/adalflow/adalflow/components/output_parsers/dataclass_parser.py +++ b/adalflow/adalflow/components/output_parsers/dataclass_parser.py @@ -1,4 +1,4 @@ -"""DataClassParser will help users convert a dataclass to prompt""" +"""DataClassParser will help users interact with LLMs even better than JsonOutputParser and YamlOutputParser with DataClass.""" from dataclasses import is_dataclass from typing import Any, Literal, List, Optional @@ -43,9 +43,46 @@ class DataClassParser(Component): - __doc__ = ( - r"""This is similar to Dspy's signature but more controllable and flexible.""" - ) + __doc__ = r"""Made the structured output even simpler compared with JsonOutputParser and YamlOutputParser. + + 1. Understands __input_fields__ and __output_fields__ from the DataClass (no need to use include/exclude to decide fields). + 2. User can choose to save the `task_desc` in the DataClass and use it in the prompt. + + Example: + + .. code-block:: python + + @dataclass + class BasicQAOutput(adal.DataClass): + explanation: str = field( + metadata={"desc": "A brief explanation of the concept in one sentence."} + ) + example: str = field( + metadata={"desc": "An example of the concept in a sentence."} + ) + # Control output fields order + __output_fields__ = ["explanation", "example"] + + # Define the template using jinja2 syntax + qa_template = " + You are a helpful assistant. + + {{output_format_str}} + + + {{input_str}} " + + parser = adal.DataClassParser(data_class=BasicQAOutput, return_data_class=True) + + # Set up the generator with model, template, and parser + self.generator = adal.Generator( + model_client=model_client, + model_kwargs=model_kwargs, + template=qa_template, + prompt_kwargs={"output_format_str": parser.get_output_format_str()}, + output_processors=parser, + ) + """ def __init__( self, @@ -132,10 +169,10 @@ def get_examples_str( def call(self, input: str) -> Any: r"""Parse the output string to the desired format and return the parsed output.""" try: - output = self._output_processor(input) + output_dict = self._output_processor(input) if self._return_data_class: - return self._data_class(**output) - return output + return self._data_class.from_dict(output_dict) + return output_dict except Exception as e: log.error(f"Error at parsing output: {e}") raise ValueError(f"Error: {e}") diff --git a/adalflow/adalflow/components/output_parsers/outputs.py b/adalflow/adalflow/components/output_parsers/outputs.py index b38f63a4..1f4ff652 100644 --- a/adalflow/adalflow/components/output_parsers/outputs.py +++ b/adalflow/adalflow/components/output_parsers/outputs.py @@ -1,4 +1,11 @@ -"""The most commonly used output parsers for the Generator.""" +"""The most commonly used output parsers for the Generator. + +Includes: +- YamlOutputParser: YAML output parser using dataclass for schema extraction. +- JsonOutputParser: JSON output parser using dataclass for schema extraction. +- ListOutputParser: List output parser to parse list of objects from the string. +- BooleanOutputParser: Boolean output parser to parse boolean values from the string. +""" from dataclasses import is_dataclass from typing import Dict, Any, Optional, List diff --git a/adalflow/pyproject.toml b/adalflow/pyproject.toml index c2705364..49a6cbe8 100644 --- a/adalflow/pyproject.toml +++ b/adalflow/pyproject.toml @@ -1,8 +1,8 @@ [tool.poetry] name = "adalflow" -version = "0.2.4" -description = "The Library to Build and Auto-optimize Any LLM Task Pipeline" +version = "0.2.5" +description = "The Library to Build and Auto-optimize LLM Applications" authors = ["Li Yin "] readme = "README.md" repository = "https://github.com/SylphAI-Inc/AdalFlow" diff --git a/adalflow/tests/test_data_class_parser.py b/adalflow/tests/test_data_class_parser.py new file mode 100644 index 00000000..b87fde4c --- /dev/null +++ b/adalflow/tests/test_data_class_parser.py @@ -0,0 +1,142 @@ +import unittest +from dataclasses import dataclass, field +from typing import List +from adalflow.core.base_data_class import DataClass +from adalflow.components.output_parsers.dataclass_parser import DataClassParser + + +# Define a basic DataClass for testing +@dataclass +class BasicOutput(DataClass): + explanation: str = field( + metadata={"desc": "A brief explanation of the concept in one sentence."} + ) + example: str = field(metadata={"desc": "An example of the concept in a sentence."}) + __output_fields__ = ["explanation", "example"] + + +# Define a nested DataClass for testing +@dataclass +class NestedOutput(DataClass): + title: str + description: str + items: List[str] + __output_fields__ = ["title", "description", "items"] + + +class TestDataClassParser(unittest.TestCase): + + def setUp(self): + self.basic_data_class = BasicOutput + self.nested_data_class = NestedOutput + self.basic_parser = DataClassParser( + data_class=self.basic_data_class, return_data_class=True, format_type="json" + ) + self.nested_parser = DataClassParser( + data_class=self.nested_data_class, + return_data_class=True, + format_type="yaml", + ) + + def test_basic_data_class_json(self): + input_instance = BasicOutput( + explanation="This is a test.", example="Example sentence." + ) + input_str = self.basic_parser.get_input_str(input_instance) + self.assertIn("This is a test.", input_str) + self.assertIn("Example sentence.", input_str) + + output_format_str = self.basic_parser.get_output_format_str() + self.assertIn("explanation", output_format_str) + self.assertIn("example", output_format_str) + + output = self.basic_parser.call( + '{"explanation": "Test explanation", "example": "Test example."}' + ) + self.assertIsInstance(output, BasicOutput) + + def test_basic_data_class_yaml(self): + self.yaml_parser = DataClassParser( + data_class=self.basic_data_class, return_data_class=True, format_type="yaml" + ) + input_instance = BasicOutput( + explanation="This is a test.", example="Example sentence." + ) + input_str = self.yaml_parser.get_input_str(input_instance) + self.assertIn("This is a test.", input_str) + + self.assertIn("Example sentence.", input_str) + + output_format_str = self.yaml_parser.get_output_format_str() + self.assertIn("explanation", output_format_str) + self.assertIn("example", output_format_str) + + output = self.yaml_parser.call( + """explanation: Test explanation +example: Test example.""" + ) + print(f"output: {output}") + self.assertIsInstance(output, BasicOutput) + + def test_nested_data_class_json(self): + input_instance = NestedOutput( + title="Title", description="Description", items=["Item 1", "Item 2"] + ) + input_str = self.nested_parser.get_input_str(input_instance) + self.assertIn("Title", input_str) + self.assertIn("Description", input_str) + self.assertIn("Item 1", input_str) + self.assertIn("Item 2", input_str) + + output_format_str = self.nested_parser.get_output_format_str() + self.assertIn("title", output_format_str) + self.assertIn("description", output_format_str) + self.assertIn("items", output_format_str) + + output = self.nested_parser.call( + """title: Nested Title +description: Nested description +items: + - Item 1 + - Item 2""" + ) + self.assertIsInstance(output, NestedOutput) + + def test_nested_data_class_yaml(self): + self.nested_parser._format_type = "yaml" + input_instance = NestedOutput( + title="Title", description="Description", items=["Item 1", "Item 2"] + ) + input_str = self.nested_parser.get_input_str(input_instance) + self.assertIn("Title", input_str) + self.assertIn("Description", input_str) + self.assertIn("Item 1", input_str) + self.assertIn("Item 2", input_str) + + output_format_str = self.nested_parser.get_output_format_str() + self.assertIn("title", output_format_str) + self.assertIn("description", output_format_str) + self.assertIn("items", output_format_str) + + output = self.nested_parser.call( + """title: Nested Title +description: Nested description +items: + - Item 1 + - Item 2""" + ) + self.assertIsInstance(output, NestedOutput) + + def test_invalid_data_class(self): + with self.assertRaises(ValueError): + DataClassParser(data_class=dict) # dict is not a dataclass + + def test_invalid_format_type(self): + with self.assertRaises(ValueError): + DataClassParser( + data_class=self.basic_data_class, format_type="xml" + ) # Invalid format type + + +if __name__ == "__main__": + unittest.main() diff --git a/adalflow/tests/test_output_parser.py b/adalflow/tests/test_output_parser.py index b9502f77..a2b529dc 100644 --- a/adalflow/tests/test_output_parser.py +++ b/adalflow/tests/test_output_parser.py @@ -13,6 +13,8 @@ class User(DataClass): id: int = field(default=1, metadata={"description": "User ID"}) name: str = field(default="John", metadata={"description": "User name"}) + __input_fields__ = ["id", "name"] + class TestOutputParsers(unittest.TestCase): diff --git a/docs/source/apis/components/index.rst b/docs/source/apis/components/index.rst index 893e7483..fce07dc1 100644 --- a/docs/source/apis/components/index.rst +++ b/docs/source/apis/components/index.rst @@ -49,6 +49,7 @@ Output Parsers .. autosummary:: components.output_parsers.outputs + components.output_parsers.dataclass_parser Agent ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/tutorials/base_data_class.rst b/docs/source/tutorials/base_data_class.rst index da78f58e..081533eb 100644 --- a/docs/source/tutorials/base_data_class.rst +++ b/docs/source/tutorials/base_data_class.rst @@ -1,4 +1,15 @@ .. _core-base_data_class_note: + + +.. raw:: html + +
+ + Try Quickstart in Colab + + +
+ DataClass ============ @@ -7,10 +18,10 @@ DataClass .. `Li Yin `_ -In `PyTorch`, ``Tensor`` is the data type used in ``Module`` and ``Optimizer`` across the library. -Tensor wraps a multi-dimensional matrix to better support its operations and computations. + In LLM applications, data constantly needs to interact with LLMs in the form of strings via prompt and be parsed back to structured data from LLMs' text prediction. :class:`DataClass` is designed to ease this data interaction with LLMs via prompt(input) and to parse the text prediction(output). +It is even more convenient to use together with :ref:`components-output_parser_note`. .. figure:: /_static/images/dataclass.png :align: center @@ -61,11 +72,13 @@ Here is how users typically use the ``dataclasses`` module: We also made the effort to provide more control: 1. **Keep the ordering of your data fields.** We provided :func:`required_field` with ``default_factory`` to mark the field as required even if it is after optional fields. We also has to do customization to preserve their ordering while being converted to dictionary, json and yaml string. -2. **Exclude some fields from the output.** All serialization methods support `exclude` parameter to exclude some fields even for nested dataclasses. -3. **Allow nested dataclasses, lists, and dictionaries.** All methods support nested dataclasses, lists, and dictionaries. +2. **Signal the output/input fields.** We allow you to use ``__output_fields__`` and ``__input_fields__`` to explicitly signal the output and input fields. (1) It can be a subset of the fields in the data class. (2) You can specify the ordering in the `__output_fields__`. +3. **Exclude some fields from the output.** All serialization methods support `exclude` parameter to exclude some fields even for nested dataclasses. +4. **Allow nested dataclasses, lists, and dictionaries.** All methods support nested dataclasses, lists, and dictionaries. +5. **Easy to use with Output parser.** It works well with output parsers such as ``JsonOutputParser``, ``YamlOutputParser``, and ``DataClassParser``. You can refer to :ref:`components-output_parser_note` for more details. -Describing the Data Format +Describing the Data Format (Data Class) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: @@ -74,6 +87,10 @@ Describing the Data Format * - **Name** - **Description** + * - ``__input_fields__`` + - A list of fields that are input fields. + * - ``__output_fields__`` + - Used more often than ``__input_fields__``. A list of fields that are output fields. (1) It can be a subset of the fields in the data class. (2) You can specify the ordering in the `__output_fields__`. (3) Works well and only with :class:`DataClassParser`. * - ``to_schema(cls, exclude) -> Dict`` - Generate a JSON schema which is more detailed than the signature. * - ``to_schema_str(cls, exclude) -> str`` @@ -227,7 +244,7 @@ As you can see, it handles the nested dataclass `Question` and the required fiel .. note:: - ``Optional`` type hint will not affect the field's required status. You can use this to work with static type checkers such as `mypy` if you want to. + ``Optional`` type hint will not affect the field's required status. We recommend you not to use it in the `dataclasses` module especially when you are nesting many levels of dataclasses. It might end up confusing the LLMs. **Signature** @@ -600,7 +617,10 @@ You can simply do a bit customization to map the dataset's key to the field name If you are looking for data types we used to support each component or any other class like `Optimizer`, you can check out the :ref:`core.types` file. - +About __output_fields__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Though you can use `exclude` in the :class:`JsonOutputParser` to exclude some fields from the output, it is less readable and less convenient than +directly use `__output_fields__` in the data class to signal the output fields and directly work with :class:`DataClassParser`. .. admonition:: References :class: highlight @@ -616,7 +636,9 @@ You can simply do a bit customization to map the dataset's key to the field name - :class:`core.base_data_class.DataClassFormatType` - :func:`core.functional.custom_asdict` - :ref:`core.base_data_class` - + - :class:`core.base_data_class.required_field` + - :class:`components.output_parsers.outputs.JsonOutputParser` + - :class:`components.output_parsers.dataclass_parser.DataClassParser` .. Document .. ------------ diff --git a/docs/source/tutorials/output_parsers.rst b/docs/source/tutorials/output_parsers.rst index c619998b..0df19e09 100644 --- a/docs/source/tutorials/output_parsers.rst +++ b/docs/source/tutorials/output_parsers.rst @@ -1,7 +1,26 @@ +.. _components-output_parser_note: + +.. raw:: html + + + Parser ============= -Parser is the `interpreter` of the LLM output. +Parser is the `interpreter` of the LLM output. We have three types of parsers: + +- **String Parsers**: it simply converts the string to the desired data type. They are located at :ref:`core.string_parser`. +- **Output Parsers**: it orchestrates the parsing and output formatting(in yaml, json and more) process. They are located at :ref:`components.output_parsers.outputs`. :class:`JsonOutputParser` and :class:`YamlOutputParser` can work with :ref:`DataClass` for structured output. +- **DataClass Parser**: On top of `YamlOutputParser` and `JsonOutputParser`, :class:`DataClassParser` is the most compatible to work with :ref:`DataClass` for structured output. @@ -140,7 +159,44 @@ Thus, ``JsonOutputParser`` and ``YamlOutputParser`` both takes the following arg - ``data_class``: the ``DataClass`` type. - ``examples``: the examples of the data class instance if you want to show the examples in the prompt. -- ``exclude``: the fields to exclude from both the data format and the examples. +- ``exclude``: the fields to exclude from both the data format and the examples, a way to tell the ``format_instructions`` on which is the output field from the data class. + +DataClass Parser +~~~~~~~~~~~~~~~~~~~~ +To make things even easier for the developers, we created :class:`DataClassParser` which +understands `__input_fields__` and `__output_fields__` of the `DataClass`, and it is especially helpful to work on a training dataset where we will have both inputs and outputs. +Users do not have to use `exclude/include` fields to specify the output fields, it will automatically understand the output fields from the `DataClass` instance. + +Below is an overview of its key components and functionalities. + +.. list-table:: + :header-rows: 1 + :widths: 20 20 60 + + * - Method + - Description + - Details + * - ``__init__(data_class: DataClass, return_data_class: bool = False, format_type: Literal["yaml", "json"] = "json")`` + - Initializes the DataClassParser + - Takes a DataClass type, whether to return the DataClass instance after parsing, and the output format type (JSON or YAML). + * - ``get_input_format_str() -> str`` + - Returns formatted instructions for input data + - Provides a string representation of the input fields defined in the DataClass. + * - ``get_output_format_str() -> str`` + - Returns formatted instructions for output data + - Generates a schema string for the output fields of the DataClass. + * - ``get_input_str(input: DataClass) -> str`` + - Formats the input data as a string + - Converts a DataClass instance to either JSON or YAML based on the specified format type. + * - ``get_task_desc_str() -> str`` + - Returns the task description string + - Retrieves the task description associated with the DataClass, useful for context in LLM prompts. + * - ``get_examples_str(examples: List[DataClass], include: Optional[IncludeType] = None, exclude: Optional[ExcludeType] = None) -> str`` + - Formats a list of example DataClass instances + - Generates a formatted string representation of examples, adhering to the specified ``include/exclude`` parameters. + * - ``call(input: str) -> Any`` + - Parses the output string to the desired format and returns parsed output + - Handles both JSON and YAML parsing, converting to the corresponding DataClass if specified. .. TODO: a summary table and a diagram @@ -148,7 +204,8 @@ Parser in Action ------------------ All of the parsers are quite straightforward to use. -**BooleanParser** +BooleanParser +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -181,7 +238,9 @@ The printout will be: Boolean parsers will not work for '1', '0', 'yes', 'no' as they are not the standard boolean values. -**IntParser** + +IntParser +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -210,7 +269,9 @@ The printout will be: ``IntParser`` will return the integer value of the first number in the string, even if it is a float. -**FloatParser** + +FloatParser +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -240,7 +301,9 @@ The printout will be: ``FloatParser`` will return the float value of the first number in the string, even if it is an integer. -**ListParser** + +ListParser +~~~~~~~~~~~~~~~~~~ .. code-block:: python @@ -263,7 +326,9 @@ The output will be: ['key', 2] [{'key': 'value'}, {'key': 'value'}] -**JsonParser** + +JsonParser +~~~~~~~~~~~~~~~~~~ Even though it can work on lists, it is better to only use it for dictionaries. @@ -294,7 +359,9 @@ The output will be: ['key', 2] [{'key': 'value'}, {'key': 'value'}] -**YamlParser** + +YamlParser +~~~~~~~~~~~~~~~~~~ Though it works almost on all of the previous examples, it is better to use it for yaml formatted dictionaries. @@ -344,7 +411,9 @@ And we will demonstrate how to use ``JsonOutputParser`` and ``YamlOutputParser`` user_example = User(id=1, name="John") -**JsonOutputParser** + +JsonOutputParser +~~~~~~~~~~~~~~~~~~ Here is how to use ``JsonOutputParser``: @@ -416,7 +485,9 @@ The output will be: {'id': 2, 'name': 'Jane'} -**YamlOutputParser** + +YamlOutputParser +~~~~~~~~~~~~~~~~~~ The steps are totally the same as the ``JsonOutputParser``. @@ -496,6 +567,147 @@ The output will be: .. .. [1] Jinja2: https://jinja.palletsprojects.com/en/3.1.x/ .. .. [2] Llama3 special tokens: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/ +DataclassParser in Action +-------------------------- + +First, let's create a new data class with both input and output fields. + +.. code-block:: python + + @dataclass + class SampleDataClass(DataClass): + description: str = field(metadata={"desc": "A sample description"}) + category: str = field(metadata={"desc": "Category of the sample"}) + value: int = field(metadata={"desc": "A sample integer value"}) + status: str = field(metadata={"desc": "Status of the sample"}) + + __input_fields__ = [ + "description", + "category", + ] # Define which fields are input fields + __output_fields__ = ["value", "status"] # Define which fields are output fields + + +Now, lets' create a parser that will use the `SampleDataClass` to parse the output json string back to the data class instance. + +.. code-block:: python + + from adalflow.components.output_parsers import DataClassParser + + parser = DataClassParser(data_class=SampleDataClass, return_data_class=True, format_type="json") + +Let's view the structure of the parser use `print(parser)`. + +The output will be: + +.. code-block:: + + DataClassParser( + data_class=SampleDataClass, format_type=json, return_data_class=True, input_fields=['description', 'category'], output_fields=['value', 'status'] + (_output_processor): JsonParser() + (output_format_prompt): Prompt( + template: Your output should be formatted as a standard JSON instance with the following schema: + ``` + {{schema}} + ``` + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the JSON output. + -Follow the JSON formatting conventions., prompt_variables: ['schema'] + ) + ) + +You can get the output and input format strings using the following methods: + +.. code-block:: python + + print(parser.get_input_format_str()) + print(parser.get_output_format_str()) + +The output for the output format string will be: + +.. code-block:: + + Your output should be formatted as a standard JSON instance with the following schema: + ``` + { + "value": " (int) (required)", + "status": " (str) (required)" + } + ``` + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -DO NOT mistaken the "properties" and "type" in the schema as the actual fields in the JSON output. + -Follow the JSON formatting conventions. + +The input format string will be: + +.. code-block:: + + { + "description": " (str) (required)", + "category": " (str) (required)" + } + +Convert a json string to a data class instance: + +.. code-block:: python + + user_input = '{"description": "Parsed description", "category": "Sample Category", "value": 100, "status": "active"}' + parsed_instance = parser.call(user_input) + + print(parsed_instance) + +The output will be: + +.. code-block:: python + + SampleDataClass(description='Parsed description', category='Sample Category', value=100, status='active') + +Try the examples string: + +.. code-block:: python + + samples = [ + SampleDataClass( + description="Sample description", + category="Sample category", + value=100, + status="active", + ), + SampleDataClass( + description="Another description", + category="Another category", + value=200, + status="inactive", + ), + ] + + examples_str = parser.get_examples_str(examples=samples) + print(examples_str) + +The output will be: + +.. code-block:: python + + examples_str: + { + "description": "Sample description", + "category": "Sample category", + "value": 100, + "status": "active" + } + __________ + { + "description": "Another description", + "category": "Another category", + "value": 200, + "status": "inactive" + } + __________ + + + .. admonition:: API References :class: highlight @@ -507,3 +719,5 @@ The output will be: - :class:`components.output_parsers.outputs.OutputParser` - :class:`components.output_parsers.outputs.BooleanOutputParser` - :class:`components.output_parsers.outputs.ListOutputParser` + - :class:`components.output_parsers.dataclass_parser.DataClassParser` + - :class:`core.base_data_class.DataClass` diff --git a/notebooks/adalflow_colab_template.ipynb b/notebooks/adalflow_colab_template.ipynb index 480d5b1a..191bbf08 100644 --- a/notebooks/adalflow_colab_template.ipynb +++ b/notebooks/adalflow_colab_template.ipynb @@ -5,7 +5,7 @@ "metadata": {}, "source": [ "# 🤗 Welcome to AdalFlow!\n", - "## The PyTorch library to auto-optimize any LLM task pipelines\n", + "## The library to build & auto-optimize any LLM task pipelines\n", "\n", "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ Star us on Github ⭐\n", "\n", @@ -20,6 +20,10 @@ "\n", "Common use cases along with the auto-optimization: check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n", "\n", + "# Author\n", + "\n", + "This notebook was created by community contributor [Name](Replace_to_github_or_other_social_account).\n", + "\n", "# Outline\n", "\n", "This is a quick introduction of what AdalFlow is capable of. We will cover:\n", diff --git a/notebooks/tutorials/adalflow_dataclasses.ipynb b/notebooks/tutorials/adalflow_dataclasses.ipynb new file mode 100644 index 00000000..5218f5e7 --- /dev/null +++ b/notebooks/tutorials/adalflow_dataclasses.ipynb @@ -0,0 +1,963 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "hGLYrUwBmvUD" + }, + "source": [ + "\n", + " \"Open\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "gHK6HFngl6iP" + }, + "source": [ + "# 🤗 Welcome to AdalFlow!\n", + "## The library to build & auto-optimize any LLM task pipelines\n", + "\n", + "Thanks for trying us out, we're here to provide you with the best LLM application development experience you can dream of 😊 any questions or concerns you may have, [come talk to us on discord,](https://discord.gg/ezzszrRZvT) we're always here to help! ⭐ Star us on Github ⭐\n", + "\n", + "\n", + "# Quick Links\n", + "\n", + "Github repo: https://github.com/SylphAI-Inc/AdalFlow\n", + "\n", + "Full Tutorials: https://adalflow.sylph.ai/index.html#.\n", + "\n", + "Deep dive on each API: check out the [developer notes](https://adalflow.sylph.ai/tutorials/index.html).\n", + "\n", + "Common use cases along with the auto-optimization: check out [Use cases](https://adalflow.sylph.ai/use_cases/index.html).\n", + "\n", + "# Author\n", + "\n", + "This notebook was created by community contributor [Ajith](https://github.com/ajithvcoder).\n", + "\n", + "# Outline\n", + "\n", + "This is a quick introduction of what AdalFlow is capable of. We will cover:\n", + "\n", + "* How to use `DataClass` with `DataClassParser`.\n", + "* How to do nested dataclass, we will test both one and two levels of nesting.\n", + "\n", + "**Next: Try our [auto-optimization](https://colab.research.google.com/drive/1n3mHUWekTEYHiBdYBTw43TKlPN41A9za?usp=sharing)**\n", + "\n", + "\n", + "# Installation\n", + "\n", + "1. Use `pip` to install the `adalflow` Python package. We will need `openai` and `groq`from the extra packages.\n", + "\n", + " ```bash\n", + " pip install adalflow[openai,groq]\n", + " ```\n", + "2. Setup `openai` and `groq` API key in the environment variables" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nqe-vxB1BCux" + }, + "source": [ + "### Install adalflow" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "ZaaevxNH9JMQ" + }, + "outputs": [], + "source": [ + "# Install adalflow with necessary dependencies\n", + "from IPython.display import clear_output\n", + "\n", + "!pip install -U adalflow[openai,groq]\n", + "\n", + "clear_output()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NGE70aZ8BLuf" + }, + "source": [ + "### Set Environment Variables\n", + "\n", + "Note: Enter your api keys in below cell" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "j2xmGr_99YDq", + "outputId": "c3d1e0b7-9072-412e-fed1-4578404357be" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting .env\n" + ] + } + ], + "source": [ + "%%writefile .env\n", + "\n", + "OPENAI_API_KEY=\"PASTE-OPENAI_API_KEY_HERE\"\n", + "GROQ_API_KEY=\"PASTE-GROQ_API_KEY-HERE\"" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API keys have been set.\n" + ] + } + ], + "source": [ + "# or more securely\n", + "\n", + "import os\n", + "\n", + "from getpass import getpass\n", + "\n", + "# Prompt user to enter their API keys securely\n", + "groq_api_key = getpass(\"Please enter your GROQ API key: \")\n", + "openai_api_key = getpass(\"Please enter your OpenAI API key: \")\n", + "\n", + "\n", + "# Set environment variables\n", + "os.environ['GROQ_API_KEY'] = groq_api_key\n", + "os.environ['OPENAI_API_KEY'] = openai_api_key\n", + "\n", + "print(\"API keys have been set.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZxBkm77uBZpl" + }, + "source": [ + "### Import necessary libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "wOAiKg899Z2u" + }, + "outputs": [], + "source": [ + "# Import required libraries\n", + "from dataclasses import dataclass, field\n", + "from typing import List, Dict\n", + "import adalflow as adal\n", + "from adalflow.components.model_client import GroqAPIClient\n", + "from adalflow.utils import setup_env" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'0.2.4'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "adal.__version__" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bTzgyp6S9bnH" + }, + "outputs": [], + "source": [ + "# Load environment variables - Make sure to have OPENAI_API_KEY in .env file and .env is present in current folder\n", + "setup_env(\".env\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "MBW5viOG9hM8" + }, + "source": [ + "### Basic Vannila Example" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "YA4pAIek9ewc" + }, + "outputs": [], + "source": [ + "# Define the output structure using dataclass\n", + "@dataclass\n", + "class BasicQAOutput(adal.DataClass):\n", + " explanation: str = field(\n", + " metadata={\"desc\": \"A brief explanation of the concept in one sentence.\"}\n", + " )\n", + " example: str = field(\n", + " metadata={\"desc\": \"An example of the concept in a sentence.\"}\n", + " )\n", + " # Control output fields order\n", + " __output_fields__ = [\"explanation\", \"example\"]\n", + "\n", + "# Define the template using jinja2 syntax\n", + "qa_template = r\"\"\"\n", + "You are a helpful assistant.\n", + "\n", + "{{output_format_str}}\n", + "\n", + "\n", + " {{input_str}} \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "x4__jnbP9luN" + }, + "outputs": [], + "source": [ + "# Define the QA component\n", + "class QA(adal.Component):\n", + " def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n", + " super().__init__()\n", + "\n", + " # Initialize the parser with the output dataclass\n", + " parser = adal.DataClassParser(data_class=BasicQAOutput, return_data_class=True)\n", + "\n", + " # Set up the generator with model, template, and parser\n", + " self.generator = adal.Generator(\n", + " model_client=model_client,\n", + " model_kwargs=model_kwargs,\n", + " template=qa_template,\n", + " prompt_kwargs={\"output_format_str\": parser.get_output_format_str()},\n", + " output_processors=parser,\n", + " )\n", + "\n", + " def call(self, query: str):\n", + " \"\"\"Synchronous call to generate response\"\"\"\n", + " return self.generator.call({\"input_str\": query})\n", + "\n", + " async def acall(self, query: str):\n", + " \"\"\"Asynchronous call to generate response\"\"\"\n", + " return await self.generator.acall({\"input_str\": query})\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "TVi3rGvs9nte" + }, + "outputs": [], + "source": [ + "# Example usage\n", + "def run_basic_example():\n", + " # Instantiate the QA class with Groq model\n", + " qa = QA(\n", + " model_client=GroqAPIClient(),\n", + " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", + " )\n", + "\n", + " # Print the QA instance details\n", + " print(qa)\n", + "\n", + " # Test the QA system\n", + " response = qa(\"What is LLM?\")\n", + " print(\"\\nResponse:\")\n", + " print(response)\n", + " print(f\"BasicQAOutput: {response.data}\")\n", + " print(f\"Explanation: {response.data.explanation}\")\n", + " print(f\"Example: {response.data.example}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "QA(\n", + " (generator): Generator(\n", + " model_kwargs={'model': 'llama3-8b-8192'}, trainable_prompt_kwargs=[]\n", + " (prompt): Prompt(\n", + " template: \n", + " You are a helpful assistant.\n", + " \n", + " {{output_format_str}}\n", + " \n", + " \n", + " {{input_str}} , prompt_kwargs: {'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\\n```\\n{\\n \"explanation\": \"A brief explanation of the concept in one sentence. (str) (required)\",\\n \"example\": \"An example of the concept in a sentence. (str) (required)\"\\n}\\n```\\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\\n-Use double quotes for the keys and string values.\\n-DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\\n-Follow the JSON formatting conventions.'}, prompt_variables: ['input_str', 'output_format_str']\n", + " )\n", + " (model_client): GroqAPIClient()\n", + " (output_processors): DataClassParser(\n", + " data_class=BasicQAOutput, format_type=json, return_data_class=True, input_fields=[], output_fields=['explanation', 'example']\n", + " (_output_processor): JsonParser()\n", + " (output_format_prompt): Prompt(\n", + " template: Your output should be formatted as a standard JSON instance with the following schema:\n", + " ```\n", + " {{schema}}\n", + " ```\n", + " -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n", + " -Use double quotes for the keys and string values.\n", + " -DO NOT mistaken the \"properties\" and \"type\" in the schema as the actual fields in the JSON output.\n", + " -Follow the JSON formatting conventions., prompt_variables: ['schema']\n", + " )\n", + " )\n", + " )\n", + ")\n", + "\n", + "Response:\n", + "GeneratorOutput(id=None, data=BasicQAOutput(explanation='Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language', example='The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy'), error=None, usage=CompletionUsage(completion_tokens=60, prompt_tokens=174, total_tokens=234), raw_response='```\\n{\\n \"explanation\": \"Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language\",\\n \"example\": \"The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy\"\\n}\\n```', metadata=None)\n", + "BasicQAOutput: BasicQAOutput(explanation='Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language', example='The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy')\n", + "Explanation: Large Language Model (LLM) is a type of artificial intelligence designed to process and generate human-like language\n", + "Example: The new LLM-powered chatbot was able to understand and respond to complex user queries with high accuracy\n" + ] + } + ], + "source": [ + "run_basic_example()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1n7edLQ19ql8" + }, + "source": [ + "### Example 1 - Movie analysis data class" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "id": "5Arp4-Dq9u49" + }, + "outputs": [], + "source": [ + "# 1. Basic DataClass with different field types\n", + "@dataclass\n", + "class MovieReview(adal.DataClass):\n", + " title: str = field(\n", + " metadata={\"desc\": \"The title of the movie\"}\n", + " )\n", + " rating: float = field(\n", + " metadata={\n", + " \"desc\": \"Rating from 1.0 to 10.0\",\n", + " \"min\": 1.0,\n", + " \"max\": 10.0\n", + " }\n", + " )\n", + " pros: List[str] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"List of positive points about the movie\"}\n", + " )\n", + " cons: List[str] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"List of negative points about the movie\"}\n", + " )\n", + "\n", + " __output_fields__ = [\"title\", \"rating\", \"pros\", \"cons\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "VLbRUzXg9yP0" + }, + "outputs": [], + "source": [ + "\n", + "@dataclass\n", + "class Actor(adal.DataClass):\n", + " name: str = field(metadata={\"desc\": \"Actor's full name\"})\n", + " role: str = field(metadata={\"desc\": \"Character name in the movie\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "id": "7MUcu0tk91l4" + }, + "outputs": [], + "source": [ + "# 2. Nested DataClass example\n", + "\n", + "# Have both MovieReview and Actor nested in DetailedMovieReview\n", + "\n", + "@dataclass\n", + "class DetailedMovieReview(adal.DataClass):\n", + " basic_review: MovieReview\n", + " cast: List[Actor] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"List of main actors in the movie\"}\n", + " )\n", + " genre: List[str] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"List of genres for the movie\"}\n", + " )\n", + " recommend: bool = field(\n", + " default_factory=str,\n", + " metadata={\"desc\": \"Whether you would recommend this movie\"}\n", + " )\n", + "\n", + " __output_fields__ = [\"basic_review\", \"cast\", \"genre\", \"recommend\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "# Example template for movie review\n", + "movie_review_template = r\"\"\"\n", + "You are a professional movie critic. Analyze the given movie and provide a detailed review.\n", + "\n", + "{{output_format_str}}\n", + "\n", + "\n", + " Review this movie: {{movie_title}} \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the MovieReviewer component with MovieAnalysis data class\n", + "class MovieReviewer(adal.Component):\n", + " def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict, data_class: adal.DataClass):\n", + " super().__init__()\n", + " self.additional_structure_prompt = \"Dont use 'type' and 'properties' in output directly give as dict\"\n", + " parser = adal.DataClassParser(\n", + " data_class=data_class,\n", + " return_data_class=True\n", + " )\n", + " self.generator = adal.Generator(\n", + " model_client=model_client,\n", + " model_kwargs=model_kwargs,\n", + " template=movie_review_template,\n", + " prompt_kwargs={\"output_format_str\": parser.get_output_format_str() + self.additional_structure_prompt},\n", + " output_processors=parser,\n", + " )\n", + "\n", + " def call(self, movie_title: str):\n", + " return self.generator.call({\"movie_title\": movie_title})" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=8.5, pros=['Groundbreaking special effects', 'Intriguing story with complex themes', 'Well-developed characters', 'Excellent world-building'], cons=['Pacing can be slow in some parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action'], recommend=True)\n", + "BasicReview: MovieReview(title='The Matrix', rating=8.5, pros=['Groundbreaking special effects', 'Intriguing story with complex themes', 'Well-developed characters', 'Excellent world-building'], cons=['Pacing can be slow in some parts'])\n", + "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n" + ] + } + ], + "source": [ + "# test the data class with one level of nesting\n", + "\n", + "reviewer = MovieReviewer(\n", + " model_client=GroqAPIClient(),\n", + " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", + " data_class=DetailedMovieReview\n", + ")\n", + "\n", + "response = reviewer(\"The Matrix\")\n", + "print(f\"DetailedMovieReview: {response.data}\")\n", + "print(f\"BasicReview: {response.data.basic_review}\")\n", + "print(f\"Cast: {response.data.cast}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.0, pros=['Innovative special effects and action sequences', 'Thought-provoking storyline', 'Engaging cyberpunk aesthetic', 'Strong performances from the cast', 'Iconic fight choreography'], cons=['Complex narrative that may confuse some viewers', 'Some dated CGI when compared to modern standards']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity'), Actor(name='Hugo Weaving', role='Agent Smith')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True)\n", + "BasicReview: MovieReview(title='The Matrix', rating=9.0, pros=['Innovative special effects and action sequences', 'Thought-provoking storyline', 'Engaging cyberpunk aesthetic', 'Strong performances from the cast', 'Iconic fight choreography'], cons=['Complex narrative that may confuse some viewers', 'Some dated CGI when compared to modern standards'])\n", + "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity'), Actor(name='Hugo Weaving', role='Agent Smith')]\n" + ] + } + ], + "source": [ + "# try use openai model\n", + "reviewer = MovieReviewer(\n", + " model_client=adal.OpenAIClient(),\n", + " model_kwargs={\"model\": \"gpt-4o\"},\n", + " data_class=DetailedMovieReview\n", + ")\n", + "response = reviewer(\"The Matrix\")\n", + "print(f\"DetailedMovieReview: {response.data}\")\n", + "print(f\"BasicReview: {response.data.basic_review}\")\n", + "print(f\"Cast: {response.data.cast}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We see both models can handle one level of nested dataclass quite well. And the output ordering will follow the ordering specified in __output_fields__" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "id": "ekr4v8Xg93en" + }, + "outputs": [], + "source": [ + "# 3. second level nested dataclass\n", + "\n", + "@dataclass\n", + "class MovieAnalysis(adal.DataClass):\n", + " review: DetailedMovieReview\n", + " box_office: float = field(\n", + " default=None,\n", + " metadata={\"desc\": \"Box office earnings in millions of dollars\"}\n", + " )\n", + " awards: Dict[str, int] = field(\n", + " default=None,\n", + " metadata={\"desc\": \"Dictionary of award categories and number of wins\"}\n", + " )\n", + "\n", + " __output_fields__ = [\"review\", \"box_office\", \"awards\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MovieAnalysis: MovieAnalysis(review=DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Action', 'Science Fiction'], recommend=True), box_office=463.5, awards={'Best Visual Effects': 4, 'Best Film Editing': 2, 'Best Sound': 1})\n", + "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Action', 'Science Fiction'], recommend=True)\n", + "BasicReview: MovieReview(title='The Matrix', rating=9.5, pros=['Innovative concept', 'Mind-bending plot', 'Impressive action sequences'], cons=['Some overly complex dialogue', 'Ending leaves room for interpretation'])\n", + "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n" + ] + } + ], + "source": [ + "# test the data class with two levels of nested dataclass\n", + "\n", + "# gpt-3.5-turbo model\n", + "\n", + "analysis = MovieReviewer(\n", + " model_client=adal.OpenAIClient(),\n", + " model_kwargs={\"model\": \"gpt-3.5-turbo\"},\n", + " data_class=MovieAnalysis\n", + ")\n", + "\n", + "response = analysis(\"The Matrix\")\n", + "print(f\"MovieAnalysis: {response.data}\")\n", + "print(f\"DetailedMovieReview: {response.data.review}\")\n", + "print(f\"BasicReview: {response.data.review.basic_review}\")\n", + "print(f\"Cast: {response.data.review.cast}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MovieAnalysis: MovieAnalysis(review=DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True), box_office=463.5, awards={'Academy Awards': 4, 'MTV Movie Awards': 10, 'Saturn Awards': 7})\n", + "DetailedMovieReview: DetailedMovieReview(basic_review=MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts']), cast=[Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')], genre=['Science Fiction', 'Action', 'Adventure'], recommend=True)\n", + "BasicReview: MovieReview(title='The Matrix', rating=9.5, pros=['Groundbreaking special effects', 'Thought-provoking themes', 'Innovative storyline', 'Strong performances from the cast'], cons=['Somewhat slow pacing in parts'])\n", + "Cast: [Actor(name='Keanu Reeves', role='Neo'), Actor(name='Laurence Fishburne', role='Morpheus'), Actor(name='Carrie-Anne Moss', role='Trinity')]\n" + ] + } + ], + "source": [ + "# test the data class with two levels of nested dataclass\n", + "\n", + "analysis = MovieReviewer(\n", + " model_client=GroqAPIClient(),\n", + " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", + " data_class=MovieAnalysis\n", + ")\n", + "\n", + "response = analysis(\"The Matrix\")\n", + "print(f\"MovieAnalysis: {response.data}\")\n", + "print(f\"DetailedMovieReview: {response.data.review}\")\n", + "print(f\"BasicReview: {response.data.review.basic_review}\")\n", + "print(f\"Cast: {response.data.review.cast}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pSTrf8_t-DCx" + }, + "source": [ + "### Example 2: Song Review\n", + "Note: Song Review is modified by keeping Example 1 - Movie Review as a reference so that we would know how to use DataClasses for similar purposes" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "7g9bUa0q-B6Y" + }, + "outputs": [], + "source": [ + "# 1. Basic DataClass with different field types\n", + "@dataclass\n", + "class SongReview(adal.DataClass):\n", + " title: str = field(\n", + " metadata={\"desc\": \"The title of the song\"}\n", + " )\n", + " album: str = field(\n", + " metadata={\"desc\": \"The album of the song\"}\n", + " )\n", + " ranking: int = field(\n", + " metadata={\n", + " \"desc\": \"Billboard peak ranking from 1 to 200\",\n", + " \"min\": 1,\n", + " \"max\": 200\n", + " }\n", + " )\n", + " streaming: Dict[str, int] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"Dict of lastest approximate streaming count in spotify and in youtube. Gives the count in millions\"}\n", + " )\n", + " pros: List[str] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"List of positive points about the song\"}\n", + " )\n", + " cons: List[str] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"List of negative points about the song\"}\n", + " )\n", + "\n", + " __output_fields__ = [\"title\", \"rating\", \"streaming\", \"pros\", \"cons\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "id": "UGhMRZht-HiB" + }, + "outputs": [], + "source": [ + "\n", + "@dataclass\n", + "class Artist(adal.DataClass):\n", + " name: str = field(metadata={\"desc\": \"Artist's full name\"})\n", + " role: str = field(metadata={\"desc\": \"Artist's role in the song\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "id": "sfNWgPYN-JAj" + }, + "outputs": [], + "source": [ + "# 2. Nested DataClass example\n", + "\n", + "@dataclass\n", + "class DetailedSongReview(adal.DataClass):\n", + " basic_review: SongReview = field(\n", + " default=SongReview, metadata={\"desc\": \"basic Song review details\"}\n", + " )\n", + " cast: List[Artist] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"List of main singer, lyrisist and musicians in the song\"}\n", + " )\n", + " genre: List[str] = field(\n", + " default_factory=list,\n", + " metadata={\"desc\": \"List of genres for the song\"}\n", + " )\n", + " recommend: bool = field(\n", + " default_factory=str,\n", + " metadata={\"desc\": \"Whether you would recommend this song\"}\n", + " )\n", + "\n", + " __output_fields__ = [\"basic_review\", \"cast\", \"genre\", \"recommend\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "HG8rtCd8-K7t" + }, + "outputs": [], + "source": [ + "# 3. two levels of nesting dataclass\n", + "\n", + "# all these fields as we use default, it is optional, so \n", + "# llm might not output that field if they dont have information\n", + "\n", + "@dataclass\n", + "class SongAnalysis(adal.DataClass):\n", + " review: DetailedSongReview = field(\n", + " default=DetailedSongReview, metadata={\"desc\": \"Song review details\"}\n", + " )\n", + " duration: float = field(\n", + " default=None,\n", + " metadata={\"desc\": \"Duration of the song\"}\n", + " )\n", + " awards: Dict[str, int] = field(\n", + " default=None,\n", + " metadata={\"desc\": \"Dictionary of award categories and number of wins\"}\n", + " )\n", + "\n", + " __output_fields__ = [\"review\", \"duration\", \"awards\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "id": "v3mNeyz7-MpY" + }, + "outputs": [], + "source": [ + "# Example template for song review\n", + "song_review_template = r\"\"\"\n", + "You are a professional song critic. Analyze the given song and provide a detailed review.\n", + "\n", + "{{output_format_str}}\n", + "\n", + "\n", + " Review this song: {{song_title}} \"\"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "X2eifXOU-OrE" + }, + "outputs": [], + "source": [ + "# Create the SongReviewer component with SongAnalysis data class\n", + "class SongReviewer(adal.Component):\n", + " def __init__(self, model_client: adal.ModelClient, model_kwargs: Dict):\n", + " super().__init__()\n", + " self.additional_structure_prompt = \"Dont use 'type' and 'properties' in output directly give as dict\"\n", + " parser = adal.DataClassParser(\n", + " data_class=SongAnalysis,\n", + " return_data_class=False,\n", + " format_type=\"json\"\n", + " )\n", + " self.generator = adal.Generator(\n", + " model_client=model_client,\n", + " model_kwargs=model_kwargs,\n", + " template=song_review_template,\n", + " prompt_kwargs={\"output_format_str\": parser.get_output_format_str() + self.additional_structure_prompt },\n", + " output_processors=parser,\n", + " )\n", + "\n", + " def call(self, song_title: str):\n", + " return self.generator.call({\"song_title\": song_title})" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SongAnalysis: {'review': {'basic_review': {'title': 'Shape of You', 'album': '÷ (Divide)', 'ranking': 7, 'streaming': {'spotify': 4.5, 'youtube': 2.5}, 'pros': ['Catchy beat', 'Catchy melody', 'Funky rhythm', 'Great lyrics'], 'cons': ['Some may find the lyrics objectifying', 'Not typically my cup of tea']}, 'cast': [{'name': 'Ed Sheeran', 'role': 'Lead vocals, songwriting'}], 'genre': ['Pop', 'Dance', 'Electro'], 'recommend': True}, 'duration': 3.53}\n" + ] + } + ], + "source": [ + "analysis = SongReviewer(\n", + " model_client=GroqAPIClient(),\n", + " model_kwargs={\"model\": \"llama3-8b-8192\"},\n", + ")\n", + "\n", + "response = analysis(\"Shape of you\")\n", + "print(f\"SongAnalysis: {response.data}\")\n", + "\n", + "# this time as we set `return_data_class` to False in the parser, we get the output as dict" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Song Title: Shape of You\n", + "Album: ÷ (Divide)\n", + "Ranking: 7\n", + "- spotify - 4.5 million views\n", + "- youtube - 2.5 million views\n", + "\n", + "Pros:\n", + "- Catchy beat\n", + "- Catchy melody\n", + "- Funky rhythm\n", + "- Great lyrics\n", + "\n", + "Artist's:\n", + "- Ed Sheeran as Lead vocals, songwriting\n", + "\n", + "Genere: \n", + " Pop \n", + " Dance \n", + " Electro \n", + "\n", + "Duration: 3.53 minutes\n" + ] + } + ], + "source": [ + "# Access nested data\n", + "analysis = response.data\n", + "print(f\"Song Title: {analysis['review']['basic_review']['title']}\")\n", + "print(f\"Album: {analysis['review']['basic_review']['album']}\")\n", + "print(f\"Ranking: {analysis['review']['basic_review']['ranking']}\")\n", + "\n", + "for platform, views in analysis['review']['basic_review']['streaming'].items():\n", + " print(f\"- {platform} - {views} million views\")\n", + "print(\"\\nPros:\")\n", + "for pro in analysis['review'][\"basic_review\"][\"pros\"]:\n", + " print(f\"- {pro}\")\n", + "\n", + "print(\"\\nArtist's:\")\n", + "for actor in analysis['review'][\"cast\"]:\n", + " print(f\"- {actor['name']} as {actor['role']}\")\n", + "\n", + "if analysis['review']['genre']:\n", + " print(f\"\\nGenere: \")\n", + " for genre in analysis['review']['genre']:\n", + " print(f\" {genre} \")\n", + "\n", + "if analysis['duration']:\n", + " print(f\"\\nDuration: {analysis['duration']} minutes\")\n", + "\n", + "if hasattr(analysis, 'awards') and analysis['awards']:\n", + " print(\"\\nAwards:\")\n", + " for category, count in analysis['awards'].items():\n", + " print(f\"- {category}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "TODOs:\n", + "1. Add `JsonOutputParser` and `YamlOutputParser` to this notebook." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BLAF5qTEmoyW" + }, + "source": [ + "# Issues and feedback\n", + "\n", + "If you encounter any issues, please report them here: [GitHub Issues](https://github.com/SylphAI-Inc/LightRAG/issues).\n", + "\n", + "For feedback, you can use either the [GitHub discussions](https://github.com/SylphAI-Inc/LightRAG/discussions) or [Discord](https://discord.gg/ezzszrRZvT)." + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "nqe-vxB1BCux", + "NGE70aZ8BLuf" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "my-project-kernel", + "language": "python", + "name": "my-project-kernel" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/tutorials/parser_note.py b/tutorials/parser_note.py index fdc23fce..80c2c009 100644 --- a/tutorials/parser_note.py +++ b/tutorials/parser_note.py @@ -271,6 +271,62 @@ class User(DataClass): print(parsed_user) +def dataclass_parser(): + from dataclasses import dataclass, field + from adalflow.components.output_parsers import DataClassParser + from adalflow.core import DataClass + + @dataclass + class SampleDataClass(DataClass): + description: str = field(metadata={"description": "A sample description"}) + category: str = field(metadata={"description": "Category of the sample"}) + value: int = field(metadata={"description": "A sample integer value"}) + status: str = field(metadata={"description": "Status of the sample"}) + + __input_fields__ = [ + "description", + "category", + ] # Define which fields are input fields + __output_fields__ = ["value", "status"] # Define which fields are output fields + + # Initialize the DataClassParser with SampleDataClass + parser = DataClassParser( + data_class=SampleDataClass, return_data_class=True, format_type="json" + ) + print("DataClassParser instance created:\n", parser) + + # Get formatted instructions for the output format + output_format_str = parser.get_output_format_str() + print("\nOutput format string:\n", output_format_str) + + # Get formatted instructions for the input format + input_format_str = parser.get_input_format_str() + print("\nInput format string:\n", input_format_str) + + # Parse a sample JSON string + user_input = '{"description": "Parsed description", "category": "Sample Category", "value": 100, "status": "active"}' + parsed_instance = parser.call(user_input) + print("\nParsed DataClass instance:\n", parsed_instance) + + samples = [ + SampleDataClass( + description="Sample description", + category="Sample category", + value=100, + status="active", + ), + SampleDataClass( + description="Another description", + category="Another category", + value=200, + status="inactive", + ), + ] + + examples_str = parser.get_examples_str(examples=samples) + print(f"examples_str: {examples_str}") + + if __name__ == "__main__": examples_of_different_ways_to_parse_string() int_parser() @@ -281,3 +337,4 @@ class User(DataClass): yaml_parser() json_output_parser() yaml_output_parser() + dataclass_parser()