From 7b0f548827f27bc86c1e491bba7085bbf12bb347 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Thu, 15 Feb 2024 14:40:04 -0500 Subject: [PATCH 1/3] add langsmith examples --- examples/batch-classification/run.py | 28 +--- .../batch-classification/run_langsmith.py | 109 +++++++++++++++ examples/extract-table/run_vision.py | 118 ++++++++++------- .../extract-table/run_vision_langsmith.py | 124 ++++++++++++++++++ 4 files changed, 308 insertions(+), 71 deletions(-) create mode 100644 examples/batch-classification/run_langsmith.py create mode 100644 examples/extract-table/run_vision_langsmith.py diff --git a/examples/batch-classification/run.py b/examples/batch-classification/run.py index 4a1193062..3a48c1fa5 100644 --- a/examples/batch-classification/run.py +++ b/examples/batch-classification/run.py @@ -7,53 +7,43 @@ from typing import List from enum import Enum - -client = instructor.patch(AsyncOpenAI(), mode=instructor.Mode.TOOLS) +client = AsyncOpenAI() +client = instructor.patch(client, mode=instructor.Mode.TOOLS) sem = asyncio.Semaphore(5) class QuestionType(Enum): - CONTENT_OWNERSHIP = "CONTENT_OWNERSHIP" CONTACT = "CONTACT" TIMELINE_QUERY = "TIMELINE_QUERY" DOCUMENT_SEARCH = "DOCUMENT_SEARCH" COMPARE_CONTRAST = "COMPARE_CONTRAST" - MEETING_TRANSCRIPTS = "MEETING_TRANSCRIPTS" EMAIL = "EMAIL" PHOTOS = "PHOTOS" - HOW_DOES_THIS_WORK = "HOW_DOES_THIS_WORK" - NEEDLE_IN_HAYSTACK = "NEEDLE_IN_HAYSTACK" SUMMARY = "SUMMARY" -ALLOWED_TYPES = [t.value for t in QuestionType] - - # You can add more instructions and examples in the description # or you can put it in the prompt in `messages=[...]` class QuestionClassification(BaseModel): """ Predict the type of question that is being asked. - Here are some tips on how to predict the question type: - - CONTENT_OWNERSHIP: "Who owns the a certain piece of content?" CONTACT: Searches for some contact information. TIMELINE_QUERY: "When did something happen? DOCUMENT_SEARCH: "Find me a document" COMPARE_CONTRAST: "Compare and contrast two things" - MEETING_TRANSCRIPTS: "Find me a transcript of a meeting, or a soemthing said in a meeting" EMAIL: "Find me an email, search for an email" PHOTOS: "Find me a photo, search for a photo" - HOW_DOES_THIS_WORK: "How does this question /answer product work?" - NEEDLE_IN_HAYSTACK: "Find me something specific in a large amount of data" SUMMARY: "Summarize a large amount of data" """ # If you want only one classification, just change it to # `classification: QuestionType` rather than `classifications: List[QuestionType]`` + chain_of_thought: str = Field( + ..., description="The chain of thought that led to the classification" + ) classification: List[QuestionType] = Field( - description=f"An accuracy and correct prediction predicted class of question. Only allowed types: {ALLOWED_TYPES}, should be used", + description=f"An accuracy and correct prediction predicted class of question. Only allowed types: {[t.value for t in QuestionType]}, should be used", ) @field_validator("classification", mode="before") @@ -64,7 +54,6 @@ def validate_classification(cls, v): return v -# Modify the classify function async def classify(data: str) -> QuestionClassification: async with sem: # some simple rate limiting return data, await client.chat.completions.create( @@ -100,9 +89,6 @@ async def main( if __name__ == "__main__": import asyncio - path = "./data.jsonl" - # Obviously we might want to big query or - # load this from a file or something??? questions = [ "What was that ai app that i saw on the news the other day?", "Can you find the trainline booking email?", @@ -114,4 +100,4 @@ async def main( "Tell me about todays meeting and how it relates to the email on Monday", ] - asyncio.run(main(questions, path_to_jsonl=path)) + asyncio.run(main(questions)) diff --git a/examples/batch-classification/run_langsmith.py b/examples/batch-classification/run_langsmith.py new file mode 100644 index 000000000..44fe8240a --- /dev/null +++ b/examples/batch-classification/run_langsmith.py @@ -0,0 +1,109 @@ +import json +import instructor +import asyncio + +from langsmith import traceable +from langsmith.wrappers import wrap_openai + +from openai import AsyncOpenAI +from pydantic import BaseModel, Field, field_validator +from typing import List +from enum import Enum + +client = wrap_openai(AsyncOpenAI()) +client = instructor.patch(client, mode=instructor.Mode.TOOLS) +sem = asyncio.Semaphore(5) + + +class QuestionType(Enum): + CONTACT = "CONTACT" + TIMELINE_QUERY = "TIMELINE_QUERY" + DOCUMENT_SEARCH = "DOCUMENT_SEARCH" + COMPARE_CONTRAST = "COMPARE_CONTRAST" + EMAIL = "EMAIL" + PHOTOS = "PHOTOS" + SUMMARY = "SUMMARY" + + +# You can add more instructions and examples in the description +# or you can put it in the prompt in `messages=[...]` +class QuestionClassification(BaseModel): + """ + Predict the type of question that is being asked. + Here are some tips on how to predict the question type: + CONTACT: Searches for some contact information. + TIMELINE_QUERY: "When did something happen? + DOCUMENT_SEARCH: "Find me a document" + COMPARE_CONTRAST: "Compare and contrast two things" + EMAIL: "Find me an email, search for an email" + PHOTOS: "Find me a photo, search for a photo" + SUMMARY: "Summarize a large amount of data" + """ + + # If you want only one classification, just change it to + # `classification: QuestionType` rather than `classifications: List[QuestionType]`` + chain_of_thought: str = Field( + ..., description="The chain of thought that led to the classification" + ) + classification: List[QuestionType] = Field( + description=f"An accuracy and correct prediction predicted class of question. Only allowed types: {[t.value for t in QuestionType]}, should be used", + ) + + @field_validator("classification", mode="before") + def validate_classification(cls, v): + # sometimes the API returns a single value, just make sure it's a list + if not isinstance(v, list): + v = [v] + return v + + +# Modify the classify function +@traceable(name="classify-question") +async def classify(data: str) -> QuestionClassification: + async with sem: # some simple rate limiting + return data, await client.chat.completions.create( + model="gpt-4", + response_model=QuestionClassification, + max_retries=2, + messages=[ + { + "role": "user", + "content": f"Classify the following question: {data}", + }, + ], + ) + + +@traceable(name="main") +async def main( + questions: List[str], *, path_to_jsonl: str = None +) -> List[QuestionClassification]: + tasks = [classify(question) for question in questions] + for task in asyncio.as_completed(tasks): + question, label = await task + resp = { + "question": question, + "classification": [c.value for c in label.classification], + } + print(resp) + if path_to_jsonl: + with open(path_to_jsonl, "a") as f: + json_dump = json.dumps(resp) + f.write(json_dump + "\n") + + +if __name__ == "__main__": + import asyncio + + questions = [ + "What was that ai app that i saw on the news the other day?", + "Can you find the trainline booking email?", + "What was the book I saw on amazon yesturday?", + "Can you speak german?", + "Do you have access to the meeting transcripts?", + "what are the recent sites I visited?", + "what did I do on Monday?", + "Tell me about todays meeting and how it relates to the email on Monday", + ] + + asyncio.run(main(questions)) diff --git a/examples/extract-table/run_vision.py b/examples/extract-table/run_vision.py index 986478ddc..4bcbd88c8 100644 --- a/examples/extract-table/run_vision.py +++ b/examples/extract-table/run_vision.py @@ -1,6 +1,6 @@ from openai import OpenAI from io import StringIO -from typing import Annotated, Any, Iterable +from typing import Annotated, Any, List from pydantic import ( BaseModel, BeforeValidator, @@ -8,11 +8,12 @@ InstanceOf, WithJsonSchema, ) -import pandas as pd import instructor +import pandas as pd -client = instructor.patch(OpenAI(), mode=instructor.function_calls.Mode.MD_JSON) +client = OpenAI() +client = instructor.patch(client, mode=instructor.function_calls.Mode.MD_JSON) def md_to_df(data: Any) -> Any: @@ -51,53 +52,70 @@ class Table(BaseModel): dataframe: MarkdownDataFrame -tables = client.chat.completions.create( - model="gpt-4-vision-preview", - max_tokens=1000, - response_model=Iterable[Table], - messages=[ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "Describe this data accurately as a table in markdown format.", - }, - { - "type": "image_url", - "image_url": { - # "url": "https://a.storyblok.com/f/47007/2400x1260/f816b031cb/uk-ireland-in-three-charts_chart_a.png/m/2880x0", - # "url": "https://a.storyblok.com/f/47007/2400x2000/bf383abc3c/231031_uk-ireland-in-three-charts_table_v01_b.png/m/2880x0", - # "url": "https://a.storyblok.com/f/47007/4800x2766/1688e25601/230629_attoptinratesmidyear_blog_chart02_v01.png/m/2880x0" - "url": "https://a.storyblok.com/f/47007/2400x1260/934d294894/uk-ireland-in-three-charts_chart_b.png/m/2880x0" - }, - }, +class MultipleTables(BaseModel): + tables: List[Table] + + +example = MultipleTables( + tables=[ + Table( + caption="This is a caption", + dataframe=pd.DataFrame( { - "type": "text", - "text": """ - First take a moment to reason about the best set of headers for the tables. - Write a good h1 for the image above. Then follow up with a short description of the what the data is about. - Then for each table you identified, write a h2 tag that is a descriptive title of the table. - Then follow up with a short description of the what the data is about. - Lastly, produce the markdown table for each table you identified. - """, - }, - ], - } - ], + "Chart A": [10, 40], + "Chart B": [20, 50], + "Chart C": [30, 60], + } + ), + ) + ] ) -for table in tables: - print(table.caption) - print(table.dataframe) - print() - """ - D1 App Retention Rates July 2023 (Ireland & U.K.) - Ireland UK - Category - Education 14% 12% - Entertainment 13% 11% - Games 26% 25% - Social 27% 18% - Utilities 11% 9% - """ + +def extract(url: str) -> MultipleTables: + tables = client.chat.completions.create( + model="gpt-4-vision-preview", + max_tokens=4000, + response_model=MultipleTables, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Describe this data accurately as a table in markdown format. {example.model_dump_json(indent=2)}", + }, + { + "type": "image_url", + "image_url": {"url": url}, + }, + { + "type": "text", + "text": """ + First take a moment to reason about the best set of headers for the tables. + Write a good h1 for the image above. Then follow up with a short description of the what the data is about. + Then for each table you identified, write a h2 tag that is a descriptive title of the table. + Then follow up with a short description of the what the data is about. + Lastly, produce the markdown table for each table you identified. + + + Make sure to escape the markdown table properly, and make sure to include the caption and the dataframe. + including escaping all the newlines and quotes. Only return a markdown table in dataframe, nothing else. + """, + }, + ], + } + ], + ) + return tables.model_dump() + + +urls = [ + "https://a.storyblok.com/f/47007/2400x1260/f816b031cb/uk-ireland-in-three-charts_chart_a.png/m/2880x0", + "https://a.storyblok.com/f/47007/2400x2000/bf383abc3c/231031_uk-ireland-in-three-charts_table_v01_b.png/m/2880x0", +] + + +for url in urls: + tables = extract(url) + print(tables) diff --git a/examples/extract-table/run_vision_langsmith.py b/examples/extract-table/run_vision_langsmith.py new file mode 100644 index 000000000..65141938c --- /dev/null +++ b/examples/extract-table/run_vision_langsmith.py @@ -0,0 +1,124 @@ +from openai import OpenAI +from io import StringIO +from typing import Annotated, Any, List +from pydantic import ( + BaseModel, + BeforeValidator, + PlainSerializer, + InstanceOf, + WithJsonSchema, +) +import instructor +import pandas as pd +from langsmith.wrappers import wrap_openai +from langsmith import traceable + + +client = wrap_openai(OpenAI()) +client = instructor.patch(client, mode=instructor.function_calls.Mode.MD_JSON) + + +def md_to_df(data: Any) -> Any: + if isinstance(data, str): + return ( + pd.read_csv( + StringIO(data), # Get rid of whitespaces + sep="|", + index_col=1, + ) + .dropna(axis=1, how="all") + .iloc[1:] + .map(lambda x: x.strip()) + ) + return data + + +MarkdownDataFrame = Annotated[ + InstanceOf[pd.DataFrame], + BeforeValidator(md_to_df), + PlainSerializer(lambda x: x.to_markdown()), + WithJsonSchema( + { + "type": "string", + "description": """ + The markdown representation of the table, + each one should be tidy, do not try to join tables + that should be seperate""", + } + ), +] + + +class Table(BaseModel): + caption: str + dataframe: MarkdownDataFrame + + +class MultipleTables(BaseModel): + tables: List[Table] + + +example = MultipleTables( + tables=[ + Table( + caption="This is a caption", + dataframe=pd.DataFrame( + { + "Chart A": [10, 40], + "Chart B": [20, 50], + "Chart C": [30, 60], + } + ), + ) + ] +) + + +@traceable(name="extract-table") +def extract(url: str) -> MultipleTables: + tables = client.chat.completions.create( + model="gpt-4-vision-preview", + max_tokens=4000, + response_model=MultipleTables, + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": f"Describe this data accurately as a table in markdown format. {example.model_dump_json(indent=2)}", + }, + { + "type": "image_url", + "image_url": {"url": url}, + }, + { + "type": "text", + "text": """ + First take a moment to reason about the best set of headers for the tables. + Write a good h1 for the image above. Then follow up with a short description of the what the data is about. + Then for each table you identified, write a h2 tag that is a descriptive title of the table. + Then follow up with a short description of the what the data is about. + Lastly, produce the markdown table for each table you identified. + + + Make sure to escape the markdown table properly, and make sure to include the caption and the dataframe. + including escaping all the newlines and quotes. Only return a markdown table in dataframe, nothing else. + """, + }, + ], + } + ], + ) + return tables.model_dump() + + +urls = [ + "https://a.storyblok.com/f/47007/2400x1260/f816b031cb/uk-ireland-in-three-charts_chart_a.png/m/2880x0", + "https://a.storyblok.com/f/47007/2400x2000/bf383abc3c/231031_uk-ireland-in-three-charts_table_v01_b.png/m/2880x0", +] + + +for url in urls: + tables = extract(url) + print(tables) From ea39ff3d174d307e21b3dfc87e81d1b0b36e0b05 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Thu, 15 Feb 2024 14:46:17 -0500 Subject: [PATCH 2/3] bump --- examples/batch-classification/run-cache.py | 123 +++++------------- .../batch-classification/run_langsmith.py | 14 +- 2 files changed, 38 insertions(+), 99 deletions(-) diff --git a/examples/batch-classification/run-cache.py b/examples/batch-classification/run-cache.py index 4e99926e5..774a8f4d8 100644 --- a/examples/batch-classification/run-cache.py +++ b/examples/batch-classification/run-cache.py @@ -1,4 +1,3 @@ -import json import instructor import asyncio @@ -6,96 +5,44 @@ from pydantic import BaseModel, Field, field_validator from typing import List from enum import Enum -import diskcache -import os -import inspect -import functools -client = instructor.patch(AsyncOpenAI(), mode=instructor.Mode.TOOLS) +client = AsyncOpenAI() +client = instructor.patch(client, mode=instructor.Mode.TOOLS) sem = asyncio.Semaphore(5) -pwd = os.getcwd() -cache = diskcache.Cache(pwd) - - -def instructor_cache(func): - """Cache a function that returns a Pydantic model""" - return_type = inspect.signature(func).return_annotation # - if not issubclass(return_type, BaseModel): # - raise ValueError("The return type must be a Pydantic model") - - @functools.wraps(func) - async def wrapper(*args, **kwargs): - key = f"{func.__name__}-{functools._make_key(args, kwargs, typed=False)}" - if (cached := cache.get(key)) is not None: - # Deserialize from JSON based on the return type - return return_type.model_validate_json(cached) - - result = await func(*args, **kwargs) - # Call the function and cache its result - - serialized_result = result.model_dump_json() - cache.set(key, serialized_result) - - return result - - @functools.wraps(func) - def sync_wrapper(*args, **kwargs): - key = f"{func.__name__}-{functools._make_key(args, kwargs, typed=False)}" - if (cached := cache.get(key)) is not None: - return return_type.model_validate_json(cached) - - result = func(*args, **kwargs) - serialized_result = result.model_dump_json() - cache.set(key, serialized_result) - - return result - - return wrapper if inspect.iscoroutinefunction(func) else sync_wrapper - class QuestionType(Enum): - CONTENT_OWNERSHIP = "CONTENT_OWNERSHIP" CONTACT = "CONTACT" TIMELINE_QUERY = "TIMELINE_QUERY" DOCUMENT_SEARCH = "DOCUMENT_SEARCH" COMPARE_CONTRAST = "COMPARE_CONTRAST" - MEETING_TRANSCRIPTS = "MEETING_TRANSCRIPTS" EMAIL = "EMAIL" PHOTOS = "PHOTOS" - HOW_DOES_THIS_WORK = "HOW_DOES_THIS_WORK" - NEEDLE_IN_HAYSTACK = "NEEDLE_IN_HAYSTACK" SUMMARY = "SUMMARY" -ALLOWED_TYPES = [t.value for t in QuestionType] - - # You can add more instructions and examples in the description # or you can put it in the prompt in `messages=[...]` class QuestionClassification(BaseModel): """ Predict the type of question that is being asked. - Here are some tips on how to predict the question type: - - CONTENT_OWNERSHIP: "Who owns the a certain piece of content?" CONTACT: Searches for some contact information. TIMELINE_QUERY: "When did something happen? DOCUMENT_SEARCH: "Find me a document" COMPARE_CONTRAST: "Compare and contrast two things" - MEETING_TRANSCRIPTS: "Find me a transcript of a meeting, or a soemthing said in a meeting" EMAIL: "Find me an email, search for an email" PHOTOS: "Find me a photo, search for a photo" - HOW_DOES_THIS_WORK: "How does this question /answer product work?" - NEEDLE_IN_HAYSTACK: "Find me something specific in a large amount of data" SUMMARY: "Summarize a large amount of data" """ # If you want only one classification, just change it to # `classification: QuestionType` rather than `classifications: List[QuestionType]`` + chain_of_thought: str = Field( + ..., description="The chain of thought that led to the classification" + ) classification: List[QuestionType] = Field( - description=f"An accuracy and correct prediction predicted class of question. Only allowed types: {ALLOWED_TYPES}, should be used", + description=f"An accuracy and correct prediction predicted class of question. Only allowed types: {[t.value for t in QuestionType]}, should be used", ) @field_validator("classification", mode="before") @@ -106,52 +53,48 @@ def validate_classification(cls, v): return v -@instructor_cache -async def classify_question(user_question: str) -> QuestionClassification: - return await client.chat.completions.create( - model="gpt-4", - response_model=QuestionClassification, - max_retries=2, - messages=[ - { - "role": "user", - "content": f"Classify the following question: {user_question}", - }, - ], - ) - - +# Modify the classify function async def classify(data: str) -> QuestionClassification: async with sem: # some simple rate limiting - return data, await classify_question(data) - - -async def main( - questions: List[str], *, path_to_jsonl: str = None -) -> List[QuestionClassification]: + return data, await client.chat.completions.create( + model="gpt-4", + response_model=QuestionClassification, + max_retries=2, + messages=[ + { + "role": "user", + "content": f"Classify the following question: {data}", + }, + ], + ) + + +async def main(questions: List[str]): tasks = [classify(question) for question in questions] + resps = [] for task in asyncio.as_completed(tasks): question, label = await task resp = { "question": question, "classification": [c.value for c in label.classification], + "chain_of_thought": label.chain_of_thought, } - print(resp) - if path_to_jsonl: - with open(path_to_jsonl, "a") as f: - json_dump = json.dumps(resp) - f.write(json_dump + "\n") + resps.append(resp) + return resps if __name__ == "__main__": import asyncio - path = "./data.jsonl" - questions = [ "What was that ai app that i saw on the news the other day?", - "What was that ai app that i saw on the news the other day?", - "What was that ai app that i saw on the news the other day?", + "Can you find the trainline booking email?", + "What was the book I saw on amazon yesturday?", + "Can you speak german?", + "Do you have access to the meeting transcripts?", + "what are the recent sites I visited?", + "what did I do on Monday?", + "Tell me about todays meeting and how it relates to the email on Monday", ] - asyncio.run(main(questions, path_to_jsonl=path)) + asyncio.run(main(questions)) diff --git a/examples/batch-classification/run_langsmith.py b/examples/batch-classification/run_langsmith.py index 44fe8240a..6c8b1fa08 100644 --- a/examples/batch-classification/run_langsmith.py +++ b/examples/batch-classification/run_langsmith.py @@ -1,4 +1,3 @@ -import json import instructor import asyncio @@ -75,21 +74,18 @@ async def classify(data: str) -> QuestionClassification: @traceable(name="main") -async def main( - questions: List[str], *, path_to_jsonl: str = None -) -> List[QuestionClassification]: +async def main(questions: List[str]): tasks = [classify(question) for question in questions] + resps = [] for task in asyncio.as_completed(tasks): question, label = await task resp = { "question": question, "classification": [c.value for c in label.classification], + "chain_of_thought": label.chain_of_thought, } - print(resp) - if path_to_jsonl: - with open(path_to_jsonl, "a") as f: - json_dump = json.dumps(resp) - f.write(json_dump + "\n") + resps.append(resp) + return resps if __name__ == "__main__": From 4354bd8bc3dd4799fddb2d768f231fedc4f4ebd8 Mon Sep 17 00:00:00 2001 From: Jason Liu Date: Thu, 15 Feb 2024 14:47:44 -0500 Subject: [PATCH 3/3] remove main --- examples/batch-classification/run_langsmith.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/batch-classification/run_langsmith.py b/examples/batch-classification/run_langsmith.py index 6c8b1fa08..030e249b1 100644 --- a/examples/batch-classification/run_langsmith.py +++ b/examples/batch-classification/run_langsmith.py @@ -73,7 +73,6 @@ async def classify(data: str) -> QuestionClassification: ) -@traceable(name="main") async def main(questions: List[str]): tasks = [classify(question) for question in questions] resps = []