-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlesson-1-5.py
105 lines (79 loc) · 4.19 KB
/
lesson-1-5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# Lesson 1.5: Text summarization with OpenAI Chat API
# the Langchain library includes a ChatOpenAI class that wraps the OpenAI API and provides additional functionality
# it's a great library for working with other LLM models and has lots of out of the box
# functionality for working with data
from dotenv import load_dotenv
from rich.console import Console
from rich.markdown import Markdown
from langchain_openai import ChatOpenAI
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_text_splitters import TokenTextSplitter
# This is a website with pdfs of Congressional legislation that will be reviewed for the current week
# Or you can use a different website with pdfs of text that you want to summarize, like Supreme Court oral arguments:
# https://www.supremecourt.gov/oral_arguments/argument_transcript/2024
START_URL = "https://docs.house.gov/floor/"
MODEL = "gpt-4o" # using gpt-4o-mini would be faster, but less accurate, especially for this refinement scenario
TOKEN_MAX = 60000 # Open AI context window is 128000 tokens, set to half to leave room for additional context
# This took me several iterations until I found a prompt that gave me the desired results
SYSTEM_PROMPT = """ You are a journalist that reads through Congressional legislation being considered for the week
and publishes a news article informing the public. This conversation
is an iterative refinement of the news article where additional information and legislation is continually added.
Respond in markdown syntax and stylize the text. Be as detailed as possible on each point of legislation being reviewed.
"""
# Here I'm using markdown to structure the article draft and additional information to review.
# We want the model to be able to distinguish between the prompt and the additional data we
# are passing into it. I'm hoping the code block markdown will help with that.
# This prompt is designed to be used iteratively to refine the article draft.
REDUCE_PROMPT = reduce_template = """
## Article Draft
```markdown
{refinedSummary}
```
## Additional information and legislation to review and add to the article
```
{context}
```
"""
load_dotenv()
llm = ChatOpenAI(model=MODEL)
# The chat prompt template is a reusable way to structure the conversation with the model
reduce_prompt = ChatPromptTemplate([("system", SYSTEM_PROMPT),("human", REDUCE_PROMPT)])
reduce_chain = reduce_prompt | llm | StrOutputParser()
# TokenTextSplitter is a utility to split text into chunks of a certain size,
# in this case we are splitting the text into chunks based on a max token size
tokenSplitter = TokenTextSplitter(chunk_size=TOKEN_MAX, chunk_overlap=0, model_name=MODEL)
console = Console()
# Function to get all the pdf links from a given url, let's us crawl the site for pdfs
def get_pdf_links(url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
pdf_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.pdf')]
return pdf_links
def main():
## Get all the pdf links from the website
pdf_links = get_pdf_links(START_URL)
refinedSummary = ""
for link in pdf_links:
console.print(f"Loading document from {link}")
loader = PyPDFLoader(link)
docs = loader.load()
# use our token splitter to make sure we don't exceed the token limit
# if the pdf is too large this will automatically split it into chunks
docsSplit = tokenSplitter.split_documents(docs)
console.print(f"Loaded {len(docs)} documents")
console.print("Generating and refining summary for pdf ...")
# Summarize and refine
for doc in docsSplit:
context = doc.page_content
response = reduce_chain.invoke({"context": context, "refinedSummary": refinedSummary})
refinedSummary = response
console.print(Markdown("# Refined summary so far:"))
console.print(Markdown(response))
console.print(Markdown("# Final summary:"))
console.print(Markdown(refinedSummary))
if __name__ == "__main__":
main()