forked from alessandroamenta/bookSummarizer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbookChat.py
148 lines (124 loc) · 6.4 KB
/
bookChat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# Standard library imports
import os
import tempfile
# Third-party imports
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders import UnstructuredEPubLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains.summarize import load_summarize_chain
import numpy as np
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from warnings import simplefilter
from langchain.callbacks import get_openai_callback
# Load environment variables
load_dotenv()
openai_api_key = os.environ.get('OPENAI_API_KEY')
def generate_summary(uploaded_file, openai_api_key: str, num_clusters: int = 11, verbose: bool = False) -> str:
"""Generate a summary for a given book."""
def load_book(file_obj, file_extension):
"""Load the content of a book based on its file type."""
text = ""
# Create a temporary file to store the uploaded content
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as temp_file:
temp_file.write(file_obj.read())
if file_extension == ".pdf":
loader = PyPDFLoader(temp_file.name)
pages = loader.load()
for page in pages:
text += page.page_content
elif file_extension == ".epub":
loader = UnstructuredEPubLoader(temp_file.name)
data = loader.load()
text = "\n".join([element.page_content for element in data])
else:
raise ValueError(f"Unsupported file extension: {file_extension}")
os.remove(temp_file.name) # Clean up the temporary file after use
text = text.replace('\t', ' ')
return text
# Get file extension from uploaded file
file_extension = os.path.splitext(uploaded_file.name)[1].lower()
# Load the content of the book from the uploaded file
text = load_book(uploaded_file, file_extension)
llm3_turbo = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens=1000, model='gpt-3.5-turbo-16k')
num_tokens = llm3_turbo.get_num_tokens(text)
if verbose:
print(f"This book has {num_tokens} tokens in it")
text_splitter = RecursiveCharacterTextSplitter(separators=["\n\n", "\n", "\t"], chunk_size=10000, chunk_overlap=3000)
docs = text_splitter.create_documents([text])
num_documents = len(docs)
if verbose:
print(f"Now our book is split up into {num_documents} documents")
# Adjust the number of clusters if necessary
num_clusters = min(num_clusters, num_documents)
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)
vectors = embeddings.embed_documents([x.page_content for x in docs])
kmeans = KMeans(n_clusters=num_clusters, random_state=42).fit(vectors)
#tsne = TSNE(n_components=2, random_state=42)
#vectors_array = np.array(vectors)
#reduced_data_tsne = tsne.fit_transform(vectors_array)
# Uncomment this if you wanna plot the embeddings
# plt.scatter(reduced_data_tsne[:, 0], reduced_data_tsne[:, 1], c=kmeans.labels_)
# plt.xlabel('Dimension 1')
# plt.ylabel('Dimension 2')
# plt.title('Book Embeddings Clustered')
# plt.show()
closest_indices = []
for i in range(num_clusters):
distances = np.linalg.norm(vectors - kmeans.cluster_centers_[i], axis=1)
closest_index = np.argmin(distances)
closest_indices.append(closest_index)
selected_indices = sorted(closest_indices)
map_prompt = """
You are provided with a passage from a book. Your task is to produce a comprehensive summary of this passage. Ensure accuracy and avoid adding any interpretations or extra details not present in the original text. The summary should be at least three paragraphs long and fully capture the essence of the passage.
```{text}```
SUMMARY:
"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])
selected_docs = [docs[doc] for doc in selected_indices]
summary_list = []
for i, doc in enumerate(selected_docs):
current_tokens = llm3_turbo.get_num_tokens(doc.page_content)
if verbose:
print(f"Chunk #{i} token count: {current_tokens}")
print(f"Using llm3_turbo for chunk #{i}")
map_chain = load_summarize_chain(llm=llm3_turbo, chain_type="stuff", prompt=map_prompt_template)
chunk_summary = map_chain.run([doc])
summary_list.append(chunk_summary)
if verbose:
print(f"Summary #{i} (chunk #{selected_indices[i]}) - Preview: {chunk_summary[:250]} \n")
summaries = "\n".join(summary_list)
summaries = Document(page_content=summaries)
if verbose:
print(f"Your total summary has {llm3_turbo.get_num_tokens(summaries.page_content)} tokens")
llm4 = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, max_tokens=3000, model='gpt-4', request_timeout=120)
combine_prompt = """
You are presented with a series of summarized sections from a book. Your task is to weave these summaries into a single, cohesive, and verbose summary. The reader should be able to understand the main events or points of the book from your summary. Ensure you retain the accuracy of the content and present it in a clear and engaging manner.
```{text}```
COHESIVE SUMMARY:
"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])
with get_openai_callback() as cost:
reduce_chain = load_summarize_chain(llm=llm4, chain_type="stuff", prompt=combine_prompt_template)
output = reduce_chain.run([summaries])
if verbose:
print(output)
print(cost)
return output
# testing
if __name__ == '__main__':
book_path = "../Happy_Sexy_Millionaire.epub"
openai_api_key = os.environ.get('OPENAI_API_KEY')
# Mimic Streamlit's uploaded file behavior using open
with open(book_path, 'rb') as uploaded_file:
summary = generate_summary(uploaded_file, openai_api_key, verbose=True)
print(summary)
#cost with gpt 3.5 4k context: Total Cost (USD): $0.12513
#cost with gpt 3.5 16k context: Total Cost (USD): $0.13191