Skip to content

Commit

Permalink
chore: fix app contents + summary printing
Browse files Browse the repository at this point in the history
  • Loading branch information
dvdblk committed Nov 26, 2023
1 parent 5647247 commit 8b47a26
Show file tree
Hide file tree
Showing 6 changed files with 150 additions and 63 deletions.
10 changes: 6 additions & 4 deletions app/gui/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,20 +67,22 @@ def display_section_tree(_document: Document, summaries: dict):

def add_hierarchy_tree(section, level=0):
result_markdown = '<div id="tree">'
should_add_expander = summaries.get(section.id) or section.subsections
# should_add_expander = summaries.get(section.id) or section.subsections
# always add expander
should_add_expander = True
if should_add_expander:
# Add details tag
result_markdown += "<details><summary>"

# Add section title and page number
result_markdown += f'<span id="treeTitle">{section.title}</span><span id="pageNr">{section.starting_page}</span>'
result_markdown += f'<span id="treeTitle">{section.title}</span><span id="pageNr">{section.starting_page+1}</span>'

if should_add_expander:
# Close details tag
result_markdown += "</summary>"

if summary := summaries.get(section.id):
result_markdown += f"<blockquote>{summary}</blockquote>"
summary = summaries.get(section.id)
result_markdown += f"<blockquote>Section summary: {summary or 'This section has no standalone text in its paragraphs.'}</blockquote>"

if section.subsections:
result_markdown += "<ul>"
Expand Down
12 changes: 9 additions & 3 deletions app/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,6 @@ def wrapper(self, *args, **kwargs):
self.n_prompt_tokens += cb.prompt_tokens
self.n_completion_tokens += cb.completion_tokens
self.total_cost += cb.total_cost
print(cb)
return result

return wrapper
Expand Down Expand Up @@ -156,18 +155,25 @@ def create_summaries_chain(self, sections: List[Section]) -> SectionSummaryDict:
SectionSummaryOutput, self.llm, create_summaries_prompt_template
)
# Generate summaries for each section
i = 0
for section in sections:
section_text = section.text
section_text = section.paragraph_text

# Check if we need to call the API (only if text exists)
if len(section_text) > 0:
response = summary_runnable.invoke(
{"section_title": section.title_clean, "section_text": section.text}
{"section_title": section.title_clean, "section_text": section_text}
)
summary_dict[section.id] = response.summary
i += 1
else:
summary_dict[section.id] = None

if i == 20:
break

return summary_dict

@track_costs
def generic_question_chain(
self,
Expand Down
160 changes: 111 additions & 49 deletions app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,33 @@ def init_state():
def query_llm(question, model):
if prompt_executor := st.session_state.prompt_executor:
result = prompt_executor.temp(question)
print(prompt_executor.total_cost)
print(prompt_executor.n_prompt_tokens)
st.write(result)


@st.cache_data
def get_summaries(_document, file_path):
"""
Args:
_document: AdobeExtractAPIManager.get_document() object
file_path: path to the file (not used within the method, only used as the st.cache_data caching key)
"""
if prompt_executor := st.session_state.prompt_executor:
if document := st.session_state.current_document:
summaries = prompt_executor.create_summaries_chain(document.all_sections)
st.session_state.summaries_dict = summaries
else:
st.warning("Please select a document first.")
else:
st.warning("Please select a document first.")


def main():
st.set_page_config(page_title="OECD Policy Explorer", page_icon="💼")

# Sidebar - Expander: Document selection
with st.sidebar.expander("📖 Document", expanded=True):
st.write(
"Upload any policy document in PDF format. To change the file simply upload a new one and click on 'Process' again."
"Upload any policy document in PDF format. To change the file simply upload a new one and click on 'Extract' again."
)
# Store the uploaded PDF document in a list
raw_pdf_document = st.file_uploader(
Expand All @@ -46,9 +61,15 @@ def main():
accept_multiple_files=False,
help="📁 Allowed document file format: '.pdf'",
)
print(raw_pdf_document)
if st.button("Process", type="primary"):
with st.spinner("Processing..."):

if st.button(
"Extract",
type="primary",
help="🤖 Extracts text and structural information from the selected document.",
):
with st.spinner(
"Processing... (this might take around 2-3 minutes if the document is new)"
):
# TODO: Identify if the PDF document is new (hash the first few and last pages + page count)
# TODO: Identify the language of the document (load with pypdf2 and use langdetect)

Expand All @@ -60,10 +81,16 @@ def main():
raw_pdf_document.getvalue(), input_file_name=raw_pdf_document.name
)
st.session_state.current_document = document
st.write("Done processing selected document!")
st.write(f"Document has {len(document.subsections)} main sections.")
print(document.subsections, document.title)
display_pdf(raw_pdf_document)
st.session_state.uploaded_file = raw_pdf_document
if "summaries_dict" in st.session_state:
del st.session_state.summaries_dict

if "uploaded_file" in st.session_state and "current_document" in st.session_state:
doc = st.session_state.current_document
st.write(
f"Document has {doc.n_pages} pages of extracted text and {len(doc.subsections)} main sections."
)
display_pdf(st.session_state.uploaded_file)

# Sidebar - Expander: LLM Options
with st.sidebar.expander("⚙️ LLModel options"):
Expand All @@ -89,10 +116,14 @@ def main():
st.session_state.oai_model = model
# init / update prompt_executor
if "prompt_executor" not in st.session_state:
st.session_state["prompt_executor"] = OpenAIPromptExecutor(llm=ChatOpenAI(model=model))
st.session_state["prompt_executor"] = OpenAIPromptExecutor(
llm=ChatOpenAI(model=model, temperature=0, timeout=10)
)
else:
if model != st.session_state.prompt_executor.llm:
st.session_state.prompt_executor.llm = ChatOpenAI(model=model)
st.session_state.prompt_executor.llm = ChatOpenAI(
model=model, temperature=0, timeout=10
)

st.caption(
"Please refer to the [OpenAI Models documentation](https://platform.openai.com/docs/models/) for more information."
Expand All @@ -117,54 +148,85 @@ def main():

st.title("OECD Policy Doc Explorer 🔎")

analysis_tab, qna_tab = st.tabs(["📊 Analysis", "💬 QnA"])

with analysis_tab:
# Section tree with summaries
st.subheader("Document overview", help="💡 Sections are provided by Adobe Extract API.")
st.caption("Click on a section to see the summary and reveal respective subsections.")
if "current_document" in st.session_state:
display_section_tree(_document=st.session_state.current_document, summaries={})
st.caption(
"Summaries are generated only for the paragraphs in the section (not including paragraphs from subsections)."
)
with st.container():
df = pd.read_csv("data/UK_34_binary_datasheet.csv")
placeholder = st.empty()

if "summaries_dict" not in st.session_state:
with placeholder.container():
if "current_document" not in st.session_state:
st.write("Please upload (and extract) a document in the sidebar first.")
else:
st.write(
"Document loaded! If you want to start the analysis or QnA, generating a summary is required."
)
st.caption(
"Note: This operation already costs money and it might take a while so please be patient :)"
)
if st.button("Analyze", type="primary"):
with st.spinner("Analyzing document..."):
get_summaries(st.session_state.current_document, raw_pdf_document)

st.write("☑️ Success!")
else:
with placeholder.container():
analysis_tab, qna_tab = st.tabs(["📊 Analysis", "💬 QnA"])

with analysis_tab:
# Section tree with summaries
st.subheader(
"Document overview",
help="💡 Sections are provided by Adobe Extract API.",
)
st.write("Click on a section to see the summary and reveal respective subsections.")
if "current_document" in st.session_state:
display_section_tree(
_document=st.session_state.current_document,
summaries=st.session_state.summaries_dict or {},
)
st.caption(
"Summaries are generated only for the paragraphs in the section (not including paragraphs from subsections)."
)
st.caption(
'⚠️ It is in your best interest to verify the summaries contain some meaningful text before proceeding to the QnA tab. A small number of documents are not OCR\'d correctly and thus might be relatively empty, resulting in a lot of "table of contents" or "references" summaries.'
)
with st.container():
df = pd.read_csv("data/UK_34_binary_datasheet.csv")

import json
import json

sheet = json.load(open("data/binary_datasheet.json"))
existing_stis = [key for key in list(sheet.keys()) if sheet[key]["general"] == "1"]
sheet = json.load(open("data/binary_datasheet.json"))
existing_stis = [
key for key in list(sheet.keys()) if sheet[key]["general"] == "1"
]

st.selectbox("Select STIs", existing_stis)
st.selectbox("Select STIs", existing_stis)

st.dataframe(df)
st.dataframe(df)

with st.expander("Mini-report #1: Summary of the document"):
st.write("<Summarized document goes here>")
with st.expander("Mini-report #1: Summary of the document"):
st.write("<Summarized document goes here>")

with st.expander("Mini-report #2: ..."):
st.write("")
with st.expander("Mini-report #2: ..."):
st.write("")

with qna_tab:
st.write(
"Here you can ask any questions about the document and get answers from the model."
)
st.markdown(
"> Note that this is not a chatbot, but a question answering system without memory. Each question is treated independently."
)
with qna_tab:
st.write(
"Here you can ask any questions about the document and get answers from the model."
)
st.markdown(
"> Note that this is not a chatbot, but a question answering system without conversation memory. Each question is treated independently of the previous ones."
)

chat_input = st.text_input("Ask a question here")
chat_input = st.text_input("Ask a question here")

if chat_input:
query_llm(chat_input, st.session_state.oai_model)
if chat_input:
query_llm(chat_input, st.session_state.oai_model)

chat_question_choice_pressed = st.button("Examples")
chat_question_choice_pressed = st.button("Examples")

if chat_question_choice_pressed:
selected_question = st.selectbox(
"Select a question", ["What are the skills mentioned in this document?"]
)
if chat_question_choice_pressed:
selected_question = st.selectbox(
"Select a question", ["What are the skills mentioned in this document?"]
)


if __name__ == "__main__":
Expand Down
14 changes: 14 additions & 0 deletions app/preprocessing/adobe/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,11 @@ def title_clean(self) -> Optional[str]:
"""Return a cleaned version of the title (without section number)"""
return re.sub(r"^(\d+\.?)+", "", self.title).lstrip()

@property
def paragraph_text(self) -> str:
"""Return the text of all paragraphs in the section"""
return "\n".join([p.text for p in self.paragraphs if p.text])

@property
def text(self) -> Optional[str]:
"""Create a simple linear text representation of the document"""
Expand Down Expand Up @@ -188,10 +193,19 @@ class Document(Section):

def __init__(
self,
file_path: str,
title: Optional[str] = None,
pages: Optional[Set[int]] = None,
paragraphs: Optional[List[str]] = None,
subsections: Optional[List["Section"]] = None,
parent: Optional["Section"] = None,
) -> None:
# The file_path of the document (.../UK_02.pdf)
self.file_path = file_path
super().__init__("root", title, pages, "document", paragraphs, subsections, parent)

@property
def n_pages(self) -> int:
"""Return the number of pages in the document"""
# Recursively go to the last nested subsection and get the last page
return sorted(self.all_sections[-1].pages)[-1]
2 changes: 1 addition & 1 deletion app/preprocessing/adobe/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def adobe_extracted_pdf_to_document(self, extracted_pdf: AdobeExtractedPDF) -> D
# "H2": "subsection",
# "H3": "subsubsection",
# }
document = Document()
document = Document(file_path=extracted_pdf.file_path)

curr_section = document
section_to_insert_header = {}
Expand Down
15 changes: 9 additions & 6 deletions app/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
_human_type_tip_message = HumanMessage(content="Tip: Make sure to answer in the correct format")

_CREATE_SUMMARIES_SYSTEM_PROMPT = """
You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document in no more than 15-20 sentences. Don't make a longer summary than the original text.
You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document with 13-20 sentences.
The section text will be given to you in the following json format:
```json
Expand All @@ -31,8 +31,9 @@
4. mention any discussion of funding, investments or budget allocations.
5. in the summary, make sure to mention whether there is a certain future need for any skills or technologies
6. mention any explicit skill needs that are mentioned in the text.
7. if the section is a table of contents or an index, just return "table of contents" as the summary
8. if the entire section contains only publication citations, don't summarize it just return "references" as the summary.
7. if the entire section is a table of contents (e.g. line after line of headings followed by page number) just return "table of contents" as the summary
8. if the entire section contains only publication citations and nothing else, just return "references" as the summary.
9. make a shorter summary than the original section text
"""

_CREATE_SUMMARIES_INPUT_PROMPT = """
Expand All @@ -50,9 +51,11 @@

create_summaries_prompt_template = ChatPromptTemplate.from_messages(
[
SystemMessage(content=_CREATE_SUMMARIES_SYSTEM_PROMPT),
HumanMessage(content=_CREATE_SUMMARIES_INPUT_PROMPT),
_human_type_tip_message,
# SystemMessage(content=_CREATE_SUMMARIES_SYSTEM_PROMPT),
("system", _CREATE_SUMMARIES_SYSTEM_PROMPT),
# HumanMessage(content=_CREATE_SUMMARIES_INPUT_PROMPT),
("human", _CREATE_SUMMARIES_INPUT_PROMPT),
# _human_type_tip_message,
]
)

Expand Down

0 comments on commit 8b47a26

Please sign in to comment.