diff --git a/app/gui/utils.py b/app/gui/utils.py
index 1379d8c..b454349 100644
--- a/app/gui/utils.py
+++ b/app/gui/utils.py
@@ -67,20 +67,22 @@ def display_section_tree(_document: Document, summaries: dict):
def add_hierarchy_tree(section, level=0):
result_markdown = '
'
- should_add_expander = summaries.get(section.id) or section.subsections
+ # should_add_expander = summaries.get(section.id) or section.subsections
+ # always add expander
+ should_add_expander = True
if should_add_expander:
# Add details tag
result_markdown += "
"
# Add section title and page number
- result_markdown += f'{section.title}{section.starting_page}'
+ result_markdown += f'{section.title}{section.starting_page+1}'
if should_add_expander:
# Close details tag
result_markdown += "
"
- if summary := summaries.get(section.id):
- result_markdown += f"{summary}
"
+ summary = summaries.get(section.id)
+ result_markdown += f"Section summary: {summary or 'This section has no standalone text in its paragraphs.'}
"
if section.subsections:
result_markdown += ""
diff --git a/app/llm.py b/app/llm.py
index de1a73a..dea17b2 100644
--- a/app/llm.py
+++ b/app/llm.py
@@ -112,7 +112,6 @@ def wrapper(self, *args, **kwargs):
self.n_prompt_tokens += cb.prompt_tokens
self.n_completion_tokens += cb.completion_tokens
self.total_cost += cb.total_cost
- print(cb)
return result
return wrapper
@@ -156,18 +155,25 @@ def create_summaries_chain(self, sections: List[Section]) -> SectionSummaryDict:
SectionSummaryOutput, self.llm, create_summaries_prompt_template
)
# Generate summaries for each section
+ i = 0
for section in sections:
- section_text = section.text
+ section_text = section.paragraph_text
# Check if we need to call the API (only if text exists)
if len(section_text) > 0:
response = summary_runnable.invoke(
- {"section_title": section.title_clean, "section_text": section.text}
+ {"section_title": section.title_clean, "section_text": section_text}
)
summary_dict[section.id] = response.summary
+ i += 1
else:
summary_dict[section.id] = None
+ if i == 20:
+ break
+
+ return summary_dict
+
@track_costs
def generic_question_chain(
self,
diff --git a/app/main.py b/app/main.py
index 6cd1055..3e52fa1 100644
--- a/app/main.py
+++ b/app/main.py
@@ -26,18 +26,33 @@ def init_state():
def query_llm(question, model):
if prompt_executor := st.session_state.prompt_executor:
result = prompt_executor.temp(question)
- print(prompt_executor.total_cost)
- print(prompt_executor.n_prompt_tokens)
st.write(result)
+@st.cache_data
+def get_summaries(_document, file_path):
+ """
+ Args:
+ _document: AdobeExtractAPIManager.get_document() object
+ file_path: path to the file (not used within the method, only used as the st.cache_data caching key)
+ """
+ if prompt_executor := st.session_state.prompt_executor:
+ if document := st.session_state.current_document:
+ summaries = prompt_executor.create_summaries_chain(document.all_sections)
+ st.session_state.summaries_dict = summaries
+ else:
+ st.warning("Please select a document first.")
+ else:
+ st.warning("Please select a document first.")
+
+
def main():
st.set_page_config(page_title="OECD Policy Explorer", page_icon="💼")
# Sidebar - Expander: Document selection
with st.sidebar.expander("📖 Document", expanded=True):
st.write(
- "Upload any policy document in PDF format. To change the file simply upload a new one and click on 'Process' again."
+ "Upload any policy document in PDF format. To change the file simply upload a new one and click on 'Extract' again."
)
# Store the uploaded PDF document in a list
raw_pdf_document = st.file_uploader(
@@ -46,9 +61,15 @@ def main():
accept_multiple_files=False,
help="📁 Allowed document file format: '.pdf'",
)
- print(raw_pdf_document)
- if st.button("Process", type="primary"):
- with st.spinner("Processing..."):
+
+ if st.button(
+ "Extract",
+ type="primary",
+ help="🤖 Extracts text and structural information from the selected document.",
+ ):
+ with st.spinner(
+ "Processing... (this might take around 2-3 minutes if the document is new)"
+ ):
# TODO: Identify if the PDF document is new (hash the first few and last pages + page count)
# TODO: Identify the language of the document (load with pypdf2 and use langdetect)
@@ -60,10 +81,16 @@ def main():
raw_pdf_document.getvalue(), input_file_name=raw_pdf_document.name
)
st.session_state.current_document = document
- st.write("Done processing selected document!")
- st.write(f"Document has {len(document.subsections)} main sections.")
- print(document.subsections, document.title)
- display_pdf(raw_pdf_document)
+ st.session_state.uploaded_file = raw_pdf_document
+ if "summaries_dict" in st.session_state:
+ del st.session_state.summaries_dict
+
+ if "uploaded_file" in st.session_state and "current_document" in st.session_state:
+ doc = st.session_state.current_document
+ st.write(
+ f"Document has {doc.n_pages} pages of extracted text and {len(doc.subsections)} main sections."
+ )
+ display_pdf(st.session_state.uploaded_file)
# Sidebar - Expander: LLM Options
with st.sidebar.expander("⚙️ LLModel options"):
@@ -89,10 +116,14 @@ def main():
st.session_state.oai_model = model
# init / update prompt_executor
if "prompt_executor" not in st.session_state:
- st.session_state["prompt_executor"] = OpenAIPromptExecutor(llm=ChatOpenAI(model=model))
+ st.session_state["prompt_executor"] = OpenAIPromptExecutor(
+ llm=ChatOpenAI(model=model, temperature=0, timeout=10)
+ )
else:
if model != st.session_state.prompt_executor.llm:
- st.session_state.prompt_executor.llm = ChatOpenAI(model=model)
+ st.session_state.prompt_executor.llm = ChatOpenAI(
+ model=model, temperature=0, timeout=10
+ )
st.caption(
"Please refer to the [OpenAI Models documentation](https://platform.openai.com/docs/models/) for more information."
@@ -117,54 +148,85 @@ def main():
st.title("OECD Policy Doc Explorer 🔎")
- analysis_tab, qna_tab = st.tabs(["📊 Analysis", "💬 QnA"])
-
- with analysis_tab:
- # Section tree with summaries
- st.subheader("Document overview", help="💡 Sections are provided by Adobe Extract API.")
- st.caption("Click on a section to see the summary and reveal respective subsections.")
- if "current_document" in st.session_state:
- display_section_tree(_document=st.session_state.current_document, summaries={})
- st.caption(
- "Summaries are generated only for the paragraphs in the section (not including paragraphs from subsections)."
- )
- with st.container():
- df = pd.read_csv("data/UK_34_binary_datasheet.csv")
+ placeholder = st.empty()
+
+ if "summaries_dict" not in st.session_state:
+ with placeholder.container():
+ if "current_document" not in st.session_state:
+ st.write("Please upload (and extract) a document in the sidebar first.")
+ else:
+ st.write(
+ "Document loaded! If you want to start the analysis or QnA, generating a summary is required."
+ )
+ st.caption(
+ "Note: This operation already costs money and it might take a while so please be patient :)"
+ )
+ if st.button("Analyze", type="primary"):
+ with st.spinner("Analyzing document..."):
+ get_summaries(st.session_state.current_document, raw_pdf_document)
+
+ st.write("☑️ Success!")
+ else:
+ with placeholder.container():
+ analysis_tab, qna_tab = st.tabs(["📊 Analysis", "💬 QnA"])
+
+ with analysis_tab:
+ # Section tree with summaries
+ st.subheader(
+ "Document overview",
+ help="💡 Sections are provided by Adobe Extract API.",
+ )
+ st.write("Click on a section to see the summary and reveal respective subsections.")
+ if "current_document" in st.session_state:
+ display_section_tree(
+ _document=st.session_state.current_document,
+ summaries=st.session_state.summaries_dict or {},
+ )
+ st.caption(
+ "Summaries are generated only for the paragraphs in the section (not including paragraphs from subsections)."
+ )
+ st.caption(
+ '⚠️ It is in your best interest to verify the summaries contain some meaningful text before proceeding to the QnA tab. A small number of documents are not OCR\'d correctly and thus might be relatively empty, resulting in a lot of "table of contents" or "references" summaries.'
+ )
+ with st.container():
+ df = pd.read_csv("data/UK_34_binary_datasheet.csv")
- import json
+ import json
- sheet = json.load(open("data/binary_datasheet.json"))
- existing_stis = [key for key in list(sheet.keys()) if sheet[key]["general"] == "1"]
+ sheet = json.load(open("data/binary_datasheet.json"))
+ existing_stis = [
+ key for key in list(sheet.keys()) if sheet[key]["general"] == "1"
+ ]
- st.selectbox("Select STIs", existing_stis)
+ st.selectbox("Select STIs", existing_stis)
- st.dataframe(df)
+ st.dataframe(df)
- with st.expander("Mini-report #1: Summary of the document"):
- st.write("")
+ with st.expander("Mini-report #1: Summary of the document"):
+ st.write("")
- with st.expander("Mini-report #2: ..."):
- st.write("")
+ with st.expander("Mini-report #2: ..."):
+ st.write("")
- with qna_tab:
- st.write(
- "Here you can ask any questions about the document and get answers from the model."
- )
- st.markdown(
- "> Note that this is not a chatbot, but a question answering system without memory. Each question is treated independently."
- )
+ with qna_tab:
+ st.write(
+ "Here you can ask any questions about the document and get answers from the model."
+ )
+ st.markdown(
+ "> Note that this is not a chatbot, but a question answering system without conversation memory. Each question is treated independently of the previous ones."
+ )
- chat_input = st.text_input("Ask a question here")
+ chat_input = st.text_input("Ask a question here")
- if chat_input:
- query_llm(chat_input, st.session_state.oai_model)
+ if chat_input:
+ query_llm(chat_input, st.session_state.oai_model)
- chat_question_choice_pressed = st.button("Examples")
+ chat_question_choice_pressed = st.button("Examples")
- if chat_question_choice_pressed:
- selected_question = st.selectbox(
- "Select a question", ["What are the skills mentioned in this document?"]
- )
+ if chat_question_choice_pressed:
+ selected_question = st.selectbox(
+ "Select a question", ["What are the skills mentioned in this document?"]
+ )
if __name__ == "__main__":
diff --git a/app/preprocessing/adobe/model.py b/app/preprocessing/adobe/model.py
index 21aa807..d7ec772 100644
--- a/app/preprocessing/adobe/model.py
+++ b/app/preprocessing/adobe/model.py
@@ -160,6 +160,11 @@ def title_clean(self) -> Optional[str]:
"""Return a cleaned version of the title (without section number)"""
return re.sub(r"^(\d+\.?)+", "", self.title).lstrip()
+ @property
+ def paragraph_text(self) -> str:
+ """Return the text of all paragraphs in the section"""
+ return "\n".join([p.text for p in self.paragraphs if p.text])
+
@property
def text(self) -> Optional[str]:
"""Create a simple linear text representation of the document"""
@@ -188,10 +193,19 @@ class Document(Section):
def __init__(
self,
+ file_path: str,
title: Optional[str] = None,
pages: Optional[Set[int]] = None,
paragraphs: Optional[List[str]] = None,
subsections: Optional[List["Section"]] = None,
parent: Optional["Section"] = None,
) -> None:
+ # The file_path of the document (.../UK_02.pdf)
+ self.file_path = file_path
super().__init__("root", title, pages, "document", paragraphs, subsections, parent)
+
+ @property
+ def n_pages(self) -> int:
+ """Return the number of pages in the document"""
+ # Recursively go to the last nested subsection and get the last page
+ return sorted(self.all_sections[-1].pages)[-1]
diff --git a/app/preprocessing/adobe/parser.py b/app/preprocessing/adobe/parser.py
index 02d7197..cba4092 100644
--- a/app/preprocessing/adobe/parser.py
+++ b/app/preprocessing/adobe/parser.py
@@ -25,7 +25,7 @@ def adobe_extracted_pdf_to_document(self, extracted_pdf: AdobeExtractedPDF) -> D
# "H2": "subsection",
# "H3": "subsubsection",
# }
- document = Document()
+ document = Document(file_path=extracted_pdf.file_path)
curr_section = document
section_to_insert_header = {}
diff --git a/app/prompts.py b/app/prompts.py
index 1dd1be9..1e8d6e6 100644
--- a/app/prompts.py
+++ b/app/prompts.py
@@ -12,7 +12,7 @@
_human_type_tip_message = HumanMessage(content="Tip: Make sure to answer in the correct format")
_CREATE_SUMMARIES_SYSTEM_PROMPT = """
-You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document in no more than 15-20 sentences. Don't make a longer summary than the original text.
+You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document with 13-20 sentences.
The section text will be given to you in the following json format:
```json
@@ -31,8 +31,9 @@
4. mention any discussion of funding, investments or budget allocations.
5. in the summary, make sure to mention whether there is a certain future need for any skills or technologies
6. mention any explicit skill needs that are mentioned in the text.
-7. if the section is a table of contents or an index, just return "table of contents" as the summary
-8. if the entire section contains only publication citations, don't summarize it just return "references" as the summary.
+7. if the entire section is a table of contents (e.g. line after line of headings followed by page number) just return "table of contents" as the summary
+8. if the entire section contains only publication citations and nothing else, just return "references" as the summary.
+9. make a shorter summary than the original section text
"""
_CREATE_SUMMARIES_INPUT_PROMPT = """
@@ -50,9 +51,11 @@
create_summaries_prompt_template = ChatPromptTemplate.from_messages(
[
- SystemMessage(content=_CREATE_SUMMARIES_SYSTEM_PROMPT),
- HumanMessage(content=_CREATE_SUMMARIES_INPUT_PROMPT),
- _human_type_tip_message,
+ # SystemMessage(content=_CREATE_SUMMARIES_SYSTEM_PROMPT),
+ ("system", _CREATE_SUMMARIES_SYSTEM_PROMPT),
+ # HumanMessage(content=_CREATE_SUMMARIES_INPUT_PROMPT),
+ ("human", _CREATE_SUMMARIES_INPUT_PROMPT),
+ # _human_type_tip_message,
]
)