chore: fix app contents + summary printing

dvdblk · Nov 26, 2023 · 8b47a26 · 8b47a26
1 parent 5647247
commit 8b47a26
Show file tree

Hide file tree

Showing 6 changed files with 150 additions and 63 deletions.
diff --git a/app/gui/utils.py b/app/gui/utils.py
@@ -67,20 +67,22 @@ def display_section_tree(_document: Document, summaries: dict):
 
     def add_hierarchy_tree(section, level=0):
         result_markdown = '<div id="tree">'
-        should_add_expander = summaries.get(section.id) or section.subsections
+        # should_add_expander = summaries.get(section.id) or section.subsections
+        # always add expander
+        should_add_expander = True
         if should_add_expander:
             # Add details tag
             result_markdown += "<details><summary>"
 
         # Add section title and page number
-        result_markdown += f'<span id="treeTitle">{section.title}</span><span id="pageNr">{section.starting_page}</span>'
+        result_markdown += f'<span id="treeTitle">{section.title}</span><span id="pageNr">{section.starting_page+1}</span>'
 
         if should_add_expander:
             # Close details tag
             result_markdown += "</summary>"
 
-        if summary := summaries.get(section.id):
-            result_markdown += f"<blockquote>{summary}</blockquote>"
+        summary = summaries.get(section.id)
+        result_markdown += f"<blockquote>Section summary: {summary or 'This section has no standalone text in its paragraphs.'}</blockquote>"
 
         if section.subsections:
             result_markdown += "<ul>"

diff --git a/app/llm.py b/app/llm.py
@@ -112,7 +112,6 @@ def wrapper(self, *args, **kwargs):
             self.n_prompt_tokens += cb.prompt_tokens
             self.n_completion_tokens += cb.completion_tokens
             self.total_cost += cb.total_cost
-            print(cb)
         return result
 
     return wrapper
@@ -156,18 +155,25 @@ def create_summaries_chain(self, sections: List[Section]) -> SectionSummaryDict:
             SectionSummaryOutput, self.llm, create_summaries_prompt_template
         )
         # Generate summaries for each section
+        i = 0
         for section in sections:
-            section_text = section.text
+            section_text = section.paragraph_text
 
             # Check if we need to call the API (only if text exists)
             if len(section_text) > 0:
                 response = summary_runnable.invoke(
-                    {"section_title": section.title_clean, "section_text": section.text}
+                    {"section_title": section.title_clean, "section_text": section_text}
                 )
                 summary_dict[section.id] = response.summary
+                i += 1
             else:
                 summary_dict[section.id] = None
 
+            if i == 20:
+                break
+
+        return summary_dict
+
     @track_costs
     def generic_question_chain(
         self,

diff --git a/app/main.py b/app/main.py
@@ -26,18 +26,33 @@ def init_state():
 def query_llm(question, model):
     if prompt_executor := st.session_state.prompt_executor:
         result = prompt_executor.temp(question)
-        print(prompt_executor.total_cost)
-        print(prompt_executor.n_prompt_tokens)
         st.write(result)
 
 
+@st.cache_data
+def get_summaries(_document, file_path):
+    """
+    Args:
+        _document: AdobeExtractAPIManager.get_document() object
+        file_path: path to the file (not used within the method, only used as the st.cache_data caching key)
+    """
+    if prompt_executor := st.session_state.prompt_executor:
+        if document := st.session_state.current_document:
+            summaries = prompt_executor.create_summaries_chain(document.all_sections)
+            st.session_state.summaries_dict = summaries
+        else:
+            st.warning("Please select a document first.")
+    else:
+        st.warning("Please select a document first.")
+
+
 def main():
     st.set_page_config(page_title="OECD Policy Explorer", page_icon="💼")
 
     # Sidebar - Expander: Document selection
     with st.sidebar.expander("📖 Document", expanded=True):
         st.write(
-            "Upload any policy document in PDF format. To change the file simply upload a new one and click on 'Process' again."
+            "Upload any policy document in PDF format. To change the file simply upload a new one and click on 'Extract' again."
         )
         # Store the uploaded PDF document in a list
         raw_pdf_document = st.file_uploader(
@@ -46,9 +61,15 @@ def main():
             accept_multiple_files=False,
             help="📁 Allowed document file format: '.pdf'",
         )
-        print(raw_pdf_document)
-        if st.button("Process", type="primary"):
-            with st.spinner("Processing..."):
+
+        if st.button(
+            "Extract",
+            type="primary",
+            help="🤖 Extracts text and structural information from the selected document.",
+        ):
+            with st.spinner(
+                "Processing... (this might take around 2-3 minutes if the document is new)"
+            ):
                 # TODO: Identify if the PDF document is new (hash the first few and last pages + page count)
                 # TODO: Identify the language of the document (load with pypdf2 and use langdetect)
 
@@ -60,10 +81,16 @@ def main():
                         raw_pdf_document.getvalue(), input_file_name=raw_pdf_document.name
                     )
                     st.session_state.current_document = document
-                    st.write("Done processing selected document!")
-                    st.write(f"Document has {len(document.subsections)} main sections.")
-                    print(document.subsections, document.title)
-                    display_pdf(raw_pdf_document)
+                    st.session_state.uploaded_file = raw_pdf_document
+                    if "summaries_dict" in st.session_state:
+                        del st.session_state.summaries_dict
+
+        if "uploaded_file" in st.session_state and "current_document" in st.session_state:
+            doc = st.session_state.current_document
+            st.write(
+                f"Document has {doc.n_pages} pages of extracted text and {len(doc.subsections)} main sections."
+            )
+            display_pdf(st.session_state.uploaded_file)
 
     # Sidebar - Expander: LLM Options
     with st.sidebar.expander("⚙️ LLModel options"):
@@ -89,10 +116,14 @@ def main():
             st.session_state.oai_model = model
         # init / update prompt_executor
         if "prompt_executor" not in st.session_state:
-            st.session_state["prompt_executor"] = OpenAIPromptExecutor(llm=ChatOpenAI(model=model))
+            st.session_state["prompt_executor"] = OpenAIPromptExecutor(
+                llm=ChatOpenAI(model=model, temperature=0, timeout=10)
+            )
         else:
             if model != st.session_state.prompt_executor.llm:
-                st.session_state.prompt_executor.llm = ChatOpenAI(model=model)
+                st.session_state.prompt_executor.llm = ChatOpenAI(
+                    model=model, temperature=0, timeout=10
+                )
 
         st.caption(
             "Please refer to the [OpenAI Models documentation](https://platform.openai.com/docs/models/) for more information."
@@ -117,54 +148,85 @@ def main():
 
     st.title("OECD Policy Doc Explorer 🔎")
 
-    analysis_tab, qna_tab = st.tabs(["📊 Analysis", "💬 QnA"])
-
-    with analysis_tab:
-        # Section tree with summaries
-        st.subheader("Document overview", help="💡 Sections are provided by Adobe Extract API.")
-        st.caption("Click on a section to see the summary and reveal respective subsections.")
-        if "current_document" in st.session_state:
-            display_section_tree(_document=st.session_state.current_document, summaries={})
-            st.caption(
-                "Summaries are generated only for the paragraphs in the section (not including paragraphs from subsections)."
-            )
-        with st.container():
-            df = pd.read_csv("data/UK_34_binary_datasheet.csv")
+    placeholder = st.empty()
+
+    if "summaries_dict" not in st.session_state:
+        with placeholder.container():
+            if "current_document" not in st.session_state:
+                st.write("Please upload (and extract) a document in the sidebar first.")
+            else:
+                st.write(
+                    "Document loaded! If you want to start the analysis or QnA, generating a summary is required."
+                )
+                st.caption(
+                    "Note: This operation already costs money and it might take a while so please be patient :)"
+                )
+                if st.button("Analyze", type="primary"):
+                    with st.spinner("Analyzing document..."):
+                        get_summaries(st.session_state.current_document, raw_pdf_document)
+
+                        st.write("☑️ Success!")
+    else:
+        with placeholder.container():
+            analysis_tab, qna_tab = st.tabs(["📊 Analysis", "💬 QnA"])
+
+            with analysis_tab:
+                # Section tree with summaries
+                st.subheader(
+                    "Document overview",
+                    help="💡 Sections are provided by Adobe Extract API.",
+                )
+                st.write("Click on a section to see the summary and reveal respective subsections.")
+                if "current_document" in st.session_state:
+                    display_section_tree(
+                        _document=st.session_state.current_document,
+                        summaries=st.session_state.summaries_dict or {},
+                    )
+                    st.caption(
+                        "Summaries are generated only for the paragraphs in the section (not including paragraphs from subsections)."
+                    )
+                    st.caption(
+                        '⚠️ It is in your best interest to verify the summaries contain some meaningful text before proceeding to the QnA tab. A small number of documents are not OCR\'d correctly and thus might be relatively empty, resulting in a lot of "table of contents" or "references" summaries.'
+                    )
+                with st.container():
+                    df = pd.read_csv("data/UK_34_binary_datasheet.csv")
 
-            import json
+                    import json
 
-            sheet = json.load(open("data/binary_datasheet.json"))
-            existing_stis = [key for key in list(sheet.keys()) if sheet[key]["general"] == "1"]
+                    sheet = json.load(open("data/binary_datasheet.json"))
+                    existing_stis = [
+                        key for key in list(sheet.keys()) if sheet[key]["general"] == "1"
+                    ]
 
-            st.selectbox("Select STIs", existing_stis)
+                    st.selectbox("Select STIs", existing_stis)
 
-            st.dataframe(df)
+                    st.dataframe(df)
 
-        with st.expander("Mini-report #1: Summary of the document"):
-            st.write("<Summarized document goes here>")
+                with st.expander("Mini-report #1: Summary of the document"):
+                    st.write("<Summarized document goes here>")
 
-        with st.expander("Mini-report #2: ..."):
-            st.write("")
+                with st.expander("Mini-report #2: ..."):
+                    st.write("")
 
-    with qna_tab:
-        st.write(
-            "Here you can ask any questions about the document and get answers from the model."
-        )
-        st.markdown(
-            "> Note that this is not a chatbot, but a question answering system without memory. Each question is treated independently."
-        )
+            with qna_tab:
+                st.write(
+                    "Here you can ask any questions about the document and get answers from the model."
+                )
+                st.markdown(
+                    "> Note that this is not a chatbot, but a question answering system without conversation memory. Each question is treated independently of the previous ones."
+                )
 
-        chat_input = st.text_input("Ask a question here")
+                chat_input = st.text_input("Ask a question here")
 
-        if chat_input:
-            query_llm(chat_input, st.session_state.oai_model)
+                if chat_input:
+                    query_llm(chat_input, st.session_state.oai_model)
 
-        chat_question_choice_pressed = st.button("Examples")
+                chat_question_choice_pressed = st.button("Examples")
 
-        if chat_question_choice_pressed:
-            selected_question = st.selectbox(
-                "Select a question", ["What are the skills mentioned in this document?"]
-            )
+                if chat_question_choice_pressed:
+                    selected_question = st.selectbox(
+                        "Select a question", ["What are the skills mentioned in this document?"]
+                    )
 
 
 if __name__ == "__main__":

diff --git a/app/preprocessing/adobe/model.py b/app/preprocessing/adobe/model.py
@@ -160,6 +160,11 @@ def title_clean(self) -> Optional[str]:
         """Return a cleaned version of the title (without section number)"""
         return re.sub(r"^(\d+\.?)+", "", self.title).lstrip()
 
+    @property
+    def paragraph_text(self) -> str:
+        """Return the text of all paragraphs in the section"""
+        return "\n".join([p.text for p in self.paragraphs if p.text])
+
     @property
     def text(self) -> Optional[str]:
         """Create a simple linear text representation of the document"""
@@ -188,10 +193,19 @@ class Document(Section):
 
     def __init__(
         self,
+        file_path: str,
         title: Optional[str] = None,
         pages: Optional[Set[int]] = None,
         paragraphs: Optional[List[str]] = None,
         subsections: Optional[List["Section"]] = None,
         parent: Optional["Section"] = None,
     ) -> None:
+        # The file_path of the document (.../UK_02.pdf)
+        self.file_path = file_path
         super().__init__("root", title, pages, "document", paragraphs, subsections, parent)
+
+    @property
+    def n_pages(self) -> int:
+        """Return the number of pages in the document"""
+        # Recursively go to the last nested subsection and get the last page
+        return sorted(self.all_sections[-1].pages)[-1]
diff --git a/app/preprocessing/adobe/parser.py b/app/preprocessing/adobe/parser.py
@@ -25,7 +25,7 @@ def adobe_extracted_pdf_to_document(self, extracted_pdf: AdobeExtractedPDF) -> D
         #     "H2": "subsection",
         #     "H3": "subsubsection",
         # }
-        document = Document()
+        document = Document(file_path=extracted_pdf.file_path)
 
         curr_section = document
         section_to_insert_header = {}

diff --git a/app/prompts.py b/app/prompts.py
@@ -12,7 +12,7 @@
 _human_type_tip_message = HumanMessage(content="Tip: Make sure to answer in the correct format")
 
 _CREATE_SUMMARIES_SYSTEM_PROMPT = """
-You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document in no more than 15-20 sentences. Don't make a longer summary than the original text.
+You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document with 13-20 sentences.
 
 The section text will be given to you in the following json format:
 ```json
@@ -31,8 +31,9 @@
 4. mention any discussion of funding, investments or budget allocations.
 5. in the summary, make sure to mention whether there is a certain future need for any skills or technologies
 6. mention any explicit skill needs that are mentioned in the text.
-7. if the section is a table of contents or an index, just return "table of contents" as the summary
-8. if the entire section contains only publication citations, don't summarize it just return "references" as the summary.
+7. if the entire section is a table of contents (e.g. line after line of headings followed by page number) just return "table of contents" as the summary
+8. if the entire section contains only publication citations and nothing else, just return "references" as the summary.
+9. make a shorter summary than the original section text
 """
 
 _CREATE_SUMMARIES_INPUT_PROMPT = """
@@ -50,9 +51,11 @@
 
 create_summaries_prompt_template = ChatPromptTemplate.from_messages(
     [
-        SystemMessage(content=_CREATE_SUMMARIES_SYSTEM_PROMPT),
-        HumanMessage(content=_CREATE_SUMMARIES_INPUT_PROMPT),
-        _human_type_tip_message,
+        # SystemMessage(content=_CREATE_SUMMARIES_SYSTEM_PROMPT),
+        ("system", _CREATE_SUMMARIES_SYSTEM_PROMPT),
+        # HumanMessage(content=_CREATE_SUMMARIES_INPUT_PROMPT),
+        ("human", _CREATE_SUMMARIES_INPUT_PROMPT),
+        # _human_type_tip_message,
     ]
 )