From 8b47a26eeefdcce2a4d8e6d56c91bb74e70e9f27 Mon Sep 17 00:00:00 2001 From: dvdblk Date: Mon, 20 Nov 2023 18:27:45 +0100 Subject: [PATCH] chore: fix app contents + summary printing --- app/gui/utils.py | 10 +- app/llm.py | 12 ++- app/main.py | 160 +++++++++++++++++++++--------- app/preprocessing/adobe/model.py | 14 +++ app/preprocessing/adobe/parser.py | 2 +- app/prompts.py | 15 +-- 6 files changed, 150 insertions(+), 63 deletions(-) diff --git a/app/gui/utils.py b/app/gui/utils.py index 1379d8c..b454349 100644 --- a/app/gui/utils.py +++ b/app/gui/utils.py @@ -67,20 +67,22 @@ def display_section_tree(_document: Document, summaries: dict): def add_hierarchy_tree(section, level=0): result_markdown = '
' - should_add_expander = summaries.get(section.id) or section.subsections + # should_add_expander = summaries.get(section.id) or section.subsections + # always add expander + should_add_expander = True if should_add_expander: # Add details tag result_markdown += "
" # Add section title and page number - result_markdown += f'{section.title}{section.starting_page}' + result_markdown += f'{section.title}{section.starting_page+1}' if should_add_expander: # Close details tag result_markdown += "" - if summary := summaries.get(section.id): - result_markdown += f"
{summary}
" + summary = summaries.get(section.id) + result_markdown += f"
Section summary: {summary or 'This section has no standalone text in its paragraphs.'}
" if section.subsections: result_markdown += "
    " diff --git a/app/llm.py b/app/llm.py index de1a73a..dea17b2 100644 --- a/app/llm.py +++ b/app/llm.py @@ -112,7 +112,6 @@ def wrapper(self, *args, **kwargs): self.n_prompt_tokens += cb.prompt_tokens self.n_completion_tokens += cb.completion_tokens self.total_cost += cb.total_cost - print(cb) return result return wrapper @@ -156,18 +155,25 @@ def create_summaries_chain(self, sections: List[Section]) -> SectionSummaryDict: SectionSummaryOutput, self.llm, create_summaries_prompt_template ) # Generate summaries for each section + i = 0 for section in sections: - section_text = section.text + section_text = section.paragraph_text # Check if we need to call the API (only if text exists) if len(section_text) > 0: response = summary_runnable.invoke( - {"section_title": section.title_clean, "section_text": section.text} + {"section_title": section.title_clean, "section_text": section_text} ) summary_dict[section.id] = response.summary + i += 1 else: summary_dict[section.id] = None + if i == 20: + break + + return summary_dict + @track_costs def generic_question_chain( self, diff --git a/app/main.py b/app/main.py index 6cd1055..3e52fa1 100644 --- a/app/main.py +++ b/app/main.py @@ -26,18 +26,33 @@ def init_state(): def query_llm(question, model): if prompt_executor := st.session_state.prompt_executor: result = prompt_executor.temp(question) - print(prompt_executor.total_cost) - print(prompt_executor.n_prompt_tokens) st.write(result) +@st.cache_data +def get_summaries(_document, file_path): + """ + Args: + _document: AdobeExtractAPIManager.get_document() object + file_path: path to the file (not used within the method, only used as the st.cache_data caching key) + """ + if prompt_executor := st.session_state.prompt_executor: + if document := st.session_state.current_document: + summaries = prompt_executor.create_summaries_chain(document.all_sections) + st.session_state.summaries_dict = summaries + else: + st.warning("Please select a document first.") + else: + st.warning("Please select a document first.") + + def main(): st.set_page_config(page_title="OECD Policy Explorer", page_icon="💼") # Sidebar - Expander: Document selection with st.sidebar.expander("📖 Document", expanded=True): st.write( - "Upload any policy document in PDF format. To change the file simply upload a new one and click on 'Process' again." + "Upload any policy document in PDF format. To change the file simply upload a new one and click on 'Extract' again." ) # Store the uploaded PDF document in a list raw_pdf_document = st.file_uploader( @@ -46,9 +61,15 @@ def main(): accept_multiple_files=False, help="📁 Allowed document file format: '.pdf'", ) - print(raw_pdf_document) - if st.button("Process", type="primary"): - with st.spinner("Processing..."): + + if st.button( + "Extract", + type="primary", + help="🤖 Extracts text and structural information from the selected document.", + ): + with st.spinner( + "Processing... (this might take around 2-3 minutes if the document is new)" + ): # TODO: Identify if the PDF document is new (hash the first few and last pages + page count) # TODO: Identify the language of the document (load with pypdf2 and use langdetect) @@ -60,10 +81,16 @@ def main(): raw_pdf_document.getvalue(), input_file_name=raw_pdf_document.name ) st.session_state.current_document = document - st.write("Done processing selected document!") - st.write(f"Document has {len(document.subsections)} main sections.") - print(document.subsections, document.title) - display_pdf(raw_pdf_document) + st.session_state.uploaded_file = raw_pdf_document + if "summaries_dict" in st.session_state: + del st.session_state.summaries_dict + + if "uploaded_file" in st.session_state and "current_document" in st.session_state: + doc = st.session_state.current_document + st.write( + f"Document has {doc.n_pages} pages of extracted text and {len(doc.subsections)} main sections." + ) + display_pdf(st.session_state.uploaded_file) # Sidebar - Expander: LLM Options with st.sidebar.expander("⚙️ LLModel options"): @@ -89,10 +116,14 @@ def main(): st.session_state.oai_model = model # init / update prompt_executor if "prompt_executor" not in st.session_state: - st.session_state["prompt_executor"] = OpenAIPromptExecutor(llm=ChatOpenAI(model=model)) + st.session_state["prompt_executor"] = OpenAIPromptExecutor( + llm=ChatOpenAI(model=model, temperature=0, timeout=10) + ) else: if model != st.session_state.prompt_executor.llm: - st.session_state.prompt_executor.llm = ChatOpenAI(model=model) + st.session_state.prompt_executor.llm = ChatOpenAI( + model=model, temperature=0, timeout=10 + ) st.caption( "Please refer to the [OpenAI Models documentation](https://platform.openai.com/docs/models/) for more information." @@ -117,54 +148,85 @@ def main(): st.title("OECD Policy Doc Explorer 🔎") - analysis_tab, qna_tab = st.tabs(["📊 Analysis", "💬 QnA"]) - - with analysis_tab: - # Section tree with summaries - st.subheader("Document overview", help="💡 Sections are provided by Adobe Extract API.") - st.caption("Click on a section to see the summary and reveal respective subsections.") - if "current_document" in st.session_state: - display_section_tree(_document=st.session_state.current_document, summaries={}) - st.caption( - "Summaries are generated only for the paragraphs in the section (not including paragraphs from subsections)." - ) - with st.container(): - df = pd.read_csv("data/UK_34_binary_datasheet.csv") + placeholder = st.empty() + + if "summaries_dict" not in st.session_state: + with placeholder.container(): + if "current_document" not in st.session_state: + st.write("Please upload (and extract) a document in the sidebar first.") + else: + st.write( + "Document loaded! If you want to start the analysis or QnA, generating a summary is required." + ) + st.caption( + "Note: This operation already costs money and it might take a while so please be patient :)" + ) + if st.button("Analyze", type="primary"): + with st.spinner("Analyzing document..."): + get_summaries(st.session_state.current_document, raw_pdf_document) + + st.write("☑️ Success!") + else: + with placeholder.container(): + analysis_tab, qna_tab = st.tabs(["📊 Analysis", "💬 QnA"]) + + with analysis_tab: + # Section tree with summaries + st.subheader( + "Document overview", + help="💡 Sections are provided by Adobe Extract API.", + ) + st.write("Click on a section to see the summary and reveal respective subsections.") + if "current_document" in st.session_state: + display_section_tree( + _document=st.session_state.current_document, + summaries=st.session_state.summaries_dict or {}, + ) + st.caption( + "Summaries are generated only for the paragraphs in the section (not including paragraphs from subsections)." + ) + st.caption( + '⚠️ It is in your best interest to verify the summaries contain some meaningful text before proceeding to the QnA tab. A small number of documents are not OCR\'d correctly and thus might be relatively empty, resulting in a lot of "table of contents" or "references" summaries.' + ) + with st.container(): + df = pd.read_csv("data/UK_34_binary_datasheet.csv") - import json + import json - sheet = json.load(open("data/binary_datasheet.json")) - existing_stis = [key for key in list(sheet.keys()) if sheet[key]["general"] == "1"] + sheet = json.load(open("data/binary_datasheet.json")) + existing_stis = [ + key for key in list(sheet.keys()) if sheet[key]["general"] == "1" + ] - st.selectbox("Select STIs", existing_stis) + st.selectbox("Select STIs", existing_stis) - st.dataframe(df) + st.dataframe(df) - with st.expander("Mini-report #1: Summary of the document"): - st.write("") + with st.expander("Mini-report #1: Summary of the document"): + st.write("") - with st.expander("Mini-report #2: ..."): - st.write("") + with st.expander("Mini-report #2: ..."): + st.write("") - with qna_tab: - st.write( - "Here you can ask any questions about the document and get answers from the model." - ) - st.markdown( - "> Note that this is not a chatbot, but a question answering system without memory. Each question is treated independently." - ) + with qna_tab: + st.write( + "Here you can ask any questions about the document and get answers from the model." + ) + st.markdown( + "> Note that this is not a chatbot, but a question answering system without conversation memory. Each question is treated independently of the previous ones." + ) - chat_input = st.text_input("Ask a question here") + chat_input = st.text_input("Ask a question here") - if chat_input: - query_llm(chat_input, st.session_state.oai_model) + if chat_input: + query_llm(chat_input, st.session_state.oai_model) - chat_question_choice_pressed = st.button("Examples") + chat_question_choice_pressed = st.button("Examples") - if chat_question_choice_pressed: - selected_question = st.selectbox( - "Select a question", ["What are the skills mentioned in this document?"] - ) + if chat_question_choice_pressed: + selected_question = st.selectbox( + "Select a question", ["What are the skills mentioned in this document?"] + ) if __name__ == "__main__": diff --git a/app/preprocessing/adobe/model.py b/app/preprocessing/adobe/model.py index 21aa807..d7ec772 100644 --- a/app/preprocessing/adobe/model.py +++ b/app/preprocessing/adobe/model.py @@ -160,6 +160,11 @@ def title_clean(self) -> Optional[str]: """Return a cleaned version of the title (without section number)""" return re.sub(r"^(\d+\.?)+", "", self.title).lstrip() + @property + def paragraph_text(self) -> str: + """Return the text of all paragraphs in the section""" + return "\n".join([p.text for p in self.paragraphs if p.text]) + @property def text(self) -> Optional[str]: """Create a simple linear text representation of the document""" @@ -188,10 +193,19 @@ class Document(Section): def __init__( self, + file_path: str, title: Optional[str] = None, pages: Optional[Set[int]] = None, paragraphs: Optional[List[str]] = None, subsections: Optional[List["Section"]] = None, parent: Optional["Section"] = None, ) -> None: + # The file_path of the document (.../UK_02.pdf) + self.file_path = file_path super().__init__("root", title, pages, "document", paragraphs, subsections, parent) + + @property + def n_pages(self) -> int: + """Return the number of pages in the document""" + # Recursively go to the last nested subsection and get the last page + return sorted(self.all_sections[-1].pages)[-1] diff --git a/app/preprocessing/adobe/parser.py b/app/preprocessing/adobe/parser.py index 02d7197..cba4092 100644 --- a/app/preprocessing/adobe/parser.py +++ b/app/preprocessing/adobe/parser.py @@ -25,7 +25,7 @@ def adobe_extracted_pdf_to_document(self, extracted_pdf: AdobeExtractedPDF) -> D # "H2": "subsection", # "H3": "subsubsection", # } - document = Document() + document = Document(file_path=extracted_pdf.file_path) curr_section = document section_to_insert_header = {} diff --git a/app/prompts.py b/app/prompts.py index 1dd1be9..1e8d6e6 100644 --- a/app/prompts.py +++ b/app/prompts.py @@ -12,7 +12,7 @@ _human_type_tip_message = HumanMessage(content="Tip: Make sure to answer in the correct format") _CREATE_SUMMARIES_SYSTEM_PROMPT = """ -You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document in no more than 15-20 sentences. Don't make a longer summary than the original text. +You're an expert policy analyst that is analyzing an economic policy document. Your goal is to summarize a given section text of a document with 13-20 sentences. The section text will be given to you in the following json format: ```json @@ -31,8 +31,9 @@ 4. mention any discussion of funding, investments or budget allocations. 5. in the summary, make sure to mention whether there is a certain future need for any skills or technologies 6. mention any explicit skill needs that are mentioned in the text. -7. if the section is a table of contents or an index, just return "table of contents" as the summary -8. if the entire section contains only publication citations, don't summarize it just return "references" as the summary. +7. if the entire section is a table of contents (e.g. line after line of headings followed by page number) just return "table of contents" as the summary +8. if the entire section contains only publication citations and nothing else, just return "references" as the summary. +9. make a shorter summary than the original section text """ _CREATE_SUMMARIES_INPUT_PROMPT = """ @@ -50,9 +51,11 @@ create_summaries_prompt_template = ChatPromptTemplate.from_messages( [ - SystemMessage(content=_CREATE_SUMMARIES_SYSTEM_PROMPT), - HumanMessage(content=_CREATE_SUMMARIES_INPUT_PROMPT), - _human_type_tip_message, + # SystemMessage(content=_CREATE_SUMMARIES_SYSTEM_PROMPT), + ("system", _CREATE_SUMMARIES_SYSTEM_PROMPT), + # HumanMessage(content=_CREATE_SUMMARIES_INPUT_PROMPT), + ("human", _CREATE_SUMMARIES_INPUT_PROMPT), + # _human_type_tip_message, ] )