vishwasg217 · vikassrini · Jul 10, 2024
diff --git a/pages/2_🗂️_Annual_Report_Analyzer.py b/pages/2_🗂️_Annual_Report_Analyzer.py
@@ -6,14 +6,24 @@
 from llama_index.embeddings.langchain import LangchainEmbedding
 from llama_index.core.schema import Document
 from llama_index.core.node_parser import UnstructuredElementNodeParser
-
+from ragas.metrics import (
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+    context_precision,
+)
+from datasets import Dataset
+from ragas import evaluate
+import pandas as pd
 from src.utils import get_model
 
 from src.fields2 import (
     fiscal_year, fiscal_year_attributes,
     strat_outlook, strat_outlook_attributes,
     risk_management, risk_management_attributes,
-    innovation, innovation_attributes
+    innovation, innovation_attributes,fiscal_year_questions,
+    strat_outlook_questions,risk_management_questions,
+    innovation_questions
 )
 
 import streamlit as st
@@ -66,6 +76,8 @@ def get_vector_index(nodes, vector_store):
 
 
 
+
+
 def generate_insight(engine, insight_name, section_name, output_format):
 
     with open("prompts/report.prompt", "r") as f:
@@ -79,41 +91,63 @@ def generate_insight(engine, insight_name, section_name, output_format):
     formatted_input = prompt_template.format(insight_name=insight_name, section_name=section_name, output_format=output_format)
     print(formatted_input)
     response = engine.query(formatted_input)
-    return response.response
+    # print(len(response.source_nodes))
+    return response
 
 
 
 def report_insights(engine, section_name, fields_to_include, section_num):
     fields = None
     attribs = None
+    questions=None
 
     if section_num == 1:
         fields = fiscal_year
         attribs = fiscal_year_attributes
+        questions=fiscal_year_questions
     elif section_num == 2:
         fields = strat_outlook
         attribs = strat_outlook_attributes
+        questions=strat_outlook_questions
     elif section_num == 3:
         fields = risk_management
         attribs = risk_management_attributes
+        questions=risk_management_questions
     elif section_num == 4:
         fields = innovation
         attribs = innovation_attributes
-
+        questions=innovation_questions
+    rageseval={}
+    rageseval["answer"]=[]
+    rageseval["contexts"]=[]
+    rageseval["question"]=[]
+    rageseval["ground_truth"]=[]
     ins = {}
+    engine2=get_query_engine(st.session_state.index.as_query_engine(similarity_top_k=3),"openai-1")
     for i, field in enumerate(attribs):
         if fields_to_include[i]:
             response = generate_insight(engine, field, section_name, str({field: fields[field]}))
-            ins[field] = response
-
+            gtobject=generate_insight(engine2, field, section_name, str({field: fields[field]}))
+            ins[field] = response.response
+            rageseval["answer"].append(response.response)
+            source_node=response.source_nodes[0]
+            text_node = source_node.node
+            context=[text_node.text]
+            rageseval["contexts"].append(context)
+            rageseval["question"].append(questions[field])
+            rageseval["ground_truth"].append(gtobject.response)
+
+    print("Printing ragesval : ")        
+    print(rageseval)
     return {
         "insights": ins
-    }
+    },rageseval
 
-def get_query_engine(engine):
-    llm = get_model("openai")
+def get_query_engine(engine,model_name):
+    llm = get_model(model_name)
     service_context = ServiceContext.from_defaults(llm=llm)
 
+
     query_engine_tools = [
         QueryEngineTool(
             query_engine=engine,
@@ -213,24 +247,36 @@ def get_query_engine(engine):
                 innovation_focus = st.toggle("Innovation Focus")
 
                 innovation_and_rd_list = [r_and_d_activities, innovation_focus]
-
-
+        results={}
+        result=None
         with col2:
             if st.button("Analyze Report"):
-                engine = get_query_engine(st.session_state.index.as_query_engine(similarity_top_k=3))
+                engine = get_query_engine(st.session_state.index.as_query_engine(similarity_top_k=3),"openai")
+                print(st.session_state.index.service_context)
                 start_time = time.time()
 
                 with st.status("**Analyzing Report...**"):
-
                     if any(fiscal_year_highlights_list):
                         st.write("Fiscal Year Highlights...")
-
                         for i, insight in enumerate(fiscal_year_attributes):
                             if st.session_state[insight]:
                                 fiscal_year_highlights_list[i] = False
-
-                        response = report_insights(engine, "Fiscal Year Highlights", fiscal_year_highlights_list, 1)
-
+                                print(insight)
+                        response,ragesval = report_insights(engine, "Fiscal Year Highlights", fiscal_year_highlights_list, 1)
+                        dataframe=pd.DataFrame(ragesval)
+                        df=Dataset.from_dict(dataframe)
+                        result = evaluate(
+                                    df,
+                                    metrics=[
+                                        context_precision,
+                                        faithfulness,
+                                        answer_relevancy,
+                                        context_recall,
+                                    ],
+                                )
+                        results["Fiscal Year Highlights"]=result
+                        print("Printing Results: ")
+                        print(results)
                         for key, value in response["insights"].items():
                             st.session_state[key] = value
 
@@ -240,8 +286,21 @@ def get_query_engine(engine):
                         for i, insight in enumerate(strat_outlook_attributes):
                             if st.session_state[insight]:
                                 strategy_outlook_future_direction_list[i] = False
-                        response = report_insights(engine, "Strategy Outlook and Future Direction", strategy_outlook_future_direction_list, 2)
-
+                        response,ragesval = report_insights(engine, "Strategy Outlook and Future Direction", strategy_outlook_future_direction_list, 2)
+                        dataframe=pd.DataFrame(ragesval)
+                        df=Dataset.from_dict(dataframe)
+                        result = evaluate(
+                                    df,
+                                    metrics=[
+                                        context_precision,
+                                        faithfulness,
+                                        answer_relevancy,
+                                        context_recall,
+                                    ],
+                                )
+                        results["Strategy Outlook and Future Direction"]=result
+                        print("Printing Results: ")
+                        print(results)
                         for key, value in response["insights"].items():
                             st.session_state[key] = value
 
@@ -253,8 +312,21 @@ def get_query_engine(engine):
                             if st.session_state[insight]:
                                 risk_management_list[i] = False
 
-                        response = report_insights(engine, "Risk Management", risk_management_list, 3)
-
+                        response,ragesval = report_insights(engine, "Risk Management", risk_management_list, 3)
+                        dataframe=pd.DataFrame(ragesval)
+                        df=Dataset.from_dict(dataframe)
+                        result = evaluate(
+                                    df,
+                                    metrics=[
+                                        context_precision,
+                                        faithfulness,
+                                        answer_relevancy,
+                                        context_recall,
+                                    ],
+                                )
+                        results["Risk Management"]=result
+                        print("Printing Results: ")
+                        print(results)
                         for key, value in response["insights"].items():
                             st.session_state[key] = value
 
@@ -265,7 +337,22 @@ def get_query_engine(engine):
                             if st.session_state[insight]:
                                 innovation_and_rd_list[i] = False
 
-                        response = report_insights(engine, "Innovation and R&D", innovation_and_rd_list, 4)
+                        response,ragesval = report_insights(engine, "Innovation and R&D", innovation_and_rd_list, 4)
+                        dataframe=pd.DataFrame(ragesval)
+                        df=Dataset.from_dict(dataframe)
+                        result = evaluate(
+                                    df,
+                                    metrics=[
+                                        context_precision,
+                                        faithfulness,
+                                        answer_relevancy,
+                                        context_recall,
+                                    ],
+                                )
+                        results["Innovation and R&D"]=result
+                        print("Printing Results: ")
+                        print(results)
+
                         st.session_state.innovation_and_rd = response
 
                         for key, value in response["insights"].items():
@@ -314,7 +401,8 @@ def get_query_engine(engine):
                     st.error("This insight has not been generated")
                 # st.write("### Milestone Achievements")
                 # st.write(str(st.session_state.fiscal_year_highlights.milestone_achievements))
-
+                if results: 
+                    st.write(results["Fiscal Year Highlights"].to_pandas())
             with tab2:
                 st.write("## Strategy Outlook and Future Direction")
                 try:
@@ -347,6 +435,8 @@ def get_query_engine(engine):
                             st.error("Product Roadmap insight has not been generated")
                 except:
                     st.error("This insight has not been generated")
+                if results: 
+                    st.write(results["Strategy Outlook and Future Direction"].to_pandas())
 
             with tab3:
                 st.write("## Risk Management")
@@ -369,7 +459,8 @@ def get_query_engine(engine):
                             st.error("Risk Mitigation insight has not been generated")
                 except:
                     st.error("This insight has not been generated")
-
+                if results: 
+                    st.write(results["Risk Management"].to_pandas())
 
             with tab4:
                 st.write("## Innovation and R&D")
@@ -392,3 +483,5 @@ def get_query_engine(engine):
                             st.error("Innovation Focus insight has not been generated")
                 except:
                     st.error("This insight has not been generated")
+                if results: 
+                    st.write(results["Innovation and R&D"].to_pandas())
diff --git a/src/fields2.py b/src/fields2.py
@@ -38,6 +38,10 @@
 
 fiscal_year_attributes = ["performance_highlights", "major_events", "challenges_encountered"]
 
+fiscal_year_questions={"performance_highlights": "What are the performance highlights?"
+,"major_events":"Describe the major events and their impact ?" ,
+"challenges_encountered": "What are the challenges encountered in this financial year ?"}
+
 strat_outlook = {
     "strategic_initiatives": "The company's primary objectives and growth strategies for the upcoming years.",
     "market_outlook": "Insights into the broader market, competitive landscape, and industry trends the company anticipates.",
@@ -46,16 +50,26 @@
 
 strat_outlook_attributes = ["strategic_initiatives", "market_outlook", "product_roadmap"]
 
+strat_outlook_questions={"strategic_initiatives":"What are the Strategic initiatives being discussed for the fiscal year",
+                         "market_outlook":"What is the market outlook for the fiscal year",
+                         "product_roadmap":"Elaborate on the product roadmap for the fiscal year"}
+
 risk_management = {
     "risk_factors": "Primary risks the company acknowledges.",
     "risk_mitigation": "Strategies for managing these risks."
 }
 
 risk_management_attributes = ["risk_factors", "risk_mitigation"]
 
+risk_management_questions={"risk_factors":"What are the risk factors faced by the company ?",
+                           "risk_mitigation":"What are the measures being taken to mitigate the risks ? "}
+
 innovation = {
     "r_and_d_activities": "Overview of the company's focus on research and development, major achievements, or breakthroughs.",
     "innovation_focus": "Mention of new technologies, patents, or areas of research the company is diving into."
 }
 
-innovation_attributes = ["r_and_d_activities", "innovation_focus"]
+innovation_attributes = ["r_and_d_activities", "innovation_focus"]
+
+innovation_questions={"r_and_d_activities":"Provide an overview on all the Research and Development activities,breakthroughs and acheievment",
+                      "innovation_focus":" What are the new technologies,patents and areas of research the company is working on ?"}
diff --git a/src/utils.py b/src/utils.py
@@ -42,6 +42,8 @@ def get_model(model_name):
     OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
     if model_name == "openai":
         model = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-3.5-turbo")
+    else:
+        model=ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o")
     return model
 
 def process_pdf(pdfs):