Merge pull request #22 from geemi725/version2

Version2
geemi725 · Jan 20, 2025 · 34e7c84 · 34e7c84
2 parents 8b9305f + 7833506
commit 34e7c84
Show file tree

Hide file tree

Showing 8 changed files with 178 additions and 75 deletions.
diff --git a/.gitignore b/.gitignore
@@ -11,9 +11,12 @@ ipython_config.py
 *.png
 *.txt
 *.pyc
+*.DS_Store
 
 # pyenv
 .python-version
+.env
+.venv
 
 # Installer logs
 pip-log.txt

diff --git a/app.py b/app.py
@@ -6,6 +6,7 @@
 import os
 import streamlit as st
 import sys
+
 __import__("pysqlite3")
 sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
 load_dotenv()
@@ -23,11 +24,13 @@
     unsafe_allow_html=True,
 )
 
+
 def on_api_key_change():
     api_key = ss.get("api_key") or os.getenv("OPENAI_API_KEY")
     os.environ["OPENAI_API_KEY"] = api_key
     openai.api_key = api_key
 
+
 def save_uploadfile(uploadedfile):
     dirpath = os.path.join("data", "lit_dir")
     if os.path.exists(dirpath):
@@ -36,19 +39,22 @@ def save_uploadfile(uploadedfile):
     with open(os.path.join(dirpath, uploadedfile.name), "wb") as f:
         f.write(uploadedfile.getbuffer())
 
+
 st.write(
     "## Xpert AI: Extract human interpretable structure-property relationships from raw data"
 )
 st.write(
     """XpertAI trains a surrogate model to your dataset and extracts impactful features from your dataset using XAI tools.
-Currently, GPT-4 model is used to generate natural language explanations."""
+Currently, GPT-4o model is used to generate natural language explanations."""
 )
 
+
 def run_autofill():
     st.session_state.auto_target = "toxicity of small molecules"
     st.session_state.auto_df = "tests/toxicity_sample_data.csv"
     st.experimental_rerun()
 
+
 auto_target = st.session_state.get("auto_target", None)
 auto_arxiv = st.session_state.get("auto_arxiv", None)
 
@@ -93,7 +99,8 @@ def run_autofill():
         "### Provide literature to generate scientific explanations! \nIf you don't provide literature, you will receive an explanation based on XAI tools."
     )
     lit_files = st.file_uploader(
-        "Upload your literature library here (Suggested):", accept_multiple_files=True
+        "Upload your literature here. Files must be in `pdf` format (Suggested):",
+        accept_multiple_files=True,
     )
     arxiv_keywords = st.text_input(
         "If you want to scrape arxiv, provide keywords for arxiv scraping:",
@@ -112,7 +119,7 @@ def run_autofill():
     st.markdown(
         """**Make sure to add your OpenAPI key**. 
                 You can download the input dataset after the explanation is generated.
-                Literature parsing is not used here."""
+                Literature is not scraped in this case."""
     )
 
     auto_button = st.button("Test Run", on_click=run_autofill)
@@ -174,25 +181,31 @@ def run_autofill():
 
     with st.spinner("Please wait...:computer: :speech_balloon:"):
         # read literature
+        lit_files_given = False
         if lit_files:
+            lit_files_given = True
             for file in lit_files:
                 save_uploadfile(file)
                 try:
                     vector_db(
                         lit_file=os.path.join("./data/lit_dir", file.name),
                         try_meta_data=True,
+                        clean=True,
                     )
                 except BaseException:
                     st.write("vectordb failed!!")
 
         # scrape arxiv.org
-
-        elif arxiv_keywords:
-            arg_dict_arxiv = {"key_words": arxiv_keywords, "max_papers": max_papers}
+        if arxiv_keywords:
+            arg_dict_arxiv = {
+                "key_words": arxiv_keywords,
+                "max_papers": max_papers,
+                "lit_files": lit_files_given,
+            }
 
             scrape_arxiv(arg_dict_arxiv)
 
-        elif not arxiv_keywords and not lit_files:
+        if not arxiv_keywords and not lit_files:
             st.markdown(
                 f"""### Literature is not provided to make an informed explanation. Based on XAI analysis, the following explanation can be given:
                 \n{explanation}"""
@@ -226,4 +239,4 @@ def run_autofill():
         with open("./data/figs.zip", "rb") as f:
             st.download_button(
                 "Download the outputs!", f, file_name="XpertAI_output.zip"
-            )
+            )
diff --git a/requirements.txt b/requirements.txt
@@ -3,14 +3,14 @@ ipython
 python-dotenv
 openai==0.28.1
 langchain==0.0.330
+lime
 matplotlib
-scikit-learn
+scikit-learn==1.4.0
 xgboost
 shap
 pandas
-lime
 arxiv
 chromadb==0.4.7
 pysqlite3-binary 
-tiktoken
+tiktoken==0.7.0
 pypdf
diff --git a/xpertai/prompts.py b/xpertai/prompts.py
@@ -9,27 +9,62 @@
 feature list: {features}
 observation: {observation}
 
-Your task is to go through {documents} and explain the relationship between the features in {features} and the {observation}
+Documents: \n
+{documents}
+
+- First, list all features identified by the XAI analysis {features} affecting the {observation}. 
+format: 
+### Features Identified by XAI Analysis
+- feature 1
+- feature 2
+...
+
+You are an expert scientist. Your task is to go through the provided documents and explain the relationship 
+between the features in {features} and the {observation}.
 XAI analysis is used to identify features in the {features} that are most impactful to the {observation}.
 Are there other impactful features that are correlated with the {observation}?
 
-You can follow the provided draft to answer:
-
-- First, list all features identified by the XAI analysis {features} affecting the {observation}. List additional features that may be correlated with the {observation}.
-\n Eg: {features} found from the XAI analysis.
+-Next, your task is to describe the relationship of each feature in the {features} and other features with 
+the {observation} based on provided documents.\n
+ Do the provided documents explicitly explain how each feature in the  {features} explicitly affects the 
+ {observation}? If yes, provide the explanation.
+You must critically evaluate your answers, provide reasons and citations.\n 
+Each claim must be supported by scientific evidence. 
+In line citations are required. \n eg: <claim (smith et al., 2020)> \n
 
-- Next, describe the relationship of each feature in the {features} and other features with the {observation}. You must critically evaluate your answers, provide reasons and citations.
-  Eg:  solubility of a molecule is affected by the number of hydroxyl groups in the molecule. This is because hydroxyl groups are polar and can form hydrogen bonds with water molecules. (Smith et al., 2019)
+Important: If the provided documents do not explicitly state the relationship between the feature 
+and the observation, you must say "an explict relationship was not found in the given documents".\n 
+Instead do the documents discuss about  synonymous features that are not identified by the XAI analysis? \n 
+You can critically evaluate the relationship between such correlated with the {observation} and generate a 
+hypothesis based on the information provided in the documents. \n 
+Important: Each claim/hypothesis must be supported with citations. \n
 
-- Next, explain how each feature in the  {features} list affects the {observation} and how the {observation} be altered by changing the features.
-  Eg: The solubility of a molecule can be increased by adding more hydroxyl groups to the molecule.
+Format:
+#### <feature>: 
+**Explanation**: <relationship of feature to the observation>
+**Scientific Evidence** <provide scientific evidence/citations from the documents> 
+**Hypothesis**: <your hypothesis>
 
-- Then, provide a summary of everything you described previously to describe the relationship between these features and the {observation}. You must sound like a scientist.
+- Then, provide a summary of everything you described previously to describe the relationship between these 
+features and the {observation}. You must sound like a scientist.
   Give scientific evidence for these answers and provide citations.
 
-- Finally, provide the list of references only used to answer. DO NOT make up references. Use APA style for referencing. \n
+- Finally, provide the list of references only used to answer. DO NOT make up references. 
+Use APA style for referencing. \n
 Eg: References: \n
     1. reference 1 \n
     2. reference 2 \n
     ...
+
 """
+
+SUMMARIZE_PROMPT = """You are an expert scientist in chemistry. You are provided with an unformatted text excerpt from a journal article.\n
+Your task is to  provide a clear and concise summary of the given text in a scientific manner. \n
+IMPORTANT: You must not hallucinate extra information. You must only summarize the text provided. \n
+text: {text}"""
+
+SUMMARIZE_PROMPT_WITH_QUESTION = """You are an expert scientist in chemistry. You are provided with an unformatted text excerpt from a journal article.\n
+Your task is to  provide a clear and concise summary of the given text in a scientific manner. \n
+The excerpt is related to the following question: {question}
+IMPORTANT: You must not hallucinate extra information. You must only summarize the text provided. \n
+text: {text}"""
diff --git a/xpertai/tools/explain_model.py b/xpertai/tools/explain_model.py
@@ -27,19 +27,18 @@ def get_modelsummary(arg_dict):
     global clean
     clean = True
 
-    if not os.path.exists(save_dir):
-        os.makedirs(save_dir)
-    # arg_dict = json.loads(json_request)
     for k, val in arg_dict.items():
         globals()[k] = val
 
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
     # Step 1: train model
 
     if model_type == "Classifier":
-        train_xgbclassifier(df_init)
+        train_xgbclassifier(df_init, savedir=save_dir)
 
     elif model_type == "Regressor":
-        train_xgbregressor(df_init)
+        train_xgbregressor(df_init, savedir=save_dir)
 
     model_path = f"{save_dir}/xgbmodel.json"
 
@@ -51,12 +50,15 @@ def get_modelsummary(arg_dict):
             classifier = False
 
         top_shap_fts, shap_summary = explain_shap(
-            df_init=df_init, model_path=model_path, top_k=top_k, classifier=classifier
+            df_init=df_init,
+            model_path=model_path,
+            top_k=top_k,
+            classifier=classifier,
+            savedir=save_dir,
         )
         np.save(f"{save_dir}/top_shap_features.npy", top_shap_fts)
     else:
         shap_summary = ""
-    # np.save(f'{save_dir}/top_shap_features.npy',top_fts)
 
     # Step 3: Run Lime
     if XAI_tool == "LIME" or XAI_tool == "Both":
@@ -71,7 +73,7 @@ def get_modelsummary(arg_dict):
     f = open(f"{save_dir}/XAI_summary.txt", "w+")
     f.write(shap_summary + lime_summary)
     f.close()
-    metadata = {"Authors": "XpertAI", "Year": "2023", "Title": "XAI Summary"}
+    metadata = {"Authors": "XpertAI", "Year": "2024", "Title": "XAI Summary"}
 
     vector_db(
         persist_directory=persist_directory,

diff --git a/xpertai/tools/generate_nle.py b/xpertai/tools/generate_nle.py
@@ -1,9 +1,10 @@
 from langchain.chat_models import ChatOpenAI
 from langchain.memory import ConversationBufferMemory, ReadOnlySharedMemory
 from langchain import LLMChain, PromptTemplate
-from xpertai.prompts import REFINE_PROMPT, FORMAT_LABLES
+from xpertai.prompts import REFINE_PROMPT, FORMAT_LABLES, SUMMARIZE_PROMPT
 from .utils import *
 from langchain.embeddings.openai import OpenAIEmbeddings
+import pandas as pd
 
 embedding = OpenAIEmbeddings()
 
@@ -25,7 +26,7 @@ def gen_nle(arg_dict):
     save_dir = "./data"
     global persist_directory
     persist_directory = "./data/chroma/"
-    # arg_dict = json.loads(json_request)
+
     for k, val in arg_dict.items():
         globals()[k] = val
 
@@ -47,7 +48,7 @@ def gen_nle(arg_dict):
     # ****************
     # get human interpretable feature labels
     # #initiate retriever, chain
-    llm = ChatOpenAI(temperature=0.0, model_name="gpt-4", request_timeout=1000)
+    llm = ChatOpenAI(temperature=0.0, model_name="gpt-4o", request_timeout=1000)
 
     prompt_fts = PromptTemplate(template=FORMAT_LABLES, input_variables=["label"])
     memory = ConversationBufferMemory(memory_key="chat_history")
@@ -63,28 +64,52 @@ def gen_nle(arg_dict):
     features = ",".join(new_labels)
     db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
     docs = []
+    rows = []
     # first collect docs for each feature
+    documents = ""
     for feature in new_labels:
-        initial_question = f"""How does the {feature} impact the {observation}?"""
+        initial_question = f"""How is {feature} related to {observation}?"""
         # Get relevant docs
-        fetched = db.max_marginal_relevance_search(initial_question, k=3)
-        docs.append(fetched)
-
-    # flatten list of docs
-    docs = [item for sublist in docs for item in sublist]
-
-    # add citations from metadata
-    documents = ""
-    for i in range(len(docs)):
-        doc = docs[i].page_content
-        try:
-            authors = docs[i].metadata["authors"]
-            year = docs[i].metadata["year"]
-            title = docs[i].metadata["source"]
-            documents += f"{doc} REFERENCE:({authors},{year},{title}) \n\n"
-
-        except BaseException:
-            documents += f"{doc} \n\n"
+        fetched = db.max_marginal_relevance_search(initial_question, k=5)
+        for document in fetched:
+            doc = document.page_content
+            summarize_prompt = PromptTemplate(
+                template=SUMMARIZE_PROMPT, input_variables=["text"]
+            )
+            summarize_chain = LLMChain(prompt=summarize_prompt, llm=llm)
+            summary = summarize_chain.run({"text": doc})
+
+            try:
+                authors = document.metadata["authors"]
+                year = document.metadata["year"]
+                title = document.metadata["source"]
+                reference = f"REFERENCE:({authors},{year},{title})"
+                documents += f"{summary} ({reference}) \n\n"
+                rows.append(
+                    {
+                        "feature": feature,
+                        "original": doc,
+                        "summary": summary,
+                        "reference": reference,
+                    }
+                )
+
+            except BaseException:
+                documents += f"{summary} \n\n"
+                rows.append(
+                    {
+                        "feature": feature,
+                        "original": doc,
+                        "summary": summary,
+                        "reference": "No reference found",
+                    }
+                )
+
+    # write to csv
+    # df = pd.DataFrame(rows)
+    # df.to_csv(f"{supporting_csv}", index=False)
+
+    # docs.append(fetched)
 
     prompt = PromptTemplate(
         template=REFINE_PROMPT, input_variables=["documents", "features", "observation"]