Skip to content

Commit

Permalink
Merge pull request #22 from geemi725/version2
Browse files Browse the repository at this point in the history
Version2
  • Loading branch information
geemi725 authored Jan 20, 2025
2 parents 8b9305f + 7833506 commit 34e7c84
Show file tree
Hide file tree
Showing 8 changed files with 178 additions and 75 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@ ipython_config.py
*.png
*.txt
*.pyc
*.DS_Store

# pyenv
.python-version
.env
.venv

# Installer logs
pip-log.txt
Expand Down
29 changes: 21 additions & 8 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import os
import streamlit as st
import sys

__import__("pysqlite3")
sys.modules["sqlite3"] = sys.modules.pop("pysqlite3")
load_dotenv()
Expand All @@ -23,11 +24,13 @@
unsafe_allow_html=True,
)


def on_api_key_change():
api_key = ss.get("api_key") or os.getenv("OPENAI_API_KEY")
os.environ["OPENAI_API_KEY"] = api_key
openai.api_key = api_key


def save_uploadfile(uploadedfile):
dirpath = os.path.join("data", "lit_dir")
if os.path.exists(dirpath):
Expand All @@ -36,19 +39,22 @@ def save_uploadfile(uploadedfile):
with open(os.path.join(dirpath, uploadedfile.name), "wb") as f:
f.write(uploadedfile.getbuffer())


st.write(
"## Xpert AI: Extract human interpretable structure-property relationships from raw data"
)
st.write(
"""XpertAI trains a surrogate model to your dataset and extracts impactful features from your dataset using XAI tools.
Currently, GPT-4 model is used to generate natural language explanations."""
Currently, GPT-4o model is used to generate natural language explanations."""
)


def run_autofill():
st.session_state.auto_target = "toxicity of small molecules"
st.session_state.auto_df = "tests/toxicity_sample_data.csv"
st.experimental_rerun()


auto_target = st.session_state.get("auto_target", None)
auto_arxiv = st.session_state.get("auto_arxiv", None)

Expand Down Expand Up @@ -93,7 +99,8 @@ def run_autofill():
"### Provide literature to generate scientific explanations! \nIf you don't provide literature, you will receive an explanation based on XAI tools."
)
lit_files = st.file_uploader(
"Upload your literature library here (Suggested):", accept_multiple_files=True
"Upload your literature here. Files must be in `pdf` format (Suggested):",
accept_multiple_files=True,
)
arxiv_keywords = st.text_input(
"If you want to scrape arxiv, provide keywords for arxiv scraping:",
Expand All @@ -112,7 +119,7 @@ def run_autofill():
st.markdown(
"""**Make sure to add your OpenAPI key**.
You can download the input dataset after the explanation is generated.
Literature parsing is not used here."""
Literature is not scraped in this case."""
)

auto_button = st.button("Test Run", on_click=run_autofill)
Expand Down Expand Up @@ -174,25 +181,31 @@ def run_autofill():

with st.spinner("Please wait...:computer: :speech_balloon:"):
# read literature
lit_files_given = False
if lit_files:
lit_files_given = True
for file in lit_files:
save_uploadfile(file)
try:
vector_db(
lit_file=os.path.join("./data/lit_dir", file.name),
try_meta_data=True,
clean=True,
)
except BaseException:
st.write("vectordb failed!!")

# scrape arxiv.org

elif arxiv_keywords:
arg_dict_arxiv = {"key_words": arxiv_keywords, "max_papers": max_papers}
if arxiv_keywords:
arg_dict_arxiv = {
"key_words": arxiv_keywords,
"max_papers": max_papers,
"lit_files": lit_files_given,
}

scrape_arxiv(arg_dict_arxiv)

elif not arxiv_keywords and not lit_files:
if not arxiv_keywords and not lit_files:
st.markdown(
f"""### Literature is not provided to make an informed explanation. Based on XAI analysis, the following explanation can be given:
\n{explanation}"""
Expand Down Expand Up @@ -226,4 +239,4 @@ def run_autofill():
with open("./data/figs.zip", "rb") as f:
st.download_button(
"Download the outputs!", f, file_name="XpertAI_output.zip"
)
)
6 changes: 3 additions & 3 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ ipython
python-dotenv
openai==0.28.1
langchain==0.0.330
lime
matplotlib
scikit-learn
scikit-learn==1.4.0
xgboost
shap
pandas
lime
arxiv
chromadb==0.4.7
pysqlite3-binary
tiktoken
tiktoken==0.7.0
pypdf
57 changes: 46 additions & 11 deletions xpertai/prompts.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,27 +9,62 @@
feature list: {features}
observation: {observation}
Your task is to go through {documents} and explain the relationship between the features in {features} and the {observation}
Documents: \n
{documents}
- First, list all features identified by the XAI analysis {features} affecting the {observation}.
format:
### Features Identified by XAI Analysis
- feature 1
- feature 2
...
You are an expert scientist. Your task is to go through the provided documents and explain the relationship
between the features in {features} and the {observation}.
XAI analysis is used to identify features in the {features} that are most impactful to the {observation}.
Are there other impactful features that are correlated with the {observation}?
You can follow the provided draft to answer:
- First, list all features identified by the XAI analysis {features} affecting the {observation}. List additional features that may be correlated with the {observation}.
\n Eg: {features} found from the XAI analysis.
-Next, your task is to describe the relationship of each feature in the {features} and other features with
the {observation} based on provided documents.\n
Do the provided documents explicitly explain how each feature in the {features} explicitly affects the
{observation}? If yes, provide the explanation.
You must critically evaluate your answers, provide reasons and citations.\n
Each claim must be supported by scientific evidence.
In line citations are required. \n eg: <claim (smith et al., 2020)> \n
- Next, describe the relationship of each feature in the {features} and other features with the {observation}. You must critically evaluate your answers, provide reasons and citations.
Eg: solubility of a molecule is affected by the number of hydroxyl groups in the molecule. This is because hydroxyl groups are polar and can form hydrogen bonds with water molecules. (Smith et al., 2019)
Important: If the provided documents do not explicitly state the relationship between the feature
and the observation, you must say "an explict relationship was not found in the given documents".\n
Instead do the documents discuss about synonymous features that are not identified by the XAI analysis? \n
You can critically evaluate the relationship between such correlated with the {observation} and generate a
hypothesis based on the information provided in the documents. \n
Important: Each claim/hypothesis must be supported with citations. \n
- Next, explain how each feature in the {features} list affects the {observation} and how the {observation} be altered by changing the features.
Eg: The solubility of a molecule can be increased by adding more hydroxyl groups to the molecule.
Format:
#### <feature>:
**Explanation**: <relationship of feature to the observation>
**Scientific Evidence** <provide scientific evidence/citations from the documents>
**Hypothesis**: <your hypothesis>
- Then, provide a summary of everything you described previously to describe the relationship between these features and the {observation}. You must sound like a scientist.
- Then, provide a summary of everything you described previously to describe the relationship between these
features and the {observation}. You must sound like a scientist.
Give scientific evidence for these answers and provide citations.
- Finally, provide the list of references only used to answer. DO NOT make up references. Use APA style for referencing. \n
- Finally, provide the list of references only used to answer. DO NOT make up references.
Use APA style for referencing. \n
Eg: References: \n
1. reference 1 \n
2. reference 2 \n
...
"""

SUMMARIZE_PROMPT = """You are an expert scientist in chemistry. You are provided with an unformatted text excerpt from a journal article.\n
Your task is to provide a clear and concise summary of the given text in a scientific manner. \n
IMPORTANT: You must not hallucinate extra information. You must only summarize the text provided. \n
text: {text}"""

SUMMARIZE_PROMPT_WITH_QUESTION = """You are an expert scientist in chemistry. You are provided with an unformatted text excerpt from a journal article.\n
Your task is to provide a clear and concise summary of the given text in a scientific manner. \n
The excerpt is related to the following question: {question}
IMPORTANT: You must not hallucinate extra information. You must only summarize the text provided. \n
text: {text}"""
18 changes: 10 additions & 8 deletions xpertai/tools/explain_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,18 @@ def get_modelsummary(arg_dict):
global clean
clean = True

if not os.path.exists(save_dir):
os.makedirs(save_dir)
# arg_dict = json.loads(json_request)
for k, val in arg_dict.items():
globals()[k] = val

if not os.path.exists(save_dir):
os.makedirs(save_dir)
# Step 1: train model

if model_type == "Classifier":
train_xgbclassifier(df_init)
train_xgbclassifier(df_init, savedir=save_dir)

elif model_type == "Regressor":
train_xgbregressor(df_init)
train_xgbregressor(df_init, savedir=save_dir)

model_path = f"{save_dir}/xgbmodel.json"

Expand All @@ -51,12 +50,15 @@ def get_modelsummary(arg_dict):
classifier = False

top_shap_fts, shap_summary = explain_shap(
df_init=df_init, model_path=model_path, top_k=top_k, classifier=classifier
df_init=df_init,
model_path=model_path,
top_k=top_k,
classifier=classifier,
savedir=save_dir,
)
np.save(f"{save_dir}/top_shap_features.npy", top_shap_fts)
else:
shap_summary = ""
# np.save(f'{save_dir}/top_shap_features.npy',top_fts)

# Step 3: Run Lime
if XAI_tool == "LIME" or XAI_tool == "Both":
Expand All @@ -71,7 +73,7 @@ def get_modelsummary(arg_dict):
f = open(f"{save_dir}/XAI_summary.txt", "w+")
f.write(shap_summary + lime_summary)
f.close()
metadata = {"Authors": "XpertAI", "Year": "2023", "Title": "XAI Summary"}
metadata = {"Authors": "XpertAI", "Year": "2024", "Title": "XAI Summary"}

vector_db(
persist_directory=persist_directory,
Expand Down
69 changes: 47 additions & 22 deletions xpertai/tools/generate_nle.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory, ReadOnlySharedMemory
from langchain import LLMChain, PromptTemplate
from xpertai.prompts import REFINE_PROMPT, FORMAT_LABLES
from xpertai.prompts import REFINE_PROMPT, FORMAT_LABLES, SUMMARIZE_PROMPT
from .utils import *
from langchain.embeddings.openai import OpenAIEmbeddings
import pandas as pd

embedding = OpenAIEmbeddings()

Expand All @@ -25,7 +26,7 @@ def gen_nle(arg_dict):
save_dir = "./data"
global persist_directory
persist_directory = "./data/chroma/"
# arg_dict = json.loads(json_request)

for k, val in arg_dict.items():
globals()[k] = val

Expand All @@ -47,7 +48,7 @@ def gen_nle(arg_dict):
# ****************
# get human interpretable feature labels
# #initiate retriever, chain
llm = ChatOpenAI(temperature=0.0, model_name="gpt-4", request_timeout=1000)
llm = ChatOpenAI(temperature=0.0, model_name="gpt-4o", request_timeout=1000)

prompt_fts = PromptTemplate(template=FORMAT_LABLES, input_variables=["label"])
memory = ConversationBufferMemory(memory_key="chat_history")
Expand All @@ -63,28 +64,52 @@ def gen_nle(arg_dict):
features = ",".join(new_labels)
db = Chroma(persist_directory=persist_directory, embedding_function=embedding)
docs = []
rows = []
# first collect docs for each feature
documents = ""
for feature in new_labels:
initial_question = f"""How does the {feature} impact the {observation}?"""
initial_question = f"""How is {feature} related to {observation}?"""
# Get relevant docs
fetched = db.max_marginal_relevance_search(initial_question, k=3)
docs.append(fetched)

# flatten list of docs
docs = [item for sublist in docs for item in sublist]

# add citations from metadata
documents = ""
for i in range(len(docs)):
doc = docs[i].page_content
try:
authors = docs[i].metadata["authors"]
year = docs[i].metadata["year"]
title = docs[i].metadata["source"]
documents += f"{doc} REFERENCE:({authors},{year},{title}) \n\n"

except BaseException:
documents += f"{doc} \n\n"
fetched = db.max_marginal_relevance_search(initial_question, k=5)
for document in fetched:
doc = document.page_content
summarize_prompt = PromptTemplate(
template=SUMMARIZE_PROMPT, input_variables=["text"]
)
summarize_chain = LLMChain(prompt=summarize_prompt, llm=llm)
summary = summarize_chain.run({"text": doc})

try:
authors = document.metadata["authors"]
year = document.metadata["year"]
title = document.metadata["source"]
reference = f"REFERENCE:({authors},{year},{title})"
documents += f"{summary} ({reference}) \n\n"
rows.append(
{
"feature": feature,
"original": doc,
"summary": summary,
"reference": reference,
}
)

except BaseException:
documents += f"{summary} \n\n"
rows.append(
{
"feature": feature,
"original": doc,
"summary": summary,
"reference": "No reference found",
}
)

# write to csv
# df = pd.DataFrame(rows)
# df.to_csv(f"{supporting_csv}", index=False)

# docs.append(fetched)

prompt = PromptTemplate(
template=REFINE_PROMPT, input_variables=["documents", "features", "observation"]
Expand Down
Loading

0 comments on commit 34e7c84

Please sign in to comment.