OSU-NLP-Group · bernaljg · Jun 20, 2024 · Jun 18, 2024 · Jun 18, 2024 · Jun 18, 2024
diff --git a/README.md b/README.md
@@ -15,9 +15,19 @@ conda create -n hipporag python=3.9
 conda activate hipporag
 pip install -r requirements.txt
 
-GPU_DEVICES=0,1,2,3 #Replace with your own free GPU Devices
+GPUS=0,1,2,3 #Replace with your own free GPU Devices
+```
+
+Add conda env to PATH as follows, where `/path/HippoRAG` is the root of HippoRAG, and `/path/HippoRAG/hipporag` is the path to the cond env. Consider adding this to your ~/.bashrc
+```shell
+export PATH=$PATH:/path/HippoRAG/hipporag/bin 
+```
+
+Setup LLM API keys: TOGETHER_API_KEY is optional and set it when you want to use their open-source models (e.g., Llama-3).
+
+```shell
 export OPENAI_API_KEY='Add your own OpenAI API key here.'
-export TOGETHER_API_KEY='Add your own TogetherAI API key here.'  # If you need to use TogetherAI models such as Llama-3 API
+export TOGETHER_API_KEY='Add your own TogetherAI API key here.'
 ```
 
 To use ColBERTv2, download the pre-trained [checkpoint](https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz) and put it under `exp/colbertv2.0`.
@@ -211,7 +221,7 @@ Using our HippoRAG framework requires a two-step process, indexing and retrieval
 To run indexing for both our main experiments and our ablations, run the following bash scripts. Retrieval will fail if this step does not succeed.
 
 ```shell
-bash src/setup_hipporag_main_exps.sh $GPU_DEVICES
+bash src/setup_hipporag_main_exps.sh $GPUS
 ```
 
 #### HippoRAG Retrieval
@@ -234,7 +244,7 @@ bash src/run_hipporag_ircot_main_exps.sh
 To run all our ablations, run the following bash scripts:
 
 ```shell
-bash src/setup_hipporag_ablations.sh $GPU_DEVICES
+bash src/setup_hipporag_ablations.sh $GPUS
 bash src/run_hipporag_ablations.sh
 ```
 
@@ -243,7 +253,7 @@ bash src/run_hipporag_ablations.sh
 To reproduce our hyperparameter tuning, we must first run indexing on the MuSiQue training subset by running the following script:
 
 ```shell
-bash src/setup_hipporag_hyperparameter_tune.sh $GPU_DEVICES
+bash src/setup_hipporag_hyperparameter_tune.sh $GPUS
 ```
 
 After indexing is completed, run the following script and note the performance of each hyperparameter combination tested.
@@ -263,7 +273,7 @@ To run the case study examples shown in our paper, which we also include in our
 #### Indexing
 
 ```shell
-bash src/setup_hipporag_case_study.sh $GPU_DEVICES
+bash src/setup_hipporag_case_study.sh $GPUS
 ```
 
 #### Retrieval

diff --git a/src/RetrievalModule.py b/src/RetrievalModule.py
@@ -3,13 +3,13 @@
 from glob import glob
 import os.path
 
+import ipdb
 import pandas as pd
 
 import pickle
 import numpy as np
 import os
 from tqdm import tqdm
-import torch
 
 import faiss
 import gc
@@ -21,12 +21,6 @@
 VECTOR_DIR = 'data/lm_vectors'
 
 
-def mean_pooling(token_embeddings, mask):
-    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
-    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
-    return sentence_embeddings
-
-
 class RetrievalModule:
     """
     Class designed to retrieve potential synonymy candidates for a set of UMLS terms from a set of entities.
@@ -121,7 +115,6 @@ def create_sorted_df(self, strings):
         return lengths_df.sort_values(0)
 
     def save_vecs(self, strings, vectors, direc_name, bin_size=50000):
-
         with open(direc_name + '/encoded_strings.txt', 'w') as f:
             for string in strings:
                 f.write(string + '\n')
@@ -379,11 +372,9 @@ def retrieve_knn(self, queries, knowledge_base, k=2047):
         return sorted_candidate_dictionary
 
 
-import sys
-
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--retriever_name', type=str)
+    parser.add_argument('--retriever_name', type=str, help='retrieval model name, e.g., "facebook/contriever"')
     parser.add_argument('--string_filename', type=str)
     parser.add_argument('--pool_method', type=str, default='mean')
 

diff --git a/src/baselines/__init__.py b/src/baselines/__init__.py
@@ -1,32 +0,0 @@
-import torch
-
-
-def mean_pooling(token_embeddings, mask):
-    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
-    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
-    return sentence_embeddings
-
-
-def get_file_name(path):
-    return path.split('/')[-1].replace('.jsonl', '').replace('.json', '')
-
-
-def mean_pooling_embedding(input_str: str, tokenizer, model, device='cuda'):
-    inputs = tokenizer(input_str, padding=True, truncation=True, return_tensors='pt').to(device)
-    outputs = model(**inputs)
-
-    embedding = mean_pooling(outputs[0], inputs['attention_mask']).to('cpu').detach().numpy()
-    return embedding
-
-
-def mean_pooling_embedding_with_normalization(input_str, tokenizer, model, device='cuda'):
-    encoding = tokenizer(input_str, return_tensors='pt', padding=True, truncation=True)
-    input_ids = encoding['input_ids']
-    attention_mask = encoding['attention_mask']
-    input_ids = input_ids.to(device)
-    attention_mask = attention_mask.to(device)
-    outputs = model(input_ids, attention_mask=attention_mask)
-    embeddings = mean_pooling(outputs[0], attention_mask)
-    embeddings = embeddings.T.divide(torch.linalg.norm(embeddings, dim=1)).T
-
-    return embeddings

diff --git a/src/baselines/create_colbertv2_index.py b/src/baselines/create_colbertv2_index.py
@@ -1,10 +1,21 @@
 import argparse
 import json
 import os.path
-
 from colbert import Indexer
 from colbert.infra import Run, RunConfig, ColBERTConfig
 
+
+def run_colbertv2_index(dataset_name: str, index_name: str, corpus_tsv_path: str, checkpoint_path='exp/colbertv2.0', overwrite=False):
+    with Run().context(RunConfig(nranks=1, experiment="colbert", root=f"exp/{dataset_name}/")):
+        config = ColBERTConfig(
+            nbits=2,
+            root=f"exp/{dataset_name}/colbert",
+        )
+        indexer = Indexer(checkpoint=checkpoint_path, config=config)
+        indexer.index(name=index_name, collection=corpus_tsv_path, overwrite=overwrite)
+        print(f'Indexing done for dataset {dataset_name}, index {index_name}')
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--corpus', type=str)
@@ -45,11 +56,4 @@
             f.write(f"{pid}\t\"{p}\"" + '\n')
     print(f'Corpus tsv saved: {corpus_tsv_path}', len(corpus_contents))
 
-    with Run().context(RunConfig(nranks=1, experiment="colbert", root=f"exp/{args.dataset}/")):
-        config = ColBERTConfig(
-            nbits=2,
-            root=f"exp/{args.dataset}/colbert",
-        )
-        indexer = Indexer(checkpoint=checkpoint_path, config=config)
-        indexer.index(name=f"{args.corpus}_nbits_2", collection=corpus_tsv_path, overwrite=True)
-        print(f'Indexing done for {args.corpus}_nbits_2')
+    run_colbertv2_index(args.dataset, args.corpus + '_nbits_2', corpus_tsv_path, 'exp/colbertv2.0', overwrite=True)
diff --git a/src/baselines/ircot.py b/src/baselines/ircot.py
@@ -6,7 +6,7 @@
 from langchain_core.prompts import ChatPromptTemplate
 
 from src.langchain_util import init_langchain_model, num_tokens_by_tiktoken
-from src.baselines import mean_pooling_embedding_with_normalization
+from src.processing import mean_pooling_embedding_with_normalization
 from src.elastic_search_tool import search_with_score
 import numpy as np
 from sentence_transformers import SentenceTransformer

diff --git a/src/baselines/mean_pooling_ip_faiss.py b/src/baselines/mean_pooling_ip_faiss.py
@@ -1,6 +1,8 @@
 import os
 import sys
 
+from src.processing import mean_pooling, mean_pooling_embedding_with_normalization
+
 sys.path.append('.')
 
 import argparse
@@ -11,7 +13,6 @@
 import torch
 from tqdm import tqdm
 from transformers import AutoTokenizer, AutoModel
-from src.baselines import mean_pooling_embedding_with_normalization, mean_pooling
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()

diff --git a/src/colbertv2_indexing.py b/src/colbertv2_indexing.py
@@ -2,11 +2,35 @@
 import json
 import pickle
 
-import ipdb
 import numpy as np
 from colbert import Indexer
 from colbert.infra import Run, RunConfig, ColBERTConfig
 
+
+def colbertv2_index(corpus: list, dataset_name: str, exp_name: str, index_name='nbits_2', checkpoint_path='exp/colbertv2.0', overwrite='reuse'):
+    """
+    Indexing corpus and phrases using colbertv2
+    @param corpus:
+    @return:
+    """
+    corpus_processed = [x.replace('\n', '\t') for x in corpus]
+
+    corpus_tsv_file_path = f'data/lm_vectors/colbert/{dataset_name}_{exp_name}_{len(corpus_processed)}.tsv'
+    with open(corpus_tsv_file_path, 'w') as f:  # save to tsv
+        for pid, p in enumerate(corpus_processed):
+            f.write(f"{pid}\t\"{p}\"" + '\n')
+    root_path = f'data/lm_vectors/colbert/{dataset_name}'
+
+    # indexing corpus
+    with Run().context(RunConfig(nranks=1, experiment=exp_name, root=root_path)):
+        config = ColBERTConfig(
+            nbits=2,
+            root=root_path,
+        )
+        indexer = Indexer(checkpoint=checkpoint_path, config=config)
+        indexer.index(name=index_name, collection=corpus_tsv_file_path, overwrite=overwrite)
+
+
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--dataset', type=str)
@@ -23,34 +47,9 @@
     else:
         corpus_contents = [x['title'] + ' ' + x['text'].replace('\n', ' ') for x in corpus]
 
-    corpus_tsv_file_path = f'data/lm_vectors/colbert/{args.dataset}_corpus_{len(corpus_contents)}.tsv'
-    with open(corpus_tsv_file_path, 'w') as f:  # save to tsv
-        for pid, p in enumerate(corpus_contents):
-            f.write(f"{pid}\t\"{p}\"" + '\n')
-
-    root_path = f'data/lm_vectors/colbert/{args.dataset}'
-    # indexing corpus
-    with Run().context(RunConfig(nranks=1, experiment='corpus', root=root_path)):
-        config = ColBERTConfig(
-            nbits=2,
-            root=root_path,
-        )
-        indexer = Indexer(checkpoint=checkpoint_path, config=config)
-        indexer.index(name=f"nbits_2", collection=corpus_tsv_file_path, overwrite=True)
+    colbertv2_index(corpus_contents, args.dataset, 'corpus', checkpoint_path, overwrite=True)
 
     kb_phrase_dict = pickle.load(open(args.phrase, 'rb'))
     phrases = np.array(list(kb_phrase_dict.keys()))[np.argsort(list(kb_phrase_dict.values()))]
     phrases = phrases.tolist()
-    # get phrases tsv
-    phrases_tsv_file_path = f'data/lm_vectors/colbert/{args.dataset}_phrases_{len(phrases)}.tsv'
-    with open(phrases_tsv_file_path, 'w') as f:  # save to tsv
-        for pid, p in enumerate(phrases):
-            f.write(f"{pid}\t\"{p}\"" + '\n')
-    # indexing phrases
-    with Run().context(RunConfig(nranks=1, experiment='phrase', root=root_path)):
-        config = ColBERTConfig(
-            nbits=2,
-            root=root_path,
-        )
-        indexer = Indexer(checkpoint=checkpoint_path, config=config)
-        indexer.index(name=f"nbits_2", collection=phrases, overwrite=True)
+    colbertv2_index(phrases, args.dataset, 'phrase', checkpoint_path, overwrite=True)