Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing Ninja checking and adding qa to test_hipporag demo #23

Merged
merged 8 commits into from
Jun 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 16 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,19 @@ conda create -n hipporag python=3.9
conda activate hipporag
pip install -r requirements.txt

GPU_DEVICES=0,1,2,3 #Replace with your own free GPU Devices
GPUS=0,1,2,3 #Replace with your own free GPU Devices
```

Add conda env to PATH as follows, where `/path/HippoRAG` is the root of HippoRAG, and `/path/HippoRAG/hipporag` is the path to the cond env. Consider adding this to your ~/.bashrc
```shell
export PATH=$PATH:/path/HippoRAG/hipporag/bin
```

Setup LLM API keys: TOGETHER_API_KEY is optional and set it when you want to use their open-source models (e.g., Llama-3).

```shell
export OPENAI_API_KEY='Add your own OpenAI API key here.'
export TOGETHER_API_KEY='Add your own TogetherAI API key here.' # If you need to use TogetherAI models such as Llama-3 API
export TOGETHER_API_KEY='Add your own TogetherAI API key here.'
```

To use ColBERTv2, download the pre-trained [checkpoint](https://downloads.cs.stanford.edu/nlp/data/colbert/colbertv2/colbertv2.0.tar.gz) and put it under `exp/colbertv2.0`.
Expand Down Expand Up @@ -211,7 +221,7 @@ Using our HippoRAG framework requires a two-step process, indexing and retrieval
To run indexing for both our main experiments and our ablations, run the following bash scripts. Retrieval will fail if this step does not succeed.

```shell
bash src/setup_hipporag_main_exps.sh $GPU_DEVICES
bash src/setup_hipporag_main_exps.sh $GPUS
```

#### HippoRAG Retrieval
Expand All @@ -234,7 +244,7 @@ bash src/run_hipporag_ircot_main_exps.sh
To run all our ablations, run the following bash scripts:

```shell
bash src/setup_hipporag_ablations.sh $GPU_DEVICES
bash src/setup_hipporag_ablations.sh $GPUS
bash src/run_hipporag_ablations.sh
```

Expand All @@ -243,7 +253,7 @@ bash src/run_hipporag_ablations.sh
To reproduce our hyperparameter tuning, we must first run indexing on the MuSiQue training subset by running the following script:

```shell
bash src/setup_hipporag_hyperparameter_tune.sh $GPU_DEVICES
bash src/setup_hipporag_hyperparameter_tune.sh $GPUS
```

After indexing is completed, run the following script and note the performance of each hyperparameter combination tested.
Expand All @@ -263,7 +273,7 @@ To run the case study examples shown in our paper, which we also include in our
#### Indexing

```shell
bash src/setup_hipporag_case_study.sh $GPU_DEVICES
bash src/setup_hipporag_case_study.sh $GPUS
```

#### Retrieval
Expand Down
13 changes: 2 additions & 11 deletions src/RetrievalModule.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,13 @@
from glob import glob
import os.path

import ipdb
import pandas as pd

import pickle
import numpy as np
import os
from tqdm import tqdm
import torch

import faiss
import gc
Expand All @@ -21,12 +21,6 @@
VECTOR_DIR = 'data/lm_vectors'


def mean_pooling(token_embeddings, mask):
token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
return sentence_embeddings


class RetrievalModule:
"""
Class designed to retrieve potential synonymy candidates for a set of UMLS terms from a set of entities.
Expand Down Expand Up @@ -121,7 +115,6 @@ def create_sorted_df(self, strings):
return lengths_df.sort_values(0)

def save_vecs(self, strings, vectors, direc_name, bin_size=50000):

with open(direc_name + '/encoded_strings.txt', 'w') as f:
for string in strings:
f.write(string + '\n')
Expand Down Expand Up @@ -379,11 +372,9 @@ def retrieve_knn(self, queries, knowledge_base, k=2047):
return sorted_candidate_dictionary


import sys

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--retriever_name', type=str)
parser.add_argument('--retriever_name', type=str, help='retrieval model name, e.g., "facebook/contriever"')
parser.add_argument('--string_filename', type=str)
parser.add_argument('--pool_method', type=str, default='mean')

Expand Down
32 changes: 0 additions & 32 deletions src/baselines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,32 +0,0 @@
import torch


def mean_pooling(token_embeddings, mask):
token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
return sentence_embeddings


def get_file_name(path):
return path.split('/')[-1].replace('.jsonl', '').replace('.json', '')


def mean_pooling_embedding(input_str: str, tokenizer, model, device='cuda'):
inputs = tokenizer(input_str, padding=True, truncation=True, return_tensors='pt').to(device)
outputs = model(**inputs)

embedding = mean_pooling(outputs[0], inputs['attention_mask']).to('cpu').detach().numpy()
return embedding


def mean_pooling_embedding_with_normalization(input_str, tokenizer, model, device='cuda'):
encoding = tokenizer(input_str, return_tensors='pt', padding=True, truncation=True)
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
outputs = model(input_ids, attention_mask=attention_mask)
embeddings = mean_pooling(outputs[0], attention_mask)
embeddings = embeddings.T.divide(torch.linalg.norm(embeddings, dim=1)).T

return embeddings
22 changes: 13 additions & 9 deletions src/baselines/create_colbertv2_index.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,21 @@
import argparse
import json
import os.path

from colbert import Indexer
from colbert.infra import Run, RunConfig, ColBERTConfig


def run_colbertv2_index(dataset_name: str, index_name: str, corpus_tsv_path: str, checkpoint_path='exp/colbertv2.0', overwrite=False):
with Run().context(RunConfig(nranks=1, experiment="colbert", root=f"exp/{dataset_name}/")):
config = ColBERTConfig(
nbits=2,
root=f"exp/{dataset_name}/colbert",
)
indexer = Indexer(checkpoint=checkpoint_path, config=config)
indexer.index(name=index_name, collection=corpus_tsv_path, overwrite=overwrite)
print(f'Indexing done for dataset {dataset_name}, index {index_name}')


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--corpus', type=str)
Expand Down Expand Up @@ -45,11 +56,4 @@
f.write(f"{pid}\t\"{p}\"" + '\n')
print(f'Corpus tsv saved: {corpus_tsv_path}', len(corpus_contents))

with Run().context(RunConfig(nranks=1, experiment="colbert", root=f"exp/{args.dataset}/")):
config = ColBERTConfig(
nbits=2,
root=f"exp/{args.dataset}/colbert",
)
indexer = Indexer(checkpoint=checkpoint_path, config=config)
indexer.index(name=f"{args.corpus}_nbits_2", collection=corpus_tsv_path, overwrite=True)
print(f'Indexing done for {args.corpus}_nbits_2')
run_colbertv2_index(args.dataset, args.corpus + '_nbits_2', corpus_tsv_path, 'exp/colbertv2.0', overwrite=True)
2 changes: 1 addition & 1 deletion src/baselines/ircot.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from langchain_core.prompts import ChatPromptTemplate

from src.langchain_util import init_langchain_model, num_tokens_by_tiktoken
from src.baselines import mean_pooling_embedding_with_normalization
from src.processing import mean_pooling_embedding_with_normalization
from src.elastic_search_tool import search_with_score
import numpy as np
from sentence_transformers import SentenceTransformer
Expand Down
3 changes: 2 additions & 1 deletion src/baselines/mean_pooling_ip_faiss.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import sys

from src.processing import mean_pooling, mean_pooling_embedding_with_normalization

sys.path.append('.')

import argparse
Expand All @@ -11,7 +13,6 @@
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from src.baselines import mean_pooling_embedding_with_normalization, mean_pooling

if __name__ == '__main__':
parser = argparse.ArgumentParser()
Expand Down
55 changes: 27 additions & 28 deletions src/colbertv2_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,35 @@
import json
import pickle

import ipdb
import numpy as np
from colbert import Indexer
from colbert.infra import Run, RunConfig, ColBERTConfig


def colbertv2_index(corpus: list, dataset_name: str, exp_name: str, index_name='nbits_2', checkpoint_path='exp/colbertv2.0', overwrite='reuse'):
"""
Indexing corpus and phrases using colbertv2
@param corpus:
@return:
"""
corpus_processed = [x.replace('\n', '\t') for x in corpus]

corpus_tsv_file_path = f'data/lm_vectors/colbert/{dataset_name}_{exp_name}_{len(corpus_processed)}.tsv'
with open(corpus_tsv_file_path, 'w') as f: # save to tsv
for pid, p in enumerate(corpus_processed):
f.write(f"{pid}\t\"{p}\"" + '\n')
root_path = f'data/lm_vectors/colbert/{dataset_name}'

# indexing corpus
with Run().context(RunConfig(nranks=1, experiment=exp_name, root=root_path)):
config = ColBERTConfig(
nbits=2,
root=root_path,
)
indexer = Indexer(checkpoint=checkpoint_path, config=config)
indexer.index(name=index_name, collection=corpus_tsv_file_path, overwrite=overwrite)


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--dataset', type=str)
Expand All @@ -23,34 +47,9 @@
else:
corpus_contents = [x['title'] + ' ' + x['text'].replace('\n', ' ') for x in corpus]

corpus_tsv_file_path = f'data/lm_vectors/colbert/{args.dataset}_corpus_{len(corpus_contents)}.tsv'
with open(corpus_tsv_file_path, 'w') as f: # save to tsv
for pid, p in enumerate(corpus_contents):
f.write(f"{pid}\t\"{p}\"" + '\n')

root_path = f'data/lm_vectors/colbert/{args.dataset}'
# indexing corpus
with Run().context(RunConfig(nranks=1, experiment='corpus', root=root_path)):
config = ColBERTConfig(
nbits=2,
root=root_path,
)
indexer = Indexer(checkpoint=checkpoint_path, config=config)
indexer.index(name=f"nbits_2", collection=corpus_tsv_file_path, overwrite=True)
colbertv2_index(corpus_contents, args.dataset, 'corpus', checkpoint_path, overwrite=True)

kb_phrase_dict = pickle.load(open(args.phrase, 'rb'))
phrases = np.array(list(kb_phrase_dict.keys()))[np.argsort(list(kb_phrase_dict.values()))]
phrases = phrases.tolist()
# get phrases tsv
phrases_tsv_file_path = f'data/lm_vectors/colbert/{args.dataset}_phrases_{len(phrases)}.tsv'
with open(phrases_tsv_file_path, 'w') as f: # save to tsv
for pid, p in enumerate(phrases):
f.write(f"{pid}\t\"{p}\"" + '\n')
# indexing phrases
with Run().context(RunConfig(nranks=1, experiment='phrase', root=root_path)):
config = ColBERTConfig(
nbits=2,
root=root_path,
)
indexer = Indexer(checkpoint=checkpoint_path, config=config)
indexer.index(name=f"nbits_2", collection=phrases, overwrite=True)
colbertv2_index(phrases, args.dataset, 'phrase', checkpoint_path, overwrite=True)
Loading