Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add remove_from_index method #66

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions byaldi/RAGModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,17 @@ def add_to_index(
input_item, store_collection_with_index, doc_id, metadata=metadata
)

def remove_from_index(self, doc_id: int):
"""Remove an item to an existing index.

Parameters:
doc_id (int): The document ID for the item being removed.

Returns:
None
"""
return self.model.remove_from_index(doc_id)

def search(
self,
query: Union[str, List[str]],
Expand Down
58 changes: 56 additions & 2 deletions byaldi/colpali.py
Original file line number Diff line number Diff line change
Expand Up @@ -589,8 +589,62 @@ def _add_to_index(
if self.verbose > 0:
print(f"Added page {page_id} of document {doc_id} to index.")

def remove_from_index(self):
raise NotImplementedError("This method is not implemented yet.")
def remove_from_index(self, doc_id: int) -> Dict[int, str]:
if self.index_name is None:
raise ValueError(
"No index loaded. Use index() to create or load an index first."
)

if not hasattr(self, "doc_ids_to_file_names"):
raise ValueError("No documents in the index.")

if doc_id not in self.doc_ids_to_file_names:
raise ValueError(f"Document ID {doc_id} does not exist in the index.")

# Remove the document from doc_ids_to_file_names
del self.doc_ids_to_file_names[doc_id]

# Remove associated embeddings
embed_ids_to_remove = [
embed_id
for embed_id, doc_info in self.embed_id_to_doc_id.items()
if doc_info["doc_id"] == doc_id
]
self.indexed_embeddings = [
embedding
for i, embedding in enumerate(self.indexed_embeddings)
if i not in embed_ids_to_remove
]

# Create a new dictionary for embed_id_to_doc_id excluding entries in embed_ids_to_remove
remaining_embed_id_to_doc_id = {
embed_id: self.embed_id_to_doc_id[embed_id]
for embed_id in self.embed_id_to_doc_id
if embed_id not in embed_ids_to_remove
}

# Re-index the remaining items in embed_id_to_doc_id
self.embed_id_to_doc_id = {
new_id: doc_info
for new_id, doc_info in enumerate(remaining_embed_id_to_doc_id.values())
}

# Remove associated entries from the collection if they exist
if hasattr(self, "collection"):
for embed_id in embed_ids_to_remove:
if embed_id in self.collection:
del self.collection[embed_id]

# Remove metadata if it exists
if hasattr(self, "doc_id_to_metadata") and doc_id in self.doc_id_to_metadata:
del self.doc_id_to_metadata[doc_id]

# Update the highest_doc_id if necessary
if doc_id == self.highest_doc_id:
self.highest_doc_id = max(self.doc_ids_to_file_names.keys(), default=-1)

self._export_index()
return self.doc_ids_to_file_names

def filter_embeddings(self,filter_metadata:Dict[str,str]):
req_doc_ids = []
Expand Down
Loading