Skip to content

Commit

Permalink
Fix GitHub reindex (#167)
Browse files Browse the repository at this point in the history
* Ignore non-existent doc_ids while deleting github files and log them as error

---------

Co-authored-by: aralyekta <[email protected]>
  • Loading branch information
kursataktas and aralyekta authored Mar 5, 2025
1 parent 822302d commit acf92da
Show file tree
Hide file tree
Showing 3 changed files with 313 additions and 11 deletions.
15 changes: 12 additions & 3 deletions src/gurubase-backend/backend/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -1553,9 +1553,18 @@ def delete_from_milvus(self):
delete_vectors(collection_name, self.doc_ids)

data_source = self.data_source
for doc_id in self.doc_ids:
data_source.doc_ids.remove(doc_id)
data_source.save()

# Check for invalid doc_ids
invalid_doc_ids = [doc_id for doc_id in self.doc_ids if doc_id not in data_source.doc_ids]
valid_doc_ids = [doc_id for doc_id in self.doc_ids if doc_id in data_source.doc_ids]

if invalid_doc_ids:
logger.error(f"Found doc_ids of github file {self.path} that don't exist in data_source: {invalid_doc_ids}. guru_type: {self.data_source.guru_type.slug}. Github link: {self.link}")

if valid_doc_ids:
for doc_id in valid_doc_ids:
data_source.doc_ids.remove(doc_id)
data_source.save()

self.in_milvus = False
self.doc_ids = []
Expand Down
10 changes: 2 additions & 8 deletions src/gurubase-backend/backend/core/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -1442,14 +1442,7 @@ def process_guru_type(guru_type):
# Bulk process the changes in a transaction
if files_to_delete or files_to_create:
with transaction.atomic():
# First remove from Milvus
for file in files_to_delete:
try:
file.delete_from_milvus()
except Exception as e:
logger.error(f"Error deleting file {file.path} from Milvus: {str(e)}")

# Then delete from DB
# Delete from DB (no need to delete from Milvus as it is handled by signals)
if files_to_delete:
deleted_count = GithubFile.objects.filter(
id__in=[f.id for f in files_to_delete]
Expand All @@ -1462,6 +1455,7 @@ def process_guru_type(guru_type):
logger.info(f"Created {len(created_files)} files for data source {str(data_source)}")

# Update data source timestamp
data_source.doc_ids = DataSource.objects.get(id=data_source.id).doc_ids # Reflect the latest doc_ids updated by the signals
data_source.save() # This will update date_updated

data_source.in_milvus = False
Expand Down
299 changes: 299 additions & 0 deletions src/gurubase-backend/backend/core/tests/test_milvus_operations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
from django.test import TestCase
from unittest.mock import patch, MagicMock, call
from django.contrib.auth import get_user_model
from core.models import GuruType, DataSource, GithubFile
from django.conf import settings
import os

User = get_user_model()

class MilvusOperationsTests(TestCase):
def setUp(self):
# Create a test user
self.user = User.objects.create(email='[email protected]')

# Create a test guru type
self.guru_type = GuruType.objects.create(
name='Test Guru',
slug='test-guru',
domain_knowledge='Test domain knowledge',
milvus_collection_name='test_guru_collection'
)

# Create a test data source
self.data_source = DataSource.objects.create(
type=DataSource.Type.WEBSITE,
title='Test Website',
guru_type=self.guru_type,
content='Test content for Website',
url='https://example.com/test'
)

# Create a test GitHub data source
self.github_data_source = DataSource.objects.create(
type=DataSource.Type.GITHUB_REPO,
title='Test GitHub Repo',
guru_type=self.guru_type,
url='https://github.com/test/repo',
default_branch='main'
)

# Create a test GitHub file
self.github_file = GithubFile.objects.create(
data_source=self.github_data_source,
path='test/file.py',
link='https://github.com/test/repo/blob/main/test/file.py',
content='def test_function():\n return "Hello, World!"',
size=100
)

@patch('core.utils.embed_texts')
@patch('core.milvus_utils.insert_vectors')
def test_datasource_write_to_milvus(self, mock_insert_vectors, mock_embed_texts):
"""Test that DataSource.write_to_milvus correctly updates doc_ids and in_milvus flag"""
# Mock the embedding and vector insertion
mock_embed_texts.return_value = [[0.1] * settings.MILVUS_CONTEXT_COLLECTION_DIMENSION]
mock_insert_vectors.return_value = ['doc_id_1', 'doc_id_2']

# Call the method
self.data_source.write_to_milvus()

# Check that the mocks were called correctly
mock_embed_texts.assert_called_once()
mock_insert_vectors.assert_called_once()

# Refresh from database
self.data_source.refresh_from_db()

# Check that the model was updated correctly
self.assertTrue(self.data_source.in_milvus)
self.assertEqual(self.data_source.doc_ids, ['doc_id_1', 'doc_id_2'])
self.assertEqual(self.data_source.status, DataSource.Status.SUCCESS)
self.assertIsNotNone(self.data_source.last_successful_index_date)

@patch('core.milvus_utils.delete_vectors')
def test_datasource_delete_from_milvus(self, mock_delete_vectors):
"""Test that DataSource.delete_from_milvus correctly clears doc_ids and in_milvus flag"""
# Set up the data source with mock doc_ids
self.data_source.doc_ids = ['doc_id_1', 'doc_id_2']
self.data_source.in_milvus = True
self.data_source.save()

# Call the method
self.data_source.delete_from_milvus()

# Check that the mock was called correctly
mock_delete_vectors.assert_called_once_with(
self.guru_type.milvus_collection_name,
['doc_id_1', 'doc_id_2']
)

# Refresh from database
self.data_source.refresh_from_db()

# Check that the model was updated correctly
self.assertFalse(self.data_source.in_milvus)
self.assertEqual(self.data_source.doc_ids, [])

@patch('core.utils.embed_texts')
@patch('core.milvus_utils.insert_vectors')
def test_github_file_write_to_milvus(self, mock_insert_vectors, mock_embed_texts):
"""Test that GithubFile.write_to_milvus correctly updates doc_ids and in_milvus flag"""
# Mock the embedding and vector insertion
mock_embed_texts.return_value = [[0.1] * settings.MILVUS_CONTEXT_COLLECTION_DIMENSION]
mock_insert_vectors.return_value = ['doc_id_1', 'doc_id_2']

# Call the method
self.github_file.write_to_milvus()

# Check that the mocks were called correctly
mock_embed_texts.assert_called_once()
mock_insert_vectors.assert_called_once()

# Refresh from database
self.github_file.refresh_from_db()

# Check that the model was updated correctly
self.assertTrue(self.github_file.in_milvus)
self.assertEqual(self.github_file.doc_ids, ['doc_id_1', 'doc_id_2'])

@patch('core.milvus_utils.delete_vectors')
def test_github_file_delete_from_milvus(self, mock_delete_vectors):
"""Test that GithubFile.delete_from_milvus correctly clears doc_ids and in_milvus flag"""
# Set up the GitHub file with mock doc_ids
self.github_file.doc_ids = ['doc_id_1', 'doc_id_2']
self.github_file.in_milvus = True
self.github_file.save()

# Set up the data source with the same doc_ids
self.github_data_source.doc_ids = ['doc_id_1', 'doc_id_2']
self.github_data_source.save()

# Call the method
self.github_file.delete_from_milvus()

# Check that the mock was called correctly
mock_delete_vectors.assert_called_once_with(
settings.GITHUB_REPO_CODE_COLLECTION_NAME,
['doc_id_1', 'doc_id_2']
)

# Refresh from database
self.github_file.refresh_from_db()
self.github_data_source.refresh_from_db()

# Check that the models were updated correctly
self.assertFalse(self.github_file.in_milvus)
self.assertEqual(self.github_file.doc_ids, [])
self.assertEqual(self.github_data_source.doc_ids, [])

@patch('core.utils.embed_texts')
@patch('core.milvus_utils.insert_vectors')
def test_github_datasource_write_to_milvus(self, mock_insert_vectors, mock_embed_texts):
"""Test that GitHub DataSource.write_to_milvus correctly updates doc_ids and in_milvus flag for all files"""
# Mock the embedding and vector insertion
mock_embed_texts.return_value = [[0.1] * settings.MILVUS_CONTEXT_COLLECTION_DIMENSION] * 2
mock_insert_vectors.return_value = ['doc_id_1', 'doc_id_2']

# Create a second GitHub file
github_file2 = GithubFile.objects.create(
data_source=self.github_data_source,
path='test/file2.py',
link='https://github.com/test/repo/blob/main/test/file2.py',
content='def another_function():\n return "Hello again!"',
size=120
)

# Call the method
self.github_data_source.in_milvus = False
self.github_data_source.write_to_milvus()

# Check that the mocks were called correctly
self.assertEqual(mock_embed_texts.call_count, 1)
self.assertEqual(mock_insert_vectors.call_count, 1)

# Refresh from database
self.github_data_source.refresh_from_db()
self.github_file.refresh_from_db()
github_file2.refresh_from_db()

# Check that the models were updated correctly
self.assertTrue(self.github_data_source.in_milvus)
self.assertTrue(self.github_file.in_milvus)
self.assertTrue(github_file2.in_milvus)
self.assertEqual(len(self.github_data_source.doc_ids), 2) # 2 doc_ids per file
self.assertEqual(self.github_file.doc_ids, ['doc_id_1'])
self.assertEqual(github_file2.doc_ids, ['doc_id_2'])

@patch('core.milvus_utils.delete_vectors')
def test_github_datasource_delete_from_milvus(self, mock_delete_vectors):
"""Test that GitHub DataSource.delete_from_milvus correctly deletes all files"""
# Set up the GitHub file with mock doc_ids
self.github_file.doc_ids = ['doc_id_1', 'doc_id_2']
self.github_file.in_milvus = True
self.github_file.save()

# Set up the data source with the same doc_ids
self.github_data_source.doc_ids = ['doc_id_1', 'doc_id_2']
self.github_data_source.in_milvus = True
self.github_data_source.save()

# Call the method
self.github_data_source.delete_from_milvus()

# Check that the mock was called correctly
mock_delete_vectors.assert_has_calls([
call(
self.guru_type.milvus_collection_name,
['doc_id_1', 'doc_id_2']
),
call(
settings.GITHUB_REPO_CODE_COLLECTION_NAME,
['doc_id_1', 'doc_id_2']
)
])

# Refresh from database
self.github_data_source.refresh_from_db()

# Check that the model was updated correctly
self.assertFalse(self.github_data_source.in_milvus)
self.assertEqual(self.github_data_source.doc_ids, [])

# Check that all GitHub files were deleted
self.assertEqual(GithubFile.objects.filter(data_source=self.github_data_source).count(), 0)

@patch('core.milvus_utils.delete_vectors')
def test_clear_github_file_signal(self, mock_delete_vectors):
"""Test that the clear_github_file signal correctly calls delete_from_milvus"""
# Set up the GitHub file with mock doc_ids
self.github_file.doc_ids = ['doc_id_1', 'doc_id_2']
self.github_file.in_milvus = True
self.github_file.save()

# Set up the data source with the same doc_ids
self.github_data_source.doc_ids = ['doc_id_1', 'doc_id_2']
self.github_data_source.save()

# Delete the GitHub file (should trigger the signal)
self.github_file.delete()

# Check that the mock was called correctly
mock_delete_vectors.assert_called_once_with(
settings.GITHUB_REPO_CODE_COLLECTION_NAME,
['doc_id_1', 'doc_id_2']
)

# Refresh from database
self.github_data_source.refresh_from_db()

# Check that the data source was updated correctly
self.assertEqual(self.github_data_source.doc_ids, [])

@patch('core.milvus_utils.delete_vectors')
def test_clear_data_source_signal(self, mock_delete_vectors):
"""Test that the clear_data_source signal correctly calls delete_from_milvus"""
# Set up the data source with mock doc_ids
self.data_source.doc_ids = ['doc_id_1', 'doc_id_2']
self.data_source.in_milvus = True
self.data_source.save()

# Delete the data source (should trigger the signal)
self.data_source.delete()

# Check that the mock was called correctly
mock_delete_vectors.assert_called_once_with(
self.guru_type.milvus_collection_name,
['doc_id_1', 'doc_id_2']
)

@patch('core.milvus_utils.delete_vectors')
@patch('core.milvus_utils.insert_vectors')
@patch('core.milvus_utils.fetch_vectors')
def test_update_data_source_in_milvus_signal(self, mock_fetch_vectors, mock_insert_vectors, mock_delete_vectors):
"""Test that the update_data_source_in_milvus signal correctly calls delete_from_milvus when title changes"""
# Set up the data source with mock doc_ids
self.data_source.doc_ids = ['doc_id_1', 'doc_id_2']
self.data_source.in_milvus = True
self.data_source.save()

mock_fetch_vectors.return_value = [
{'id': 'doc_id_1', 'metadata': {'title': 'Old Title'}},
{'id': 'doc_id_2', 'metadata': {'title': 'Old Title'}}
]

# Change the title and save (should trigger the signal)
self.data_source.title = 'Updated Title'
self.data_source.save()

# Check that the mock was called correctly
mock_delete_vectors.assert_called_once_with(
self.guru_type.milvus_collection_name,
['doc_id_1', 'doc_id_2']
)
mock_insert_vectors.assert_called_once_with(
self.guru_type.milvus_collection_name,
[{'metadata': {'title': 'Updated Title'}}, {'metadata': {'title': 'Updated Title'}}]
)


0 comments on commit acf92da

Please sign in to comment.