langchain-ai · hemidactylus · Sep 19, 2024 · Sep 4, 2024 · Sep 4, 2024 · Sep 6, 2024
diff --git a/libs/astradb/langchain_astradb/graph_vectorstores.py b/libs/astradb/langchain_astradb/graph_vectorstores.py
@@ -39,7 +39,7 @@ class _Edge:
 
 
 # NOTE: Conversion to string is necessary
-# becasue AstraDB doesn't support matching on arrays of tuples
+# because AstraDB doesn't support matching on arrays of tuples
 def _tag_to_str(kind: str, tag: str) -> str:
     return f"{kind}:{tag}"
 
@@ -139,7 +139,17 @@ def from_documents(
     ) -> AstraDBGraphVectorStore:
         """Return GraphVectorStore initialized from documents and embeddings."""
         store = cls(embedding, **kwargs)
-        store.add_documents(documents, ids=ids)
+        # `store.add_documents` ends up calling store.add_nodes, which
+        # discards the kwargs including ids. This is the place to normalize
+        # the documents' .id and the separate ids into one and the same:
+        _documents: Iterable[Document]
+        if ids is not None:
+            _documents = [document.copy() for document in documents]
+            for _doc_id, _document in zip(ids, _documents):
+                _document.id = _doc_id
+        else:
+            _documents = documents
+        store.add_documents(_documents)
         return store
 
     @override
@@ -248,11 +258,21 @@ def visit_targets(d: int, targets: Sequence[Document]) -> None:
 
         return visited_docs
 
-    def _filter_to_metadata(self, filter_dict: dict[str, Any] | None) -> dict[str, Any]:
-        if filter_dict is None:
-            return {}
+    def filter_to_query(self, filter_dict: dict[str, Any] | None) -> dict[str, Any]:
+        """Prepare a query for use on DB based on metadata filter.
 
-        return self.vectorstore.document_codec.encode_filter(filter_dict)
+        Encode an "abstract" filter clause on metadata into a query filter
+        condition aware of the collection schema choice.
+
+        Args:
+            filter_dict: a metadata condition in the form {"field": "value"}
+                or related.
+
+        Returns:
+            the corresponding mapping ready for use in queries,
+            aware of the details of the schema used to encode the document on DB.
+        """
+        return self.vectorstore.filter_to_query(filter_dict)
 
     def _get_outgoing_tags(
         self,
@@ -318,7 +338,7 @@ def get_adjacent(tags: set[str]) -> Iterable[_Edge]:
             for tag in tags:
                 m_filter = (metadata_filter or {}).copy()
                 m_filter[self.link_from_metadata_key] = tag
-                metadata_parameter = self._filter_to_metadata(m_filter)
+                metadata_parameter = self.filter_to_query(m_filter)
 
                 hits = list(
                     self.astra_env.collection.find(
@@ -382,7 +402,7 @@ def fetch_neighborhood(neighborhood: Sequence[str]) -> None:
             helper.add_candidates(new_candidates)
 
         def fetch_initial_candidates() -> None:
-            metadata_parameter = self._filter_to_metadata(metadata_filter).copy()
+            metadata_parameter = self.filter_to_query(metadata_filter).copy()
             hits = list(
                 self.astra_env.collection.find(
                     filter=metadata_parameter,

diff --git a/libs/astradb/langchain_astradb/utils/mmr.py b/libs/astradb/langchain_astradb/utils/mmr.py
@@ -50,7 +50,7 @@ def cosine_similarity(x: Matrix, y: Matrix) -> np.ndarray:
     else:
         x = np.array(x, dtype=np.float32)
         y = np.array(y, dtype=np.float32)
-        z = 1 - simd.cdist(x, y, metric="cosine")
+        z = 1 - np.array(simd.cdist(x, y, metric="cosine"))
         if isinstance(z, float):
             return np.array([z])
         return z

diff --git a/libs/astradb/langchain_astradb/vectorstores.py b/libs/astradb/langchain_astradb/vectorstores.py
@@ -300,7 +300,20 @@ class AstraDBVectorStore(VectorStore):
 
     """  # noqa: E501
 
-    def _filter_to_metadata(self, filter_dict: dict[str, Any] | None) -> dict[str, Any]:
+    def filter_to_query(self, filter_dict: dict[str, Any] | None) -> dict[str, Any]:
+        """Prepare a query for use on DB based on metadata filter.
+
+        Encode an "abstract" filter clause on metadata into a query filter
+        condition aware of the collection schema choice.
+
+        Args:
+            filter_dict: a metadata condition in the form {"field": "value"}
+                or related.
+
+        Returns:
+            the corresponding mapping ready for use in queries,
+            aware of the details of the schema used to encode the document on DB.
+        """
         if filter_dict is None:
             return {}
 
@@ -1319,7 +1332,7 @@ def _similarity_search_with_score_id_by_sort(
     ) -> list[tuple[Document, float, str]]:
         """Run ANN search with a provided sort clause."""
         self.astra_env.ensure_db_setup()
-        metadata_parameter = self._filter_to_metadata(filter)
+        metadata_parameter = self.filter_to_query(filter)
         hits_ite = self.astra_env.collection.find(
             filter=metadata_parameter,
             projection=self.document_codec.base_projection,
@@ -1515,7 +1528,7 @@ async def _asimilarity_search_with_score_id_by_sort(
     ) -> list[tuple[Document, float, str]]:
         """Run ANN search with a provided sort clause."""
         await self.astra_env.aensure_db_setup()
-        metadata_parameter = self._filter_to_metadata(filter)
+        metadata_parameter = self.filter_to_query(filter)
         return [
             (doc, sim, did)
             async for (doc, sim, did) in (
@@ -1638,7 +1651,7 @@ def max_marginal_relevance_search_by_vector(
             The list of Documents selected by maximal marginal relevance.
         """
         self.astra_env.ensure_db_setup()
-        metadata_parameter = self._filter_to_metadata(filter)
+        metadata_parameter = self.filter_to_query(filter)
 
         return self._run_mmr_query_by_sort(
             sort={"$vector": embedding},
@@ -1677,7 +1690,7 @@ async def amax_marginal_relevance_search_by_vector(
             The list of Documents selected by maximal marginal relevance.
         """
         await self.astra_env.aensure_db_setup()
-        metadata_parameter = self._filter_to_metadata(filter)
+        metadata_parameter = self.filter_to_query(filter)
 
         return await self._arun_mmr_query_by_sort(
             sort={"$vector": embedding},
@@ -1719,7 +1732,7 @@ def max_marginal_relevance_search(
             # this case goes directly to the "_by_sort" method
             # (and does its own filter normalization, as it cannot
             #  use the path for the with-embedding mmr querying)
-            metadata_parameter = self._filter_to_metadata(filter)
+            metadata_parameter = self.filter_to_query(filter)
             return self._run_mmr_query_by_sort(
                 sort={"$vectorize": query},
                 k=k,
@@ -1770,7 +1783,7 @@ async def amax_marginal_relevance_search(
             # this case goes directly to the "_by_sort" method
             # (and does its own filter normalization, as it cannot
             #  use the path for the with-embedding mmr querying)
-            metadata_parameter = self._filter_to_metadata(filter)
+            metadata_parameter = self.filter_to_query(filter)
             return await self._arun_mmr_query_by_sort(
                 sort={"$vectorize": query},
                 k=k,
@@ -1930,6 +1943,13 @@ def from_documents(
         """
         texts = [d.page_content for d in documents]
         metadatas = [d.metadata for d in documents]
+        if "ids" not in kwargs:
+            ids = [doc.id for doc in documents]
+
+            # If there's at least one valid ID, we'll assume that IDs
+            # should be used.
+            if any(ids):
+                kwargs["ids"] = ids
         return cls.from_texts(
             texts,
             embedding=embedding,
@@ -1956,6 +1976,13 @@ async def afrom_documents(
         """
         texts = [d.page_content for d in documents]
         metadatas = [d.metadata for d in documents]
+        if "ids" not in kwargs:
+            ids = [doc.id for doc in documents]
+
+            # If there's at least one valid ID, we'll assume that IDs
+            # should be used.
+            if any(ids):
+                kwargs["ids"] = ids
         return await cls.afrom_texts(
             texts,
             embedding=embedding,