Merge branch 'main' into fix/structured_output_wrapper

jzaldi · Jan 13, 2025 · 864ed61 · 864ed61
2 parents f0eb2da + 49d3844
commit 864ed61
Show file tree

Hide file tree

Showing 42 changed files with 5,181 additions and 2,732 deletions.
diff --git a/libs/community/Makefile b/libs/community/Makefile
@@ -34,14 +34,14 @@ lint_tests: MYPY_CACHE=.mypy_cache_test
 
 lint lint_diff lint_package lint_tests:
 	./scripts/lint_imports.sh
-	poetry run ruff .
+	poetry run ruff check .
 	poetry run ruff format $(PYTHON_FILES) --diff
-	poetry run ruff --select I $(PYTHON_FILES)
+	poetry run ruff check --select I $(PYTHON_FILES)
 	mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
 
 format format_diff:
 	poetry run ruff format $(PYTHON_FILES)
-	poetry run ruff --select I --fix $(PYTHON_FILES)
+	poetry run ruff check --select I --fix $(PYTHON_FILES)
 
 spell_check:
 	poetry run codespell --toml pyproject.toml

diff --git a/libs/community/langchain_google_community/__init__.py b/libs/community/langchain_google_community/__init__.py
@@ -11,6 +11,10 @@
 from langchain_google_community.drive import GoogleDriveLoader
 from langchain_google_community.gcs_directory import GCSDirectoryLoader
 from langchain_google_community.gcs_file import GCSFileLoader
+from langchain_google_community.geocoding import (
+    GoogleGeocodingAPIWrapper,
+    GoogleGeocodingTool,
+)
 from langchain_google_community.gmail.loader import GMailLoader
 from langchain_google_community.gmail.toolkit import GmailToolkit
 from langchain_google_community.google_speech_to_text import SpeechToTextLoader
@@ -50,6 +54,8 @@
     "GMailLoader",
     "GmailToolkit",
     "GoogleDriveLoader",
+    "GoogleGeocodingAPIWrapper",
+    "GoogleGeocodingTool",
     "GooglePlacesAPIWrapper",
     "GooglePlacesTool",
     "GoogleSearchAPIWrapper",

diff --git a/libs/community/langchain_google_community/bq_storage_vectorstores/bigquery.py b/libs/community/langchain_google_community/bq_storage_vectorstores/bigquery.py
@@ -301,7 +301,7 @@ def _create_search_query(
         if table_to_query is not None:
             embeddings_query = f"""
             with embeddings as (
-            SELECT {self.embedding_field}, ROW_NUMBER() OVER() as row_num
+            SELECT {self.embedding_field}, row_num
             from `{table_to_query}`
             )"""
 
@@ -390,14 +390,16 @@ def _create_temp_bq_table(
         df = pd.DataFrame([])
 
         df[self.embedding_field] = embeddings
+        df["row_num"] = list(range(len(df)))
         table_id = (
             f"{self.project_id}."
             f"{self.temp_dataset_name}."
             f"{self.table_name}_{uuid.uuid4().hex}"
         )
 
         schema = [
-            bigquery.SchemaField(self.embedding_field, "FLOAT64", mode="REPEATED")
+            bigquery.SchemaField(self.embedding_field, "FLOAT64", mode="REPEATED"),
+            bigquery.SchemaField("row_num", "INT64"),
         ]
         table_ref = bigquery.Table(table_id, schema=schema)
         table = self._bq_client.create_table(table_ref)
@@ -483,7 +485,7 @@ def batch_search(
             )
 
         if queries is not None:
-            embeddings = self.embedding.embed_documents(queries)
+            embeddings = [self.embedding.embed_query(query) for query in queries]
 
         if embeddings is None:
             raise ValueError("Could not obtain embeddings - value is None.")

diff --git a/libs/community/langchain_google_community/bq_storage_vectorstores/featurestore.py b/libs/community/langchain_google_community/bq_storage_vectorstores/featurestore.py
@@ -406,6 +406,7 @@ def _parse_proto_output(
             documents.append(
                 [
                     Document(
+                        id=result.entity_id,
                         page_content=content,
                         metadata=metadata,
                     ),
@@ -421,6 +422,7 @@ def _search_embedding(
         entity_id: Optional[str] = None,
         k: int = 5,
         string_filters: Optional[List[dict]] = None,
+        numeric_filters: Optional[List[dict]] = None,
         per_crowding_attribute_neighbor_count: Optional[int] = None,
         approximate_neighbor_candidates: Optional[int] = None,
         leaf_nodes_search_fraction: Optional[float] = None,
@@ -438,6 +440,7 @@ def _search_embedding(
             embedding=embedding,
             neighbor_count=k,
             string_filters=string_filters,
+            numeric_filters=numeric_filters,
             per_crowding_attribute_neighbor_count=per_crowding_attribute_neighbor_count,
             parameters={
                 "approximate_neighbor_candidates": approximate_neighbor_candidates,

diff --git a/libs/community/langchain_google_community/drive.py b/libs/community/langchain_google_community/drive.py
@@ -9,18 +9,26 @@
 
 import os
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Sequence, Union
+from typing import Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Union
 
 from langchain_core.document_loaders import BaseLoader
 from langchain_core.documents import Document
 from pydantic import BaseModel, field_validator, model_validator
 
-SCOPES = ["https://www.googleapis.com/auth/drive.file"]
-
 
 class GoogleDriveLoader(BaseLoader, BaseModel):
     """Load Google Docs from `Google Drive`."""
 
+    # Generated from https://developers.google.com/drive/api/guides/api-specific-auth
+    # limiting to the scopes that are required to read the files
+    VALID_SCOPES: ClassVar[Tuple[str, ...]] = (
+        "https://www.googleapis.com/auth/drive.file",
+        "https://www.googleapis.com/auth/drive.readonly",
+        "https://www.googleapis.com/auth/drive.meet.readonly",
+        "https://www.googleapis.com/auth/drive.metadata.readonly",
+        "https://www.googleapis.com/auth/drive.metadata",
+    )
+
     service_account_key: Path = Path.home() / ".credentials" / "keys.json"
     """Path to the service account key file."""
     credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
@@ -51,6 +59,9 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
     """Whether to load authorization identities."""
     load_extended_metadata: bool = False
     """Whether to load extended metadata."""
+    scopes: List[str] = ["https://www.googleapis.com/auth/drive.file"]
+    """The credential scopes to use for Google Drive API access. Default is 
+    drive.file scope."""
 
     def _get_file_size_from_id(self, id: str) -> str:
         """Fetch the size of the file."""
@@ -252,6 +263,22 @@ def validate_credentials_path(cls, v: Any, **kwargs: Any) -> Any:
             raise ValueError(f"credentials_path {v} does not exist")
         return v
 
+    @field_validator("scopes")
+    def validate_scopes(cls, v: List[str]) -> List[str]:
+        """Validate that the provided scopes are not empty and
+        are valid Google Drive API scopes."""
+        if not v:
+            raise ValueError("At least one scope must be provided")
+
+        invalid_scopes = [scope for scope in v if scope not in cls.VALID_SCOPES]
+        if invalid_scopes:
+            raise ValueError(
+                f"Invalid Google Drive API scope(s): {', '.join(invalid_scopes)}. "
+                f"Valid scopes are: {', '.join(cls.VALID_SCOPES)}"
+            )
+
+        return v
+
     def _load_credentials(self) -> Any:
         """Load credentials."""
         # Adapted from https://developers.google.com/drive/api/v3/quickstart/python
@@ -273,11 +300,13 @@ def _load_credentials(self) -> Any:
         creds = None
         if self.service_account_key.exists():
             return service_account.Credentials.from_service_account_file(
-                str(self.service_account_key), scopes=SCOPES
+                str(self.service_account_key), scopes=self.scopes
             )
 
         if self.token_path.exists():
-            creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)
+            creds = Credentials.from_authorized_user_file(
+                str(self.token_path), self.scopes
+            )
 
         if self.credentials:
             # use whatever was passed to us
@@ -289,13 +318,13 @@ def _load_credentials(self) -> Any:
                 creds.refresh(Request())
             elif "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
                 creds, project = default()
-                creds = creds.with_scopes(SCOPES)
+                creds = creds.with_scopes(self.scopes)
                 # no need to write to file
                 if creds:
                     return creds
             else:
                 flow = InstalledAppFlow.from_client_secrets_file(
-                    str(self.credentials_path), SCOPES
+                    str(self.credentials_path), self.scopes
                 )
                 creds = flow.run_local_server(port=0)
             with open(self.token_path, "w") as token: