Skip to content

Commit

Permalink
Merge branch 'main' into fix/structured_output_wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
jzaldi authored Jan 13, 2025
2 parents f0eb2da + 49d3844 commit 864ed61
Show file tree
Hide file tree
Showing 42 changed files with 5,181 additions and 2,732 deletions.
6 changes: 3 additions & 3 deletions libs/community/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,14 @@ lint_tests: MYPY_CACHE=.mypy_cache_test

lint lint_diff lint_package lint_tests:
./scripts/lint_imports.sh
poetry run ruff .
poetry run ruff check .
poetry run ruff format $(PYTHON_FILES) --diff
poetry run ruff --select I $(PYTHON_FILES)
poetry run ruff check --select I $(PYTHON_FILES)
mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)

format format_diff:
poetry run ruff format $(PYTHON_FILES)
poetry run ruff --select I --fix $(PYTHON_FILES)
poetry run ruff check --select I --fix $(PYTHON_FILES)

spell_check:
poetry run codespell --toml pyproject.toml
Expand Down
6 changes: 6 additions & 0 deletions libs/community/langchain_google_community/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@
from langchain_google_community.drive import GoogleDriveLoader
from langchain_google_community.gcs_directory import GCSDirectoryLoader
from langchain_google_community.gcs_file import GCSFileLoader
from langchain_google_community.geocoding import (
GoogleGeocodingAPIWrapper,
GoogleGeocodingTool,
)
from langchain_google_community.gmail.loader import GMailLoader
from langchain_google_community.gmail.toolkit import GmailToolkit
from langchain_google_community.google_speech_to_text import SpeechToTextLoader
Expand Down Expand Up @@ -50,6 +54,8 @@
"GMailLoader",
"GmailToolkit",
"GoogleDriveLoader",
"GoogleGeocodingAPIWrapper",
"GoogleGeocodingTool",
"GooglePlacesAPIWrapper",
"GooglePlacesTool",
"GoogleSearchAPIWrapper",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,7 @@ def _create_search_query(
if table_to_query is not None:
embeddings_query = f"""
with embeddings as (
SELECT {self.embedding_field}, ROW_NUMBER() OVER() as row_num
SELECT {self.embedding_field}, row_num
from `{table_to_query}`
)"""

Expand Down Expand Up @@ -390,14 +390,16 @@ def _create_temp_bq_table(
df = pd.DataFrame([])

df[self.embedding_field] = embeddings
df["row_num"] = list(range(len(df)))
table_id = (
f"{self.project_id}."
f"{self.temp_dataset_name}."
f"{self.table_name}_{uuid.uuid4().hex}"
)

schema = [
bigquery.SchemaField(self.embedding_field, "FLOAT64", mode="REPEATED")
bigquery.SchemaField(self.embedding_field, "FLOAT64", mode="REPEATED"),
bigquery.SchemaField("row_num", "INT64"),
]
table_ref = bigquery.Table(table_id, schema=schema)
table = self._bq_client.create_table(table_ref)
Expand Down Expand Up @@ -483,7 +485,7 @@ def batch_search(
)

if queries is not None:
embeddings = self.embedding.embed_documents(queries)
embeddings = [self.embedding.embed_query(query) for query in queries]

if embeddings is None:
raise ValueError("Could not obtain embeddings - value is None.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ def _parse_proto_output(
documents.append(
[
Document(
id=result.entity_id,
page_content=content,
metadata=metadata,
),
Expand All @@ -421,6 +422,7 @@ def _search_embedding(
entity_id: Optional[str] = None,
k: int = 5,
string_filters: Optional[List[dict]] = None,
numeric_filters: Optional[List[dict]] = None,
per_crowding_attribute_neighbor_count: Optional[int] = None,
approximate_neighbor_candidates: Optional[int] = None,
leaf_nodes_search_fraction: Optional[float] = None,
Expand All @@ -438,6 +440,7 @@ def _search_embedding(
embedding=embedding,
neighbor_count=k,
string_filters=string_filters,
numeric_filters=numeric_filters,
per_crowding_attribute_neighbor_count=per_crowding_attribute_neighbor_count,
parameters={
"approximate_neighbor_candidates": approximate_neighbor_candidates,
Expand Down
43 changes: 36 additions & 7 deletions libs/community/langchain_google_community/drive.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,26 @@

import os
from pathlib import Path
from typing import Any, Dict, List, Optional, Sequence, Union
from typing import Any, ClassVar, Dict, List, Optional, Sequence, Tuple, Union

from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from pydantic import BaseModel, field_validator, model_validator

SCOPES = ["https://www.googleapis.com/auth/drive.file"]


class GoogleDriveLoader(BaseLoader, BaseModel):
"""Load Google Docs from `Google Drive`."""

# Generated from https://developers.google.com/drive/api/guides/api-specific-auth
# limiting to the scopes that are required to read the files
VALID_SCOPES: ClassVar[Tuple[str, ...]] = (
"https://www.googleapis.com/auth/drive.file",
"https://www.googleapis.com/auth/drive.readonly",
"https://www.googleapis.com/auth/drive.meet.readonly",
"https://www.googleapis.com/auth/drive.metadata.readonly",
"https://www.googleapis.com/auth/drive.metadata",
)

service_account_key: Path = Path.home() / ".credentials" / "keys.json"
"""Path to the service account key file."""
credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
Expand Down Expand Up @@ -51,6 +59,9 @@ class GoogleDriveLoader(BaseLoader, BaseModel):
"""Whether to load authorization identities."""
load_extended_metadata: bool = False
"""Whether to load extended metadata."""
scopes: List[str] = ["https://www.googleapis.com/auth/drive.file"]
"""The credential scopes to use for Google Drive API access. Default is
drive.file scope."""

def _get_file_size_from_id(self, id: str) -> str:
"""Fetch the size of the file."""
Expand Down Expand Up @@ -252,6 +263,22 @@ def validate_credentials_path(cls, v: Any, **kwargs: Any) -> Any:
raise ValueError(f"credentials_path {v} does not exist")
return v

@field_validator("scopes")
def validate_scopes(cls, v: List[str]) -> List[str]:
"""Validate that the provided scopes are not empty and
are valid Google Drive API scopes."""
if not v:
raise ValueError("At least one scope must be provided")

invalid_scopes = [scope for scope in v if scope not in cls.VALID_SCOPES]
if invalid_scopes:
raise ValueError(
f"Invalid Google Drive API scope(s): {', '.join(invalid_scopes)}. "
f"Valid scopes are: {', '.join(cls.VALID_SCOPES)}"
)

return v

def _load_credentials(self) -> Any:
"""Load credentials."""
# Adapted from https://developers.google.com/drive/api/v3/quickstart/python
Expand All @@ -273,11 +300,13 @@ def _load_credentials(self) -> Any:
creds = None
if self.service_account_key.exists():
return service_account.Credentials.from_service_account_file(
str(self.service_account_key), scopes=SCOPES
str(self.service_account_key), scopes=self.scopes
)

if self.token_path.exists():
creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)
creds = Credentials.from_authorized_user_file(
str(self.token_path), self.scopes
)

if self.credentials:
# use whatever was passed to us
Expand All @@ -289,13 +318,13 @@ def _load_credentials(self) -> Any:
creds.refresh(Request())
elif "GOOGLE_APPLICATION_CREDENTIALS" not in os.environ:
creds, project = default()
creds = creds.with_scopes(SCOPES)
creds = creds.with_scopes(self.scopes)
# no need to write to file
if creds:
return creds
else:
flow = InstalledAppFlow.from_client_secrets_file(
str(self.credentials_path), SCOPES
str(self.credentials_path), self.scopes
)
creds = flow.run_local_server(port=0)
with open(self.token_path, "w") as token:
Expand Down
Loading

0 comments on commit 864ed61

Please sign in to comment.