Skip to content

Commit

Permalink
fix: lower file type comparison
Browse files Browse the repository at this point in the history
  • Loading branch information
ArzelaAscoIi committed Apr 30, 2024
1 parent 4b993db commit 635ba6b
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 17 deletions.
25 changes: 12 additions & 13 deletions deepset_cloud_sdk/_service/files_service.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Module for all file-related operations."""

from __future__ import annotations

import asyncio
Expand Down Expand Up @@ -35,6 +36,7 @@
logger = structlog.get_logger(__name__)

SUPPORTED_TYPE_SUFFIXES = [".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"]
META_SUFFIX = ".meta.json"
DIRECT_UPLOAD_THRESHOLD = 20


Expand Down Expand Up @@ -197,11 +199,11 @@ async def upload_file_paths(
_raw_files = [
path
for path in file_paths
if path.suffix.lower() in SUPPORTED_TYPE_SUFFIXES and not path.name.endswith(".meta.json")
if path.suffix in SUPPORTED_TYPE_SUFFIXES and not path.name.endswith(META_SUFFIX)
]
for file_path in _raw_files:
meta: Dict[str, Any] = {}
meta_path = Path(str(file_path) + ".meta.json")
meta_path = Path(str(file_path) + META_SUFFIX)
if meta_path in file_paths:
with meta_path.open("r") as meta_file:
meta = json.loads(meta_file.read())
Expand Down Expand Up @@ -240,7 +242,7 @@ async def upload_file_paths(

# wait for ingestion to finish
if blocking:
total_files = len(list(filter(lambda x: not os.path.basename(x).endswith(".meta.json"), file_paths)))
total_files = len(list(filter(lambda x: not os.path.basename(x).endswith(META_SUFFIX), file_paths)))
await self._wait_for_finished(
workspace_name=workspace_name,
session_id=upload_session.session_id,
Expand Down Expand Up @@ -282,24 +284,24 @@ def _validate_file_paths(file_paths: List[Path]) -> None:
"""
logger.info("Validating file paths and metadata.")
for file_path in file_paths:
if file_path.suffix.lower() not in SUPPORTED_TYPE_SUFFIXES:
if file_path.suffix not in SUPPORTED_TYPE_SUFFIXES:
raise ValueError(
f"Invalid file extension: {file_path.suffix}. Refer to the list of supported file types in `SUPPORTED_TYPE_SUFFIXES`. "
"Metadata files should have the `.meta.json` extension."
)
meta_file_names = list(
map(
lambda fp: os.path.basename(fp),
[file_path for file_path in file_paths if str(file_path).lower().endswith(".meta.json")],
[file_path for file_path in file_paths if str(file_path).endswith(META_SUFFIX)],
)
)
file_names = list(map(lambda fp: os.path.basename(fp), file_paths))
file_name_set = set(filter(lambda fn: not fn.lower().endswith(".meta.json"), file_names))
file_name_set = set(filter(lambda fn: not fn.endswith(META_SUFFIX), file_names))

not_mapped_meta_files = [
meta_file_name
for meta_file_name in meta_file_names
if meta_file_name.lower().split(".meta.json")[0] not in file_name_set
if meta_file_name.split(META_SUFFIX)[0] not in file_name_set
]

if len(not_mapped_meta_files) > 0:
Expand Down Expand Up @@ -341,7 +343,7 @@ def _get_allowed_file_types(desired_file_types: Optional[List[Any]]) -> List[str
return SUPPORTED_TYPE_SUFFIXES

desired_types_processed: Set[str] = {
str(file_type).lower() if str(file_type).startswith(".") else f".{str(file_type).lower()}"
str(file_type) if str(file_type).startswith(".") else f".{str(file_type)}"
for file_type in desired_file_types
}
allowed_types: Set[str] = {
Expand All @@ -362,14 +364,11 @@ def _preprocess_paths(
allowed_file_types: List[str] = FilesService._get_allowed_file_types(desired_file_types)
allowed_meta_types: Tuple = tuple(f"{file_type}.meta.json" for file_type in allowed_file_types)

meta_file_path = [
path for path in all_files if path.is_file() and str(path).lower().endswith(allowed_meta_types)
]
meta_file_path = [path for path in all_files if path.is_file() and str(path).endswith(allowed_meta_types)]
file_paths = [
path
for path in all_files
if path.is_file()
and (path.suffix.lower() in allowed_file_types and not str(path).lower().endswith(".meta.json"))
if path.is_file() and (path.suffix in allowed_file_types and not str(path).endswith(META_SUFFIX))
]
combined_paths = meta_file_path + file_paths

Expand Down
9 changes: 5 additions & 4 deletions tests/integration/service/test_integration_files_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from deepset_cloud_sdk._api.files import File
from deepset_cloud_sdk._api.upload_sessions import WriteMode
from deepset_cloud_sdk._service.files_service import (
META_SUFFIX,
SUPPORTED_TYPE_SUFFIXES,
DeepsetCloudFile,
FilesService,
Expand Down Expand Up @@ -37,7 +38,7 @@ async def test_direct_upload_path(self, integration_config: CommonConfig, worksp
names_of_uploaded_files = [
file.name
for file in Path("./tests/test_data/msmarco.10").glob("*.txt")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]
# Check the metadata was uploaded correctly
files: List[File] = []
Expand Down Expand Up @@ -76,7 +77,7 @@ async def test_direct_upload_path_multiple_file_types(
local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/multiple_file_types").glob("*")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]

uploaded_files: List[File] = []
Expand Down Expand Up @@ -119,7 +120,7 @@ async def test_async_upload(
local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/msmarco.10").glob("*.txt")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]
# Check the metadata was uploaded correctly
uploaded_files: List[File] = []
Expand Down Expand Up @@ -164,7 +165,7 @@ async def test_async_upload_multiple_file_types(
local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/multiple_file_types").glob("*")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]

uploaded_files: List[File] = []
Expand Down

0 comments on commit 635ba6b

Please sign in to comment.