Skip to content

Commit

Permalink
fix: lower file type comparison (#185)
Browse files Browse the repository at this point in the history
* fix: lower file type comparison

* fix: unit tests

* tests: integration tests for matching with capital letters

* fix:

* fix

* fix

* fix

* fix

* fix: tets

* fix
  • Loading branch information
ArzelaAscoIi authored May 3, 2024
1 parent 4b993db commit 0f55cbf
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 26 deletions.
25 changes: 12 additions & 13 deletions deepset_cloud_sdk/_service/files_service.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Module for all file-related operations."""

from __future__ import annotations

import asyncio
Expand Down Expand Up @@ -35,6 +36,7 @@
logger = structlog.get_logger(__name__)

SUPPORTED_TYPE_SUFFIXES = [".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"]
META_SUFFIX = ".meta.json"
DIRECT_UPLOAD_THRESHOLD = 20


Expand Down Expand Up @@ -197,11 +199,11 @@ async def upload_file_paths(
_raw_files = [
path
for path in file_paths
if path.suffix.lower() in SUPPORTED_TYPE_SUFFIXES and not path.name.endswith(".meta.json")
if path.suffix in SUPPORTED_TYPE_SUFFIXES and not path.name.endswith(META_SUFFIX)
]
for file_path in _raw_files:
meta: Dict[str, Any] = {}
meta_path = Path(str(file_path) + ".meta.json")
meta_path = Path(str(file_path) + META_SUFFIX)
if meta_path in file_paths:
with meta_path.open("r") as meta_file:
meta = json.loads(meta_file.read())
Expand Down Expand Up @@ -240,7 +242,7 @@ async def upload_file_paths(

# wait for ingestion to finish
if blocking:
total_files = len(list(filter(lambda x: not os.path.basename(x).endswith(".meta.json"), file_paths)))
total_files = len(list(filter(lambda x: not os.path.basename(x).endswith(META_SUFFIX), file_paths)))
await self._wait_for_finished(
workspace_name=workspace_name,
session_id=upload_session.session_id,
Expand Down Expand Up @@ -282,24 +284,24 @@ def _validate_file_paths(file_paths: List[Path]) -> None:
"""
logger.info("Validating file paths and metadata.")
for file_path in file_paths:
if file_path.suffix.lower() not in SUPPORTED_TYPE_SUFFIXES:
if file_path.suffix not in SUPPORTED_TYPE_SUFFIXES:
raise ValueError(
f"Invalid file extension: {file_path.suffix}. Refer to the list of supported file types in `SUPPORTED_TYPE_SUFFIXES`. "
"Metadata files should have the `.meta.json` extension."
)
meta_file_names = list(
map(
lambda fp: os.path.basename(fp),
[file_path for file_path in file_paths if str(file_path).lower().endswith(".meta.json")],
[file_path for file_path in file_paths if str(file_path).endswith(META_SUFFIX)],
)
)
file_names = list(map(lambda fp: os.path.basename(fp), file_paths))
file_name_set = set(filter(lambda fn: not fn.lower().endswith(".meta.json"), file_names))
file_name_set = set(filter(lambda fn: not fn.endswith(META_SUFFIX), file_names))

not_mapped_meta_files = [
meta_file_name
for meta_file_name in meta_file_names
if meta_file_name.lower().split(".meta.json")[0] not in file_name_set
if meta_file_name.split(META_SUFFIX)[0] not in file_name_set
]

if len(not_mapped_meta_files) > 0:
Expand Down Expand Up @@ -341,7 +343,7 @@ def _get_allowed_file_types(desired_file_types: Optional[List[Any]]) -> List[str
return SUPPORTED_TYPE_SUFFIXES

desired_types_processed: Set[str] = {
str(file_type).lower() if str(file_type).startswith(".") else f".{str(file_type).lower()}"
str(file_type) if str(file_type).startswith(".") else f".{str(file_type)}"
for file_type in desired_file_types
}
allowed_types: Set[str] = {
Expand All @@ -362,14 +364,11 @@ def _preprocess_paths(
allowed_file_types: List[str] = FilesService._get_allowed_file_types(desired_file_types)
allowed_meta_types: Tuple = tuple(f"{file_type}.meta.json" for file_type in allowed_file_types)

meta_file_path = [
path for path in all_files if path.is_file() and str(path).lower().endswith(allowed_meta_types)
]
meta_file_path = [path for path in all_files if path.is_file() and str(path).endswith(allowed_meta_types)]
file_paths = [
path
for path in all_files
if path.is_file()
and (path.suffix.lower() in allowed_file_types and not str(path).lower().endswith(".meta.json"))
if path.is_file() and (path.suffix in allowed_file_types and not str(path).endswith(META_SUFFIX))
]
combined_paths = meta_file_path + file_paths

Expand Down
41 changes: 29 additions & 12 deletions tests/integration/service/test_integration_files_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from deepset_cloud_sdk._api.files import File
from deepset_cloud_sdk._api.upload_sessions import WriteMode
from deepset_cloud_sdk._service.files_service import (
META_SUFFIX,
SUPPORTED_TYPE_SUFFIXES,
DeepsetCloudFile,
FilesService,
Expand Down Expand Up @@ -37,7 +38,7 @@ async def test_direct_upload_path(self, integration_config: CommonConfig, worksp
names_of_uploaded_files = [
file.name
for file in Path("./tests/test_data/msmarco.10").glob("*.txt")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]
# Check the metadata was uploaded correctly
files: List[File] = []
Expand Down Expand Up @@ -68,15 +69,15 @@ async def test_direct_upload_path_multiple_file_types(
timeout_s=timeout,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
)
assert result.total_files == 9
assert result.successful_upload_count == 9
assert result.total_files == 10
assert result.successful_upload_count == 10
assert result.failed_upload_count == 0
assert len(result.failed) == 0

local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/multiple_file_types").glob("*")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]

uploaded_files: List[File] = []
Expand Down Expand Up @@ -119,7 +120,7 @@ async def test_async_upload(
local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/msmarco.10").glob("*.txt")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]
# Check the metadata was uploaded correctly
uploaded_files: List[File] = []
Expand Down Expand Up @@ -150,21 +151,25 @@ async def test_async_upload_multiple_file_types(

result = await file_service.upload(
workspace_name=workspace_name,
paths=[Path("./tests/test_data/multiple_file_types")],
paths=[
Path("./tests/test_data/multiple_file_types"),
Path("./tests/test_data/multiple_file_types_caps"), # same file, but with uppercase name
],
blocking=True,
write_mode=WriteMode.KEEP,
timeout_s=timeout,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
)
assert result.total_files == 18
assert result.successful_upload_count == 18
assert result.total_files == 22
assert result.successful_upload_count == 22
assert result.failed_upload_count == 0
assert len(result.failed) == 0

local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/multiple_file_types").glob("*")
if not file.name.endswith(".meta.json")
for file in list(Path("./tests/test_data/multiple_file_types").glob("*"))
+ list(Path("./tests/test_data/multiple_file_types_caps").glob("*"))
if not file.name.endswith(META_SUFFIX)
]

uploaded_files: List[File] = []
Expand All @@ -175,8 +180,13 @@ async def test_async_upload_multiple_file_types(
):
uploaded_files += file_batch

# We already uploaded the same set of files in a previous test, so we expect files to exist twice
for local_file_name in local_file_names:
# We already uploaded the same set of files in a previous test, so we expect files to exist twice except for the
# Therefore we just use the "multiple_file_types" folder to check for duplicates
for local_file_name in [
file.name
for file in list(Path("./tests/test_data/multiple_file_types").glob("*"))
if not file.name.endswith(META_SUFFIX)
]:
count = sum(1 for uploaded_file in uploaded_files if uploaded_file.name == local_file_name)
assert count >= 2, f"File '{local_file_name}' does not exist twice in uploaded files"

Expand All @@ -186,6 +196,13 @@ async def test_async_upload_multiple_file_types(
file.meta.get("source") == "multiple file types"
), f"Metadata was not uploaded correctly for file '{file.name}': {file.meta}"

# Make sure that the metadata for File00.txt and file00.txt are mapped correctly
File00_metadata = next((file.meta for file in uploaded_files if file.name == "File00.txt"), None)
assert File00_metadata == {"file_name_duplicate_check": "File00.txt", "source": "multiple file types"}

file00_metadata = next((file.meta for file in uploaded_files if file.name == "file00.txt"), None)
assert file00_metadata == {"file_name_duplicate_check": "file00.txt", "source": "multiple file types"}

async def test_upload_texts(self, integration_config: CommonConfig, workspace_name: str) -> None:
async with FilesService.factory(integration_config) as file_service:
files = [
Expand Down
1 change: 1 addition & 0 deletions tests/test_data/multiple_file_types/file00.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some text as a Textfile of file file00.txt
4 changes: 4 additions & 0 deletions tests/test_data/multiple_file_types/file00.txt.meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"file_name_duplicate_check": "file00.txt",
"source": "multiple file types"
}
1 change: 1 addition & 0 deletions tests/test_data/multiple_file_types_caps/File00.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some text as a Textfile of file File00.txt with capital letters and some small letters.
4 changes: 4 additions & 0 deletions tests/test_data/multiple_file_types_caps/File00.txt.meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"file_name_duplicate_check": "File00.txt",
"source": "multiple file types"
}
2 changes: 1 addition & 1 deletion tests/unit/service/test_files_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,7 @@ def test_get_allowed_file_types_unsupported_types(self) -> None:
assert file_types == [".pdf"]

def test_get_allowed_file_types_manages_formatting(self) -> None:
desired = [".pdf", "txt", "XML", "PDF"]
desired = [".pdf", "txt", "xml", "XML", "PDF"]
file_types = sorted(FilesService._get_allowed_file_types(desired))
assert file_types == [".pdf", ".txt", ".xml"]

Expand Down

0 comments on commit 0f55cbf

Please sign in to comment.