Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: lower file type comparison #185

Merged
merged 10 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 12 additions & 13 deletions deepset_cloud_sdk/_service/files_service.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Module for all file-related operations."""

from __future__ import annotations

import asyncio
Expand Down Expand Up @@ -35,6 +36,7 @@
logger = structlog.get_logger(__name__)

SUPPORTED_TYPE_SUFFIXES = [".csv", ".docx", ".html", ".json", ".md", ".txt", ".pdf", ".pptx", ".xlsx", ".xml"]
META_SUFFIX = ".meta.json"
ArzelaAscoIi marked this conversation as resolved.
Show resolved Hide resolved
ArzelaAscoIi marked this conversation as resolved.
Show resolved Hide resolved
DIRECT_UPLOAD_THRESHOLD = 20


Expand Down Expand Up @@ -197,11 +199,11 @@ async def upload_file_paths(
_raw_files = [
path
for path in file_paths
if path.suffix.lower() in SUPPORTED_TYPE_SUFFIXES and not path.name.endswith(".meta.json")
if path.suffix in SUPPORTED_TYPE_SUFFIXES and not path.name.endswith(META_SUFFIX)
]
for file_path in _raw_files:
meta: Dict[str, Any] = {}
meta_path = Path(str(file_path) + ".meta.json")
meta_path = Path(str(file_path) + META_SUFFIX)
if meta_path in file_paths:
with meta_path.open("r") as meta_file:
meta = json.loads(meta_file.read())
Expand Down Expand Up @@ -240,7 +242,7 @@ async def upload_file_paths(

# wait for ingestion to finish
if blocking:
total_files = len(list(filter(lambda x: not os.path.basename(x).endswith(".meta.json"), file_paths)))
total_files = len(list(filter(lambda x: not os.path.basename(x).endswith(META_SUFFIX), file_paths)))
await self._wait_for_finished(
workspace_name=workspace_name,
session_id=upload_session.session_id,
Expand Down Expand Up @@ -282,24 +284,24 @@ def _validate_file_paths(file_paths: List[Path]) -> None:
"""
logger.info("Validating file paths and metadata.")
for file_path in file_paths:
if file_path.suffix.lower() not in SUPPORTED_TYPE_SUFFIXES:
if file_path.suffix not in SUPPORTED_TYPE_SUFFIXES:
raise ValueError(
f"Invalid file extension: {file_path.suffix}. Refer to the list of supported file types in `SUPPORTED_TYPE_SUFFIXES`. "
"Metadata files should have the `.meta.json` extension."
)
meta_file_names = list(
map(
lambda fp: os.path.basename(fp),
[file_path for file_path in file_paths if str(file_path).lower().endswith(".meta.json")],
[file_path for file_path in file_paths if str(file_path).endswith(META_SUFFIX)],
)
)
file_names = list(map(lambda fp: os.path.basename(fp), file_paths))
file_name_set = set(filter(lambda fn: not fn.lower().endswith(".meta.json"), file_names))
file_name_set = set(filter(lambda fn: not fn.endswith(META_SUFFIX), file_names))

not_mapped_meta_files = [
meta_file_name
for meta_file_name in meta_file_names
if meta_file_name.lower().split(".meta.json")[0] not in file_name_set
if meta_file_name.split(META_SUFFIX)[0] not in file_name_set
]

if len(not_mapped_meta_files) > 0:
Expand Down Expand Up @@ -341,7 +343,7 @@ def _get_allowed_file_types(desired_file_types: Optional[List[Any]]) -> List[str
return SUPPORTED_TYPE_SUFFIXES

desired_types_processed: Set[str] = {
str(file_type).lower() if str(file_type).startswith(".") else f".{str(file_type).lower()}"
str(file_type) if str(file_type).startswith(".") else f".{str(file_type)}"
for file_type in desired_file_types
}
allowed_types: Set[str] = {
Expand All @@ -362,14 +364,11 @@ def _preprocess_paths(
allowed_file_types: List[str] = FilesService._get_allowed_file_types(desired_file_types)
allowed_meta_types: Tuple = tuple(f"{file_type}.meta.json" for file_type in allowed_file_types)

meta_file_path = [
path for path in all_files if path.is_file() and str(path).lower().endswith(allowed_meta_types)
]
meta_file_path = [path for path in all_files if path.is_file() and str(path).endswith(allowed_meta_types)]
file_paths = [
path
for path in all_files
if path.is_file()
and (path.suffix.lower() in allowed_file_types and not str(path).lower().endswith(".meta.json"))
if path.is_file() and (path.suffix in allowed_file_types and not str(path).endswith(META_SUFFIX))
]
combined_paths = meta_file_path + file_paths

Expand Down
43 changes: 31 additions & 12 deletions tests/integration/service/test_integration_files_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from deepset_cloud_sdk._api.files import File
from deepset_cloud_sdk._api.upload_sessions import WriteMode
from deepset_cloud_sdk._service.files_service import (
META_SUFFIX,
SUPPORTED_TYPE_SUFFIXES,
DeepsetCloudFile,
FilesService,
Expand Down Expand Up @@ -37,7 +38,7 @@ async def test_direct_upload_path(self, integration_config: CommonConfig, worksp
names_of_uploaded_files = [
file.name
for file in Path("./tests/test_data/msmarco.10").glob("*.txt")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]
# Check the metadata was uploaded correctly
files: List[File] = []
Expand Down Expand Up @@ -68,15 +69,15 @@ async def test_direct_upload_path_multiple_file_types(
timeout_s=timeout,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
)
assert result.total_files == 9
assert result.successful_upload_count == 9
assert result.total_files == 10
assert result.successful_upload_count == 10
assert result.failed_upload_count == 0
assert len(result.failed) == 0

local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/multiple_file_types").glob("*")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]

uploaded_files: List[File] = []
Expand Down Expand Up @@ -119,7 +120,7 @@ async def test_async_upload(
local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/msmarco.10").glob("*.txt")
if not file.name.endswith(".meta.json")
if not file.name.endswith(META_SUFFIX)
]
# Check the metadata was uploaded correctly
uploaded_files: List[File] = []
Expand Down Expand Up @@ -150,21 +151,25 @@ async def test_async_upload_multiple_file_types(

result = await file_service.upload(
workspace_name=workspace_name,
paths=[Path("./tests/test_data/multiple_file_types")],
paths=[
Path("./tests/test_data/multiple_file_types"),
Path("./tests/test_data/multiple_file_types_caps"), # same file, but with uppercase name
],
blocking=True,
write_mode=WriteMode.KEEP,
timeout_s=timeout,
desired_file_types=SUPPORTED_TYPE_SUFFIXES,
)
assert result.total_files == 18
assert result.successful_upload_count == 18
assert result.total_files == 22
assert result.successful_upload_count == 22
assert result.failed_upload_count == 0
assert len(result.failed) == 0

local_file_names: List[str] = [
file.name
for file in Path("./tests/test_data/multiple_file_types").glob("*")
if not file.name.endswith(".meta.json")
for file in list(Path("./tests/test_data/multiple_file_types").glob("*"))
+ list(Path("./tests/test_data/multiple_file_types_caps").glob("*"))
if not file.name.endswith(META_SUFFIX)
]

uploaded_files: List[File] = []
Expand All @@ -175,8 +180,13 @@ async def test_async_upload_multiple_file_types(
):
uploaded_files += file_batch

# We already uploaded the same set of files in a previous test, so we expect files to exist twice
for local_file_name in local_file_names:
# We already uploaded the same set of files in a previous test, so we expect files to exist twice except for the
# Therefore we just use the "multiple_file_types" folder to check for duplicates
for local_file_name in [
file.name
for file in list(Path("./tests/test_data/multiple_file_types").glob("*"))
if not file.name.endswith(META_SUFFIX)
]:
count = sum(1 for uploaded_file in uploaded_files if uploaded_file.name == local_file_name)
assert count >= 2, f"File '{local_file_name}' does not exist twice in uploaded files"

Expand All @@ -186,6 +196,15 @@ async def test_async_upload_multiple_file_types(
file.meta.get("source") == "multiple file types"
), f"Metadata was not uploaded correctly for file '{file.name}': {file.meta}"

# Make sure that the metadata for File00.txt and file00.txt are mapped correctly
ArzelaAscoIi marked this conversation as resolved.
Show resolved Hide resolved
File00_metadata = [file.meta for file in uploaded_files if file.name == "File00.txt"]
assert len(File00_metadata) == 1
assert File00_metadata[0].get("file_name_duplicate_check") == "File00.txt"

file00_metadata = [file.meta for file in uploaded_files if file.name == "file00.txt"]
assert len(file00_metadata) == 1
assert file00_metadata[0].get("file_name_duplicate_check") == "file00.txt"

async def test_upload_texts(self, integration_config: CommonConfig, workspace_name: str) -> None:
async with FilesService.factory(integration_config) as file_service:
files = [
Expand Down
1 change: 1 addition & 0 deletions tests/test_data/multiple_file_types/file00.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some text as a Textfile of file file00.txt
4 changes: 4 additions & 0 deletions tests/test_data/multiple_file_types/file00.txt.meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"file_name_duplicate_check": "file00.txt",
"source": "multiple file types"
}
1 change: 1 addition & 0 deletions tests/test_data/multiple_file_types_caps/File00.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Some text as a Textfile of file File00.txt with capital letters and some small letters.
3 changes: 3 additions & 0 deletions tests/test_data/multiple_file_types_caps/File00.txt.meta.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"file_name_duplicate_check": "File00.txt"
}
2 changes: 1 addition & 1 deletion tests/unit/service/test_files_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -1079,7 +1079,7 @@ def test_get_allowed_file_types_unsupported_types(self) -> None:
assert file_types == [".pdf"]

def test_get_allowed_file_types_manages_formatting(self) -> None:
desired = [".pdf", "txt", "XML", "PDF"]
desired = [".pdf", "txt", "xml", "XML", "PDF"]
file_types = sorted(FilesService._get_allowed_file_types(desired))
assert file_types == [".pdf", ".txt", ".xml"]

Expand Down
Loading