Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/upload multiple files at once to bytes #3476

Merged
merged 22 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
80edfcd
WIP multipart upload
Donnype Aug 28, 2024
ad59d77
WIP multipart upload
Donnype Aug 28, 2024
8a18e39
Merge branch 'feature/upload-multiple-files-at-once-to-bytes' of gith…
Donnype Aug 29, 2024
5758530
WIP multipart upload
Donnype Aug 28, 2024
5d72fc0
Merge branch 'feature/upload-multiple-files-at-once-to-bytes' of gith…
Donnype Aug 29, 2024
edff901
Fix the test (client) and return raw ids based on the content type
Donnype Aug 29, 2024
e7fcf3a
Update other clients as well
Donnype Aug 29, 2024
ff65977
WIP: better safeguards for mime types
Donnype Aug 29, 2024
56730ae
Use boefjes API schema for the multiple raw file upload functionality
Donnype Sep 5, 2024
a785b56
Add comment about file_name usage
Donnype Sep 5, 2024
281b4bf
Fix integration test: order of mime-types change randomly
Donnype Sep 5, 2024
0e1cd58
Merge branch 'main' into feature/upload-multiple-files-at-once-to-bytes
Donnype Sep 5, 2024
2056796
Fix integration tests by making the mime-types field a set in Bytes a…
Donnype Sep 5, 2024
16f2117
Fix integration tests part 2
Donnype Sep 5, 2024
f6060af
Merge branch 'main' into feature/upload-multiple-files-at-once-to-bytes
Donnype Sep 6, 2024
4c53b4c
Merge branch 'main' into feature/upload-multiple-files-at-once-to-bytes
Donnype Sep 9, 2024
698053e
A set is not JSON serializable
Donnype Sep 9, 2024
7e250df
Merge branch 'main' into feature/upload-multiple-files-at-once-to-bytes
ammar92 Sep 10, 2024
e6bc372
Use httpx codes and the route signature to define the JSON return val…
Donnype Sep 10, 2024
bcd2de1
Add integration test for uploading multiple raw files
Donnype Sep 10, 2024
7ad0fdb
Fix integration test inconsistency with the order of the mime-types
Donnype Sep 10, 2024
d182714
Merge branch 'main' into feature/upload-multiple-files-at-once-to-bytes
Donnype Sep 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions boefjes/boefjes/clients/bytes_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import typing
import uuid
from base64 import b64encode
from collections.abc import Callable, Set
from functools import wraps
from typing import Any
Expand Down Expand Up @@ -99,17 +100,25 @@ def get_normalizer_meta(self, normalizer_meta_id: uuid.UUID) -> NormalizerMeta:

@retry_with_login
def save_raw(self, boefje_meta_id: str, raw: str | bytes, mime_types: Set[str] = frozenset()) -> UUID:
headers = {"content-type": "application/octet-stream"}
headers.update(self.headers)
file_name = "raw" # The name provides a key for all ids returned, so this is arbitrary as we only upload 1 file
Donnype marked this conversation as resolved.
Show resolved Hide resolved

response = self._session.post(
"/bytes/raw",
content=raw,
headers=headers,
params={"mime_types": list(mime_types), "boefje_meta_id": boefje_meta_id},
json={
"files": [
{
"name": file_name,
"content": b64encode(raw if isinstance(raw, bytes) else raw.encode()).decode(),
"tags": mime_types,
}
]
},
headers=self.headers,
params={"boefje_meta_id": str(boefje_meta_id)},
)

self._verify_response(response)
return UUID(response.json()["id"])

return UUID(response.json()[file_name])

@retry_with_login
def get_raw(self, raw_data_id: str) -> bytes:
Expand Down
14 changes: 12 additions & 2 deletions bytes/bytes/api/models.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
from pydantic import BaseModel
from pydantic import BaseModel, Field


class RawResponse(BaseModel):
status: str
message: str
id: str | None = None
ids: list[str] | None = None


class File(BaseModel):
name: str
content: str = Field(..., contentEncoding="base64")
tags: list[str] = Field(default_factory=list)


class BoefjeOutput(BaseModel):
files: list[File] = Field(default_factory=list)
89 changes: 53 additions & 36 deletions bytes/bytes/api/router.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from base64 import b64decode
from uuid import UUID

import structlog
from asgiref.sync import async_to_sync
from cachetools import TTLCache, cached
from fastapi import APIRouter, Depends, HTTPException, Query, Request
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import Response
from starlette.responses import JSONResponse

from bytes.api.models import RawResponse
from bytes.api.models import BoefjeOutput
from bytes.auth import authenticate_token
from bytes.config import get_settings
from bytes.database.sql_meta_repository import MetaIntegrityError, ObjectNotFoundException, create_meta_data_repository
Expand Down Expand Up @@ -148,42 +148,59 @@ def get_normalizer_meta(

@router.post("/raw", tags=[RAW_TAG])
def create_raw(
request: Request,
boefje_meta_id: UUID,
mime_types: list[str] | None = Query(None),
boefje_output: BoefjeOutput,
meta_repository: MetaDataRepository = Depends(create_meta_data_repository),
event_manager: EventManager = Depends(create_event_manager),
) -> RawResponse:
parsed_mime_types = [] if mime_types is None else [MimeType(value=mime_type) for mime_type in mime_types]

try:
meta = meta_repository.get_boefje_meta_by_id(boefje_meta_id)

if meta_repository.has_raw(meta, parsed_mime_types):
return RawResponse(status="success", message="Raw data already present")

# FastAPI/starlette only has async versions of the Request methods, but
# all our code is sync, so we wrap it in async_to_sync.
data = async_to_sync(request.body)()

raw_data = RawData(value=data, boefje_meta=meta, mime_types=parsed_mime_types)
with meta_repository:
raw_id = meta_repository.save_raw(raw_data)

event = RawFileReceived(
organization=meta.organization,
raw_data=RawDataMeta(
id=raw_id,
boefje_meta=raw_data.boefje_meta,
mime_types=raw_data.mime_types,
),
)
event_manager.publish(event)
except Exception as error:
logger.exception("Error saving raw data")
raise HTTPException(status_code=500, detail="Could not save raw data") from error

return RawResponse(status="success", message="Raw data saved", id=raw_id)
) -> JSONResponse:
Donnype marked this conversation as resolved.
Show resolved Hide resolved
"""Parse all the raw files from the request and return the ids. The ids are ordered according to the order from the
request data, but we assume the `name` field is unique, and hence return a mapping of the file name to the id."""

raw_ids = {}
mime_types_by_id = {
raw.id: set(raw.mime_types) for raw in meta_repository.get_raw(RawDataFilter(boefje_meta_id=boefje_meta_id))
}
all_parsed_mime_types = list(mime_types_by_id.values())

for raw in boefje_output.files:
parsed_mime_types = {MimeType(value=x) for x in raw.tags}

if parsed_mime_types in mime_types_by_id.values():
# Set the id for this file using the precomputed dict that maps existing primary keys to the mime-type set.
raw_ids[raw.name] = str(
list(mime_types_by_id.keys())[list(mime_types_by_id.values()).index(parsed_mime_types)]
)
continue

if parsed_mime_types in all_parsed_mime_types:
raise HTTPException(status_code=400, detail="Content types do not define unique sets of mime types.")
Donnype marked this conversation as resolved.
Show resolved Hide resolved

try:
meta = meta_repository.get_boefje_meta_by_id(boefje_meta_id)
raw_data = RawData(value=b64decode(raw.content.encode()), boefje_meta=meta, mime_types=parsed_mime_types)

with meta_repository:
raw_id = meta_repository.save_raw(raw_data)
raw_ids[raw.name] = str(raw_id)

all_parsed_mime_types.append(parsed_mime_types)

event = RawFileReceived(
organization=meta.organization,
raw_data=RawDataMeta(
id=raw_id,
boefje_meta=raw_data.boefje_meta,
mime_types=raw_data.mime_types,
),
)
event_manager.publish(event)
except Exception as error:
logger.exception("Error saving raw data")
raise HTTPException(status_code=500, detail="Could not save raw data") from error
Donnype marked this conversation as resolved.
Show resolved Hide resolved

all_parsed_mime_types.append(parsed_mime_types)

return JSONResponse(raw_ids)
Donnype marked this conversation as resolved.
Show resolved Hide resolved


@router.get("/raw/{raw_id}", tags=[RAW_TAG])
Expand Down
7 changes: 5 additions & 2 deletions bytes/bytes/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ def _validate_timezone_aware_datetime(value: datetime) -> datetime:
class MimeType(BaseModel):
value: str

def __hash__(self):
return hash(self.value)
Donnype marked this conversation as resolved.
Show resolved Hide resolved


class Job(BaseModel):
id: UUID
Expand Down Expand Up @@ -69,7 +72,7 @@ class RawDataMeta(BaseModel):

id: UUID
boefje_meta: BoefjeMeta
mime_types: list[MimeType] = Field(default_factory=list)
mime_types: set[MimeType] = Field(default_factory=set)

# These are set once the raw is saved
secure_hash: SecureHash | None = None
Expand All @@ -80,7 +83,7 @@ class RawDataMeta(BaseModel):
class RawData(BaseModel):
value: bytes
boefje_meta: BoefjeMeta
mime_types: list[MimeType] = Field(default_factory=list)
mime_types: set[MimeType] = Field(default_factory=set)

# These are set once the raw is saved
secure_hash: SecureHash | None = None
Expand Down
21 changes: 13 additions & 8 deletions bytes/tests/client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import typing
from base64 import b64encode
from collections.abc import Callable
from functools import wraps
from typing import Any
Expand Down Expand Up @@ -126,19 +127,23 @@ def save_raw(self, boefje_meta_id: UUID, raw: bytes, mime_types: list[str] | Non
if not mime_types:
mime_types = []

headers = {"content-type": "application/octet-stream"}

file_name = "raw" # The name provides a key for all ids returned, so this is arbitrary as we only upload 1 file
response = self.client.post(
"/bytes/raw",
content=raw,
headers=headers,
params={"mime_types": mime_types, "boefje_meta_id": str(boefje_meta_id)},
json={
"files": [
{
"name": file_name,
"content": b64encode(raw).decode(),
"tags": mime_types,
}
],
},
params={"boefje_meta_id": str(boefje_meta_id)},
)

self._verify_response(response)
raw_id = response.json()["id"]

return str(raw_id)
return response.json()[file_name]

@retry_with_login
def get_raw(self, raw_id: UUID) -> bytes:
Expand Down
26 changes: 18 additions & 8 deletions bytes/tests/integration/test_bytes_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import uuid
from base64 import b64encode

import httpx
import pytest
Expand Down Expand Up @@ -147,7 +148,7 @@ def test_normalizer_meta(bytes_api_client: BytesAPIClient, event_manager: Rabbit
normalizer_meta.raw_data.hash_retrieval_link = retrieved_normalizer_meta.raw_data.hash_retrieval_link
normalizer_meta.raw_data.signing_provider_url = retrieved_normalizer_meta.raw_data.signing_provider_url

assert normalizer_meta.dict() == retrieved_normalizer_meta.dict()
assert normalizer_meta.model_dump_json() == retrieved_normalizer_meta.model_dump_json()


def test_filtered_normalizer_meta(bytes_api_client: BytesAPIClient) -> None:
Expand Down Expand Up @@ -255,21 +256,30 @@ def test_save_raw_no_mime_types(bytes_api_client: BytesAPIClient) -> None:
boefje_meta = get_boefje_meta(meta_id=uuid.uuid4())
bytes_api_client.save_boefje_meta(boefje_meta)

headers = {"content-type": "application/octet-stream"}
bytes_api_client.login()
headers.update(bytes_api_client.client.headers)

raw_url = f"{bytes_api_client.client.base_url}/bytes/raw"

raw = b"second test 123456"
file_name = "raw"
response = httpx.post(
raw_url, content=raw, headers=headers, params={"boefje_meta_id": str(boefje_meta.id)}, timeout=30
raw_url,
json={
"files": [
{
"name": file_name,
"content": b64encode(raw).decode(),
"tags": [],
}
]
},
headers=bytes_api_client.client.headers,
params={"boefje_meta_id": str(boefje_meta.id)},
)

assert response.status_code == 200

get_raw_without_mime_type_response = httpx.get(
f"{raw_url}/{response.json().get('id')}", headers=bytes_api_client.client.headers, timeout=30
f"{raw_url}/{response.json()[file_name]}", headers=bytes_api_client.client.headers, timeout=30
)

assert get_raw_without_mime_type_response.status_code == 200
Expand All @@ -293,13 +303,13 @@ def test_raw_mimes(bytes_api_client: BytesAPIClient) -> None:
)
)
assert len(retrieved_raws) == 1
assert retrieved_raws[0]["mime_types"] == [{"value": value} for value in mime_types]
assert {x["value"] for x in retrieved_raws[0]["mime_types"]} == set(mime_types)

retrieved_raws = bytes_api_client.get_raws(
RawDataFilter(boefje_meta_id=boefje_meta.id, normalized=False, mime_types=[MimeType(value="text/html")])
)
assert len(retrieved_raws) == 1
assert retrieved_raws[0]["mime_types"] == [{"value": value} for value in mime_types]
assert {x["value"] for x in retrieved_raws[0]["mime_types"]} == set(mime_types)

retrieved_raws = bytes_api_client.get_raws(
RawDataFilter(boefje_meta_id=boefje_meta.id, normalized=False, mime_types=[MimeType(value="bad/mime")])
Expand Down
6 changes: 3 additions & 3 deletions bytes/tests/integration/test_migrations.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,15 @@ def test_clean_mime_types(meta_repository: SQLMetaDataRepository) -> None:
meta_repository.save_boefje_meta(boefje_meta)

raw = get_raw_data()
raw.mime_types.append(MimeType(value=raw.boefje_meta.boefje.id))
raw.mime_types.add(MimeType(value=raw.boefje_meta.boefje.id))
raw_id_1 = meta_repository.save_raw(raw)

raw.mime_types.append(
raw.mime_types.add(
MimeType(value=f"boefje/{raw.boefje_meta.boefje.id}-ce293f79fd3c809a300a2837bb1da4f7115fc034a1f78")
)
raw_id_2 = meta_repository.save_raw(raw)

raw.mime_types.append(
raw.mime_types.add(
MimeType(value=f"boefje/{raw.boefje_meta.boefje.id}-ba293f79fd3c809a300a2837bb1da4f7115fc034a1f78")
)
raw_id_3 = meta_repository.save_raw(raw)
Expand Down
19 changes: 15 additions & 4 deletions rocky/rocky/bytes_client.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import uuid
from base64 import b64encode
from collections.abc import Set
from datetime import datetime, timezone

Expand Down Expand Up @@ -113,15 +114,25 @@ def _save_normalizer_meta(self, normalizer_meta: NormalizerMeta) -> None:
response.raise_for_status()

def _save_raw(self, boefje_meta_id: uuid.UUID, raw: bytes, mime_types: Set[str] = frozenset()) -> str:
file_name = "raw" # The name provides a key for all ids returned, so this is arbitrary as we only upload 1 file

response = self.session.post(
"/bytes/raw",
content=raw,
headers={"content-type": "application/octet-stream"},
params={"mime_types": list(mime_types), "boefje_meta_id": str(boefje_meta_id)},
json={
"files": [
{
"name": file_name,
"content": b64encode(raw).decode(),
"tags": mime_types,
}
]
},
params={"boefje_meta_id": str(boefje_meta_id)},
)

response.raise_for_status()
return response.json()["id"]

return response.json()[file_name]

def get_raw(self, raw_id: str) -> bytes:
# Note: we assume organization permissions are handled before requesting raw data.
Expand Down
Loading