Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CWE-20 - Code Security #632

Merged
merged 2 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion code/backend/batch/utilities/common/SourceDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,8 @@ def from_metadata(
hash_key = f"doc_{hash_key}"
sas_placeholder = (
"_SAS_TOKEN_PLACEHOLDER_"
if "blob.core.windows.net" in parsed_url.netloc
if parsed_url.netloc
and parsed_url.netloc.endswith(".blob.core.windows.net")
ross-p-smith marked this conversation as resolved.
Show resolved Hide resolved
else ""
)
return cls(
Expand Down
66 changes: 58 additions & 8 deletions code/tests/common/test_SourceDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,18 +75,72 @@ def test_get_markdown_url(azure_blob_service_mock):
assert markdown_url == "[A title](http://example.com/path/to/file.txt_12345)"


def test_from_metadata_returns_empty_sas_placeholder():
# Given
content = "Some content"
metadata = {}
# blob.core.windows.net needs to be the domain name - not a faked one as per CWE-20
document_url = "http://blob.core.windows.net.example.com/path/to/file.txt"
expectedFileName = "/path/to/file.txt"
idx = 0

# When
source_document = SourceDocument.from_metadata(content, metadata, document_url, idx)

# Then
parsed_url = urlparse(document_url)
file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
hash_key = hashlib.sha1(f"{file_url}_{idx}".encode("utf-8")).hexdigest()
hash_key = f"doc_{hash_key}"

assert source_document.id == hash_key
assert source_document.content == content
assert source_document.source == document_url
assert source_document.title == expectedFileName
assert source_document.chunk == idx
assert source_document.offset is None
assert source_document.page_number is None


def test_from_metadata_returns_sas_placeholder():
# Given
content = "Some content"
metadata = {}
document_url = "http://example.blob.core.windows.net/path/to/file.txt"
expectedFileName = "/path/to/file.txt"
expected_sas_placeholder = "_SAS_TOKEN_PLACEHOLDER_"
idx = 0

# When
source_document = SourceDocument.from_metadata(content, metadata, document_url, idx)

# Then
parsed_url = urlparse(document_url)
file_url = parsed_url.scheme + "://" + parsed_url.netloc + parsed_url.path
hash_key = hashlib.sha1(f"{file_url}_{idx}".encode("utf-8")).hexdigest()
hash_key = f"doc_{hash_key}"

assert source_document.id == hash_key
assert source_document.content == content
assert source_document.source == f"{file_url}{expected_sas_placeholder}"
assert source_document.title == expectedFileName
assert source_document.chunk == idx
assert source_document.offset is None
assert source_document.page_number is None


def test_from_metadata():
# Given
content = "Some content"
metadata = {
"id": "1",
"source": "http://example.com/path/to/file.txt_SAS_TOKEN_PLACEHOLDER_",
"source": "http://example.com/path/to/file.txt",
"title": "A title",
"chunk": "A chunk",
"offset": "An offset",
"page_number": "1",
}
document_url = "http://example.com/path/to/file.txt_SAS_TOKEN_PLACEHOLDER_"
document_url = "http://example.com/path/to/file.txt"
idx = 0

# When
Expand All @@ -98,15 +152,11 @@ def test_from_metadata():
filename = parsed_url.path
hash_key = hashlib.sha1(f"{file_url}_{idx}".encode("utf-8")).hexdigest()
hash_key = f"doc_{hash_key}"
sas_placeholder = (
"_SAS_TOKEN_PLACEHOLDER_"
if "blob.core.windows.net" in parsed_url.netloc
else ""
)

expected_source_document = SourceDocument(
id=metadata.get("id", hash_key),
content=content,
source=metadata.get("source", f"{file_url}{sas_placeholder}"),
source=metadata.get("source", document_url),
title=metadata.get("title", filename),
chunk=metadata.get("chunk", idx),
offset=metadata.get("offset"),
Expand Down
Loading