Skip to content

Commit

Permalink
Merge branch 'main' into feat/doughnut-http-endpoint-1
Browse files Browse the repository at this point in the history
  • Loading branch information
edknv authored Feb 7, 2025
2 parents db68afb + ca16e39 commit e9fa05e
Show file tree
Hide file tree
Showing 8 changed files with 2,169 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ repos:
rev: v5.0.0
hooks:
- id: trailing-whitespace
exclude: '^(docs)'
exclude: '^(docs|data)'
- id: end-of-file-fixer
- id: check-added-large-files
args: [-- maxkb=1500]
Expand Down
2 changes: 1 addition & 1 deletion client/src/nv_ingest_client/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ def fetch_with_retries(job_id: str):
try:
result, _ = handle_future_result(future, timeout=timeout)
# Append a tuple of (result data, job_id). (Using result.get("data") if result is valid.)
results.append((result.get("data") if result else None, job_id))
results.append(result.get("data"))
# Run the callback if provided and the result is valid
if completion_callback and result:
completion_callback(result, job_id)
Expand Down
23 changes: 23 additions & 0 deletions client/src/nv_ingest_client/util/milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,6 +370,28 @@ def _pull_text(element, enable_text: bool, enable_charts: bool, enable_tables: b
return text


def _insert_location_into_content_metadata(element, enable_charts: bool, enable_tables: bool, enable_images: bool):
location = max_dimensions = None
if element["document_type"] == "structured":
location = element["metadata"]["table_metadata"]["table_location"]
max_dimensions = element["metadata"]["table_metadata"]["table_location_max_dimensions"]
if element["metadata"]["content_metadata"]["subtype"] == "chart" and not enable_charts:
location = max_dimensions = None
elif element["metadata"]["content_metadata"]["subtype"] == "table" and not enable_tables:
location = max_dimensions = None
elif element["document_type"] == "image" and enable_images:
location = element["metadata"]["image_metadata"]["image_location"]
max_dimensions = element["metadata"]["image_metadata"]["image_location_max_dimensions"]
if (not location) and (element["document_type"] != "text"):
source_name = element["metadata"]["source_metadata"]["source_name"]
pg_num = element["metadata"]["content_metadata"]["page_number"]
doc_type = element["document_type"]
logger.error(f"failed to find location for entity: {source_name} page: {pg_num} type: {doc_type}")
location = max_dimensions = None
element["metadata"]["content_metadata"]["location"] = location
element["metadata"]["content_metadata"]["max_dimensions"] = max_dimensions


def write_records_minio(
records,
writer: RemoteBulkWriter,
Expand Down Expand Up @@ -415,6 +437,7 @@ def write_records_minio(
for result in records:
for element in result:
text = _pull_text(element, enable_text, enable_charts, enable_tables, enable_images)
_insert_location_into_content_metadata(element, enable_charts, enable_tables, enable_images)
if text:
if sparse_model is not None:
writer.append_row(record_func(text, element, sparse_model.encode_documents([text])))
Expand Down
269 changes: 269 additions & 0 deletions data/charts_with_page_num_fixed.csv

Large diffs are not rendered by default.

236 changes: 236 additions & 0 deletions data/table_queries_cleaned_235.csv

Large diffs are not rendered by default.

490 changes: 490 additions & 0 deletions data/text_query_answer_gt_page.csv

Large diffs are not rendered by default.

Loading

0 comments on commit e9fa05e

Please sign in to comment.