From fd8619a74cc4f82ed519adf4a78be8651bbddb9b Mon Sep 17 00:00:00 2001 From: Himanshi-Mirosoft Date: Wed, 24 Jul 2024 19:39:17 +0530 Subject: [PATCH] fix: Type HTM upload issue fix on explore page under admin (#1172) Co-authored-by: Himanshi Agrawal Co-authored-by: Roopan P M --- .../batch/utilities/helpers/config/config_helper.py | 1 + .../batch/utilities/helpers/config/default.json | 11 +++++++++++ code/tests/utilities/helpers/test_config_helper.py | 9 +++++++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/code/backend/batch/utilities/helpers/config/config_helper.py b/code/backend/batch/utilities/helpers/config/config_helper.py index f6ae638f7..7ad35bbe2 100644 --- a/code/backend/batch/utilities/helpers/config/config_helper.py +++ b/code/backend/batch/utilities/helpers/config/config_helper.py @@ -63,6 +63,7 @@ def get_available_document_types(self) -> list[str]: "pdf", "url", "html", + "htm", "md", "jpeg", "jpg", diff --git a/code/backend/batch/utilities/helpers/config/default.json b/code/backend/batch/utilities/helpers/config/default.json index c3100dff8..dfdbd2201 100644 --- a/code/backend/batch/utilities/helpers/config/default.json +++ b/code/backend/batch/utilities/helpers/config/default.json @@ -74,6 +74,17 @@ "strategy": "web" } }, + { + "document_type": "htm", + "chunking": { + "strategy": "layout", + "size": 500, + "overlap": 100 + }, + "loading": { + "strategy": "web" + } + }, { "document_type": "docx", "chunking": { diff --git a/code/tests/utilities/helpers/test_config_helper.py b/code/tests/utilities/helpers/test_config_helper.py index c5363382e..8ddd6bb03 100644 --- a/code/tests/utilities/helpers/test_config_helper.py +++ b/code/tests/utilities/helpers/test_config_helper.py @@ -211,6 +211,11 @@ def test_default_config_when_use_advanced_image_processing(env_helper_mock): "chunking": expected_chunking, "loading": {"strategy": "web"}, }, + { + "document_type": "htm", + "chunking": expected_chunking, + "loading": {"strategy": "web"}, + }, { "document_type": "docx", "chunking": expected_chunking, @@ -409,7 +414,7 @@ def test_get_available_document_types(config: Config): # then assert sorted(document_types) == sorted( - ["txt", "pdf", "url", "html", "md", "jpeg", "jpg", "png", "docx"] + ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx"] ) @@ -424,7 +429,7 @@ def test_get_available_document_types_when_advanced_image_processing_enabled( # then assert sorted(document_types) == sorted( - ["txt", "pdf", "url", "html", "md", "jpeg", "jpg", "png", "docx", "tiff", "bmp"] + ["txt", "pdf", "url", "html", "htm", "md", "jpeg", "jpg", "png", "docx", "tiff", "bmp"] )