Cleanup dependencies (#8212)

GitOrigin-RevId: 68dadb185be7b52368f19432dfde44ebde9ac3d8
pathwaycom · Feb 13, 2025 · 75a5d18 · 75a5d18
1 parent d802df8
commit 75a5d18
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 7 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,23 +55,23 @@ xpack-llm = [
     "litellm ~= 1.44.28",
     "cohere ~= 5.1.0",
     "tiktoken >= 0.5",
-    "langchain == 0.2.0",
-    "langchain_community == 0.2.0",
+    "langchain ~= 0.2.0",
+    "langchain_community ~= 0.2.0",
     "llama-index-core ~= 0.10.0",
     "llama-index-readers-pathway ~= 0.1.0",
     "llama-index-retrievers-pathway ~= 0.1.3",
     "tenacity != 8.4.0", # langchain dependency, 8.4.0 wheel is broken
     "instructor == 1.2.6",
     "google-generativeai ~= 0.8.4",
     "google-api-core ~= 2.24.1",
-    "docling >= 2.15, <3.0",
 ]
 xpack-llm-local = [  # requirements that allow local ML inference
     "unstructured[all-docs] >= 0.16, < 0.16.15",
     "sentence_transformers", 
     "transformers >= 4.42.0",
 ]
 xpack-llm-docs = [
+    "docling >= 2.15, <3.0",
     "python-docx >= 1.1.2",
     "unstructured >= 0.16, < 0.16.12",
     "pdf2image",

diff --git a/python/pathway/xpacks/llm/parsers.py b/python/pathway/xpacks/llm/parsers.py
@@ -354,7 +354,12 @@ def __init__(
     ):
         with optional_imports("xpack-llm-docs"):
             from docling.datamodel.pipeline_options import PdfPipelineOptions
-            from docling.document_converter import DocumentConverter, PdfFormatOption
+            from docling.document_converter import (
+                DocumentConverter,
+                InputFormat,
+                PdfFormatOption,
+            )
+            from docling_core.types.doc import ImageRefMode
 
         self.multimodal_llm: llms.OpenAIChat | llms.LiteLLMChat | None
         self.parse_images = parse_images
@@ -371,11 +376,15 @@ def __init__(
                     retry_strategy=udfs.ExponentialBackoffRetryStrategy(max_retries=4),
                     verbose=True,
                 )
-            self.image_mode = "embedded"  # will make docling export document to markdown with base64-embedded images
+            self.image_mode = (
+                ImageRefMode.EMBEDDED
+            )  # will make docling export document to markdown with base64-embedded images
             self.multimodal_llm = multimodal_llm
         else:
             self.multimodal_llm = None
-            self.image_mode = "placeholder"  # will make docling export document to markdown with image placeholders
+            self.image_mode = (
+                ImageRefMode.PLACEHOLDER
+            )  # will make docling export document to markdown with image placeholders
 
         default_pipeline_options = {
             "do_table_structure": True,
@@ -392,7 +401,9 @@ def __init__(
 
         # actual docling converter
         self.converter: DocumentConverter = DocumentConverter(
-            format_options={"pdf": PdfFormatOption(pipeline_options=pipeline_options)},
+            format_options={
+                InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+            },
             # TODO: Add more file types
         )
         super().__init__(cache_strategy=cache_strategy)