Skip to content

Commit

Permalink
Cleanup dependencies (#8212)
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 68dadb185be7b52368f19432dfde44ebde9ac3d8
  • Loading branch information
voodoo11 authored and Manul from Pathway committed Feb 13, 2025
1 parent d802df8 commit 75a5d18
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 7 deletions.
6 changes: 3 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,23 +55,23 @@ xpack-llm = [
"litellm ~= 1.44.28",
"cohere ~= 5.1.0",
"tiktoken >= 0.5",
"langchain == 0.2.0",
"langchain_community == 0.2.0",
"langchain ~= 0.2.0",
"langchain_community ~= 0.2.0",
"llama-index-core ~= 0.10.0",
"llama-index-readers-pathway ~= 0.1.0",
"llama-index-retrievers-pathway ~= 0.1.3",
"tenacity != 8.4.0", # langchain dependency, 8.4.0 wheel is broken
"instructor == 1.2.6",
"google-generativeai ~= 0.8.4",
"google-api-core ~= 2.24.1",
"docling >= 2.15, <3.0",
]
xpack-llm-local = [ # requirements that allow local ML inference
"unstructured[all-docs] >= 0.16, < 0.16.15",
"sentence_transformers",
"transformers >= 4.42.0",
]
xpack-llm-docs = [
"docling >= 2.15, <3.0",
"python-docx >= 1.1.2",
"unstructured >= 0.16, < 0.16.12",
"pdf2image",
Expand Down
19 changes: 15 additions & 4 deletions python/pathway/xpacks/llm/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,12 @@ def __init__(
):
with optional_imports("xpack-llm-docs"):
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.document_converter import (
DocumentConverter,
InputFormat,
PdfFormatOption,
)
from docling_core.types.doc import ImageRefMode

self.multimodal_llm: llms.OpenAIChat | llms.LiteLLMChat | None
self.parse_images = parse_images
Expand All @@ -371,11 +376,15 @@ def __init__(
retry_strategy=udfs.ExponentialBackoffRetryStrategy(max_retries=4),
verbose=True,
)
self.image_mode = "embedded" # will make docling export document to markdown with base64-embedded images
self.image_mode = (
ImageRefMode.EMBEDDED
) # will make docling export document to markdown with base64-embedded images
self.multimodal_llm = multimodal_llm
else:
self.multimodal_llm = None
self.image_mode = "placeholder" # will make docling export document to markdown with image placeholders
self.image_mode = (
ImageRefMode.PLACEHOLDER
) # will make docling export document to markdown with image placeholders

default_pipeline_options = {
"do_table_structure": True,
Expand All @@ -392,7 +401,9 @@ def __init__(

# actual docling converter
self.converter: DocumentConverter = DocumentConverter(
format_options={"pdf": PdfFormatOption(pipeline_options=pipeline_options)},
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
},
# TODO: Add more file types
)
super().__init__(cache_strategy=cache_strategy)
Expand Down

0 comments on commit 75a5d18

Please sign in to comment.