diff --git a/extract_thinker/document_loader/document_loader.py b/extract_thinker/document_loader/document_loader.py index 3eb877c..118c492 100644 --- a/extract_thinker/document_loader/document_loader.py +++ b/extract_thinker/document_loader/document_loader.py @@ -3,25 +3,36 @@ from io import BytesIO from PIL import Image import pypdfium2 as pdfium -from typing import Any, Dict, Union +from typing import Any, Dict, Union, List from cachetools import TTLCache import os import magic from extract_thinker.utils import get_file_extension, check_mime_type +from playwright.sync_api import sync_playwright +from urllib.parse import urlparse +import base64 +import math class DocumentLoader(ABC): - def __init__(self, content: Any = None, cache_ttl: int = 300): + # SUPPORTED_FORMATS = [ + # "pdf", "jpg", "jpeg", "png", "tiff", "bmp" + # ] + + def __init__(self, content: Any = None, cache_ttl: int = 300, screenshot_timeout: int = 1000): """Initialize loader. Args: content: Initial content cache_ttl: Cache time-to-live in seconds + screenshot_timeout: Timeout in milliseconds to wait for page content load when capturing a screenshot. """ self.content = content self.file_path = None self.cache = TTLCache(maxsize=100, ttl=cache_ttl) self.vision_mode = False self.max_image_size = None # Changed to None by default + self.is_url = False # Indicates if the source is a URL + self.screenshot_timeout = screenshot_timeout def set_max_image_size(self, size: int) -> None: """Set the maximum image size.""" @@ -31,6 +42,10 @@ def set_vision_mode(self, enabled: bool = True) -> None: """Enable or disable vision mode processing.""" self.vision_mode = enabled + def set_screenshot_timeout(self, timeout: int) -> None: + """Set the screenshot timeout in milliseconds for capturing a screenshot from a URL.""" + self.screenshot_timeout = timeout + def can_handle(self, source: Union[str, BytesIO]) -> bool: """ Checks if the loader can handle the given source. @@ -60,7 +75,6 @@ def _can_handle_file_path(self, file_path: str) -> bool: def _can_handle_stream(self, stream: BytesIO) -> bool: """Checks if the loader can handle the given BytesIO stream.""" try: - # Read the first few bytes to determine file type mime = magic.from_buffer(stream.getvalue(), mime=True) stream.seek(0) # Reset stream position return check_mime_type(mime, self.SUPPORTED_FORMATS) @@ -85,7 +99,26 @@ def convert_to_images(self, file: Union[str, io.BytesIO, io.BufferedReader], sca raise TypeError("file must be a file path (str) or a file-like stream") def _convert_file_to_images(self, file_path: str, scale: float) -> Dict[int, bytes]: - # Check if the file is already an image + """Convert file to images, handling both URLs and local files.""" + # Check if it's a URL + if self._is_url(file_path): + self.is_url = True # Set the instance variable if the source is a URL + try: + screenshot = self._capture_screenshot_from_url(file_path) + # Convert screenshot to PIL Image for potential resizing + img = Image.open(BytesIO(screenshot)) + img = self._resize_if_needed(img) + + # Split into vertical chunks + chunks = self._split_image_vertically(img) + + # Return dictionary with chunks as list + return {0: chunks} # All chunks from URL are considered "page 0" + + except Exception as e: + raise ValueError(f"Failed to capture screenshot from URL: {str(e)}") + + # Existing code for local files... try: Image.open(file_path) is_image = True @@ -93,11 +126,9 @@ def _convert_file_to_images(self, file_path: str, scale: float) -> Dict[int, byt is_image = False if is_image: - # If it is, return it as is with open(file_path, "rb") as f: return {0: f.read()} - # If it's not an image, proceed with the conversion return self._convert_pdf_to_images(pdfium.PdfDocument(file_path), scale) def _convert_stream_to_images(self, file_stream: io.BytesIO, scale: float) -> Dict[int, bytes]: @@ -163,13 +194,15 @@ def can_handle_vision(self, source: Union[str, BytesIO]) -> bool: Checks if the loader can handle the source in vision mode. Args: - source: Either a file path (str) or a BytesIO stream + source: Either a file path (str), URL, or a BytesIO stream Returns: bool: True if the loader can handle the source in vision mode """ try: if isinstance(source, str): + if self._is_url(source): + return True # URLs are always supported in vision mode ext = get_file_extension(source).lower() return ext in ['pdf', 'jpg', 'jpeg', 'png', 'tiff', 'bmp'] elif isinstance(source, BytesIO): @@ -210,4 +243,99 @@ def can_handle_paginate(self, source: Union[str, BytesIO]) -> bool: # List of extensions that support pagination return ext in ['pdf'] except Exception: - return False \ No newline at end of file + return False + + @staticmethod + def _check_playwright_dependencies(): + """ + Check if the playwright dependency is installed. + Raises: + ImportError: If playwright is not installed. + """ + try: + from playwright.sync_api import sync_playwright + except ImportError: + raise ImportError( + "You are using vision with url. You need to install playwright." + "`pip install playwright` and run `playwright install`." + ) + + def _capture_screenshot_from_url(self, url: str) -> bytes: + """ + Captures a full-page screenshot of a URL using Playwright. + + Args: + url: The URL to capture + + Returns: + bytes: The screenshot image data + """ + # Optional: Check if playwright is installed before attempting to use it. + self._check_playwright_dependencies() + + from playwright.sync_api import sync_playwright # Import after the dependency check + + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + page = browser.new_page() + + try: + # Navigate to URL + page.goto(url, wait_until='networkidle') + + # Optional: Handle cookie consent popups (customize selectors as needed) + try: + page.click('button:has-text("Accept")', timeout=10000) + except Exception: + pass # Ignore if no cookie banner is found + + # Wait for content to load with the configurable timeout + page.wait_for_timeout(self.screenshot_timeout) + + # Capture full page screenshot + screenshot = page.screenshot(full_page=True) + + return screenshot + + finally: + browser.close() + + def _split_image_vertically(self, img: Image.Image, chunk_height: int = 1000) -> List[bytes]: + """ + Splits a tall PIL Image into vertical chunks of `chunk_height`. + Returns a list of bytes in PNG format, in top-to-bottom order. + + Args: + img: PIL Image to split + chunk_height: Height of each chunk in pixels + + Returns: + List of PNG-encoded bytes for each chunk + """ + width, height = img.size + num_chunks = math.ceil(height / chunk_height) + + chunks_bytes = [] + for i in range(num_chunks): + top = i * chunk_height + bottom = min((i + 1) * chunk_height, height) + crop_box = (0, top, width, bottom) + + # Crop the chunk + chunk_img = img.crop(crop_box) + + # Convert chunk to bytes + chunk_bytes = io.BytesIO() + chunk_img.save(chunk_bytes, format="PNG", optimize=True) + chunk_bytes.seek(0) + chunks_bytes.append(chunk_bytes.read()) + + return chunks_bytes + + def _is_url(self, source: str) -> bool: + """Check if the source string is a URL.""" + try: + result = urlparse(source) + return bool(result.scheme and result.netloc) + except: + return False \ No newline at end of file diff --git a/extract_thinker/document_loader/document_loader_beautiful_soup.py b/extract_thinker/document_loader/document_loader_beautiful_soup.py index 8d1d337..5784464 100644 --- a/extract_thinker/document_loader/document_loader_beautiful_soup.py +++ b/extract_thinker/document_loader/document_loader_beautiful_soup.py @@ -52,7 +52,9 @@ def __post_init__(self): class DocumentLoaderBeautifulSoup(CachedDocumentLoader): """Loader that uses BeautifulSoup4 to load HTML content.""" - SUPPORTED_FORMATS = ['html', 'htm'] + SUPPORTED_FORMATS = [ + 'html', 'htm', 'url' # Add URL support + ] def __init__( self, @@ -257,9 +259,7 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: raise ValueError(f"Error loading HTML content: {str(e)}") def can_handle(self, source: Union[str, BytesIO]) -> bool: - """Check if the loader can handle this source.""" - if isinstance(source, BytesIO): + """Override to add URL support.""" + if isinstance(source, str) and self._is_url(source): return True - if self._is_url(source): - return True - return get_file_extension(source) in self.SUPPORTED_FORMATS \ No newline at end of file + return super().can_handle(source) \ No newline at end of file diff --git a/extract_thinker/document_loader/document_loader_docling.py b/extract_thinker/document_loader/document_loader_docling.py index 1ff2a1c..d47783a 100644 --- a/extract_thinker/document_loader/document_loader_docling.py +++ b/extract_thinker/document_loader/document_loader_docling.py @@ -1,6 +1,7 @@ from io import BytesIO from typing import Any, Dict, List, Union, Optional from dataclasses import dataclass, field +from urllib.parse import urlparse from cachetools import cachedmethod from cachetools.keys import hashkey @@ -120,7 +121,9 @@ class DocumentLoaderDocling(CachedDocumentLoader): # XML (including PubMed .nxml) "xml", "nxml", # Plain text - "txt" + "txt", + # URL support + "url" ] def __init__( @@ -212,6 +215,7 @@ def can_handle(self, source: Union[str, BytesIO]) -> bool: self.vision_mode )) def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: + from docling.document_converter import ConversionResult """ Load and parse the document using Docling. @@ -219,30 +223,35 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: A list of dictionaries, each representing a "page" with: - "content": text from that page - "image": optional image bytes if vision_mode is True - - "markdown": Markdown string of that page """ if not self.can_handle(source): raise ValueError(f"Cannot handle source: {source}") # Convert the source to a docling "ConversionResult" - conv_result = self._docling_convert(source) - - test = conv_result.document.export_to_markdown() - print(test) + conv_result: ConversionResult = self._docling_convert(source) - # Build the output list of page data + # If the source is a URL, return a single page with all the content. + if isinstance(source, str) and self._is_url(source): + content = conv_result.document.export_to_markdown() + print(content) # Log the exported markdown, if needed + page_output = {"content": content, "image": None} + # Handle image extraction if vision_mode is enabled + if self.vision_mode: + images_dict = self.convert_to_images(source) + page_output["images"] = images_dict.get(0) + return [page_output] + + # Build the output list of page data for non-URL sources pages_output = [] for p in conv_result.pages: page_dict = { "content": conv_result.document.export_to_markdown(page_no=p.page_no+1), "image": None } - # Handle image extraction if vision_mode is enabled if self.vision_mode: images_dict = self.convert_to_images(source) page_dict["image"] = images_dict.get(p.page_no) - pages_output.append(page_dict) # Fallback for documents without explicit pages diff --git a/extract_thinker/document_loader/document_loader_markitdown.py b/extract_thinker/document_loader/document_loader_markitdown.py index 3e401d7..4b8c157 100644 --- a/extract_thinker/document_loader/document_loader_markitdown.py +++ b/extract_thinker/document_loader/document_loader_markitdown.py @@ -53,13 +53,18 @@ class DocumentLoaderMarkItDown(CachedDocumentLoader): Supports text extraction and optional image/page rendering in vision mode. Produces a list of pages, each with: - "content": text from that page - - "image": optional page/image bytes if vision_mode is True + - "image": optional page/image bytes if vision_mode is True (for non-URL sources) + - For URL sources, returns a single page with: + - "content": extracted text + - "image": always None + - "images": rendered image bytes if vision_mode is enabled """ SUPPORTED_FORMATS = [ - "pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", + "pdf", "doc", "docx", "ppt", "pptx", "xls", "xlsx", "csv", "tsv", "txt", "html", "xml", "json", "zip", - "jpg", "jpeg", "png", "bmp", "gif", "wav", "mp3", "m4a" + "jpg", "jpeg", "png", "bmp", "gif", "wav", "mp3", "m4a", + "url" ] def __init__( @@ -131,13 +136,30 @@ def _process_text(self, text: str) -> str: """Apply any additional text processing (e.g., strip whitespace).""" return text if self.config.preserve_whitespace else text.strip() - def _is_url(self, source: str) -> bool: + def _is_url(self, potential_url: str) -> bool: """Check if the source is a URL.""" + return potential_url.startswith("http://") or potential_url.startswith("https://") + + def can_handle(self, source: Union[str, BytesIO]) -> bool: + """ + Checks if the loader can handle the given source. + + Args: + source: Either a file path (str), a BytesIO stream, or a URL + + Returns: + bool: True if the loader can handle the source, False otherwise + """ try: - from urllib.parse import urlparse - result = urlparse(source) - return all([result.scheme, result.netloc]) - except: + if isinstance(source, str): + if self._is_url(source): + return True + extension = source.split('.')[-1].lower() + return extension in self.SUPPORTED_FORMATS + elif isinstance(source, BytesIO): + return True + return False + except Exception: return False @cachedmethod(cache=attrgetter('cache'), @@ -154,9 +176,27 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: Returns: A list of dictionaries where each dict is one "page" of text. - - "content": The text content (str) - - "image": Optional bytes if vision mode is enabled (key only present if vision_mode is True) + For non-URL sources: + - "content": The text content (str) + - "image": Optional bytes if vision mode is enabled (key only present if vision_mode is True) + For URL sources: + - "content": The text content (str) + - "image": Always None + - "images": Optional rendered image bytes if vision mode is enabled """ + # Handle URL sources separately + if isinstance(source, str) and self._is_url(source): + try: + result = self.markitdown.convert(source) + text_content = result.text_content or "" + page_output = {"content": text_content, "image": None} + if self.vision_mode: + images_dict = self.convert_to_images(source) + page_output["images"] = images_dict.get(0) + return [page_output] + except Exception as e: + raise ValueError(f"Error processing document with MarkItDown URL handling: {str(e)}") + if not self.can_handle(source): raise ValueError(f"Cannot handle source: {source}") @@ -170,7 +210,7 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: # File path result = self.markitdown.convert(source) else: - # BytesIO + # BytesIO stream source.seek(0) if self.config.mime_type_detection: mime = magic.from_buffer(source.getvalue(), mime=True) @@ -189,13 +229,9 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: source.seek(0) # Full text from MarkItDown - text_content = result.text_content - if not text_content: - text_content = "" - + text_content = result.text_content or "" # Split text content into pages (based on config.page_separator) raw_pages = text_content.split(self.config.page_separator) - pages = [] for page_text in raw_pages: processed = self._process_text(page_text) @@ -215,25 +251,4 @@ def load(self, source: Union[str, BytesIO]) -> List[Dict[str, Any]]: return pages except Exception as e: - raise ValueError(f"Error processing document with MarkItDown: {str(e)}") - - def can_handle(self, source: Union[str, BytesIO]) -> bool: - """ - Checks if the loader can handle the given source. - - Args: - source: Either a file path (str), a BytesIO stream, or a URL - - Returns: - bool: True if the loader can handle the source, False otherwise - """ - try: - if isinstance(source, str): - if self._is_url(source): - return True - return self._can_handle_file_path(source) - elif isinstance(source, BytesIO): - return self._can_handle_stream(source) - return False - except Exception: - return False \ No newline at end of file + raise ValueError(f"Error processing document with MarkItDown: {str(e)}") \ No newline at end of file diff --git a/extract_thinker/pagination_handler.py b/extract_thinker/pagination_handler.py index eb5b035..fbd2c81 100644 --- a/extract_thinker/pagination_handler.py +++ b/extract_thinker/pagination_handler.py @@ -164,7 +164,7 @@ def _merge_list_field(self, field_name: str, values: List[Any], field_type: Any) break if unique_key: - # Merge by unique key + # Merge by unique key using case-insensitive comparison merged_by_key = {} for item in flattened: if hasattr(item, 'model_dump'): @@ -173,13 +173,14 @@ def _merge_list_field(self, field_name: str, values: List[Any], field_type: Any) item_dict = item key_val = item_dict.get(unique_key) if key_val is not None: - if key_val in merged_by_key: - merged_by_key[key_val] = self._merge_two_models( - merged_by_key[key_val], + normalized_key = str(key_val).lower() + if normalized_key in merged_by_key: + merged_by_key[normalized_key] = self._merge_two_models( + merged_by_key[normalized_key], item_dict ) else: - merged_by_key[key_val] = item_dict + merged_by_key[normalized_key] = item_dict else: # If no unique key found for this item, just store it uniquely merged_by_key[f"no_key_{len(merged_by_key)}"] = item_dict diff --git a/poetry.lock b/poetry.lock index cd6f78f..8f68b2b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -802,6 +802,93 @@ test-downstream = ["aiobotocore (>=2.5.4,<3.0.0)", "dask-expr", "dask[dataframe, test-full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "cloudpickle", "dask", "distributed", "dropbox", "dropboxdrivefs", "fastparquet", "fusepy", "gcsfs", "jinja2", "kerchunk", "libarchive-c", "lz4", "notebook", "numpy", "ocifs", "pandas", "panel", "paramiko", "pyarrow", "pyarrow (>=1)", "pyftpdlib", "pygit2", "pytest", "pytest-asyncio (!=0.22.0)", "pytest-benchmark", "pytest-cov", "pytest-mock", "pytest-recording", "pytest-rerunfailures", "python-snappy", "requests", "smbprotocol", "tqdm", "urllib3", "zarr", "zstandard"] tqdm = ["tqdm"] +[[package]] +name = "greenlet" +version = "3.1.1" +description = "Lightweight in-process concurrent programming" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "greenlet-3.1.1-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:0bbae94a29c9e5c7e4a2b7f0aae5c17e8e90acbfd3bf6270eeba60c39fce3563"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0fde093fb93f35ca72a556cf72c92ea3ebfda3d79fc35bb19fbe685853869a83"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:36b89d13c49216cadb828db8dfa6ce86bbbc476a82d3a6c397f0efae0525bdd0"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b6150a85e1b33b40b1464a3f9988dcc5251d6ed06842abff82e42632fac120"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:93147c513fac16385d1036b7e5b102c7fbbdb163d556b791f0f11eada7ba65dc"}, + {file = "greenlet-3.1.1-cp310-cp310-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da7a9bff22ce038e19bf62c4dd1ec8391062878710ded0a845bcf47cc0200617"}, + {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b2795058c23988728eec1f36a4e5e4ebad22f8320c85f3587b539b9ac84128d7"}, + {file = "greenlet-3.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ed10eac5830befbdd0c32f83e8aa6288361597550ba669b04c48f0f9a2c843c6"}, + {file = "greenlet-3.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:77c386de38a60d1dfb8e55b8c1101d68c79dfdd25c7095d51fec2dd800892b80"}, + {file = "greenlet-3.1.1-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:e4d333e558953648ca09d64f13e6d8f0523fa705f51cae3f03b5983489958c70"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:09fc016b73c94e98e29af67ab7b9a879c307c6731a2c9da0db5a7d9b7edd1159"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d5e975ca70269d66d17dd995dafc06f1b06e8cb1ec1e9ed54c1d1e4a7c4cf26e"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3b2813dc3de8c1ee3f924e4d4227999285fd335d1bcc0d2be6dc3f1f6a318ec1"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e347b3bfcf985a05e8c0b7d462ba6f15b1ee1c909e2dcad795e49e91b152c383"}, + {file = "greenlet-3.1.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9e8f8c9cb53cdac7ba9793c276acd90168f416b9ce36799b9b885790f8ad6c0a"}, + {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:62ee94988d6b4722ce0028644418d93a52429e977d742ca2ccbe1c4f4a792511"}, + {file = "greenlet-3.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:1776fd7f989fc6b8d8c8cb8da1f6b82c5814957264d1f6cf818d475ec2bf6395"}, + {file = "greenlet-3.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:48ca08c771c268a768087b408658e216133aecd835c0ded47ce955381105ba39"}, + {file = "greenlet-3.1.1-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:4afe7ea89de619adc868e087b4d2359282058479d7cfb94970adf4b55284574d"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f406b22b7c9a9b4f8aa9d2ab13d6ae0ac3e85c9a809bd590ad53fed2bf70dc79"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c3a701fe5a9695b238503ce5bbe8218e03c3bcccf7e204e455e7462d770268aa"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2846930c65b47d70b9d178e89c7e1a69c95c1f68ea5aa0a58646b7a96df12441"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:99cfaa2110534e2cf3ba31a7abcac9d328d1d9f1b95beede58294a60348fba36"}, + {file = "greenlet-3.1.1-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1443279c19fca463fc33e65ef2a935a5b09bb90f978beab37729e1c3c6c25fe9"}, + {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b7cede291382a78f7bb5f04a529cb18e068dd29e0fb27376074b6d0317bf4dd0"}, + {file = "greenlet-3.1.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:23f20bb60ae298d7d8656c6ec6db134bca379ecefadb0b19ce6f19d1f232a942"}, + {file = "greenlet-3.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:7124e16b4c55d417577c2077be379514321916d5790fa287c9ed6f23bd2ffd01"}, + {file = "greenlet-3.1.1-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:05175c27cb459dcfc05d026c4232f9de8913ed006d42713cb8a5137bd49375f1"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:935e943ec47c4afab8965954bf49bfa639c05d4ccf9ef6e924188f762145c0ff"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:667a9706c970cb552ede35aee17339a18e8f2a87a51fba2ed39ceeeb1004798a"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b8a678974d1f3aa55f6cc34dc480169d58f2e6d8958895d68845fa4ab566509e"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:efc0f674aa41b92da8c49e0346318c6075d734994c3c4e4430b1c3f853e498e4"}, + {file = "greenlet-3.1.1-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0153404a4bb921f0ff1abeb5ce8a5131da56b953eda6e14b88dc6bbc04d2049e"}, + {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:275f72decf9932639c1c6dd1013a1bc266438eb32710016a1c742df5da6e60a1"}, + {file = "greenlet-3.1.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:c4aab7f6381f38a4b42f269057aee279ab0fc7bf2e929e3d4abfae97b682a12c"}, + {file = "greenlet-3.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:b42703b1cf69f2aa1df7d1030b9d77d3e584a70755674d60e710f0af570f3761"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f1695e76146579f8c06c1509c7ce4dfe0706f49c6831a817ac04eebb2fd02011"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:7876452af029456b3f3549b696bb36a06db7c90747740c5302f74a9e9fa14b13"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4ead44c85f8ab905852d3de8d86f6f8baf77109f9da589cb4fa142bd3b57b475"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8320f64b777d00dd7ccdade271eaf0cad6636343293a25074cc5566160e4de7b"}, + {file = "greenlet-3.1.1-cp313-cp313t-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6510bf84a6b643dabba74d3049ead221257603a253d0a9873f55f6a59a65f822"}, + {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:04b013dc07c96f83134b1e99888e7a79979f1a247e2a9f59697fa14b5862ed01"}, + {file = "greenlet-3.1.1-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:411f015496fec93c1c8cd4e5238da364e1da7a124bcb293f085bf2860c32c6f6"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:47da355d8687fd65240c364c90a31569a133b7b60de111c255ef5b606f2ae291"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:98884ecf2ffb7d7fe6bd517e8eb99d31ff7855a840fa6d0d63cd07c037f6a981"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f1d4aeb8891338e60d1ab6127af1fe45def5259def8094b9c7e34690c8858803"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db32b5348615a04b82240cc67983cb315309e88d444a288934ee6ceaebcad6cc"}, + {file = "greenlet-3.1.1-cp37-cp37m-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dcc62f31eae24de7f8dce72134c8651c58000d3b1868e01392baea7c32c247de"}, + {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:1d3755bcb2e02de341c55b4fca7a745a24a9e7212ac953f6b3a48d117d7257aa"}, + {file = "greenlet-3.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:b8da394b34370874b4572676f36acabac172602abf054cbc4ac910219f3340af"}, + {file = "greenlet-3.1.1-cp37-cp37m-win32.whl", hash = "sha256:a0dfc6c143b519113354e780a50381508139b07d2177cb6ad6a08278ec655798"}, + {file = "greenlet-3.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:54558ea205654b50c438029505def3834e80f0869a70fb15b871c29b4575ddef"}, + {file = "greenlet-3.1.1-cp38-cp38-macosx_11_0_universal2.whl", hash = "sha256:346bed03fe47414091be4ad44786d1bd8bef0c3fcad6ed3dee074a032ab408a9"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfc59d69fc48664bc693842bd57acfdd490acafda1ab52c7836e3fc75c90a111"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d21e10da6ec19b457b82636209cbe2331ff4306b54d06fa04b7c138ba18c8a81"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:37b9de5a96111fc15418819ab4c4432e4f3c2ede61e660b1e33971eba26ef9ba"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ef9ea3f137e5711f0dbe5f9263e8c009b7069d8a1acea822bd5e9dae0ae49c8"}, + {file = "greenlet-3.1.1-cp38-cp38-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:85f3ff71e2e60bd4b4932a043fbbe0f499e263c628390b285cb599154a3b03b1"}, + {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:95ffcf719966dd7c453f908e208e14cde192e09fde6c7186c8f1896ef778d8cd"}, + {file = "greenlet-3.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:03a088b9de532cbfe2ba2034b2b85e82df37874681e8c470d6fb2f8c04d7e4b7"}, + {file = "greenlet-3.1.1-cp38-cp38-win32.whl", hash = "sha256:8b8b36671f10ba80e159378df9c4f15c14098c4fd73a36b9ad715f057272fbef"}, + {file = "greenlet-3.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:7017b2be767b9d43cc31416aba48aab0d2309ee31b4dbf10a1d38fb7972bdf9d"}, + {file = "greenlet-3.1.1-cp39-cp39-macosx_11_0_universal2.whl", hash = "sha256:396979749bd95f018296af156201d6211240e7a23090f50a8d5d18c370084dc3"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca9d0ff5ad43e785350894d97e13633a66e2b50000e8a183a50a88d834752d42"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f6ff3b14f2df4c41660a7dec01045a045653998784bf8cfcb5a525bdffffbc8f"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94ebba31df2aa506d7b14866fed00ac141a867e63143fe5bca82a8e503b36437"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aaad12ac0ff500f62cebed98d8789198ea0e6f233421059fa68a5aa7220145"}, + {file = "greenlet-3.1.1-cp39-cp39-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:63e4844797b975b9af3a3fb8f7866ff08775f5426925e1e0bbcfe7932059a12c"}, + {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7939aa3ca7d2a1593596e7ac6d59391ff30281ef280d8632fa03d81f7c5f955e"}, + {file = "greenlet-3.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:d0028e725ee18175c6e422797c407874da24381ce0690d6b9396c204c7f7276e"}, + {file = "greenlet-3.1.1-cp39-cp39-win32.whl", hash = "sha256:5e06afd14cbaf9e00899fae69b24a32f2196c19de08fcb9f4779dd4f004e5e7c"}, + {file = "greenlet-3.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:3319aa75e0e0639bc15ff54ca327e8dc7a6fe404003496e3c6925cd3142e0e22"}, + {file = "greenlet-3.1.1.tar.gz", hash = "sha256:4ce3ac6cdb6adf7946475d7ef31777c26d94bccc377e070a7986bd2d5c515467"}, +] + +[package.extras] +docs = ["Sphinx", "furo"] +test = ["objgraph", "psutil"] + [[package]] name = "h11" version = "0.14.0" @@ -1744,6 +1831,27 @@ docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-a test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] type = ["mypy (>=1.11.2)"] +[[package]] +name = "playwright" +version = "1.50.0" +description = "A high-level API to automate web browsers" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "playwright-1.50.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:f36d754a6c5bd9bf7f14e8f57a2aea6fd08f39ca4c8476481b9c83e299531148"}, + {file = "playwright-1.50.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:40f274384591dfd27f2b014596250b2250c843ed1f7f4ef5d2960ecb91b4961e"}, + {file = "playwright-1.50.0-py3-none-macosx_11_0_universal2.whl", hash = "sha256:9922ef9bcd316995f01e220acffd2d37a463b4ad10fd73e388add03841dfa230"}, + {file = "playwright-1.50.0-py3-none-manylinux1_x86_64.whl", hash = "sha256:8fc628c492d12b13d1f347137b2ac6c04f98197ff0985ef0403a9a9ee0d39131"}, + {file = "playwright-1.50.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcff35f72db2689a79007aee78f1b0621a22e6e3d6c1f58aaa9ac805bf4497c"}, + {file = "playwright-1.50.0-py3-none-win32.whl", hash = "sha256:3b906f4d351260016a8c5cc1e003bb341651ae682f62213b50168ed581c7558a"}, + {file = "playwright-1.50.0-py3-none-win_amd64.whl", hash = "sha256:1859423da82de631704d5e3d88602d755462b0906824c1debe140979397d2e8d"}, +] + +[package.dependencies] +greenlet = ">=3.1.1,<4.0.0" +pyee = ">=12,<13" + [[package]] name = "pluggy" version = "1.5.0" @@ -2085,6 +2193,24 @@ files = [ [package.dependencies] typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0" +[[package]] +name = "pyee" +version = "12.1.1" +description = "A rough port of Node.js's EventEmitter to Python with a few tricks of its own" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "pyee-12.1.1-py3-none-any.whl", hash = "sha256:18a19c650556bb6b32b406d7f017c8f513aceed1ef7ca618fb65de7bd2d347ef"}, + {file = "pyee-12.1.1.tar.gz", hash = "sha256:bbc33c09e2ff827f74191e3e5bbc6be7da02f627b7ec30d86f5ce1a6fb2424a3"}, +] + +[package.dependencies] +typing-extensions = "*" + +[package.extras] +dev = ["black", "build", "flake8", "flake8-black", "isort", "jupyter-console", "mkdocs", "mkdocs-include-markdown-plugin", "mkdocstrings[python]", "pytest", "pytest-asyncio", "pytest-trio", "sphinx", "toml", "tox", "trio", "trio", "trio-typing", "twine", "twisted", "validate-pyproject[all]"] + [[package]] name = "pyflakes" version = "3.2.0" @@ -3126,4 +3252,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.9,<3.14" -content-hash = "3b843ac3af622ab2801fa34a523dedba241688a13fe9321f8a084eef235b7c71" +content-hash = "46f4b5e4c32ffe2d06dd23bfbcf450b7c4edadd3538749df59cd311eb51c5ff7" diff --git a/pyproject.toml b/pyproject.toml index d57df0e..7f2fd6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,6 +17,7 @@ cachetools = "^5.3.3" pyyaml = "^6.0.1" tiktoken = {version = "^0.8.0", python = ">=3.9,<3.13"} python-magic = "^0.4.27" +playwright = "^1.50.0" [tool.poetry.dev-dependencies] flake8 = "^7.1.1" diff --git a/tests/models/gdp_contract.py b/tests/models/gdp_contract.py index c1c2228..dbd044d 100644 --- a/tests/models/gdp_contract.py +++ b/tests/models/gdp_contract.py @@ -21,11 +21,21 @@ class RegionData(Contract): provinces: List[ProvinceData] = Field(default_factory=list) class CountryData(Contract): - country: str - total_gdp_million: Optional[float] = Field(None, description="Total GDP (€ million)") - regions: List[RegionData] = Field(default_factory=list, description="Make sure to ignore Extra-regio*/Extra-region") + country: str = Field( + ..., + description="Country name as it appears in the PDF. IMPORTANT: Extract this value from every page and aggregate unique entries, not just the first occurrence." + ) + total_gdp_million: Optional[float] = Field( + None, + description="Total GDP (€ million) for the country, using the value from any page in the document." + ) + regions: List[RegionData] = Field( + default_factory=list, + description="List of regions for the country. Aggregate all regions from every page and ignore any formatting variations like 'Extra-regio*/Extra-region'." + ) class EUData(Contract): + thinking: str = Field(None, description="Think step by step. You have 2 pages dont forget to add them.") eu_total_gdp_million_27: float = Field(None, description="EU27 Total GDP (€ million)") eu_total_gdp_million_28: float = Field(None, description="EU28 Total GDP (€ million)") - countries: List[CountryData] \ No newline at end of file + countries: List[CountryData] = Field(None, description="List of countries. Make sure you add all countries of every page, not just the first one.") \ No newline at end of file diff --git a/tests/models/handbook_contract.py b/tests/models/handbook_contract.py new file mode 100644 index 0000000..f8f8732 --- /dev/null +++ b/tests/models/handbook_contract.py @@ -0,0 +1,4 @@ +from pydantic import BaseModel + +class HandbookContract(BaseModel): + title: str \ No newline at end of file diff --git a/tests/test_document_loader_docling.py b/tests/test_document_loader_docling.py index 2a0e4ce..c626ea3 100644 --- a/tests/test_document_loader_docling.py +++ b/tests/test_document_loader_docling.py @@ -234,13 +234,17 @@ def test_title_extraction(self): def test_url_loading(self, loader): """Test loading from a URL for Docling loader.""" - url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2/?view=chapter" + loader = DocumentLoaderDocling() + url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2A/?view=chapter" # Ensure the loader recognizes and can handle a URL + loader.set_vision_mode(True) assert loader.can_handle(url) is True - + pages = loader.load(url) assert isinstance(pages, list) assert len(pages) > 0 for page in pages: assert "content" in page + assert "images" in page + assert len(page["images"]) == 3 assert isinstance(page["content"], str) \ No newline at end of file diff --git a/tests/test_document_loader_markitdown.py b/tests/test_document_loader_markitdown.py index 1d3944e..db95dc3 100644 --- a/tests/test_document_loader_markitdown.py +++ b/tests/test_document_loader_markitdown.py @@ -159,23 +159,22 @@ def test_page_separator_splitting(self): loader = DocumentLoaderMarkItDown(config) pages = loader.load(bulk_pdf_path) - # Verify we get exactly 3 pages + # Verify we get exactly 2 pages (as per current expectations) assert len(pages) == 2, f"Expected 2 pages, got {len(pages)}" def test_url_loading(self, loader): """Test loading from a URL for MarkItDown loader.""" - url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2/?view=chapter" + url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2A/?view=chapter" # Verify that the loader accepts the URL as a valid source. + loader.set_vision_mode(True) assert loader.can_handle(url) is True pages = loader.load(url) assert isinstance(pages, list) assert len(pages) > 0 for page in pages: + # This test expects the URL branch to return a page with "images" if vision mode is enabled. assert "content" in page - assert isinstance(page["content"], str) - - def test_can_handle_url(self, loader): - """Test that MarkItDown loader correctly identifies URL sources.""" - url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2/?view=chapter" - assert loader.can_handle(url) is True \ No newline at end of file + assert "images" in page + assert len(page["images"]) == 3 + assert isinstance(page["content"], str) \ No newline at end of file diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 8976a19..7444ecd 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -16,6 +16,9 @@ import pytest import numpy as np from litellm import embedding +from extract_thinker.document_loader.document_loader_docling import DocumentLoaderDocling +from tests.models.handbook_contract import HandbookContract + load_dotenv() cwd = os.getcwd() @@ -190,7 +193,7 @@ def test_pagination_handler(): test_file_path = os.path.join(os.getcwd(), "tests", "files", "Regional_GDP_per_capita_2018_2.pdf") extractor = Extractor() - extractor.load_document_loader(DocumentLoaderPdfPlumber()) + extractor.load_document_loader(DocumentLoaderDocling()) extractor.load_llm("gpt-4o") # Create and run both extractions in parallel @@ -405,3 +408,22 @@ def test_extract_with_pydanticai_backend(): except ImportError: pytest.skip("pydantic-ai not installed") + +def test_extract_from_url_docling_and_gpt4o_mini(): + """ + Test extraction from a URL using the Docling document loader and gpt-4o-mini LLM. + The test asserts that the extracted title is as expected. + """ + url = "https://www.handbook.fca.org.uk/handbook/BCOBS/2A/?view=chapter" + + # Initialize the extractor, load the Docling loader and the gpt-4o-mini LLM + extractor = Extractor() + extractor.load_document_loader(DocumentLoaderDocling()) + extractor.load_llm("gpt-4o-mini") + + # Act: Extract the document using the specified URL and the HandbookContract + result = extractor.extract(url, HandbookContract) + + # Assert: Verify that the extracted title matches the expected value. + expected_title = "BCOBS 2A.1 Restriction on marketing or providing an optional product for which a fee is payable" + assert result.title == expected_title \ No newline at end of file