Skip to content

Commit

Permalink
Move unstructured tests to be an integration test (#8244)
Browse files Browse the repository at this point in the history
GitOrigin-RevId: 212d083180f24db95a61ba4aa2ba87525b55d3a5
  • Loading branch information
szymondudycz authored and Manul from Pathway committed Feb 18, 2025
1 parent 4a88c9e commit aa94353
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 49 deletions.
52 changes: 52 additions & 0 deletions integration_tests/xpack/test_parsers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os

import pandas as pd
import pytest

import pathway as pw
from pathway.tests.utils import assert_table_equality
from pathway.xpacks.llm.parsers import UnstructuredParser


@pytest.mark.environment_changes
def test_parse_unstructured(monkeypatch):
parser = UnstructuredParser()
txt = "Pójdź, kińże tę chmurność w głąb flaszy 🍾."
input_df = pd.DataFrame([dict(raw=txt.encode("utf8"))])

class schema(pw.Schema):
raw: bytes

input_table = pw.debug.table_from_pandas(input_df, schema=schema)
result = input_table.select(ret=parser(pw.this.raw)[0][0])

assert_table_equality(
result, pw.debug.table_from_pandas(pd.DataFrame([dict(ret=txt)]))
)


@pytest.mark.environment_changes
@pytest.mark.asyncio
def test_parse_unstructured_unk_exception(monkeypatch):
parser = UnstructuredParser()

binary_data = b"NONEXISTING_FMT" + os.urandom(2048)

input_df = pd.DataFrame([dict(raw=binary_data)])

class schema(pw.Schema):
raw: bytes

input_table = pw.debug.table_from_pandas(input_df, schema=schema)

with pytest.raises(Exception) as excinfo:
result = input_table.select(ret=parser(pw.this.raw)[0][0])
pw.debug.compute_and_print(result)

exception_msg = str(excinfo.value)

assert (
"This error may indicate libmagic (magic) dependency is missing."
in exception_msg
)
assert "FileType.UNK" in exception_msg
48 changes: 1 addition & 47 deletions python/pathway/xpacks/llm/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,15 @@

from __future__ import annotations

import os
from pathlib import Path

import nltk
import pandas as pd
import pytest
from fpdf import FPDF

import pathway as pw
from pathway.tests.utils import assert_table_equality
from pathway.xpacks.llm.parsers import PypdfParser, UnstructuredParser, Utf8Parser
from pathway.xpacks.llm.parsers import PypdfParser, Utf8Parser

for _ in range(10):
try:
Expand Down Expand Up @@ -44,50 +42,6 @@ class schema(pw.Schema):
)


@pytest.mark.environment_changes
def test_parse_unstructured(monkeypatch):
parser = UnstructuredParser()
txt = "Pójdź, kińże tę chmurność w głąb flaszy 🍾."
input_df = pd.DataFrame([dict(raw=txt.encode("utf8"))])

class schema(pw.Schema):
raw: bytes

input_table = pw.debug.table_from_pandas(input_df, schema=schema)
result = input_table.select(ret=parser(pw.this.raw)[0][0])

assert_table_equality(
result, pw.debug.table_from_pandas(pd.DataFrame([dict(ret=txt)]))
)


@pytest.mark.environment_changes
@pytest.mark.asyncio
def test_parse_unstructured_unk_exception(monkeypatch):
parser = UnstructuredParser()

binary_data = b"NONEXISTING_FMT" + os.urandom(2048)

input_df = pd.DataFrame([dict(raw=binary_data)])

class schema(pw.Schema):
raw: bytes

input_table = pw.debug.table_from_pandas(input_df, schema=schema)

with pytest.raises(Exception) as excinfo:
result = input_table.select(ret=parser(pw.this.raw)[0][0])
pw.debug.compute_and_print(result)

exception_msg = str(excinfo.value)

assert (
"This error may indicate libmagic (magic) dependency is missing."
in exception_msg
)
assert "FileType.UNK" in exception_msg


def _create_temp_pdf_with_text(text: str, path: Path) -> Path:
class PDF(FPDF):
def header(self):
Expand Down
3 changes: 1 addition & 2 deletions python/pathway/xpacks/llm/tests/test_vector_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,7 @@ async def fake_embeddings_model(x: str) -> list[float]:
assert call_count == 4 # dimension x 2 (no cache used), doc, query


@pytest.mark.environment_changes # unstructured parser adds env vars after first use
@pytest.mark.parametrize("parser_cls", [parsers.UnstructuredParser])
@pytest.mark.parametrize("parser_cls", [parsers.Utf8Parser])
def test_vs_parsing(parser_cls):
def fake_embeddings_model(x: str) -> list[float]:
return [1.0, 1.0, 0.0]
Expand Down

0 comments on commit aa94353

Please sign in to comment.