Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding CSV Parser #1996

Merged
merged 14 commits into from
Oct 2, 2024
2 changes: 2 additions & 0 deletions app/backend/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

from load_azd_env import load_azd_env
from prepdocslib.blobmanager import BlobManager
from prepdocslib.csvparser import CsvParser
from prepdocslib.embeddings import (
AzureOpenAIEmbeddingService,
ImageEmbeddings,
Expand Down Expand Up @@ -190,6 +191,7 @@ def setup_file_processors(
".json": FileProcessor(JsonParser(), SimpleTextSplitter()),
".md": FileProcessor(TextParser(), sentence_text_splitter),
".txt": FileProcessor(TextParser(), sentence_text_splitter),
".csv": FileProcessor(CsvParser(), sentence_text_splitter),
}
# These require either a Python package or Document Intelligence
if pdf_parser is not None:
Expand Down
31 changes: 31 additions & 0 deletions app/backend/prepdocslib/csvparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import csv
from typing import IO, AsyncGenerator

from .page import Page
from .parser import Parser


class CsvParser(Parser):
"""
Concrete parser that can parse CSV into Page objects. Each row becomes a Page object.
"""

async def parse(self, content: IO) -> AsyncGenerator[Page, None]:
# Check if content is in bytes (binary file) and decode to string
content_str: str
if isinstance(content, (bytes, bytearray)):
content_str = content.decode("utf-8")
elif hasattr(content, "read"): # Handle BufferedReader
content_str = content.read().decode("utf-8")

# Create a CSV reader from the text content
reader = csv.reader(content_str.splitlines())
offset = 0

# Skip the header row
next(reader, None)

for i, row in enumerate(reader):
page_text = ",".join(row)
yield Page(i, offset, page_text)
offset += len(page_text) + 1 # Account for newline character
8 changes: 4 additions & 4 deletions tests/test_app_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ async def test_app_user_upload_processors(monkeypatch, minimal_env):
async with quart_app.test_app():
ingester = quart_app.config[app.CONFIG_INGESTER]
assert ingester is not None
assert len(ingester.file_processors.keys()) == 5
assert len(ingester.file_processors.keys()) == 6


@pytest.mark.asyncio
Expand All @@ -77,7 +77,7 @@ async def test_app_user_upload_processors_docint(monkeypatch, minimal_env):
async with quart_app.test_app():
ingester = quart_app.config[app.CONFIG_INGESTER]
assert ingester is not None
assert len(ingester.file_processors.keys()) == 14
assert len(ingester.file_processors.keys()) == 15


@pytest.mark.asyncio
Expand All @@ -92,7 +92,7 @@ async def test_app_user_upload_processors_docint_localpdf(monkeypatch, minimal_e
async with quart_app.test_app():
ingester = quart_app.config[app.CONFIG_INGESTER]
assert ingester is not None
assert len(ingester.file_processors.keys()) == 14
assert len(ingester.file_processors.keys()) == 15
assert ingester.file_processors[".pdf"] is not ingester.file_processors[".pptx"]


Expand All @@ -108,7 +108,7 @@ async def test_app_user_upload_processors_docint_localhtml(monkeypatch, minimal_
async with quart_app.test_app():
ingester = quart_app.config[app.CONFIG_INGESTER]
assert ingester is not None
assert len(ingester.file_processors.keys()) == 14
assert len(ingester.file_processors.keys()) == 15
assert ingester.file_processors[".html"] is not ingester.file_processors[".pptx"]


Expand Down
57 changes: 57 additions & 0 deletions tests/test_csvparser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import io

import pytest

from prepdocslib.csvparser import CsvParser # Adjust import to the correct module


@pytest.mark.asyncio
async def test_csvparser_single_row():
# Mock CSV content with a single row in binary format
file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3")
file.name = "test.csv"
csvparser = CsvParser()

# Parse the file
pages = [page async for page in csvparser.parse(file)]

# Assertions
assert len(pages) == 1
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "value1,value2,value3"


@pytest.mark.asyncio
async def test_csvparser_multiple_rows():
# Mock CSV content with multiple rows in binary format
file = io.BytesIO(b"col1,col2,col3\nvalue1,value2,value3\nvalue4,value5,value6")
file.name = "test.csv"
csvparser = CsvParser()

# Parse the file
pages = [page async for page in csvparser.parse(file)]

# Assertions
assert len(pages) == 2 # Expect only data rows, skipping the header
assert pages[0].page_num == 0
assert pages[0].offset == 0
assert pages[0].text == "value1,value2,value3"

assert pages[1].page_num == 1
assert pages[1].offset == len(pages[0].text) + 1 # Length of the first row plus a newline
assert pages[1].text == "value4,value5,value6"


@pytest.mark.asyncio
async def test_csvparser_empty_file():
# Mock empty CSV content in binary format
file = io.BytesIO(b"")
file.name = "test.csv"
csvparser = CsvParser()

# Parse the file
pages = [page async for page in csvparser.parse(file)]

# Assertions
assert len(pages) == 0 # No rows should be parsed from an empty file