Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[IDWA-OCR-96]Create segmentation template and labels file using PDF metadata #99

Merged
merged 32 commits into from
Jun 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
8f0b288
initial
arinkulshi Apr 17, 2024
04ac8bb
form_asset
arinkulshi Apr 17, 2024
e645e4b
edited script
arinkulshi Apr 18, 2024
a7b212c
initial
arinkulshi Apr 24, 2024
18d2591
Merge branch 'main' into pdf_metadata_auto_segmentation
arinkulshi Apr 24, 2024
0416355
removed redundant files
arinkulshi Apr 25, 2024
e59bbae
edit to file path config
arinkulshi Apr 25, 2024
02518d2
linting
arinkulshi Apr 25, 2024
7ff60fb
file name change
arinkulshi Apr 25, 2024
336c131
added new packages removed previous image extract logic
arinkulshi Apr 25, 2024
0390c36
edited package from pypdf2 to pypdf
arinkulshi Apr 25, 2024
3271cf3
added pdf file for context
arinkulshi Apr 25, 2024
e01186a
edited imports on test
arinkulshi Apr 25, 2024
0dbb0cc
edited comments
arinkulshi Apr 25, 2024
25e39b2
linting
arinkulshi Apr 25, 2024
4dc00c7
added new functions
arinkulshi Apr 30, 2024
55215aa
edits to formatting
arinkulshi Apr 30, 2024
836cdff
added new approach to segment fields
arinkulshi May 2, 2024
5cc895b
added tests
arinkulshi May 2, 2024
7e6fac0
edited tests
arinkulshi May 2, 2024
771e221
linting
arinkulshi May 2, 2024
31e41d1
edited string formatting
arinkulshi May 15, 2024
b1abb2d
linting
arinkulshi May 15, 2024
318b040
Update pdf_field_extractor_main.py
arinkulshi-skylight May 16, 2024
cd8527c
added new end to end test
arinkulshi May 16, 2024
5fa6712
added new end to end test to verify colors
arinkulshi May 16, 2024
da6bb98
edited end to end test
arinkulshi May 17, 2024
3fe261b
edited end to end test to only include color check
arinkulshi May 17, 2024
9c4b7c8
move color matches to a list and tested against a list
arinkulshi May 20, 2024
2a9825f
linting
arinkulshi May 20, 2024
77797e4
merge
arinkulshi Jun 21, 2024
6ff4e5e
poetry file update
arinkulshi Jun 21, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions OCR/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,14 @@ poetry lock
To view installed packages in the virtual env
```shell
poetry show
```

To lint your files using ruff
```shell
ruff check --fix
```

To format your files using ruff
```shell
ruff format
```
10 changes: 10 additions & 0 deletions OCR/ocr/pdf_field_extractor_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from ocr.services.pdf_field_extractor import PDFFieldExtractor
import os

current_script_dir = os.path.dirname(os.path.abspath(__file__))
file_relative_path = "../tests/assets/per_example.pdf"
file_absolute_path = os.path.join(current_script_dir, file_relative_path)
extractor = PDFFieldExtractor(file_absolute_path)
extractor.initialize_reader()
output, labels = extractor.mark_rectangles_on_pdf()
extractor.pdf_to_images(output)
224 changes: 224 additions & 0 deletions OCR/ocr/services/pdf_field_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
import json
import os
import random
from typing import Dict, List, Optional, Tuple

import pypdf
from pdf2image import convert_from_path


class PDFFieldExtractor:
def __init__(self, file_path: str):
"""
Initialize the PDF Field Extractor with a specific file path.

Parameters:
file_path (str): The path to the PDF file to be processed.
"""
self.file_path = file_path
self.reader = None
self.form_fields = []
self.color_matches = []

def initialize_reader(self, base_path: Optional[str] = None) -> None:
"""
Initialize the PDF reader for the specified file.

Parameters:
base_path (str, optional): The base path where the PDF file is located.
If None, it defaults to the directory of this script.
"""
if base_path is None:
base_path = os.path.dirname(__file__)
full_path = os.path.join(base_path, self.file_path)
self.reader = pypdf.PdfReader(full_path)

def close_reader(self) -> None:
"""
Close the PDF reader to release resources.
"""
if self.reader is not None:
self.reader.stream.close()
self.reader = None

def list_annotations(self):
"""
Generates a list of annotations in the pdf provided this method is mostly used for debugging purposes

Returns:
str: A string list of the type of annotations the name of the fields and its coordinates on the page
"""
for page_number, page in enumerate(self.reader.pages, start=1):
print(f"Page {page_number}:")
annotations = page.get("/Annots")
if annotations is None:
print(" No annotations on this page.")
continue
for annot in annotations:
if isinstance(annot, pypdf.generic.IndirectObject):
annot = annot.get_object()
field_name = annot.get("/T")
rect = annot.get("/Rect")
subtype = annot.get("/Subtype")
print(f"Annotation - Type: {subtype}, Field Name: {field_name}, Coordinates: {rect}")

def generate_random_color(self) -> str:
"""
Generate a random RGB color.

Returns:
str: A string representing the RGB color in the format "r,g,b".
"""
r, g, b = [random.randint(0, 255) for _ in range(3)]
return f"{r},{g},{b}"

def create_rectangle_annotation(self, rect: List[float], color_str: str):
"""
Create a rectangle annotation dictionary

Parameters:
rect (list): The rectangle coordinates for the annotation.
color_str (str): A string representing the RGB color in the format "r,g,b".

Returns:
pypdf.generic.DictionaryObject: The annotation dictionary object.
"""
r, g, b = map(int, color_str.split(","))
r, g, b = r / 255.0, g / 255.0, b / 255.0 # Use floating-point division
return pypdf.generic.DictionaryObject(
{
pypdf.generic.NameObject("/Type"): pypdf.generic.NameObject("/Annot"),
pypdf.generic.NameObject("/Subtype"): pypdf.generic.NameObject("/Square"),
pypdf.generic.NameObject("/Rect"): rect,
pypdf.generic.NameObject("/C"): pypdf.generic.ArrayObject(
[
pypdf.generic.FloatObject(r),
pypdf.generic.FloatObject(g),
pypdf.generic.FloatObject(b),
]
),
pypdf.generic.NameObject("/IC"): pypdf.generic.ArrayObject(
[
pypdf.generic.FloatObject(r),
pypdf.generic.FloatObject(g),
pypdf.generic.FloatObject(b),
]
),
pypdf.generic.NameObject("/F"): pypdf.generic.NumberObject(4),
pypdf.generic.NameObject("/BS"): pypdf.generic.DictionaryObject(
{pypdf.generic.NameObject("/W"): pypdf.generic.FloatObject(0)}
),
}
)

def update_annotations_and_save(
self, output_path: str, pages, color_label_map: Dict[str, str]
) -> Tuple[str, str]:
"""
Write modified pages to a new PDF and save the color-to-field mappings to a JSON file.

Parameters:
output_path (str): The path to save the modified PDF.
pages (list): A list of page objects that have been modified.
color_label_map (dict): A dictionary mapping RGB strings to field names.

Returns:
tuple: A tuple containing the paths to the saved PDF and JSON files.
"""
output = pypdf.PdfWriter()
for page in pages:
output.add_page(page)
with open(output_path, "wb") as output_stream:
output.write(output_stream)

# Save the color mappings
labels_path = os.path.splitext(output_path)[0] + "_labels.json"
with open(labels_path, "w") as json_file:
json.dump(color_label_map, json_file, indent=4)
return output_path, labels_path

def mark_rectangles_on_pdf(self):
"""
Process the PDF to add rectangle annotations and save the document along with a JSON mapping file.

Raises:
ValueError: If the PDF reader is not initialized.

Returns:
tuple: Paths to the modified PDF file and the JSON labels file.
"""
if self.reader is None:
raise ValueError("PDF reader is not initialized. Call initialize_reader() first.")

color_label_map = {}
count = 0
for page in self.reader.pages:
annotations = page.get("/Annots", pypdf.generic.ArrayObject())

new_annotations = pypdf.generic.ArrayObject()
for annot in annotations:
if isinstance(annot, pypdf.generic.IndirectObject):
annot = annot.get_object()

field = annot.get("/T") if annot.get("/T") else f"invalid_string_{random.random()}"
rect = annot.get("/Rect")

if field and rect:
color = self.generate_random_color()
color_str = ",".join(map(str, map(int, color.split(","))))
color_label_map[color_str] = field

new_annot = self.create_rectangle_annotation(rect, color)
new_annotations.append(new_annot)
pdf_color = new_annot.get("/C").get_object()
pdf_color_values = [int(color_val * 255) for color_val in pdf_color]
pdf_color_str = ",".join(map(str, pdf_color_values))
self.color_matches.append((color_str, pdf_color_str))
if count < 5:
print(f"Color in labels file: {color_str}")
print(f"Color in PDF annotation: {pdf_color_str}")
count += 1
else:
new_annotations.append(annot) # Add annotations with null fields

page[pypdf.generic.NameObject("/Annots")] = new_annotations # set the annotations array back to the page

output_path = os.path.splitext(self.file_path)[0] + "_marked.pdf"
return self.update_annotations_and_save(output_path, self.reader.pages, color_label_map)

def pdf_to_images(self, path) -> List[str]:
"""
Converts each page of the PDF to a PNG image file.

Raises:
ValueError: If the PDF reader is not initialized.

Returns:
List[str]: A list containing the paths to the saved image files for each page of the PDF.
"""
if self.reader is None:
raise ValueError("PDF reader is not initialized. Call initialize_reader() first.")

base_path = os.path.splitext(self.file_path)[0]
output_folder = os.path.dirname(base_path)
os.makedirs(output_folder, exist_ok=True)

image_paths = []
images = convert_from_path(path, dpi=300)

# Save each image to the defined path
for i, image in enumerate(images):
image_path = os.path.join(output_folder, f"page_{i+1}.png")
image.save(image_path, "PNG")
image_paths.append(image_path)

return image_paths

def get_color_matches(self) -> List[Tuple[str, str]]:
"""
Get the color matches for testing purposes

Returns:
List[Tuple[str, str]]: A list of tuples containing the colors in labels and the colors in the PDF annotations.
"""
return self.color_matches
Loading
Loading