Skip to content

Commit

Permalink
edit to file path config
Browse files Browse the repository at this point in the history
  • Loading branch information
arinkulshi committed Apr 25, 2024
1 parent 0416355 commit e59bbae
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 14 deletions.
10 changes: 10 additions & 0 deletions OCR/ocr/pdf_segmentor_main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from OCR.ocr.services.pdf_field_extractor import PDFFieldExtractor
import os

current_script_dir = os.path.dirname(os.path.abspath(__file__))
file_relative_path = "../tests/assets/form_filled_example.pdf"
file_absolute_path = os.path.join(current_script_dir, file_relative_path)
extractor = PDFFieldExtractor(file_absolute_path)
extractor.initialize_reader()
extractor.segment_fields(["Region", "ParentGuardian"])
extractor.extract_images()
31 changes: 17 additions & 14 deletions OCR/ocr/services/pdf_field_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,26 @@
class PDFFieldExtractor:
def __init__(self, file_path):
self.file_path = file_path
self.reader = None
self.form_fields = []


def initialize_reader(self):
path = os.path.dirname(__file__)
full_path = os.path.join(path, self.file_path)
with open(full_path, 'rb') as file:
self.reader = PyPDF2.PdfReader(file)
def initialize_reader(self, base_path=None):
if base_path is None:
base_path = os.path.dirname(__file__)
full_path = os.path.join(base_path, self.file_path)
self.reader = PyPDF2.PdfReader(full_path)

def close_reader(self):
if self.reader is not None:
self.reader.stream.close() # Close the stream explicitly
self.reader = None

def segment_fields(self, field_names):
with open(self.file_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
# Iterate through each page in the PDF
for page in reader.pages:
if self.reader is None:
raise ValueError("PDF reader is not initialized. Call initialize_reader() first.")
for page in self.reader.pages:
# Check if there are annotations (textboxes are considered annotations)
if '/Annots' in page:
annotations = page['/Annots']
Expand All @@ -34,6 +40,8 @@ def segment_fields(self, field_names):
self.form_fields.append((field_str, rect))

def extract_images(self):
if self.reader is None:
raise ValueError("PDF reader is not initialized. Call initialize_reader() first.")
doc = fitz.open(self.file_path)
page = doc[0]
page_rect = page.mediabox
Expand All @@ -46,10 +54,5 @@ def extract_images(self):
pix = page.get_pixmap(clip=fitz.Rect(left, top, right, bottom), dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
img.save(f'extracted_{field_name.replace("/", "_")}.png')
break

self.close_reader()

file_relative_path = "../tests/assets/form_filled_example.pdf"
extractor = PDFFieldExtractor(file_relative_path)
extractor.segment_fields(["Region", "Address"])
extractor.extract_images()

0 comments on commit e59bbae

Please sign in to comment.