From 02518d2307449499a7f0efd6352fc6eeef0dfa16 Mon Sep 17 00:00:00 2001 From: Arindam Kulshi Date: Thu, 25 Apr 2024 11:14:03 -0700 Subject: [PATCH] linting --- OCR/ocr/pdf_segmentor_main.py | 2 +- OCR/ocr/services/pdf_field_extractor.py | 39 ++++++++++++------------- 2 files changed, 20 insertions(+), 21 deletions(-) diff --git a/OCR/ocr/pdf_segmentor_main.py b/OCR/ocr/pdf_segmentor_main.py index 1afaf58b..a70e5378 100644 --- a/OCR/ocr/pdf_segmentor_main.py +++ b/OCR/ocr/pdf_segmentor_main.py @@ -7,4 +7,4 @@ extractor = PDFFieldExtractor(file_absolute_path) extractor.initialize_reader() extractor.segment_fields(["Region", "ParentGuardian"]) -extractor.extract_images() \ No newline at end of file +extractor.extract_images() diff --git a/OCR/ocr/services/pdf_field_extractor.py b/OCR/ocr/services/pdf_field_extractor.py index 9a280fb0..68520be8 100644 --- a/OCR/ocr/services/pdf_field_extractor.py +++ b/OCR/ocr/services/pdf_field_extractor.py @@ -3,41 +3,41 @@ from PIL import Image import os + class PDFFieldExtractor: def __init__(self, file_path): self.file_path = file_path - self.reader = None + self.reader = None self.form_fields = [] - def initialize_reader(self, base_path=None): if base_path is None: base_path = os.path.dirname(__file__) full_path = os.path.join(base_path, self.file_path) self.reader = PyPDF2.PdfReader(full_path) - + def close_reader(self): if self.reader is not None: self.reader.stream.close() # Close the stream explicitly self.reader = None def segment_fields(self, field_names): - # Iterate through each page in the PDF - if self.reader is None: - raise ValueError("PDF reader is not initialized. Call initialize_reader() first.") - for page in self.reader.pages: - # Check if there are annotations (textboxes are considered annotations) - if '/Annots' in page: - annotations = page['/Annots'] - for annot in annotations: - if isinstance(annot, PyPDF2.generic.IndirectObject): - annot = annot.get_object() - field = annot.get('/T') - rect = annot.get('/Rect') - if field and rect: - field_str = str(field) - if field_str in field_names: - self.form_fields.append((field_str, rect)) + # Iterate through each page in the PDF + if self.reader is None: + raise ValueError("PDF reader is not initialized. Call initialize_reader() first.") + for page in self.reader.pages: + # Check if there are annotations (textboxes are considered annotations) + if "/Annots" in page: + annotations = page["/Annots"] + for annot in annotations: + if isinstance(annot, PyPDF2.generic.IndirectObject): + annot = annot.get_object() + field = annot.get("/T") + rect = annot.get("/Rect") + if field and rect: + field_str = str(field) + if field_str in field_names: + self.form_fields.append((field_str, rect)) def extract_images(self): if self.reader is None: @@ -55,4 +55,3 @@ def extract_images(self): img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) img.save(f'extracted_{field_name.replace("/", "_")}.png') self.close_reader() -