From bb6fdbff6d4c77c8c4426ea427b5ade241d72b7c Mon Sep 17 00:00:00 2001 From: Rongxin Date: Sat, 23 Nov 2024 16:48:19 +0800 Subject: [PATCH] fead (main): pdf/a auto converter --- pdf2zh/pdf2zh.py | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/pdf2zh/pdf2zh.py b/pdf2zh/pdf2zh.py index 7153232c..2cfbcd9e 100644 --- a/pdf2zh/pdf2zh.py +++ b/pdf2zh/pdf2zh.py @@ -8,12 +8,13 @@ import argparse import logging import os +import subprocess import sys +from pathlib import Path from typing import TYPE_CHECKING, Any, Container, Iterable, List, Optional import pymupdf from huggingface_hub import hf_hub_download -from pathlib import Path from pdf2zh import __version__ from pdf2zh.pdfexceptions import PDFValueError @@ -98,7 +99,45 @@ def extract_text( for file in files: filename = os.path.splitext(os.path.basename(file))[0] - doc_en = pymupdf.open(file) + def convert_to_pdfa(input_pdf_path, output_pdfa_path): + """ + Converts a PDF to PDF/A format using Ghostscript. + Args: + input_pdf_path (str): Path to the input PDF file. + output_pdfa_path (str): Path where the PDF/A file will be saved. + """ + try: + # Ghostscript command for conversion + command = [ + "gs", + "-dPDFA", + "-dBATCH", + "-dNOPAUSE", + "-dNOOUTERSAVE", + "-sDEVICE=pdfwrite", + "-sOutputFile=" + output_pdfa_path, + "-dPDFACompatibilityPolicy=1", + input_pdf_path, + ] + + # Run the command + subprocess.run(command, check=True) + print( + f"Successfully converted {input_pdf_path} to PDF/A at {output_pdfa_path}" + ) + except subprocess.CalledProcessError as e: + print(f"Error during conversion: {e}") + except FileNotFoundError: + print("Ghostscript is not installed or not found in the PATH.") + + try: + file_pdfa = f"{str(file)}-pdfa.pdf" + convert_to_pdfa(file, file_pdfa) + doc_en = pymupdf.open(file_pdfa) + except Exception as e: + print(f"Error converting PDF: {e}") + doc_en = pymupdf.open(file) + page_count = doc_en.page_count font_list = ["china-ss", "tiro"] font_id = {}