-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_text_from_pdf_exception_sample.py
112 lines (91 loc) · 4.87 KB
/
extract_text_from_pdf_exception_sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Copyright 2024 Adobe
All Rights Reserved.
NOTICE: Adobe permits you to use, modify, and distribute this file in
accordance with the terms of the Adobe license agreement accompanying it.
"""
import logging
import os
import sys
from datetime import datetime
from adobe.pdfservices.operation.auth.service_principal_credentials import ServicePrincipalCredentials
from adobe.pdfservices.operation.exception.exceptions import ServiceApiException, ServiceUsageException, SdkException
from adobe.pdfservices.operation.io.cloud_asset import CloudAsset
from adobe.pdfservices.operation.io.stream_asset import StreamAsset
from adobe.pdfservices.operation.pdf_services import PDFServices
from adobe.pdfservices.operation.pdf_services_media_type import PDFServicesMediaType
from adobe.pdfservices.operation.pdfjobs.jobs.extract_pdf_job import ExtractPDFJob
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_element_type import ExtractElementType
from adobe.pdfservices.operation.pdfjobs.params.extract_pdf.extract_pdf_params import ExtractPDFParams
from adobe.pdfservices.operation.pdfjobs.result.extract_pdf_result import ExtractPDFResult
# Initialize the logger
logging.basicConfig(level=logging.INFO)
#
# This sample illustrates how to extract Text Information from PDF.
#
# Refer to README.md for instructions on how to run the samples & understand output zip file.
#
class ExtractTextInfoFromPDF:
def __init__(self):
try:
input_file_name = sys.argv[1]
file = open('inputs/' + input_file_name, 'rb')
input_stream = file.read()
file.close()
# Initial setup, create credentials instance
credentials = ServicePrincipalCredentials(
client_id=os.getenv('PDF_SERVICES_CLIENT_ID'),
client_secret=os.getenv('PDF_SERVICES_CLIENT_SECRET')
)
# Creates a PDF Services instance
pdf_services = PDFServices(credentials=credentials)
# Creates an asset(s) from source file(s) and upload
input_asset = pdf_services.upload(input_stream=input_stream, mime_type=PDFServicesMediaType.PDF)
# Create parameters for the job
extract_pdf_params = ExtractPDFParams(
elements_to_extract=[ExtractElementType.TEXT],
)
# Creates a new job instance
extract_pdf_job = ExtractPDFJob(input_asset=input_asset, extract_pdf_params=extract_pdf_params)
# Submit the job and gets the job result
location = pdf_services.submit(extract_pdf_job)
pdf_services_response = pdf_services.get_job_result(location, ExtractPDFResult)
# Get content from the resulting asset(s)
result_asset: CloudAsset = pdf_services_response.get_result().get_resource()
stream_asset: StreamAsset = pdf_services.get_content(result_asset)
# Creates an output stream and copy stream asset's content to it
output_file_path = self.create_output_file_path()
with open(output_file_path, "wb") as file:
file.write(stream_asset.get_input_stream())
except ServiceApiException as service_api_exception:
# ServiceApiException is thrown when an underlying service API call results in an error.
self.handle_exception("ServiceApiException",
service_api_exception.message,
service_api_exception.status_code)
except ServiceUsageException as service_usage_exception:
# ServiceUsageException is thrown when either service usage limit has been reached or credentials quota
# has been exhausted.
self.handle_exception("ServiceUsageException",
service_usage_exception.message,
service_usage_exception.status_code)
except SdkException as sdk_exception:
# SdkException is typically thrown for client-side or network errors.
self.handle_exception("SdkException",
sdk_exception.message,
None)
# Generates a string containing a directory structure and file name for the output file
@staticmethod
def create_output_file_path() -> str:
now = datetime.now()
time_stamp = now.strftime("%Y-%m-%dT%H-%M-%S")
os.makedirs("output/ExtractTextInfoFromPDF", exist_ok=True)
return f"output/ExtractTextInfoFromPDF/extract{time_stamp}.zip"
# Helper function to log type of exception, exception code and exception message
@staticmethod
def handle_exception(exception_type, exception_message, status_code) -> None:
logging.info(exception_type)
if status_code is not None:
logging.info(status_code)
logging.info(exception_message)
if __name__ == "__main__":
ExtractTextInfoFromPDF()