forked from mayooear/gpt4-pdf-chatbot-langchain
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcombine.py
38 lines (31 loc) · 1.37 KB
/
combine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import pdfplumber
import os
from fpdf import FPDF
# Define the directory containing the PDF files
pdf_directory = r'C:\Users\eagle\OneDrive\Documents\Dev\gpt4-pdf-chatbot-langchain\docs'
# Create a PDF object for the output file
pdf_output = FPDF()
pdf_output.set_auto_page_break(auto=True, margin=15)
# Add the "DejaVu Sans" font to the PDF object
pdf_output.add_font("DejaVu", style="", fname="DejaVuSans.ttf")
# Set the font to "DejaVu Sans"
pdf_output.set_font("DejaVu", size=12)
# Iterate over all the files in the directory
for filename in os.listdir(pdf_directory):
# Check if the file is a PDF
if filename.endswith('.pdf'):
# Open the PDF file
with pdfplumber.open(os.path.join(pdf_directory, filename)) as pdf:
# Iterate over all the pages in the PDF file
for page in pdf.pages:
# Extract the text from the page
text = page.extract_text()
if text:
# Add a new page to the output PDF
pdf_output.add_page()
# Write the extracted text to the output PDF
pdf_output.multi_cell(0, 10, text)
# Create the output file
output_filename = os.path.join(pdf_directory, 'combined_text.pdf')
pdf_output.output(output_filename)
print('Text from PDF files has been successfully extracted and combined into', output_filename)