-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf2text.py
81 lines (62 loc) · 2.26 KB
/
pdf2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import os.path
import os
import sys
import pdftotext
import yaml
import re
import uuid
# get base path.
base_path = os.path.dirname(os.path.abspath(__file__))
# directory structure
input_dir = "/inputs/"
output_dir = "/outputs/"
# file to convert
filename = sys.argv[1]
file_path = base_path + input_dir + filename
filename = filename.split(".")
output_filename = filename[0] + ".yaml"
text = ''
# instead of open both files I decided to dump the whole text so that I can avoid looping every page
# and dumping each one which caused the inclussion of very anoying extra -''
# to be noted: there is a risk of memory errors if the file is huge
# Load your PDF
with open(file_path, "rb") as i:
pdf = pdftotext.PDF(i)
text = ''.join(pdf)
# write another text file
with open(base_path + output_dir + output_filename, "w", encoding="utf-8") as o:
# eliminar numeradcion de paginas
# por el momento no las elimino para facilitar la identifiación de la página
#text = re.sub(r"\n\d+\n", "", text)
#text = re.sub(r"\n\d+\n", "- page: ", text)
# elimino los fin de página
text = re.sub(r"\f", "", text)
text = text.replace("•","")
# eliminar espacios antes y despues de saltos de linea
text = re.sub(r" \n", "\n", text)
text = re.sub(r"\n ", "\n", text)
# eliminar multiples saltos de linea
text = re.sub(r"\n+", r"\n", text)
# sustituyo \n por espacio en caso de frases sin terminar (saltos de linea entre palabras con letras minusculas)
for match in re.finditer(r"[a-z|,]\n[a-z|ú]", text):
index = match.start()
text = text[:index + 1] + " " + text[index + 2:]
# transformación final
text = text.split("\n")
res = {}
res['id'] = str(uuid.uuid4())
res['version'] = 2
res['type'] = ""
res['parties'] = ""
res['election_type'] = ""
res['election_date'] = ""
res['publication_date'] = ""
res['url'] = ""
res['web_history'] = ""
res['title'] = ""
res['description'] = ""
res['content'] = []
for string in text:
dict = {'declaration': {'id': str(uuid.uuid4()), 'page': "", 'text': string}}
res['content'].append(dict)
yaml.safe_dump(res, o, allow_unicode=True, width=float("inf"), sort_keys=False)