forked from sdtblck/PDFextract
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf_filter.py
196 lines (150 loc) · 6.03 KB
/
pdf_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
"""aggressive-ish pdf cleaning script for language models.
Based off: https://gist.github.com/leogao2/8d4662dfb8e58e8c58ef94df5d46413d by Leo Gao"""
import os
import re
import fix_unicode
lone_accent_dict = {"a´": "á", "e´": "é", "i´": "í", "o´": "ó", "u´": "ú",
"a¨": "ä", "e¨": "ë", "i¨": "ï", "o¨": "ö", "u¨": "ü",
"a^": "â", "e^": "ê", "i^": "î", "o^": "ô", "u^": "û",
"a`": "à", "e`": "è", "i`": "ì", "o`": "ò", "u`": "ù",
"a~": "ã", "o~": "õ", "n~": "ñ"}
lone_accent_dict.update({k.upper(): v.upper() for k, v in lone_accent_dict.items()})
def ditch_combining_diacritics(text):
for orig, repl in lone_accent_dict.items():
text = text.replace(orig, repl)
text = re.sub(r'[\u0300-\u036F]', '', text)
return re.sub(r'(?:\xa8|[\u02C0-\u02DF])', '', text)
def listdir(x):
return [x + '/' + fn for fn in os.listdir(x)]
def id(x):
return x
def average_word_length(text):
"""
get average word length of a given text file
:param txt: string
:return: float of avg word length
"""
n_words = len(text.split())
n_chars = len(text)
avgw = n_chars / (n_words + 1)
return avgw
def mean(x):
x = list(x)
if not x: return 0
return sum(x) / len(x)
def nonzero(x):
return filter(id, x)
def is_letter(x):
return x in "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
def is_date(x):
res = re.match(r'.*([1-3][0-9]{3})', x)
if res is not None:
return True
else:
return False
def header_footer_filter(para):
"""if para is short & begins with ©, r {date}, copyright {date}, remove para"""
try:
if len(para) < 50:
if para.strip()[0] == "©":
return ""
elif para.strip()[0] == "r":
para_words = para.strip().split(" ")
if len(para_words) >= 2:
second_word = para_words[1]
if is_date(second_word):
return ""
elif para.strip().split(" ")[0] == "copyright":
second_word = para.strip().split(" ")[1]
if is_date(second_word):
return ""
except:
return para
return para
def all_equal(x):
return all([n == x[0] for n in x])
def replace_hyphenated(text):
text = re.sub(r'-[?\s]\n{1,2}(\w+ *)', r'\1\n', text)
return re.sub(r'-\s{1,2}(\w+ *)', r'\1', text)
def remove_leading_and_trailing_nums(text):
# remove leading and trailing numbers (i.e page nums)
text = text.strip()
text = re.sub(r'^(\d+)', '', text)
text = re.sub(r'(\d+)$', '', text)
return text.strip()
def cid_percentage(text):
"""
detects the amount of cid numbers (an artefact from missing custom fonts) found in a converted pdf.
Example:
"which maintained contacts not least in the South East Asian extreme right. To some extent during the
(cid:38)(cid:82)(cid:79)(cid:71)(cid:3) (cid:58)(cid:68)(cid:85)(cid:15)"
:param text: string
:return: float between 0 and 1 representing density of cid nos in string
"""
n_matches = len(re.findall('\(cid:[0-9]+\)', text))
if text:
return (n_matches * 8) / len(text)
else: return 0.
def remove_cid(text):
return re.sub('\(cid:[0-9]+\)', '', text)
def filter_double_whitespace(text):
return re.sub("\s\s+" , " ", text)
def filter_newlines(text):
return re.sub("\n", " ", text)
def pdf_filter(text, fn):
cid_perc = cid_percentage(text)
# if cid_perc is larger than threshold, it's probably a latex / alt font heavy document. delete the whole thing.
if cid_perc > .03:
print('ERROR: too many font errors - skipping {}.'.format(fn))
return ""
# if mean line len is too short, it's probably garbled, not useful, or overly latex-y
whole_doc_mean_line_len = mean(nonzero(map(len, text.split('\n'))))
if whole_doc_mean_line_len < 15:
print('ERROR: avg mean line length too short - skipping {}.'.format(fn))
return ""
word_length = average_word_length(text)
# if average word length is too big or small, document is not worth keeping
if word_length > 45:
print('ERROR: avg word length too large - skipping {}.'.format(fn))
return ""
elif word_length < 2:
print('ERROR: avg word length too short - skipping {}.'.format(fn))
return ""
# replace hyphens at end of lines and paragraphs
text = replace_hyphenated(text)
paras = text.split('\n\n')
out = []
for para in paras:
# filter out new lines in the middle of paragraphs,
# remove headers
# and remove double whitespaces
para = filter_newlines(para)
para = header_footer_filter(para)
para = filter_double_whitespace(para)
# if mean line len is too short, it's probably garbled or not useful
mean_line_len = mean(nonzero(map(len, para.split('\n'))))
if mean_line_len < 2.:
continue
# if cid_percentage is higher than 10%, it's likely a latex heavy para and won't make sense without it
# delete the whole para
if cid_percentage(para) > .1:
continue
# not enough letters (i.e math, tables, etc)
letterness = mean(map(is_letter, para))
if letterness < 0.40:
continue
# final cleaning steps:
# remove leading and trailing numbers (usually pagenos)
# remove any remaining cid strings
# fix any unicode / ligature related errors
# combine letter -> accent strings from bad decoding to combined letter/accent
# e.g a´ gets converted to á
para = ditch_combining_diacritics(fix_unicode.fix_unicode(remove_cid(remove_leading_and_trailing_nums(para))))
if para != '':
# only append if not empty
out.append(para)
# remove empty strings from prev step
for i in out:
if not i:
out.remove(i)
return '\n\n'.join(out)