-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpdf_word_extraction.py
142 lines (109 loc) · 4.12 KB
/
pdf_word_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import os
import re
from collections import defaultdict
import ftfy
from pypdf import PdfReader
import spacy
def get_pdf_files(directory):
return [
os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(".pdf")
]
def read_pdf(file_path):
with open(file_path, "rb") as file:
pdf = PdfReader(file)
text = ""
for page_num in range(len(pdf.pages)):
text += pdf.pages[page_num].extract_text()
return text
def process_text(text):
# Fix unicode issues, including ligatures
text = ftfy.fix_text(text)
# Load the NLP model
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
word_freq = defaultdict(int)
skip_next = False
for i, token in enumerate(doc):
# If this token is 'al.' and the previous one was 'et', skip it
if skip_next:
skip_next = False
continue
# Skip stop words, punctuation, whitespaces, and single characters
if token.is_stop or token.is_punct or token.is_space or len(token.text) == 1:
continue
# Skip tokens that are numbers, floats or look like author initials
if (
token.text.isdigit()
or re.match(r"^\d+\.?\d*$", token.text)
or (token.text[:-1].isupper() and token.text.endswith("."))
):
continue
# Regularize the case but preserve proper nouns
word = token.text if token.tag_ == "NNP" else token.text.lower()
# Check for 'et al.' and skip both 'et' and 'al.'
if word == "et" and i < len(doc) - 1 and doc[i + 1].text.lower() == "al":
skip_next = True
continue
word_freq[word] += 1
return word_freq
def process_pdfs(directory):
pdf_files = get_pdf_files(directory)
total_word_freq = defaultdict(int)
for pdf_file in pdf_files:
text = read_pdf(pdf_file)
word_freq = process_text(text)
for word, freq in word_freq.items():
total_word_freq[word] += freq
return total_word_freq
# Tools for removing or replacing words manually
def remove_specific_words(word_freq, words_to_remove):
"""Remove specific words from the word frequency count."""
for word in words_to_remove:
word_freq.pop(
word, None
) # The second argument to 'pop' is a default in case 'word' is not found in the dict
return word_freq
def replace_specific_words(word_freq, replacements):
"""
Replace specific words in the word frequency count.
replacements should be a dict mapping old words to new words.
"""
for old_word, new_word in replacements.items():
if old_word in word_freq:
# If the new word is already in the dictionary, add the count from the old word
if new_word in word_freq:
word_freq[new_word] += word_freq[old_word]
else:
word_freq[new_word] = word_freq[old_word]
# Remove the old word from the dictionary
del word_freq[old_word]
return word_freq
# Directory containing the PDF files
directory = "pdf/"
word_freq = process_pdfs(directory)
# Customize the words to remove
words_to_remove = [
"placeholder_word_to_remove1",
"placeholder_word_to_remove2",
"placeholder_word_to_remove3"
]
word_freq = remove_specific_words(word_freq, words_to_remove)
# Customize the words to replace
replacements = {
"placeholder_word1": "replacement_word1",
"placeholder_word2": "replacement_word2",
}
word_freq = replace_specific_words(word_freq, replacements)
# Print most frequent words
top_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)[:250]
for word, freq in top_words:
print(f"{word}: {freq}")
def write_to_file(word_freq, filename, top_n=250):
"""Write the top N words and their frequencies to a file."""
with open(filename, "w") as f:
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
for word, freq in sorted_words[:top_n]:
f.write((word + " ") * freq)
f.write("\n")
# Write to txt after processing and customization
write_to_file(word_freq, "top_words.txt")