-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess_texts.py
65 lines (52 loc) · 2.14 KB
/
preprocess_texts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# Preprocess poems and concatenate all collections to a single file
import re
# Load and preprocess texts
# See regex expressions, e.g., here: https://www.rexegg.com/regex-quickstart.html
def load_text(path):
text = open(path, "r").read()
text = text.replace("ArribaAbajo","") # Remove spurious words (from Cervantes Virtual)
text = text.replace("Abajo ","") # Remove spurious words (from Cervantes Virtual)
text = text.replace("Arriba ","") # Remove spurious words (from Cervantes Virtual)
text = re.sub("\[\d+\]","",text) # Remove notes (from Project Gutenberg)
text = re.sub("\^\{.*\}","",text) # Remove characters "^" (from Project Gutenberg)
text = re.sub("\w\^\w","",text) # Remove characters "^" (from Project Gutenberg)
text = text.lower() # Set to lowercase
text = re.sub(" \d+", " ", text) # Remove digits
text = re.sub("\d+", "", text) # Remove digits
text = text.replace("\t","") # Remove tabs
text = re.sub("- [a-zA-Z]* -","", text) # Remove numeration of poems
text = re.sub("\n\n+", "\n\n", text) # Remove multiple
text = re.sub("\s\s^\n+", "", text) # Remove spaces
text = text.replace(" ","") # Remove spaces
text = text.replace("\n ","\n") # Remove spaces
text = text.replace("_","") # Remove underscores
return text
# Compile all texts by an author and preprocess them
def preprocess(author):
# Texts
filepath = "data/"+author+"/"
if author=="Lope":
filenames = ["Rimas","Otros_Sonetos","Rimas_Humanas_y_divinas","Rimas_Sacras","Sonetos_en_comedias_autos_y_entremeses","Sonetos_en_libros"]
elif author=="Verdaguer":
filenames = ["L'Atlantida", "Canigo", "Cansons_de_Montserrat", "Oda_a_Barcelona"]
elif author=="Donne":
filenames = ["Poems_Donne_1", "No_man_is_an_island"]
else:
print("Author not included.")
# Load texts
texts = []
for filename in filenames:
text = load_text(filepath + filename + ".txt")
texts.append(text)
totaltext = "".join(texts)
#print([totaltext[0:200]])
newfile = open(filepath+"processed.txt","w")
newfile.write(totaltext)
newfile.close()
# Choose author
author = "Lope"
preprocess(author)
author = "Verdaguer"
preprocess(author)
author = "Donne"
preprocess(author)