-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilities.py
154 lines (107 loc) · 5.63 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from glob import glob
from pathlib import Path
from re import sub
from typing import List, Union
import rich
import typer
from rich.progress import Progress, SpinnerColumn
from bs4 import BeautifulSoup
from bs4.element import Tag
from file_writing import write_to_file
def get_texts(directory: str = "../reuters21578", article_count: Union[str, int] = 'all') -> List[Tag]:
"""
Get any number of articles in the corpus and return a list of them
:param directory: Optionally specify where the reuters corpus is
:param article_count: How many articles to retrieve. "all" or a number >= 1
:return: A List of articles as `bs4.element.Tag` objects
"""
# Create a list of file names in the required Reuters corpus
CORPUS_FILES: List[Path] = [Path(p) for p in glob(f"{directory}/*.sgm")]
rich.print("\nFound files:\n", [f"{f}" for f in CORPUS_FILES])
# This will contain all the newspaper articles in the Reuters corpus
all_articles: List[Tag] = []
print()
# Create a spinner to show that the app isn't frozen
spinner = Progress(SpinnerColumn(), '[progress.description]{task.description}', SpinnerColumn(), transient=True)
with spinner:
_ = spinner.add_task("Creating list of articles...")
# Loop through each file
for file in CORPUS_FILES:
# Read the file contents
with open(file, 'r') as f:
contents = BeautifulSoup(f, features="html.parser")
# Get all the articles for the file
articles = contents('text')
# Add each article to the list
all_articles.extend(articles)
# Stop adding to the list of articles once the desired number of articles is reached
if article_count.isnumeric() and len(all_articles) >= int(article_count):
break
# If there is a definite article count provided, ensure article list only contains that many
if article_count.isnumeric() and len(all_articles) > int(article_count):
all_articles = all_articles[: int(article_count)]
# Print a message if the user requested more articles than exist
if article_count.isnumeric() and int(article_count) > len(all_articles):
rich.print(f"You asked for [green bold]{article_count}[/] articles,"
f" but only [green bold]{len(all_articles)}[/] found\n")
# In all other cases, just print the number of articles
else:
rich.print(f"Number of articles found: [bold green]{len(all_articles)}[/]\n")
return all_articles
def textualize(article: Tag, article_num: int) -> str:
"""
Take the given article (of type bs4.element.Tag) and turn it into normal text, as usable throughout the pipeline
:param article: The article as a bs4.element.Tag
:param article_num: Which article this is
:return: The 'stringified' version of the article
"""
# Bring all children together into 1 string
text = '\n'.join(child.text for child in article.children)
# Make sure all newline characters have a space after, to prevent future tokenization errors
# as found in experiment
text = text.replace('\n', '\n ')
# Remove all instances of multiple - in a row
text = sub(r'-{2,}', ' ', text)
# Remove - characters without letters surrounding them. Keeps "once-in-a-lifetime", but not " - "
text = sub(r'(?![A-Za-z])-(?![A-Za-z])', ' ', text)
# Remove certain unicode control characters, as found in experiment
text = sub(r'\x03|\x02', '', text)
# Remove all punctuation except periods and commas. Handle those separately for money values
text = sub(r"[()<>!:;?\"]+", ' ', text)
# Remove all instances of multiple periods in a row
text = sub(r"\.{2,}", ' ', text)
# Simplify acronyms to their constituent letters. i.e. changes "U.S." to "US"
text = sub(r"(?<!\w)([A-Za-z])\.", r'\1', text)
# Remove all periods that aren't surrounded by a number. i.e. keeps "1.1" and "1."
text = sub(r"(?!\d)\.(?!\d)", ' ', text)
# Remove all commas that aren't surrounded by a number. i.e. keeps "1,000"
text = sub(r"(?!\d),(?!\d)", ' ', text)
# Remove all apostrophes surrounded by letters. In other words, replace all "it's" with "its", etc.
text = sub(r"(?<=[A-Za-z])'(?=[A-Za-z])", '', text)
# Remove all apostrophes remaining. (Needed to do this separately, because we needed to replace contraction
# apostrophes with nothing. We will replace all other apostrophes with a space
text = sub(r"'", ' ', text)
# Remove all slashes, but not with numbers. For instance, removes / in "March/April", but not "1998/99".
# I will treat numbers with slashes in them as 1 concept
text = sub(r"(?!\d)/(?!\d)", ' ', text)
file_print = Path(f'output/article{article_num}/1. Initial-text.txt')
# Don't print for any articles beyond 5
if article_num <= 5:
rich.print(f"\twriting to file \"{file_print}\"")
# Write this semi-cleaned text to file
write_to_file(Path(f"article{article_num}/1. Initial-text.txt"), text)
return text
def count_callback(count: str) -> str:
"""
A callback function for the Pipeline Typer app, to validate user input.
The value must either be "all" or a number >= 1.
:param count: The value the user entered for the number of articles they want printed
:return: The value, if it was valid
"""
if not count.isnumeric() and count.lower() != 'all':
rich.print("\n[red bold]Only \"all\" or a number are permitted[/]\n")
raise typer.Exit(1)
if count.isnumeric() and int(count) < 1:
rich.print("\n[red bold]Only integers >= 1 are permitted[/]\n")
raise typer.Exit(1)
return count