-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathread_nyt.py
110 lines (84 loc) · 3.23 KB
/
read_nyt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import os
import numpy as np
from nltk.tokenize import word_tokenize
from tqdm import tqdm
import os
import tarfile
import zipfile
def extract_file(path, to_directory='.'):
if path.endswith('.zip'):
opener, mode = zipfile.ZipFile, 'r'
elif path.endswith('.tar.gz') or path.endswith('.tgz'):
opener, mode = tarfile.open, 'r:gz'
elif path.endswith('.tar.bz2') or path.endswith('.tbz'):
opener, mode = tarfile.open, 'r:bz2'
else:
print("Could not extract, as no appropriate extractor is found: " + path )
cwd = os.getcwd()
os.chdir(to_directory)
try:
file = opener(path, mode)
try: file.extractall()
finally: file.close()
finally:
os.chdir(cwd)
from nltk.tokenize import word_tokenize
from xml.dom import minidom
def tokenlize(text):
# text =text.lower()
# print(" ".join(word_tokenize(text))) word_tokenize(text)
return word_tokenize(text)
def read_xml(filename = "1815777.xml"):
doc = minidom.parse(filename)
items = doc.getElementsByTagName("block")
# print(dir(items[0].getElementsByTagName("p")[0]))
# print([ item.p.text for item in items])
# assert len(items) >1, "xml error, do not have exactly three blocks, but with {} blocks".format(len(items))
# if len(items) <2:
# print(filename, len(items),"!!!!!")
# return None
try:
text = " ".join([p.firstChild.data for p in items[-1].getElementsByTagName("p")])
text = text.replace("\n"," ")
text = text.replace("\r", " ")
return " ".join(tokenlize(text))
except Exception as e:
print(e)
return None
#. /nfsd/quartz/benyou/codes/wordwave
from pathlib import Path
demo_path = "/nfsd/quartz/datasets/nyt/data/"
new_path = "nyt"
def unzip():
if not os.path.exists(new_path):
os.mkdir(new_path)
for year in os.listdir(demo_path):
path = os.path.join(demo_path,year)
print("processing year: "+year)
new_path_year = os.path.join(new_path,year)
if not os.path.exists(new_path_year):
os.mkdir(new_path_year)
for filename in os.listdir(path):
zipped_file = os.path.join(path,filename)
print("unzip {} to {} ".format(zipped_file,new_path_year))
if os.path.isfile(zipped_file):
extract_file(zipped_file,new_path_year)
# exit()
def convert_raw():
with open("nyt.txt", "w", encoding="utf-8") as f:
for year in [str(i) for i in range (1987,2008,1)]:
new_path_year = os.path.join(new_path, year)
for group in os.listdir(new_path_year):
unzipped = os.path.join(new_path_year, group)
print("read unzipped path: " + unzipped)
if os.path.isdir(unzipped):
filenames = list(Path(unzipped).glob("**/*.xml"))
for xml_filename in filenames:
# print( xml_filename)
text = read_xml(xml_filename.as_posix())
if text is not None:
f.write("{} {}\n".format(year, text))
if __name__ == "__main__":
# unzip()
convert_raw()
# read_xml()