-
Notifications
You must be signed in to change notification settings - Fork 0
/
sweetparser.py
205 lines (157 loc) · 7.11 KB
/
sweetparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
import json
from utility import get_all_files_in_directory
from xml.etree.ElementTree import parse, fromstring
import argparse
import yaoner
import os
from tika import parser
import csv
__author__ = 'Frank'
def parse_owl_file(filename):
doc = parse(filename)
# doc = parse(u)
# doc = xml_text
root = doc.getroot()
concept_dictionary = dict()
for child in root:
if child.tag.endswith('Class'):
concept = str()
for attr in child.attrib:
if attr.endswith('about'):
concept2 = child.attrib[attr]
if concept2.startswith('#'):
concept = concept2[1:]
# print(concept)
subclass_of = []
for sub_child in child:
if sub_child.tag.endswith('subClassOf'):
for attr in sub_child.attrib:
if attr.endswith('resource'):
parts = sub_child.attrib[attr].split('#')
if parts is not None and len(parts) > 0:
subclass_of.append(parts[-1])
# print(subclass_of)
concept_dictionary[concept] = subclass_of
return concept_dictionary
def parse_owl_directory(path):
file_list = get_all_files_in_directory(path, suffix='.owl')
entologies = dict()
for entry in file_list:
print(entry)
concept_dictionary = parse_owl_file(entry)
entologies.update(concept_dictionary)
dump(entologies, 'sweet_concepts.json')
categories = transform_to_categories(entologies)
dump(categories, 'sweet_concept_categories.json')
return
def transform_to_categories(dictionary):
categories = dict()
for key in dictionary:
value = dictionary[key]
for item in value:
if item not in categories:
categories[item] = dict()
categories[item][key] = 1
return categories
def dump(dictionary, output):
absolute_path = os.path.abspath(os.path.join(output, os.pardir))
if not os.path.exists(absolute_path):
os.makedirs(absolute_path)
json_data = json.dumps(dictionary)
with open(output, 'w') as output_file:
output_file.write(json_data)
return
def intersect(json_filename, output_name, index_file, start_index=0, end_index=yaoner.MAX_INT_VALUE):
base_directory = '/Users/Frank/Desktop/fulldump/raw-dataset/'
if index_file is None:
index_file = '/Users/Frank/PycharmProjects/599assignment1/geo-topic-parser-folder/geo-topic-all-files.txt'
with open(json_filename) as json_file:
json_data = json.load(json_file)
concept_dictionary = dict()
for key in json_data.keys():
concept_dictionary[key.lower()] = {}
file_list = yaoner.read_index_file(index_file, base_directory, start_index, end_index)
for idx, val in enumerate(file_list):
print(start_index + idx)
parsed = parser.from_file(''.join([base_directory, val]))
if 'content' in parsed and parsed['content'] is not None:
content = parsed['content']
words = content.split()
for word in words:
lowercased = word.lower()
if lowercased in concept_dictionary:
last_part = os.path.basename(val)
concept_dictionary[lowercased][last_part] = 1
dump(concept_dictionary, output_name + 'from' + str(start_index) + 'to' + str(end_index) + '.json')
return
def reverse(dir_list, output_name):
with open(output_name, 'w') as output_file:
files = []
for entry in dir_list:
files.extend(get_all_files_in_directory(entry))
json_data = {}
for entry in files:
with open(entry) as input_json:
json_data.update(json.load(input_json))
print(len(json_data))
filename_sweet_dictionary = {}
for key in json_data.keys():
for filename in json_data[key].keys():
if filename not in filename_sweet_dictionary:
filename_sweet_dictionary[filename] = {}
filename_sweet_dictionary[filename][key] = 1
print(len(filename_sweet_dictionary))
# output_file.write(json.dumps(filename_sweet_dictionary))
return
def transform_json_to_tsv(json_name, output_file_name):
with open(json_name) as json_file, open(output_file_name, 'w') as output_file:
csv_writer = csv.writer(output_file, delimiter='\t')
csv_writer.writerow(['filename', 'sweetNumber', 'sweetList'])
json_data = json.load(json_file)
rows = []
for filename in json_data.keys():
# sweet_list = list(json_data[filename].keys())
# sweet_list = [key.encode('ascii') for key in json_data[filename]]
sweet_list = []
for key in json_data[filename]:
sweet_list.append(str(key))
rows.append([filename, len(sweet_list), json.dumps(sweet_list)])
csv_writer.writerows(rows)
return
# transform_json_to_tsv('/Users/Frank/working-directory/filename-sweet/filename-sweet.json', '/Users/Frank/working-directory/filename-sweet/sweet.tsv')
def count_concepts_in_json(json_name, output_name):
with open(json_name) as json_file, open(output_name, 'w') as output_json_file:
json_data = json.load(json_file)
dictionary = dict()
for filename in json_data:
for concept in json_data[filename]:
if concept not in dictionary:
dictionary[concept] = 1
else:
dictionary[concept] += 1
output_json_file.write(json.dumps(dictionary, indent=4))
return
count_concepts_in_json('/Users/Frank/working-directory/filename-sweet/filename-sweet.json', '/Users/Frank/working-directory/filename-sweet/1.json')
def main():
arg_parser = argparse.ArgumentParser('Yao EXIF tool')
arg_parser.add_argument('--mode', required=True, help='parse_owl, intersect, reverse')
arg_parser.add_argument('--index', type=str, help='the index file containing all the paths to the file')
arg_parser.add_argument('--input', nargs='+', required=False, type=str, help='paths to directory containing files')
arg_parser.add_argument('--output', required=True, help='output file name')
arg_parser.add_argument('--json', type=str, help='index file')
arg_parser.add_argument('--start', type=str, help='index start at')
arg_parser.add_argument('--end', type=str, help='index end at')
args = arg_parser.parse_args()
if args.mode == 'parse_owl' and args.input:
# parse_owl_directory('/Users/Frank/Downloads/2.3/')
parse_owl_directory(args.input)
if args.mode == 'intersect' and args.json and args.output:
if args.start and args.end:
intersect(args.json, args.output, args.index, int(args.start), int(args.end))
else:
intersect(args.json, args.output, args.index)
if args.mode == 'reverse' and args.input and args.output:
reverse(args.input, args.output)
return
if __name__ == '__main__':
main()