-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmake_toponym_json.py
90 lines (77 loc) · 4.1 KB
/
make_toponym_json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#Make Annotation Files
import os
import json
voltext_directory = "/Users/grant/devel/GeoAnnotate/volume_text"
docgeo_directory = "/Users/grant/devel/GeoAnnotate/docgeo_spans_dloaded_103115"
toponym_directory = "/Users/grant/devel/GeoAnnotate/toponym_annotated_103115"
voltext_dict = {}
for f in os.listdir(voltext_directory):
fp = os.path.join(voltext_directory, f)
vol = str(int(f.split('.')[0]))
with open(fp, 'rb') as r:
voltext_dict[vol] = r.read()
docgeo_dict = {}
for f in os.listdir(docgeo_directory):
fp = os.path.join(docgeo_directory, f)
annotator = f.split('-')[0]
vol = fp.split('-')[1].split('.')[0]
if vol not in docgeo_dict:
docgeo_dict[vol] = {}
i = 1
with open(fp, 'rb') as r:
for doc in r.read().split('|'):
row = doc.split('$')
char_start = row[1]
char_end = row[2]
geo = row[3]
docgeo_dict[vol][char_start+'-'+char_end] = {'docid':i, 'doc_charstart':int(char_start), 'doc_charend':int(char_end), 'vol':vol, 'geo':geo, 'annotator':annotator, 'text':voltext_dict[vol][int(char_start):int(char_end)]}
i += 1
#print docgeo_dict[vol][char_start+'-'+char_end]
vol_stop_strings = {'61':"The detachment from Army of the Tennessee re-embarks for Vicksburg, Miss.",
'75':"for I have a very high personal esteem for General Hovey, and believe he is unquestionably a most gallant soldier"
'76':"done they will be arrested in the same manner and banished from the United States as these men",
'77':"On taking command, by the request of my superior officer, Colonel F. Campbell, by direction of Colonel McMillen",
'78':"I suggest that rations be sent to Colonel Wolfe's brigade, and that they",
'79': "Two prisoners brought in on train, captured near Midway."
}
topo_dict = {}
def check_in_doc(docgeo_dict, vol, start_char, end_char):
for doc in docgeo_dict[vol]:
if int(start_char) <= int(doc.split('-')[1]) and int(start_char) >= int(doc.split('-')[0]) and int(end_char) <= int(doc.split('-')[1]):
return doc
return False
for f in os.listdir(toponym_directory):
fp = os.path.join(toponym_directory, f)
vol = f.split('-')[1].split('.')[0]
topo_dict[vol] = {}
with open(fp, 'rb') as r:
for line in r.read().split('|'):
row = line.split('$')
ne_type = row[0]
char_start = row[1]
char_end = row[2]
geo = row[3]
if vol in vol_stop_strings:
if voltext_dict[vol].find(vol_stop_strings[vol]) > int(char_start):
cd = check_in_doc(docgeo_dict, vol, char_start, char_end)
if cd != False:
if cd not in topo_dict[vol]:
topo_dict[vol][cd] = {'text':docgeo_dict[vol][cd]['text'], 'docid':docgeo_dict[vol][cd]['docid'], 'toponyms':[{'geo':geo, 'char_start':int(char_start)-docgeo_dict[vol][cd]['doc_charstart'],
'char_end':int(char_end)-docgeo_dict[vol][cd]['doc_charstart'], 'entity_string':voltext_dict[vol][int(char_start):int(char_end)], 'entity_type':ne_type}]}
else:
topo_dict[vol][cd]['toponyms'].append({'geo':geo, 'char_start':int(char_start)-docgeo_dict[vol][cd]['doc_charstart'],
'char_end':int(char_end)-docgeo_dict[vol][cd]['doc_charstart'], 'entity_string':voltext_dict[vol][int(char_start):int(char_end)], 'entity_type':ne_type})
else:
cd = check_in_doc(docgeo_dict, vol, char_start, char_end)
if cd != False:
if cd not in topo_dict[vol]:
topo_dict[vol][cd] = {'vol':vol, 'docid':docgeo_dict[vol][cd]['docid'], 'text':docgeo_dict[vol][cd]['text'], 'toponyms':[{'geo':geo, 'char_start':int(char_start)-docgeo_dict[vol][cd]['doc_charstart'],
'char_end':int(char_end)-docgeo_dict[vol][cd]['doc_charstart'], 'entity_string':voltext_dict[vol][int(char_start):int(char_end)], 'entity_type':ne_type}]}
else:
topo_dict[vol][cd]['toponyms'].append({'geo':geo, 'char_start':int(char_start)-docgeo_dict[vol][cd]['doc_charstart'],
'char_end':int(char_end)-docgeo_dict[vol][cd]['doc_charstart'], 'entity_string':voltext_dict[vol][int(char_start):int(char_end)], 'entity_type':ne_type})
for vol in topo_dict:
for cd in topo_dict[vol]:
file_name = 'vol'+vol+'_'+cd+'.json'
with open(file_name, 'wb') as w:
json.dump(topo_dict[vol][cd], w, indent=5)