-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathgetIdTable.py
109 lines (94 loc) · 3.87 KB
/
getIdTable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""
Extract all the identifiers associated to articles and output them in a csv file
"""
import os
import argparse
import ntpath
import csv
import xml
from xml.sax import make_parser, handler
class TEICorpusHandler(xml.sax.ContentHandler):
"""
TEI XML SAX handler for reading the TEI corpus file corresponding to the Softcite corpus
"""
# working variables
accumulated = ''
is_origin_file = False
origin_file = None
nb_file = 0
doc_id = None
current_identifier_type = None
def __init__(self, output_path):
xml.sax.ContentHandler.__init__(self)
self.output_path = output_path
self.current_identifier = {}
def startElement(self, name, attrs):
self.accumulated = ''
if name == 'idno':
if attrs.getLength() != 0:
if "type" in attrs:
if attrs.getValue("type") == 'origin':
# this is where we get the PDF file name
self.is_origin_file = True
self.current_identifier_type = attrs.getValue("type")
if name == 'TEI' or name == 'tei':
# reinit working variable
self.is_origin_file = False
self.nb_file += 1
self.doc_id = None
self.current_identifier = {}
if name == "fileDesc":
if attrs.getLength() != 0:
if "xml:id" in attrs:
self.doc_id = attrs.getValue("xml:id")
self.current_identifier["id"] = attrs.getValue("xml:id")
if name == 'teiCorpus':
self.output_file = open(self.output_path, "w", newline='')
self.writer = csv.writer(self.output_file)
self.writer.writerow(["id", "origin", "DOI", "PMID", "PMCID"])
def endElement(self, name):
# print("endElement '" + name + "'")
if name == 'idno':
if self.is_origin_file:
self.is_origin_file = False
self.origin_file = self.accumulated.strip()
self.current_identifier[self.current_identifier_type] = self.accumulated.strip()
if name == "body":
# write entry
#print(self.current_identifier)
doi = ""
if "DOI" in self.current_identifier:
doi = self.current_identifier["DOI"]
pmid = ""
if "PMID" in self.current_identifier:
pmid = self.current_identifier["PMID"]
pmcid = ""
if "PMC" in self.current_identifier:
pmcid = self.current_identifier["PMC"]
self.writer.writerow([self.doc_id, self.origin_file, doi, pmid, pmcid])
if name == 'teiCorpus':
self.output_file.close()
self.accumulated = ''
def characters(self, content):
self.accumulated += content
def clear(self): # clear the accumulator for re-use
self.accumulated = ""
def inject_corpus_annotations(tei_corpus_path, output_path):
parser = make_parser()
handler = TEICorpusHandler(output_path)
parser.setContentHandler(handler)
parser.parse(tei_corpus_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description = "Extract all the identifiers associated to articles and output them in a csv file")
parser.add_argument("--tei-corpus", type=str,
help="path to the Softcite TEI corpus file corresponding to the curated annotated corpus to inject")
parser.add_argument("--output", type=str,
help="path to an output directory where to write the CSV file")
args = parser.parse_args()
tei_corpus_path = args.tei_corpus
output_path = args.output
# check path and call methods
if tei_corpus_path is None or not os.path.isfile(tei_corpus_path):
print("the path to the TEI corpus file is not valid: ", tei_corpus_path)
inject_corpus_annotations(tei_corpus_path, output_path)