-
Notifications
You must be signed in to change notification settings - Fork 0
/
ExtractedTextSet.py
executable file
·222 lines (192 loc) · 8.27 KB
/
ExtractedTextSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
Name: ExtractedTextSet.py
Purpose:
This module provides utilities for recovering the extracted text for
references (bib_refs records) in the database.
Extracted text is stored in the bib_workflow_data table in the database,
but it is stored split into sections (body, references, supplemental, ...),
and it is not so easy to recover the full text concatenated back together.
The ExtractedTextSet class defined here does this for you.
Convenience functions for building an ExtractedTextSet for a set of
_refs_keys are also provided.
If run as a script, take _ref_key as a command line argument and write
the (full) extracted text for the reference to stdout.
See ExtractedTextSet.py -h
"""
import argparse
def getExtractedTextSet(db, # an initialized db module
refKeyList, # list of _ref_keys
):
"""
Return an ExtractedTextSet for the references with the specified keys.
Assumes refKeyList is small enough to format into a select statement.
Example:
import ExtractedTextSet
import db
db.set_sqlServer("bhmgidevdb01")
db.set_sqlDatabase("prod")
db.set_sqlUser("mgd_public")
db.set_sqlPassword("mgdpub")
refKeys = [390554, 390545]
ets = ExtractedTextSet.getExtractedTextSet(db, refKeys)
for r in refKeys:
text = ets.getExtText(r)
...
"""
query = '''
select bd._refs_key, t.term "text_type", bd.extractedtext "text_part"
from bib_workflow_data bd join voc_term t on
(bd._extractedtext_key = t._term_key)
where bd._refs_key in ( %s )
''' % ','.join([ str(r) for r in refKeyList ])
results = db.sql([query], 'auto')
ets = ExtractedTextSet(results[-1])
return ets
#-----------------------------------
def getExtractedTextSetForTable(db, # an initialized db module
tmpTableName, # (string) name of tmp table
):
"""
Return an ExtractedTextSet for the references represented in a tmpTable
in the database.
The only requirement for the tmpTable is that it has a _refs_key field
(ideally, it should have an index on this field too for efficiency)
"""
query = '''
select r._refs_key, t.term "text_type", bd.extractedtext "text_part"
from %s r join bib_workflow_data bd on (r._refs_key = bd._refs_key)
join voc_term t on (bd._extractedtext_key = t._term_key)
''' % tmpTableName
results = db.sql([query], 'auto')
ets = ExtractedTextSet(results[-1])
return ets
#-----------------------------------
class ExtractedTextSet (object):
"""
IS a collection of extracted text records (from multiple references)
HAS each extracted text record is dict with fields
{'_refs_key' : int, 'text_type': (e.g, 'body', 'references'),
'text_part': text}
The records may have other fields too that are not used here.
The field names '_refs_key', 'text_type', 'text_part' are specifiable.
DOES (1)collects and concatenates all the fields for a given _refs_key into
a single text field in the correct order - thus recapitulating the
full extracted text.
(2) getExtText(refKey) - get the extracted text for a given _refs_key
(3) join a set of basic reference records to their extracted text
"""
# from Vocab_key = 142 (Lit Triage Extracted Text Section vocab)
# These are the expected values for the 'text_type' field.
validTextTypes = [ 'body', 'reference',
'author manuscript fig legends',
'star methods',
'supplemental', ]
#-----------------------------------
def __init__(self,
extTextRcds, # list of rcds as above
keyLabel='_refs_key', # name of the reference key field
typeLabel='text_type', # name of the text type field
textLabel='text_part', # name of the text field
):
self.keyLabel = keyLabel
self.typeLabel = typeLabel
self.textLabel = textLabel
self.extTextRcds = extTextRcds
self._gatherExtText()
#-----------------------------------
def hasExtText(self, refKey ):
""" Return True if this ExtractedTextSet has text for refKey
"""
return str(refKey) in self.key2TextParts
#-----------------------------------
def getExtText(self, refKey ):
""" Return the text for refKey (or '' if there is no text)
"""
extTextDict = self.key2TextParts.get(str(refKey),{})
text = extTextDict.get('body','') + \
extTextDict.get('reference', '') + \
extTextDict.get('author manuscript fig legends', '') + \
extTextDict.get('star methods', '') + \
extTextDict.get('supplemental', '')
return text
#-----------------------------------
def joinRefs2ExtText(self,
refRcds,
refKeyLabel='_refs_key',
extTextLabel='ext_text',
allowNoText=True,
):
"""
Assume refRcds is a list of records { refKeyLabel : xxx, ...}
For each record in the list, add a field: extTextLabel: text
so that the extracted text becomes part of the record.
If allowNoText is False, then an exception is raised if a refRcd is
found with no extracted text.
"""
for r in refRcds:
refKey = str(r[refKeyLabel])
if not allowNoText and refKey not in self.key2TextParts:
raise ValueError("No extracted text found for '%s'\n" % \
str(refKey))
r[extTextLabel] = self.getExtText(refKey)
return refRcds
#-----------------------------------
def _gatherExtText(self, ):
"""
Gather the extracted text sections for each _refs_key
Return dict { _refs_key: { extratedTextType : text } }
E.g., { '12345' : { 'body' : 'body section text',
'references' : 'ref section text',
'star methods': '...text...',
} }
(we force all _refs_keys to strings so user can use either int or str)
"""
resultDict = {}
for r in self.extTextRcds:
refKey = str(r[self.keyLabel])
textType = r[self.typeLabel]
textPart = r[self.textLabel]
if textType not in self.validTextTypes:
raise ValueError("Invalid extracted text type: '%s'\n" % \
textType)
if refKey not in resultDict:
resultDict[refKey] = {}
resultDict[refKey][textType] = str(textPart)
self.key2TextParts = resultDict
return self.key2TextParts
#-----------------------------------
# end class ExtractedTextSet -----------------------------------
#-----------------------------------
# if run as a script, write extracted text for a reference to stdout
#-----------------------------------
def getArgs():
parser = argparse.ArgumentParser( \
description='get extracted text for a reference and write it to stdout')
parser.add_argument('ref_key', default=None,
help="reference key to get extracted text for")
parser.add_argument('-s', '--server', dest='server', action='store',
required=False, default='dev',
help='db server: adhoc, prod, or dev (default)')
args = parser.parse_args()
if args.server == 'adhoc':
args.host = 'mgi-adhoc.jax.org'
args.db = 'mgd'
if args.server == 'prod':
args.host = 'bhmgidb01'
args.db = 'prod'
if args.server == 'dev':
args.host = 'bhmgidevdb01'
args.db = 'prod'
return args
#-----------------------------------
if __name__ == "__main__":
import sys
import db as dbModule
args = getArgs()
dbModule.set_sqlServer(args.host)
dbModule.set_sqlDatabase(args.db)
dbModule.set_sqlUser("mgd_public")
dbModule.set_sqlPassword("mgdpub")
ets = getExtractedTextSet(dbModule, [args.ref_key])
text = ets.getExtText(args.ref_key)
print(text)