-
Notifications
You must be signed in to change notification settings - Fork 21
/
pubStore
executable file
·159 lines (126 loc) · 5.28 KB
/
pubStore
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#!/usr/bin/env python2
# general tool to extract from, rewrite or add to text collection files
# that are split into "chunks". This is our primary text storage format
# aka "pubStore format".
# first load the standard libraries from python
# we require at least python 2.7
#from sys import *
from __future__ import print_function
import sys
if sys.version_info[0]==2 and not sys.version_info[1]>=7:
print("Sorry, this program requires at least python 2.7")
exit(1)
# load default python packages
import logging, optparse, os, glob, zipfile, types, gzip, shutil
from os.path import *
# add <scriptDir>/lib/ to package search path
progFile = os.path.abspath(sys.argv[0])
progDir = os.path.dirname(progFile)
pubToolsLibDir = os.path.join(progDir, "lib")
sys.path.insert(0, pubToolsLibDir)
# now load our own libraries
import pubChange
import pubGeneric, maxRun, pubStore, pubConf, maxCommon, pubXml, pubPubmed
# === CONSTANTS ===================================
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = optparse.OptionParser("""usage: %prog [options] command options
command is one of filterText, filterPmid, addPmids, rechunk
command "filterText":
create subset of dataset(s) with text that contains a keyword
%prog filter <datasetList> <keywordListCommaSep> <datasetOut>
example:
%prog filter pmc,elsevier,crawler ebola,filovirus ebola
command "filterPmid":
create subset of dataset(s) with a list of PMIDs, create a new dataset
%prog filter <datasetList> <pmidListFile> <datasetOut>
example:
%prog filter pmc,elsevier,crawler uniprotPmids.txt uniProtText
command "filterIssn":
%prog filterIssn <datasetList> issn1,issn2,issn3,... <outFname>
example:
%prog filterIssn pmc 1367-4803 bioformatics
It may be easier to use sqlite3 + filterPmid for ISSN filtering
command "addPmids":
read PMIDs from datasetDir/medline.ids.tab, rewrite all
articles.gz files and add the new PMID.
medlines.ids.tab can be created with pubFingerprint.
example:
%prog addPmids elsevier
command "rechunk":
%prog rechunk <inDir> <outDir>
read all munched text tables from inDir and write to outDir. Merges smaller
chunks into bigger ones.
command "index":
%prog index inDir
loads missing article files into the sqlite index.
command "show":
%prog show <inDirList> <docSelector>
pull a single text file out of the text directories specified.
Output goes to stdout, attributes are prefixed with |
example:
%prog show pmc 'pmid=1234567'
""")
#parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
#parser.add_option("", "--minId", dest="minId", action="store", help="numerical IDs written to the pubStore start at this number times one billion to prevent overlaps of numerical IDs between publishers, default %default", default=pubConf.identifierStart["medline"])
#parser.add_option("", "--parse", dest="parse", action="store", help="for debugging, just parse one single xml file", default=None)
pubGeneric.addGeneralOptions(parser)
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
def main(args, options):
cmd = args[0]
if cmd=="filterIssn":
inSpec, issnListStr, outFname = args[1:]
store = pubStore.PubWriterFile(outFname)
issns = issnListStr.split(",")
inDirs = pubConf.resolveTextDirs(inSpec)
for inDir in inDirs:
for reader in pubStore.iterPubReaders(inDir):
for article, files in reader.iterArticlesFileList(None):
if article.eIssn in issns or article.printIssn in issns:
fileDicts = [f._asdict() for f in files]
store.writeDocs(article._asdict(), fileDicts)
store.close()
elif cmd=="filterPmid":
inSpec, pmidFname, outDir = args[1:]
pmidFname = os.path.abspath(pmidFname)
assert(isfile(pmidFname))
pubChange.filterCmd(inSpec, pmidFname, outDir, options)
elif cmd=="filterText":
inSpec, searchSpec, outDir = args[1:]
partsDir = join(outDir, "parts")
maxCommon.mustBeEmptyDir(partsDir, makeDir=True)
outFnames = pubChange.filterCmd(inSpec, searchSpec, partsDir, options)
pubChange.rechunk(partsDir, outDir)
# cleanup
for fname in outFnames:
os.remove(fname)
os.removedirs(partsDir)
elif cmd=="filterJob":
inSpec, pmidFname, outSpec = args[1:]
pubChange.filterOneChunk(inSpec, pmidFname, outSpec)
elif cmd=="rechunk":
inDir, outDir = args[1:]
pubChange.rechunk(inDir, outDir)
elif cmd=="index":
inDir = args[1]
inDir = pubConf.resolveTextDir(inDir)
pubStore.updateSqlite(inDir)
elif cmd=="show":
inDirStr, whereExpr = args[1:]
inDirs = inDirStr.split(",")
for artDict, fileDicts in pubStore.lookupFullDocs(inDirs, whereExpr):
print(pubStore.dictToMarkLines(artDict))
print()
for fileDict in fileDicts:
print(pubStore.dictToMarkLines(fileDict))
elif cmd=="addPmids":
pubChange.addPmids(args[1])
else:
raise Exception("Unknown command %s" % cmd)
# ----------- MAIN --------------
if args==[]:
parser.print_help()
exit(1)
# normal operation
pubGeneric.setupLogging(progFile, options)
main(args, options)