-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmongo_import_pmc_oas_local.py
48 lines (40 loc) · 1.31 KB
/
mongo_import_pmc_oas_local.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
"""
If run as a script, crawls the directory tree provided as an argument, and imports all .nxml files in that directory into a Mongo collection.
If imported, provides classes for importing a .nxml file into a Mongo collection.
"""
import os
import pymongo
def mongo_document_from_nxml(file_path):
base = os.path.basename(file_path)
name = os.path.splitext(base)[0]
with open(file_path, 'r') as f:
nxml = f.read()
article = {
'_id': name,
'nxml': nxml
}
return(article)
def sample_articles_from_collection(n, articles):
pipeline = [
{"$sample": {"size": n}},
{"$out": "articlesubset"}
]
articles.aggregate(pipeline)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument(
'--pmc_path', default='/Volumes/Transcend/pmc'
)
args = parser.parse_args()
articles = pymongo.MongoClient().pmc.articles
articles.drop()
for (dirpath, dirnames, filenames) in os.walk(args.pmc_path):
for name in filenames:
print(name)
filepath = os.path.join(dirpath, name)
if filepath.endswith('nxml'):
article = mongo_document_from_nxml(filepath)
articles.insert(article)
print(articles.count())