This repository has been archived by the owner on Mar 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathembargo_sync.py
98 lines (85 loc) · 3.25 KB
/
embargo_sync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
from config import Config
from blackfynn import Blackfynn
from pymongo import MongoClient
from collections import Counter
bf = None
embargoed = None
def connect_blackfynn():
'Connect to Blackfynn'
print('Connecting to Blackfynn ...', end=' ')
bf = Blackfynn(
api_token=Config.BLACKFYNN_API_TOKEN,
api_secret=Config.BLACKFYNN_API_SECRET,
env_override=False,
host=Config.BLACKFYNN_API_HOST,
)
print('done')
return bf
def connect_mongo():
### 'Connect to MongoDB'
print('Connecting to MongoDB ...', end=' ')
client = MongoClient(Config.MONGODB_URI)
db = client[Config.MONGODB_NAME]
collection = db[Config.MONGODB_COLLECTION]
print('done')
return collection
def transform(ds):
'Convert dataset JSON from Blackfynn to a database entry'
content = ds['content']
# models = bf.get_dataset(content['id']).models()
doc = {
'_id': content['intId'],
'name': content['name'],
'description': content.get('description'),
'createdAt': content['createdAt'],
'updatedAt': content['updatedAt'],
'tags': content['tags'],
'contributors': content['contributors'],
'organization': ds['organization'],
'userId': ds['owner'],
'size': ds['storage'],
'organization': org['organization']['name'],
'organizationId': org['organization']['intId'],
# 'modelCount': {name: m.count for name,m in models.items()},
# 'fileCount': Counter(),
#'recordCount': sum(m.count for m in models.values()),
'banner': api._get(api._uri('/datasets/{dsid}/banner', dsid=content['id'])).get('banner', None)
}
# Get file counts:
#cursor = ''
#while True:
# resp = api._get(api._uri('/datasets/{dsid}/packages?cursor={cursor}', dsid=content['id'], cursor=cursor))
# for p in resp['packages']:
# ptype = p['content']['packageType']
# doc['fileCount'][ptype] += 1
# cursor = resp.get('cursor')
# if cursor is None:
# break
return doc
if __name__ == '__main__':
print('Starting embargoed data sync')
bf = connect_blackfynn()
api = bf._api.core
embargoed = connect_mongo()
org = api._get(api._uri('/organizations/{orgid}', orgid=bf.context.id))
all_datasets = api._get('/datasets') # assuming all of these are part of SPARC Consortium org.
publishedIds = [x['sourceDatasetId'] for x in api._get('/datasets/published')]
print(embargoed)
### Delete all existing entries:
embargoed.drop()
entries = []
for ds in all_datasets:
print(ds)
# skip if dataset is published or isn't part of Embargoed Data Team:
print( api._get(api._uri('/datasets/{dsid}/collaborators/teams', dsid=ds['content']['id'])) )
if ds['content']['intId'] in publishedIds or not any(t['id'] == Config.BLACKFYNN_EMBARGO_TEAM_ID for t in \
api._get(api._uri('/datasets/{dsid}/collaborators/teams', dsid=ds['content']['id']))):
print('skipping')
continue
entry = transform(ds)
entries.append(entry)
print('Found:', entry['_id'], entry['name'], sep='\t')
if entries:
embargoed.insert_many(entries)
print('Sync finished.')