forked from openstates/openstates-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpurge_old_committee_ids.py
149 lines (133 loc) · 5.77 KB
/
purge_old_committee_ids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from billy.core import db, feeds_db
from billy.core import settings
from billy.core import logging
def main():
import sys
abbrs = sys.argv[1:] or [x['abbreviation'] for x in db.metadata.find()]
logger = logging.getLogger('purge_committee_ids')
logger.setLevel(logging.DEBUG)
for abbr in abbrs:
spec = {settings.LEVEL_FIELD: abbr}
committee_ids = [c['_id'] for c in db.committees.find(spec, fields=['_id'])]
# Events with committee participants.
spec = {
settings.LEVEL_FIELD: abbr,
'participants.committee_id': {'$nin': committee_ids}
}
for event in db.events.find(spec):
old_ids = set()
count = 0
found = False
for participant in event['participants']:
for id_key in 'committee_id', 'id':
_id = participant.get(id_key, None)
type_ = participant.get('participant_type')
if id_key == 'id' and type_ != 'committee':
continue
if _id and (_id not in committee_ids):
found = True
msg = 'Removing participant %r from event %r'
logger.info(msg % (participant[id_key], event['_id']))
# Leave the participant in but set their id to none.
# Text will still be displayed without a hyperlink.
participant[id_key] = None
if found:
msg = 'Removed %d old committee %r ids from %r'
logger.info(msg % (count, old_ids, event['_id']))
db.events.save(event, safe=True)
# Related committees in bill actions.
spec = {
settings.LEVEL_FIELD: abbr,
'actions.related_entities.type': 'committee'
}
for bill in db.bills.find(spec):
old_ids = set()
count = 0
found = False
for action in bill['actions']:
for entity in action['related_entities']:
if entity['type'] == 'committee':
if entity['id'] and (entity['id'] not in committee_ids):
found = True
count += 1
old_ids.add(entity['id'])
msg = 'Removing entity %r from action in %r'
logger.debug(msg % (entity['id'], bill['bill_id']))
# Completely remove the related entity. Without an
# id it has no other purpose.
action['related_entities'].remove(entity)
if found:
msg = 'Removed %d old committee %r ids from %r'
logger.info(msg % (count, old_ids, bill['_id']))
db.bills.save(bill, safe=True)
# Legislator old roles.
spec = {
settings.LEVEL_FIELD: abbr,
'old_roles': {'$exists': True}
}
for leg in db.legislators.find(spec):
old_ids = set()
count = 0
found = False
for role in leg['old_roles']:
if 'committee_id' in role:
_id = role['committee_id']
if _id and (_id not in committee_ids):
found = True
count += 1
old_ids.add(_id)
msg = 'Removing id %r from old_role in %r'
logger.info(msg % (role['committee_id'], leg['full_name']))
# Set the id to None.
role['committee_id'] = None
if found:
msg = 'Removed %d old committee %r ids from %r'
logger.info(msg % (count, old_ids, leg['_id']))
db.legislators.save(leg, safe=True)
# Related entities in feeds.
spec = {
settings.LEVEL_FIELD: abbr,
'entity_ids': {'$ne': None}
}
for entry in feeds_db.entries.find(spec):
old_ids = set()
count = 0
found = False
for entity_id in entry['entity_ids']:
if entity_id[2] == 'C':
if entity_id not in committee_ids:
found = True
count += 1
msg = 'Removing id %r from feed %r'
logger.info(msg % (entity_id, entry['_id']))
# Delete the entity from the feed.
old_ids.add(entity_id)
index = entry['entity_ids'].index(entity_id)
del entry['entity_ids'][index]
del entry['entity_strings'][index]
if found:
msg = 'Removed %d old committee ids %r from %r'
logger.info(msg % (count, old_ids, entry['_id']))
feeds_db.entries.save(entry, safe=True)
# Nuke any committee sponsors of bills.
spec = {
settings.LEVEL_FIELD: abbr,
'sponsors.committee_id': {'$nin': committee_ids}
}
for bill in db.bills.find(spec):
count = 0
found = False
old_ids = set()
for sponsor in bill.get('sponsors', []):
if 'committee_id' in sponsor:
_id = sponsor['committee_id']
old_ids.add(_id)
found = True
count += 1
del sponsor['committee_id']
if found:
msg = 'Removed %d old committee ids %r from %r'
logger.info(msg % (count, old_ids, bill['_id']))
db.bills.save(bill)
if __name__ == '__main__':
main()