-
Notifications
You must be signed in to change notification settings - Fork 2
/
parse_elections.py
214 lines (165 loc) · 6.33 KB
/
parse_elections.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# -*- coding: utf-8 -*-
import os
from models import *
from model_helpers import parse_name, get_or_create_person, get_party
from model_helpers import get_or_create_source_doc
import re
import io
import inspect
f_path = 'source_docs/SoS/election_results/txts/'
# select the race_types into a list of matching purposes
race_types = []
for race_type in Race_Type.select():
race_types.append(race_type)
# election date pattern
date_pattern = re.compile("^\s*(\w+,\s\w+\s\d{2},\s\d{4})$")
# candidate match pattern
cand_pattern = re.compile("^\s+(\w[\w\.\s,\xad'\(\)]+)\s{2,}([A-Za-z\d]{2,3})\s+([\d,]+)\s+([\d\.%]+)\s*$")
elections = []
# loop over the election results .txt files...
for i in os.listdir(f_path):
# skipping any file without the .txt extension
if '.txt' in i:
print 'Getting data from {}'.format(i)
# set up an election for each file
election = Election(
date = None
, races = []
)
# this particular election was not available in pdf
# had to copy the text from the 2001 Blue Book
if i == 'AllRacesSpecialMarch2000SD5.txt':
election.source_doc = get_or_create_source_doc(
name = 'All Races Special March 2000 SD5'
, file_name = f_path + i
, url = 'http://s1.sos.mo.gov/cmsimages/bluebook/2001-2002/0711-0717.pdf#p715'
, parent = None
)
else:
source_doc_file = f_path + i
election.source_doc = Source_Doc.get(Source_Doc.file_name == source_doc_file.replace('txt', 'pdf'))
# determine which type of election it is (based on file name)
for elec_type in Election_Type.select():
if elec_type.name in i:
election.election_type = elec_type
# open the file
with io.open(f_path + i, mode = 'r', encoding='UTF-8') as f:
# declare a line reader so that we can reference line numbers (i.e., index position)
reader = f.readlines()
# for each line in the file
for idx, line in enumerate(reader):
# ignore lines with only one non-whitespace character
if len(line.strip()) > 1:
date_match = re.match(date_pattern, line)
cand_match = re.match(cand_pattern, line)
# if the line matches the election date pattern...
if date_match != None:
# set this election attribute
election.election_date = date_match.group(1)
# if it's a general election...
if election.election_type.name == 'General':
# assume it's for the assembly starting next year
election.assembly = Assembly.get(start_year = int(re.search('\d{4}', election.election_date).group()) + 1)
# if it's a special election...
elif election.election_type.name == 'Special':
try:
# first, try getting an assembly that started the same year...
election.assembly = Assembly.get(start_year = int(re.search('\d{4}', election.election_date).group()))
except Assembly.DoesNotExist:
# then try getting an assembly that ended the same year...
election.assembly = Assembly.get(end_year = int(re.search('\d{4}', election.election_date).group()))
elif 'State of Missouri' in line:
election.name = line.replace('State of Missouri', '').strip()
# see if any of the race types names are in the line
elif any(race_type.name in line for race_type in race_types):
# if so, then set up a new race
race = Race(
candidates = []
, total_votes = None
)
# set which type of race it is
for race_type in race_types:
if race_type.name in line:
# then set this attribute
race.race_type = race_type.id
# if the candidate line also contains the word 'District'...
if 'District' in line:
# try finding the district number and setting this attribute
try:
race.district = re.search('\d+', line).group()
except AttributeError:
print 'No district number'
print repr(line)
# if the line matches the candidate pattern
elif cand_match != None:
# append a candidate to the race's list
race.candidates.append(
Race_Candidate(
raw_name = cand_match.group(1).replace(u'\xad', '-').strip()
, party = get_party(cand_match.group(2).strip())
, votes = cand_match.group(3).strip().replace(',', '')
, pct_votes = cand_match.group(4).replace('%', '').strip()
)
)
# if the phrase 'Total Votes' is in the line
elif 'Total Votes' in line:
# search the current line for the total votes number
try:
race.total_votes = re.search('[\d|,]+', line).group().replace(',', '')
# if not founnd, try the next line
except AttributeError:
race.total_votes = re.search('[\d|,]+', reader[idx + 1]).group().replace(',', '')
# after getting the total votes, append the race to the election's list
election.races.append(race)
elections.append(election)
print '=============='
for election in elections:
try:
with db.atomic():
election.save()
except Exception as e:
if 'duplicate' in e.message:
pass
else:
print 'Error on line #{0}: {1}'.format(inspect.currentframe().f_lineno, e)
for race in election.races:
# for now, ignore judicial races, propositions and constitutional amendments
if race.race_type.name not in [
'Constitutional Amendment'
, 'Proposition'
, 'Court of Appeals'
, 'Missouri Supreme Court'
, 'Circuit Judge'
, 'Assoc. Circuit Judge'
, 'U.S. President And Vice President'
]:
race.election = election.id
print ' {}, District # {}'.format(election.election_date, race.district)
try:
with db.atomic():
race.save()
except Exception as e:
if 'duplicate' in e.message:
pass
else:
print 'Error on line #{0}: {1}'.format(inspect.currentframe().f_lineno, e)
for candidate in race.candidates:
candidate.race = race.id
raw_name = candidate.raw_name
parsed_name = parse_name(candidate.raw_name.replace(',,', ','))['name_dict']
# set the person attribute
candidate.person = get_or_create_person(parsed_name)['person']
for k, v in candidate._data.iteritems():
print ' {0}: {1}'.format(k, repr(v))
# now save
try:
with db.atomic():
candidate.save()
except Exception as e:
if 'duplicate' in e.message:
pass
else:
print 'Error on line #{0}: {1}'.format(inspect.currentframe().f_lineno, e)
print '------'
print '==================='
print 'fin.'