-
Notifications
You must be signed in to change notification settings - Fork 0
/
voters_2016.py
135 lines (113 loc) · 3.95 KB
/
voters_2016.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import argparse
import pyspark
import re
sc = pyspark.SparkContext()
# takes the record from the file 2016 and splits into the record as
# name, count of medicine for 2016
def getNameCount(line):
pattern = re.compile(r'[^a-zA-Z0-9]')
fields = re.split(pattern, line)
return fields[2], fields[5]
# filters if the name is in the records for the 2015
# and if the record for 2015 has a count (for Republicans)
def filterReps(line):
name = line[0]
count = line[1]
if name in file_reps_2015:
return True
return False
# filters if the name is in the records for the 2015
# and if the record for 2015 has a count (for Dems)
def filterDems(line):
name = line[0]
count = line[1]
if name in file_dems_2015:
return True
return False
# at the end we have name, number of medicine for 2016,
# number of medicine for 2015 (for Republicans)
def getRepublicans(line):
name = line[0]
count = line[1]
# we are guaranteed that the name is already in the file_reps_2015,
# but this the additional check
if name in file_reps_2015:
try: count = int(count)
except: count = 0
try: rep_2015 = int(file_reps_2015[name])
except: rep_2015 = 0
return name, count, rep_2015
# at the end we have name, number of medicine for 2016,
# number of medicine for 2015 (for Democrats)
def getDemocrats(line):
name = line[0]
count = line[1]
# we are guaranteed that the name is already in the file_dems_2015,
# but this the additional check
if name in file_dems_2015:
# number of medicine for 2016 should be the int
try: count = int(count)
except: count = 0
# number of medicine for 2015 should be the int
try: dem_2015 = int(file_dems_2015[name])
except: dem_2015 = 0
return name, count, dem_2015
def main( state):
# get the raw data for 2016 ->
# refomat all records to name, count tuples
text_file = sc.textFile(state+ "_2016_folder/part*").map(getNameCount)
# filter only democrats ->
# get tupes name, count_2016, count_2015 ->
# save to file
democrats = text_file.filter(filterDems)\
.map(getDemocrats)\
.saveAsTextFile(state+"_REPUBLICANS")
# filter only republicans ->
# get tupes name, count_2016, count_2015 ->
# save to file
republicans = text_file.filter(filterReps)\
.map(getRepublicans)\
.saveAsTextFile(state+"_DEMOCRATS")
# take in the file for Democrats in 2015 and turn it into the dictionary
def get_file_dems_2015(file_dems_2015, state):
pattern = re.compile(r'[^a-zA-Z0-9]')
# open file
file_dems_2015 = open(file_dems_2015, "rt")
dems_2015 = {}
for line in file_dems_2015:
fields = re.split(pattern, line)
# name: count
dems_2015[fields[2]] = fields[6]
# close file
file_dems_2015.close()
return dems_2015
# take in the file for Republicans in 2015 and turn it into the dictionary
def get_file_reps_2015(file_reps_2015, state):
pattern = re.compile(r'[^a-zA-Z0-9]')
file_reps_2015 = open(file_reps_2015, "rt")
reps_2015 = {}
for line in file_reps_2015:
fields = re.split(pattern, line)
# name: count
reps_2015[fields[2]] = fields[6]
file_reps_2015.close()
return reps_2015
# add the parsing arguments
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--state')
return parser.parse_args()
# parse the --state argument
args = parse_args()
if args.state == "OH":
state = "OH"
elif args.state == "OK":
state = "OK"
# the files to get the data from
file_dems_2015 = state + "_Democrats"
file_reps_2015 = state + "_Republicans"
# change these data files into the dictionaies
file_dems_2015 = get_file_dems_2015(file_dems_2015, state)
file_reps_2015 = get_file_reps_2015(file_reps_2015, state)
# this calls the main process
main(state)