-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgender_from_schedule.py
175 lines (139 loc) · 6.03 KB
/
gender_from_schedule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python
# -----------------------------------------------------------------------------
# GENDER_FROM_SCHEDULE
# Gets session/talk numbers and chair/speaker genders from schedule and
# writes output to file.
# -----------------------------------------------------------------------------
import re
import astropy.io.ascii as ascii
from numpy import *
from numpy.core.records import fromarrays as recinit
# read in name lists
male = ascii.read( "male_uniq.csv", names=[ "name", "nm" ], comment='#' )
female = ascii.read( "female_uniq.csv", names=[ "name", "nf" ], comment='#' )
unisex = ascii.read( "unisex_uniq.csv", names=[ "name", "nf", "nm" ], comment='#' )
extras = ascii.read( "extra_names.csv" )
# get gender by name
def get_gender( name ):
# find name in male and female lists
wm = where( male["name"] == name )[0]
wf = where( female["name"] == name )[0]
if not wm and not wf: # name not found
gender= "x"
wh = where( extras["name"] == name )
if extras[wh]["gender"] == "m" or extras[wh]["gender"] == "f":
gender = extras[wh]["gender"]
elif not wf: gender = "m" # only in male
elif not wm: gender = "f" # only in female
elif male[wm]["nm"] > female[wf]["nf"]: gender = "m" # both, more male
elif male[wm]["nm"] < female[wf]["nf"]: gender = "f" # both, more female
else: gender = "e" # error
# want to highlight if an error happens so we can check it out
if gender == "e": print "OH NO!"
return gender
# make recarray for session data
nn = 200
zz = zeros( nn ).astype( int )
ss = array( [ "" ] * nn ).astype( "S25" )
aa = [ zz ] + [ ss ]
sess = recinit( aa, names="id, chair" )
# make recarray for talk data
nn = 2000
zz = zeros( nn )
ss = array( [ "" ] * nn ).astype( "S25" )
aa = [ zz ] + [ ss ]
talk = recinit( aa, names="id, speaker" )
# some setup stuff
scount = 0
tcount = 0
withdrawn = False
nw = 0
# read from schedule file
fname = "data/schedule.txt"
f = open( fname, "r" )
lines = f.readlines()
for i in range( size( lines ) ):
line = lines[i]
# this is what a five minute talk looks like
re_reg = re.compile( r'[0-9][0-9][0-9][.][ ][0-9][0-9][.][ ]' )
# this is what a disseration talk looks like
re_dis = re.compile( r'[0-9][0-9][0-9][.][ ][0-9][0-9][D][.][ ]' )
# this is what a session looks like
re_ses = re.compile( r'[0-9][0-9][0-9][.][ ]' )
# check for withdrawl - I don't actually do anything with this yet
if line.strip() == "This presentation has been withdrawn. Withdrawn":
withdrawn = True
# check if line is a session or a talk
if re_reg.search( line ) is not None or re_dis.search( line ) is not None:
stat = "talk"
elif re_ses.search( line ) is not None:
stat = "session"
else: stat = ""
# for a talk
if stat == "talk":
# get talk id
talkid = "".join( line.split(" ")[:2] )[:6]
# get speaker name from next line
fullname = lines[i+1].split( ";" )[0].strip()
speaker = fullname.split()[0]
if fullname == "W. P. Maksym": speaker = "Peter"
if fullname == "G. B. Berriman": speaker = "Bruce"
if fullname == "J. Pocahontas Olson": speaker = "Pocohontas"
if fullname == "S. Thomas Megeath": speaker = "Thomas"
if fullname == "D. Anish Roshi": speaker = "Anish"
if fullname == "N. J. Kasdin": speaker = "Jeremy"
if fullname == "H. P. Stahl": speaker = "Philip"
if fullname == "A. Smirnov": speaker = "Alexander"
if fullname == "S. Likhachev": speaker = "Sergey"
if fullname == "B. S. Gaudi": speaker = "Scott"
if fullname == "K.E. S. Ford": speaker = "Saavik"
if fullname == "J. T. Armstrong": speaker = "John"
if fullname == "G. F. Benedict": speaker = "George"
if fullname == "Hyunsung David Jun": speaker = "David"
if fullname == "W. N. Brandt": speaker = "Niel"
if fullname == "Myungkook J. Jee": speaker = "James"
if fullname == "F. Richard Stephenson": speaker = "Richard"
if fullname == "P. T. de Zeeuw": speaker = "Tim"
if fullname == "E. C. Krupp": speaker = "Edwin"
speaker_gender = get_gender( speaker )
if fullname == "Remi Soummer": speaker_gender = "m"
if fullname == "Sasha Hinkley": speaker_gender = "m"
if fullname == "Korey Haynes": speaker_gender = "f"
if fullname == "Kaisey Mandel": speaker_gender = "m"
if fullname == "Yuan Li": speaker_gender = "f"
if fullname == "Taehyun Kim": speaker_gender = "f"
if fullname == "Li Zeng": speaker_gender = "m"
if fullname == "Keivan Stassun": speaker_gender = "f" # subs speaker f
if fullname == "Yu Lu": speaker_gender = "m"
if fullname == "Sanlyn Buxner": speaker_gender = "m"
# print when gender unknown
if speaker_gender == "x": print talkid, fullname
if withdrawn:
# print talkid, "withdrawn"
withdrawn = False
nw += 1
# record talk info
talk[tcount].id = talkid
talk[tcount].speaker = speaker_gender
tcount += 1
# for a session
if stat == "session":
# get session id
sessid = line[:3]
# get chair name from next line
chair = lines[i+1].split( ":" )[1].split( "(" )[0].strip().split()[0]
chair_gender = get_gender( chair )
# print when gender unknown
if chair_gender == "x": print sessid, chair
# record session info
sess[scount].id = sessid
sess[scount].chair = chair_gender
scount += 1
# close the schedule file
f.close()
# cut out all null entries
sess = sess[ where( sess.id ) ]
talk = talk[ where( talk.id ) ]
# write info to files
ascii.write( sess, output="data/sessions.dat" )
ascii.write( talk, output="data/talks.dat" )