-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgender_ratio_position.py
156 lines (112 loc) · 4.97 KB
/
gender_ratio_position.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import numpy as np
import matplotlib.pyplot as plt
import astropy
import astropy.io.ascii as ascii
import collections
import scipy
import scipy.misc
from scipy import stats
def count_mf_questions(data_in):
N_m = 0
N_f = 0
for i in np.arange(0, len(data_in)):
N_m = N_m + data_in['questions'][i].count('M')
N_f = N_f + data_in['questions'][i].count('F')
return (N_f, N_m)
# This is how to access data in this array:
# For example, print the speaker genders:
print data['speaker']
# Plot number of female and male speakers in sample
#plt.plot(np.array([0,1]), np.array([len(data[data['speaker']=='M']), len(data[data['speaker']=='F'])]), 'o')
#plt.axis([-1, 2, 0, 300])
print "Number of reported talks given by men: ", len(data[data['speaker']=='M'])
print "Number of reported talks given by women: ", len(data[data['speaker']=='F'])
print "Talk ratio f/m: ", np.float(len(data[data['speaker']=='F']))/len(data[data['speaker']=='M'])
# count female/male questions:
(N_f, N_m) = count_mf_questions(data)
print "Number of questions asked by men: ", N_m
print "Number of questions asked by women: ", N_f
print "Questions ratio f/m: ", np.float(N_f)/N_m
# questions ratio for talks given by women/men individually
f_talks = data['speaker'] == 'F'
m_talks = data['speaker'] == 'M'
(N_f_tf, N_m_tf) = count_mf_questions(data[f_talks==True])
(N_f_tm, N_m_tm) = count_mf_questions(data[m_talks==True])
print "In Talks given by women:"
print "Number of questions asked by men: ", N_m_tf
print "Number of questions asked by women: ", N_f_tf
print "Questions ratio f/m: ", np.float(N_f_tf)/N_m_tf
print "In Talks given by men:"
print "Number of questions asked by men: ", N_m_tm
print "Number of questions asked by women: ", N_f_tm
print "Questions ratio f/m: ", np.float(N_f_tm)/N_m_tm
l_f = [len(q) for q in data[f_talks]['questions']]
l_m = [len(q) for q in data[m_talks]['questions']]
plt.clf()
plt.hist(l_m, alpha=0.5)
plt.hist(l_f, alpha=0.5, color='r')
plt.legend(['male speaker', 'female speaker'])
plt.title('Questions asked per talk')
plt.savefig('questions_asked_per_talk.pdf')
# find out at which position in the talk queue women and men typically ask their question
def position_ratios(data):
discussion_length = [len(data['questions'][i]) for i in np.arange(0, len(data))]
N_max = np.max(discussion_length)
pos_numbers = np.zeros([N_max, 5])
for i in np.arange(0, N_max-1):
for j in np.arange(0, len(data)):
if len(data['questions'][j]) >= (i+1):
pos_numbers[i,0] = pos_numbers[i,0] + (data['questions'][j][i]=='F')
pos_numbers[i,1] = pos_numbers[i,1] + (data['questions'][j][i]=='M')
for i in np.arange(0, N_max-1):
pos_numbers[i,2] = pos_numbers[i,0]/(pos_numbers[i,1]+pos_numbers[i,0])
return (pos_numbers, N_max)
print "testing position ratios"
(pos_numbers, N_max) = position_ratios(data)
#print pos_numbers
# binomial errors:
def get_errors_on_ratio(pos_numbers):
for i in np.arange(0, len(pos_numbers)):
p_f_est = pos_numbers[i,0] / (pos_numbers[i,0] + pos_numbers[i,1])
z = 1 - 0.5*0.68
pos_numbers[i, 3] = pos_numbers[i, 2] - z*np.sqrt(1./(pos_numbers[i,0] + pos_numbers[i,1]) * p_f_est * (1 - p_f_est))
pos_numbers[i, 4] = pos_numbers[i, 2] + z*np.sqrt(1./(pos_numbers[i,0] + pos_numbers[i,1]) * p_f_est * (1 - p_f_est))
return pos_numbers
get_errors_on_ratio(pos_numbers)
# I did a booboo somewhere and the last line of pos_numbers is empty. Delete it.
pos_numbers = pos_numbers[0:-1]
# get binomial errors where approximation does not work (i.e. where p approaches 0 or 1)
# yes, quick and ugly.
i=7 # the one with zero women in that bin
N = pos_numbers[i,0] + pos_numbers[i,1]
k = pos_numbers[i,0]
p_est = pos_numbers[i,0]/(pos_numbers[i,0] + pos_numbers[i,1])
p_est = np.arange(0.01,0.99, 0.01)
likelihood = scipy.misc.comb(N, k) * p_est**k * (1-p_est)**(N-k)
maxi = np.max(np.log(likelihood))
i_near = np.where(np.abs(np.log(likelihood) - maxi) < 1.)[0]
high = np.max(p_est[i_near])
pos_numbers[i, 3] = 0.
pos_numbers[i, 4] = high
others = np.array([0,1,2,3,4,5,6,8]) # the rest of the bins
for i in others:
N = pos_numbers[i,0] + pos_numbers[i,1]
k = pos_numbers[i,0]
p_est = pos_numbers[i,0]/(pos_numbers[i,0] + pos_numbers[i,1])
p_est = np.arange(0.01,0.99, 0.01)
likelihood = scipy.misc.comb(N, k) * p_est**k * (1-p_est)**(N-k)
maxi = np.max(np.log(likelihood))
i_near = np.where(np.abs(np.log(likelihood) - maxi) < 0.5)[0]
low = np.min(p_est[i_near])
high = np.max(p_est[i_near])
pos_numbers[i, 3] = low
pos_numbers[i, 4] = high
np.set_printoptions(precision=3)
np.set_printoptions(suppress=True)
print pos_numbers
plt.clf()
plt.errorbar(np.arange(1, N_max), pos_numbers.transpose()[2], yerr=(pos_numbers.transpose()[2]-pos_numbers.transpose()[3], pos_numbers.transpose()[4]-pos_numbers.transpose()[2]), lw=2 )
plt.xlabel('Position in queue of questions', fontsize=14)
plt.ylabel('Gender ratio f/(m+f)', fontsize=14)
plt.axis([0, 11, -0.25, 1])
plt.savefig('gender_ratio_by_position_in_question_queue.pdf')