-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenderstats.py
122 lines (102 loc) · 4.27 KB
/
genderstats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#run using ipython
#assumes you've already ran read_speaker_data_and_stuff.py
# run using execfile("genderstats.py")
import re
from scipy.stats import ks_2samp
numtalks = len(data)
mtalks = len(data[data["speaker"] == "M"])
ftalks = len(data[data["speaker"] == "F"])
print "Total Number of Talks Recorded:", numtalks
print "Talks Given by Men:", mtalks, "(%.1f"%(float(mtalks)/numtalks*100) + "%)"
print "Talks Given by Women:", ftalks, "(%.1f"%(float(ftalks)/numtalks*100) + "%)"
print
qstring = ''
for i in data["questions"]:
qstring = qstring + i
numqs = len(qstring)
mqs = qstring.count("M")
fqs = qstring.count("F")
print "Total Questions: ", numqs
print "Questions by Males:", mqs, "(%.1f"%(float(mqs)/numqs*100) + "%)"
print "Questions by Females:", fqs, "(%.1f"%(float(fqs)/numqs*100) + "%)"
print
#first question
mfirstq = 0
ffirstq = 0
for i in data["questions"]:
if i[0] == "M":
mfirstq += 1
else:
ffirstq += 1
mfraction = float(mfirstq)/numtalks
ffraction = float(ffirstq)/numtalks
print "First Question:"
print "Men:", mfirstq, "(%.1f"%(mfraction*100) + "%)"
print "Women:", ffirstq, "(%.1f"%(ffraction*100) + "%)"
print
#two point function time
tpmm = 0
tpmf = 0
tpfm = 0
tpff = 0
for i in data["questions"]:
tpmm += len(re.findall('(?=MM)', i))
tpmf += len(re.findall('(?=MF)', i))
tpfm += len(re.findall('(?=FM)', i))
tpff += len(re.findall('(?=FF)', i))
tptotal = tpmm + tpmf + tpfm + tpff
print "Two-Gram Total:", tptotal
print "Two-Gram MM:", tpmm, "(%.1f"%(float(tpmm)/tptotal*100) + "%);", "Expected:", int(round(tptotal*mfraction**2))
print "Two-Gram MF:", tpmf, "(%.1f"%(float(tpmf)/tptotal*100) + "%);", "Expected:", int(round(tptotal*mfraction*ffraction))
print "Two-Gram FM:", tpfm, "(%.1f"%(float(tpfm)/tptotal*100) + "%);", "Expected:", int(round(tptotal*mfraction*ffraction))
print "Two-Gram FF:", tpff, "(%.1f"%(float(tpff)/tptotal*100) + "%);", "Expected:", int(round(tptotal*ffraction**2))
print
#three-gram time
tgmmm = 0
tgmmf = 0
tgmfm = 0
tgfmm = 0
tgmff = 0
tgfmf = 0
tgffm = 0
tgfff = 0
for i in data["questions"]:
tgmmm += len(re.findall('(?=MMM)', i))
tgmmf += len(re.findall('(?=MMF)', i))
tgmfm += len(re.findall('(?=MFM)', i))
tgfmm += len(re.findall('(?=FMM)', i))
tgmff += len(re.findall('(?=MFF)', i))
tgfmf += len(re.findall('(?=FMF)', i))
tgffm += len(re.findall('(?=FFM)', i))
tgfff += len(re.findall('(?=FFF)', i))
tgtotal = tgmmm + tgmmf + tgmfm + tgfmm + tgmff + tgfmf + tgffm + tgfff
print "Three-Gram Total:", tgtotal
print "Three-Gram MMM:", tgmmm, "(%.1f"%(float(tgmmm)/tgtotal*100) + "%);", "Expected:", int(round(tgtotal*mfraction**3))
print "Three-Gram MMF:", tgmmf, "(%.1f"%(float(tgmmf)/tgtotal*100) + "%);", "Expected:", int(round(tgtotal*mfraction**2 * ffraction))
print "Three-Gram MFM:", tgmfm, "(%.1f"%(float(tgmfm)/tgtotal*100) + "%);", "Expected:", int(round(tgtotal*mfraction**2 * ffraction))
print "Three-Gram FMM:", tgfmm, "(%.1f"%(float(tgfmm)/tgtotal*100) + "%);", "Expected:", int(round(tgtotal*mfraction**2 * ffraction))
print "Three-Gram MFF:", tgmff, "(%.1f"%(float(tgmff)/tgtotal*100) + "%);", "Expected:", int(round(tgtotal*mfraction* ffraction**2))
print "Three-Gram FMF:", tgfmf, "(%.1f"%(float(tgfmf)/tgtotal*100) + "%);", "Expected:", int(round(tgtotal*mfraction* ffraction**2))
print "Three-Gram FFM:", tgffm, "(%.1f"%(float(tgffm)/tgtotal*100) + "%);", "Expected:", int(round(tgtotal*mfraction* ffraction**2))
print "Three-Gram FFF:", tgfff, "(%.1f"%(float(tgfff)/tgtotal*100) + "%);", "Expected:", int(round(tgtotal*ffraction**3))
print
#talks with only one question
singleq = ''
sqnum = 0
for i in data["questions"]:
if len(i) == 1:
singleq = singleq + i
sqnum += 1
msq = singleq.count("M")
msqfrac = float(msq) / sqnum
fsq = singleq.count("F")
fsqfrac = float(fsq) / sqnum
print "Talks with Only One Question:", sqnum
print "Male:", msq, "(%.1f"%(msqfrac*100) + "%)"
print "Female:", fsq, "(%.1f"%(fsqfrac*100) + "%)"
print
#Do men or women get asked more questions?
fsqnums = np.array([len(i) for i in data[data["speaker"] == "F"]["questions"]])
msqnums = np.array([len(i) for i in data[data["speaker"] == "M"]["questions"]])
print "Do women and men get asked different numbers of questions?"
print "KS Test Results (KS-stat,p-value):", ks_2samp(fsqnums,msqnums)