-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_joins.py
165 lines (139 loc) · 5.81 KB
/
data_joins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
import numpy as np
from fuzzywuzzy import process
from collections import defaultdict
def grab_pair_assignment_data():
'''
OUTPUT: Pandas dataframe where each row is pair programming pair
'''
return pd.read_csv('../repo-name.csv')
def grab_length_data(directory):
'''
OUTPUT: Pandas Dataframe of scraped Github files and respective lengths
'''
return pd.read_csv('../scraped_github_data.csv')
def grab_final_data():
'''
OUTPUT: Pandas Dataframe of assessment and experience data for each student
'''
return pd.read_excel('../../galvanize_data.xlsx')
def _pair_checker(filename):
'''
find_relevant_columns helper function
INPUT: filename (string)
OUTPUT: 1 if 'pair' is contained within filename string, 0 if not
'''
filename = filename.lower()
if 'pair' in filename:
return 1
else:
return 0
def find_relevant_rows(df):
'''
INPUT: Dataframe
OUTPUT: Dataframe, only with rows that we have tagged as pair assignments
'''
column_name = 'filename'
df[column_name] = df[column_name].apply(str)
df['is_pair'] = pd.Series(
df[column_name].apply(_pair_checker), index=df.index)
return df.ix[df.is_pair == 1, :]
def _get_name_dict(df):
'''
INPUT: Dataframe with columns of names and github usernames
OUTPUT: dictionary with keys of real names
'''
name_dict = defaultdict(str)
for row in df.iterrows():
if row[1]['Name']:
name_dict[row[1]['Name']] = row[1]['github']
return name_dict
def change_name_to_ghu(df1, df2):
'''
INPUT: Dataframe (df1) that has columns of names and github usernames,
Dataframe (df2) of name data that needs to be converted
OUTPUT: Modified df2 with names changed to github usernames
'''
name_dict = _get_name_dict(df1)
name_list = name_dict.keys()
for row in df2.iterrows():
name1 = process.extractOne(row[1]['Name1'], name_list)[0]
name2 = process.extractOne(row[1]['Name2'], name_list)[0]
df2.set_value(row[0], 'Name1', name_dict[name1])
df2.set_value(row[0], 'Name2', name_dict[name2])
if row[1]['Name3']:
name3 = process.extractOne(str(row[1]['Name3']), name_list)[0]
df2.set_value(row[0], 'Name3', name_dict[name3])
return df2
def filler(final_df, repo, name1, name2, name3=None):
if not pair_length_df.loc[(pair_length_df.gh_username == name1) & (pair_length_df.repo_name == repo)].empty:
temp = pair_length_df.loc[(pair_length_df.gh_username == name1) & (
pair_length_df.repo_name == repo)]
# In other file, set pair programming assignment value for user and
# repo
final_df.set_value(final_df.loc[final_df.github == name1].index, repo,
temp.file_length.mean())
if temp.shape[0] != 1:
# Set off warning if pair progamming submission consists of
# multiple files
print "Warning: {}: {} consists of multiple files".format(name1, repo)
# Analogous logic to above if statement - note that we check partner
# value, but assign to primary user
elif not pair_length_df.loc[(pair_length_df.gh_username == name2) & (pair_length_df.repo_name == repo)].empty:
temp = pair_length_df.loc[(pair_length_df.gh_username == name2) & (
pair_length_df.repo_name == repo)]
final_df.set_value(final_df.loc[final_df.github == name1].index, repo,
temp.file_length.mean())
if temp.shape[0] != 1:
# Set off warning if pair progamming submission consists of
# multiple files
print "Warning: {}: {} consists of multiple files".format(name2, repo)
elif not pair_length_df.loc[(pair_length_df.gh_username == name3) & (pair_length_df.repo_name == repo)].empty:
temp = pair_length_df.loc[(pair_length_df.gh_username == name3) & (
pair_length_df.repo_name == repo)]
final_df.set_value(final_df.loc[final_df.github == name1].index, repo,
temp.file_length.mean())
if temp.shape[0] != 1:
# Set off warning if pair progamming submission consists of
# multiple files
print "Warning: {}: {} consists of multiple files".format(name3, repo)
else:
print "Error: No pair programming values found for {}, repo: {}".format(name1, repo)
def _isNotNan(obj):
'''
Helper function that checks for non-missing values
'''
return obj == obj
def fill_lengths(pairdf, pair_lengthdf, final_df):
'''
INPUT: pairdf: dataframe with pair assignments for a given repo and cohort,
pair_lengthdf: dataframe with scraped data from Github featuring the length
of pair assignments, finaldf: dataframe with students as rows
OUTPUT: modified finaldf with pair assignment lengths filled in
'''
for row in pairdf.iterrows():
repo = row[1]['repo']
name1 = row[1]['Name1']
name2 = row[1]['Name2']
if _isNotNan(row[1]['Name3']):
name3 = row[1]['Name3']
filler(final_df, repo, name1, name2, name3)
filler(final_df, repo, name2, name1, name3)
filler(final_df, repo, name3, name2, name1)
else:
filler(final_df, repo, name1, name2)
filler(final_df, repo, name2, name1)
def write_to_csv(df):
'''
INPUT: Dataframe to be written to file
'''
df.to_csv('../final_sheet.csv')
if __name__ == "__main__":
pair_df = grab_pair_assignment_data()
pair_length_df = find_relevant_rows(grab_length_data(os.getcwd()))
final_df = grab_final_data()
pair_df = change_name_to_ghu(final_df, pair_df)
print "*** Github names changed! ***"
fill_lengths(pair_df, pair_length_df, final_df)
print "*** Lengths inputted! ***"
write_to_csv(final_df)