-
Notifications
You must be signed in to change notification settings - Fork 1
/
Exploration.R
147 lines (108 loc) · 4.54 KB
/
Exploration.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
library(ggplot2)
library(dplyr)
library(RSQLite)
setwd("C:/Kaggle-collegescorecard/")
my_db <- src_sqlite("database.sqlite")
src_tbls(my_db)
tbl <- tbl(my_db, "Scorecard")
#building a smaller working set from the database for exploration
smaller <- sample(1:124699, 5000)
small_tbl <- tbl %>% filter(rowid %in% smaller)
#start data wrangling
#first step is to make the column names more legible using the data dictionary provided
data_dictionary <- read.csv("CollegeScorecardDataDictionary-09-12-2015.csv")
dim(data_dictionary)
#has more rows than there are variables in the scorecard table
#not sure what causes this, need to write a more robust method to rename
#let's look for matches in the current names cross referenced with the data dictionary
#replace those we have a match for
#in every row where a variable name occurs in the data_dictionary
#with a corresponding developer.friendly.name we can take that
current_colnames <- colnames(small_tbl)
for(i in 1:length(current_colnames)) {
location <- which(data_dictionary$VARIABLE.NAME == current_colnames[i])
if(length(location) == 0) {next}
new_name <- as.character(data_dictionary[location, 4])
print(i)
if(new_name != "") {current_colnames[i] <- new_name}
}
head(current_colnames)
#insofar as I can tell, this usual assignment doesn't work
#perhaps has to do with the fact that it's a database
#however rename works
dimnames(small_tbl) <- current_colnames
dimnames(small_tbl)
updated_colnames <- current_colnames
current_colnames <- colnames(small_tbl)
new_name_list <- ""
new_test <- sapply(updated_colnames, sub, pattern = "-", replacement="to")
new_test <- as.character(new_test)
test_rename <- small_tbl
for(i in 1:length(current_colnames)) {
if(i < length(current_colnames) && i %% 150 != 0) {
new_name_list <- paste(new_name_list, new_test[i], "=", current_colnames[i], ", ")
}
else {
new_name_list <- paste(new_name_list, new_test[i], "=", current_colnames[i])
}
print(i)
if(i %% 150 == 0) {
Encoding(new_name_list) <- "UTF-8"
eval(parse(text =
paste("test_rename <- rename(small_tbl,", new_name_list, ") ")
))
new_name_list <- ""
}
}
for(i in 1:length(current_colnames)) {
print(i)
eval(parse(text =
paste("test_rename <- rename(small_tbl,", updated_colnames[i], "=", current_colnames[i], ") ")
))
}
eval(parse(text =
paste("test_rename <- rename(small_tbl,",
substr(new_name_list, 1, 43), ") ")
))
#([^,]+,){1,10}
#eval
#assign
#start paring down variables, too many to deal with for now
#some initial exploratory graphs
typeof(tbl)
test <- collect(select(small_tbl, state = STABBR))
ggplot(aes(x = STABBR), data = test) +
geom_histogram()
small_tbl collect(small_tbl)
colnames(test) <- current_colnames
earnings <- collect(select(small_tbl, q50earnings = md_earn_wne_p10,
q10earnings = pct10_earn_wne_p10,
q25earnings = pct25_earn_wne_p10,
q75earnings = pct75_earn_wne_p10,
q90earnings = pct90_earn_wne_p10))
year_2011 <- filter(tbl, Year == 2011)
year_2011 <- collect(year_2011)
year_2013 <- filter(tbl, Year == 2013)
year_2013 <- collect(year_2013)
earnings_2011 <- select(year_2011, q50earnings = md_earn_wne_p10,
q10earnings = pct10_earn_wne_p10,
q25earnings = pct25_earn_wne_p10,
q75earnings = pct75_earn_wne_p10,
q90earnings = pct90_earn_wne_p10)
states_2011 <- select(year_2011, states = STABBR)
schools_2011 <- select(year_2011, school_name = INSTNM)
carnegie_class_2013 <- select(year_2013, carnegie_class = CCBASIC)
#filter out all special vocation schools
filter(carnegie_class_2013, !grepl("special", ignore.case = TRUE, carnegie_class))
cbind(schools_2011, states_2011)
schools_earnings_2011 <- cbind(schools_2011, earnings_2011)
schools_earnings_2011 <- schools_earnings_2011 %>% filter(!is.na(q50earnings), q50earnings != 0)
s_e_11 <- schools_earnings_2011
s_e_11 <- s_e_11 %>% filter(q10earnings != 0)
s_e_11 <- s_e_11 %>% arrange(desc(q50earnings))
ggplot(s_e_11[1:20,], aes(x = school_name, ymin = q10earnings, lower = q25earnings, middle = q50earnings, upper= q75earnings, ymax = q90earnings)) +
geom_boxplot(stat="identity") +
coord_flip()
#ymin=e10, lower=e25, middle=e50, upper=e75, ymax=e90
#Some things to explore:
#correlation between ACT/SAT test scores and earnings