-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaning.R
342 lines (281 loc) · 13.8 KB
/
data_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
library(data.table)
library(tidyverse)
library(datasets) # for state abbreviations
# Load in the batting and pitching data
batting <- fread('Data/batting_stats.csv')
pitching <- fread('Data/pitching_stats.csv')
# Change the variable name for WAR so that it matches in both data frames
## Our data give us FanGraphs WAR commonly denoted fWAR
batting <-
batting %>%
rename(fWAR = `L-WAR`,
fg_playerID = IDfg)
pitching <-
pitching %>%
rename(fWAR = WAR,
fg_playerID = IDfg)
# View the structure of the batting data
# glimpse(batting)
# View the structure of the pitching data
# glimpse(pitching)
# We want to subset the columns which have less than 50% of values as NA values
bat_columns <-
which(batting[, lapply(.SD, function(x) {sum(is.na(x)) / length(x)})] <= 0.5)
# Keep only columns of interest for all players in the data
batting <-
batting %>%
select(all_of(bat_columns), -Events, -Dol, -starts_with("GB"),
-starts_with("LD"), -starts_with("FB"), -c(Pitches:BUH),
-c(`IFFB%`:`BUH%`), -Age, -Team, -IFFB)
# We want to subset the columns which have less than 50% of values as NA values
pitch_columns <-
which(pitching[, lapply(.SD, function(x) {sum(is.na(x)) / length(x)})] <= 0.5)
# Keep only columns of interest for all players in the data
pitching <-
pitching %>%
select(all_of(pitch_columns), -Team, -Age, -c(GB:BUH), -c(`GB/FB`:xFIP),
-c(`-WPA`:`WPA/LI`), -c(`FB% 2`:`SwStr%`), -SD, -MD, -SIERA, -`RS/9`,
-c(`O-Swing% (sc)`:`FDP-Wins`), -c(`Pull%`:`Hard%`),
-c(`O-Swing% (pi)`:`Pace (pi)`), -c(`LD%+`:`CSW%`), -`xFIP-`) %>%
replace_na(list(BS = 0, HLD = 0))
rm(bat_columns, pitch_columns)
# Load the MLB Amateur Draft information
draft_info <- fread('Data/draft_data.csv')
# View the structure of the draft information
## 67,903 rows by 84 columns
# glimpse(draft_info)
## 67,903 rows by 51 columns
draft_info <-
draft_info %>%
# Add year columns to match up with Chadwick
mutate(mlb_played_first = year(person_mlb_debut_date),
high_school = ifelse(str_detect(school_name, "\\sHS") == TRUE, "Yes",
"No"),
person_height = str_replace_all(person_height, "\"$", "")) %>%
replace_na(list(home_state = "None", mlb_played_first = 0)) %>%
mutate(mlb_played_last = ifelse(mlb_played_first == 0, 0,
year(person_last_played_date))) %>%
replace_na(list(mlb_played_last = 2023)) %>%
# Remove variables of no interest
select(-person_link, -c(person_use_name:person_gender),
-c(person_name_slug:person_init_last_name),
-c(person_name_matrilineal:home_city), -c(home_country, school_state),
-c(person_xref_ids:person_death_country), -person_name_title, -team_id,
-person_name_suffix, -c(team_link:team_spring_league_abbreviation),
-headshot_link, home_state)
# Load the Chadwick baseball player info
## 482,259 rows by 40 columns
chadwick <- fread('Data/chadwick_data.csv')
# Update Chadwick for players who were drafted after
## 109,609 rows by 21 columns
chadwick <-
chadwick %>%
select(-c(pro_managed_first:mlb_umpired_last), -name_suffix,
-name_matrilineal, -key_person, -key_uuid, -key_retro,
-key_bbref_minors, -c(key_npb:key_wikidata), -key_bbref) %>%
# Add variables to match up with Draft info
mutate(person_birth_date =
str_c(birth_year, birth_month, birth_day,
sep="-"),
person_full_fml_name =
str_c(name_given, name_last, sep = " "),
person_full_name =
str_c(name_first, name_last, sep = " ")) %>%
rename(person_id = key_mlbam,
fg_playerID = key_fangraphs) %>%
arrange(mlb_played_first) %>%
drop_na(person_id) # Remove players we don't have an identifier for
# missing height is formatted as 0' 0" (0 feet 0 inches tall)
missing_height <- draft_info$person_height[57]
## 47,162 rows by 55 columns
draft_info <-
draft_info %>%
# add the chadwick player info, particularly the `fg_playerID`
inner_join(chadwick %>%
select(-person_full_name, -person_birth_date, -person_full_fml_name,
-mlb_played_first, -mlb_played_last),
by = "person_id") %>%
# Remove unnecessary variables
select(-person_first_name, -person_last_name, -person_primary_number,
-person_current_age, -c(person_name_first_last:person_strike_zone_bottom),
person_primary_position_code, person_primary_position_type,
person_bat_side_description, person_pitch_hand_description, -name_last,
-name_first) %>%
filter(person_height != missing_height)
# glimpse(draft_info)
# glimpse(chadwick)
rm(chadwick)
# Get the indices for the elements which contain a state abbreviation
# this applies to High School players who have states listed in format of either
# (CA) or (, CA) (e.g. CA = California)
draft_index <- which(str_detect(draft_info$school_name, pattern = "\\((.*?)\\)"))
length(draft_index) # 13,579 elements
# List with all the match results for the pattern "(letters)"
draft_list <- str_match_all(draft_info$school_name, pattern = "\\((.*?)\\)")
# Extracts the text within the "()", and stores it as a vector
draft_states <- unlist(lapply(draft_list, function(x) {x[, 2]}))
rm(draft_list) # Remove the list form of the home states
# removes any results where there was an NA result generated
draft_states <- draft_states[!is.na(draft_states)]
length(draft_states) # 13,579 elements, same as home_index
# Get the index for the draft_stats which contain the pattern ", "
comma_index <- which(str_detect(draft_states, ",\\s"))
length(comma_index) # 6,275 elements
# Extract the states with the pattern ", " from `draft_states`
comma_states <- draft_states[comma_index]
length(comma_states) # 6,275 elements
# Determine if there are any incorrect patterns
wrong <- comma_states[which(!str_detect(comma_states, ", ([:alpha:]+)"))]
# Replace the pattern within the `comma_states` manually
comma_states[which(!str_detect(comma_states, ", ([:alpha:]+)"))] <- ", FL"
# Extract the letters which follow the ", " pattern into a vector
comma_states <-
str_match_all(comma_states, ", ([:alpha:]+)") %>%
lapply(function(x) {x[, 2]})
# There was an error, so it was detecting Louis and MO
comma_states[[2525]] <- comma_states[[2525]][2]
# Unlist it into a single vector
comma_states <- comma_states %>% unlist()
# Replace the comma states with the correct abbreviations
draft_states[comma_index] <- comma_states
# Create a new variable home_state
draft_info <- draft_info %>% mutate(home_state = person_birth_state_province)
# Store all the draft_states to the new `draft_state` variable within `draft_info`
draft_info$home_state[draft_index] <- draft_states
rm(draft_index, draft_states, wrong)
# wrong abbreviations indices
wrong_index <- which(nchar(draft_info$home_state) > 2)
# Find the wrong abbreviations
wrong_abbrev <- draft_info$home_state[wrong_index]
# Data frame for state_data
state_data <- data.frame(state_name = state.name, state_abb = state.abb)
state_names_wrong <-
data.frame(state_name = wrong_abbrev[which(tolower(wrong_abbrev) %in% tolower(state.name))]) %>%
inner_join(state_data, by = "state_name")
# Fix the wrong abbreviations
wrong_abbrev[which(tolower(wrong_abbrev) %in% tolower(state.name))] <- state_names_wrong$state_abb
# For those players who didn't go to high school
no_hs <- draft_info %>%
slice(wrong_index) %>%
filter(high_school == "No")
# Improperly formatted states
wrong_states <-
wrong_abbrev[which(!(tolower(wrong_abbrev) %in% tolower(no_hs$home_state)))]
# Find the unique improperly formatted states
wrong <- unique(wrong_states) %>% sort()
correct_states <- c("AL", "ALB", "AZ", "AZ", "AR", "AR", "GA", "MD", "MI", "AL",
"CA", "CA", "CA", "Canada", "Canada", "IL",
"Christian Brothers", "OH", "Coahuila", "CO", "CO", "CT", "CA",
"DE", "CO", "FL", "FL", "FL", "FL", "Germany", "HI", "Holland",
"HI", "TX", "IA", "IL", "IL", "IL", "IL", "IN", "IN", "IA",
"KS", "NV", "CA", "IA", "MAN", "MA", "FL", "FL", "MI", "MI",
"MI", "MN", "MS", "CA", "NC", "NE", "NJ", "NJ", "VA", "NY",
"NY", "OH", "OK", "OK", "OK", "OR", "PA", "PA", "PA", "PA",
"AZ", "CA", "PA", "PA", "TX", "CA", "CA", "CA", "SASK",
"St Thomas", "TN", "TX", "Trabuco Hills", "Trinity International",
"TX", "TX", "US Virgin Islands", "Ven", "WA", "WI", "OH", "WV",
"Zacatecas")
# Corrected abbreviations
corrections <- data.frame(wrong = wrong, correct = correct_states)
# Correct the wrong_states abbreviations with the correct ones, unless not in
# U.S.A.
wrong_states <- ifelse(wrong_states %in% corrections$wrong, corrections$correct, wrong_states)
# Update wrong_abbrev object with corrected abbreviations
wrong_abbrev[which(!(tolower(wrong_abbrev) %in% tolower(no_hs$home_state)))] <- wrong_states
# Correct leftover wrong_abbrev
correct_abbrev <- ifelse(wrong_abbrev %in% corrections$wrong, corrections$correct, wrong_abbrev)
# Add the correct_abbrev to the draft_info
draft_info$home_state[wrong_index] <- correct_abbrev
draft_info$home_state <- tolower(draft_info$home_state) # For matching up
# Create a list with all drafted player height
height_list <- str_match_all(draft_info$person_height, "\\d+")
# Convert the first element from feet to inches
height_ft <- as.integer(unlist(lapply(height_list, function(x) {x[1]}))) * 12
height_in <- as.integer(unlist(lapply(height_list, function(x) {x[2]}))) + as.integer(height_ft)
rm(height_list, height_ft) # Remove the height_list
# Convert the person_height variable to inches
draft_info$person_height <- height_in
rm(comma_index, comma_states, correct_abbrev, correct_states, height_in,
missing_height, wrong, wrong_abbrev, wrong_index, wrong_states, state_data,
state_names_wrong, no_hs, corrections)
# The draft info for the batters
## Separate from the stats because some players are drafted into the MLB
## multiple times. (e.g. Barry Bonds in 2nd round of 1982, and first round 1985)
bat_draft <-
batting %>%
inner_join(draft_info, by = "fg_playerID") %>%
filter(person_primary_position_name != "Pitcher")
rm(batting) # remove the batting data
# Only the distinct instances of batter's draft info, so only one instance for
# each fg_playerID
bat_stats <-
bat_draft %>%
# For picks whose rounds are none numerical we will designate them with a 1000
# to ensure it's clear that it was a supplemental draft round
mutate(pick_round = ifelse(is.na(as.numeric(pick_round)), 1000,
as.numeric(pick_round))) %>%
group_by(person_id) %>%
slice(which.min(pick_round)) %>% # choose the lower round (the better pick)
ungroup() %>% # ungroup
distinct(person_id, .keep_all = T) %>% # distinct occurrences
arrange(desc(fWAR))
# Removes players who do not have a person_birth_state_province
# e.g. Albert Pujols is from Santo Domingo, Dominican Republic
bat_stats <-
bat_stats %>%
drop_na(home_state)
# Select only those with abbreviations with 2 letters, all those with more
# are international players so they do not have a `home_state`
bat_stats <-
bat_stats %>%
filter(home_state %in% tolower(state.abb))
# Full Draft information for pitcher's
pitch_draft <-
pitching %>%
inner_join(draft_info, by = "fg_playerID") %>%
filter(person_primary_position_name == "Pitcher")
rm(pitching) # Remove pitching data
# Career Stats for each person from the draft information
pitch_stats <-
pitch_draft %>%
# For picks whose rounds are none numerical we will designate them with a 1000
# to ensure it's clear that it was a supplemental draft round
mutate(pick_round = ifelse(is.na(as.numeric(pick_round)), 1000,
as.numeric(pick_round))) %>%
group_by(person_id) %>%
slice(which.min(pick_round)) %>% # choose the lower round (the better pick)
ungroup() %>% # ungroup
distinct(person_id, .keep_all = T) %>% # distinct occurrences
arrange(desc(fWAR))
# Removes players who do not have a person_birth_state_province
# e.g. Albert Pujols is from Santo Domingo, Dominican Republic
pitch_stats <-
pitch_stats %>%
drop_na(home_state)
# Select only those with abbreviations with 2 letters, all those with more
# are international players so they do not have a `home_state`
pitch_stats <-
pitch_stats %>%
# Remove abbreviations for those such as the Canadians with province such as
# Ontario (ON)
filter(home_state %in% tolower(state.abb))
# Create csv files for draft information
# write_csv(draft_info, 'Data/clean_draft.csv')
# write_csv(bat_draft, 'Data/batters/batters_full_draft.csv')
# write_csv(bat_stats, 'Data/batters/batters_draft_stats.csv')
# write_csv(pitch_draft, 'Data/pitchers/pitchers_full_draft.csv')
# write_csv(pitch_stats, 'Data/pitchers/pitchers_draft_stats.csv')
# write_csv(bat_stats %>% select(fg_playerID, person_id, person_full_name, fWAR, pick_round,
# pick_number, year, person_birth_state_province,
# person_height, person_weight, home_state,
# person_primary_position_abbreviation,
# person_bat_side_code, person_pitch_hand_code,
# mlb_played_first, mlb_played_last, high_school
# ), 'Data/bat_subset.csv')
# write_csv(pitch_stats %>% select(fg_playerID, person_id, person_full_name, fWAR, pick_round,
# pick_number, year, person_birth_state_province,
# person_height, person_weight, home_state,
# person_primary_position_abbreviation,
# person_bat_side_code, person_pitch_hand_code,
# mlb_played_first, mlb_played_last, high_school
# ), 'Data/pitch_subset.csv')