-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_cleaning.Rmd
278 lines (202 loc) · 9.68 KB
/
data_cleaning.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
---
title: "Data Cleaning and Analysis"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r load_packages}
# # Ensure that pacman is installed for package management and loading.
# if (!require("pacman")) install.packages("pacman")
# # for data reading wrangling and visualization
pacman::p_load(tidyverse)
# for working directories
pacman::p_load(here)
# # for cross tabulation and data cleaning
# pacman::p_load(janitor)
# for working with strings
pacman::p_load(glue)
# For randomized inference, also loads randomizr and estimatr
pacman::p_load(ri2)
# for marginal effects from lineal regressions
pacman::p_load(margins)
# Tests for linear regression models
pacman::p_load(lmtest)
pacman::p_load(car)
# Tables
pacman::p_load(kableExtra)
# for updated ggplot2 theme
pacman::p_load(hrbrthemes)
# for updated ggplot2 colorblind-friendly scheme
pacman::p_load(ggthemes)
# theme_set(hrbrthemes::theme_ipsum())
pacman::p_load(reshape2)
# for plotting of covariate balance
pacman::p_load(cobalt)
# for matching only
# pacman::p_load(MatchIt)
library('fastDummies')
```
```{r read_data}
#download the data from GitHub
data <-'https://raw.githubusercontent.com/gsbDBI/ALP301-spr21-project3/main/RLA_0507.csv'
df <- read.csv(data)
rm(data) #remove data csv file
```
```{r glimpse_data}
tibble::glimpse(df) # overview of variables in the data
```
```{r data cleaning 1}
# Raw Data Cleaning
# First, let's view the data
#view(df)
# Let's check to make sure the survey only reported participants that were 100% complete
df$Progress = as.numeric(df$Progress)
summary(df$Progress)
# The summary statistics show that all of the values (minus one) for Progress are 100, so let's delete that too
# We can also get rid of the Response ID and Lat/Long and the Recorded Date
df <- df %>%
subset(
select = -c(StartDate, EndDate, Status, IPAddress, Progress, ResponseId, LocationLatitude, LocationLongitude, RecordedDate)
)
# In our conversation with Emma, we decided only those columns that read Page.Submit were useful for understanding the timing
# of treatment groups. Let's get rid of all the other columns such as First.Click and Last.Click.
# We'll start with the bipartisan treatment
df <- df %>%
subset(select = -c(bipart_intro_time_First.Click, bipart_intro_time_Last.Click, bipart_intro_time_Click.Count,
bipartisan_time_First.Click, bipartisan_time_Last.Click, bipartisan_time_Click.Count))
# Next, the RLA as a Percentage treatment
df <- df %>%
subset(select = -c(RL_perc_intro_time_First.Click, RL_perc_intro_time_Last.Click, RL_perc_intro_time_Click.Count,
RL_perc_time_First.Click, RL_perc_time_Last.Click, RL_perc_time_Click.Count))
# Next, the Handcount treatment
df <- df %>%
subset(select = -c(Handcount_intro_time_First.Click, Handcount_intro_time_Last.Click, Handcount_intro_time_Click.Count,
Handcount_time_First.Click, Handcount_time_Last.Click, Handcount_time_Click.Count))
# Next, the Bad Loser treatment
df <- df %>%
subset(select = -c(Loser_intro_time_First.Click, Loser_intro_time_Last.Click, Loser_intro_time_Click.Count,
Loser_time_First.Click, Loser_time_Last.Click, Loser_time_Click.Count))
# Next, the Soup treatment
df <- df %>%
subset(select = -c(Soup_intro_time_First.Click, Soup_intro_time_Last.Click, Soup_intro_time_Click.Count,
Soup_time_First.Click, Soup_time_Last.Click, Soup_time_Click.Count))
# Next, the Local Officials treatment
df <- df %>%
subset(select = -c(Local_intro_time_First.Click, Local_intro_time_Last.Click, Local_intro_time_Click.Count,
Local_time_First.Click, Local_time_Last.Click, Local_time_Click.Count))
# Finally, the Control group
df <- df %>%
subset(select = -c(control_time_First.Click, control_time_Last.Click, control_time_Click.Count))
# Though the comments are fun, let's get rid of those. Let's also get rid of all of the Lucid profile data except the political
# party (Calli's idea but can't remember exactly why). We can also get rid of the column called rid
df <- df %>%
subset(select = -c(comments, rid, age, gender, hhi, ethnicity, hispanic, education, region, zip))
```
```{r data cleaning 2}
# The first row is also repetitive from the column names. Let's remove that.
df = df[-1,]
# Check to see if we can get rid of UserLanguage. If they are all EN, then let's remove and just annotate that all speakers were
# English in nature. Further, let's do the same thing with consent.
if(nrow(df) == sum(df$UserLanguage == "EN")) {
df = subset(df, select = -c(UserLanguage))
}
if(nrow(df) == sum(df$Consent == "YES")) {
df = subset(df, select = -Consent)
}
# ***NOTE: Since the following code has an if statement, we need to check the df to see if UserLanguage and Consent were deleted
#before we run the next line of code. If they were deleted, we can run the code below with no issues.
```
```{r column names}
names(df) <- # to remove white space and put all in lower case
names(df) %>%
stringr::str_replace_all("\\s","_") %>% tolower
#clean up column names
names(df) <- gsub("_1", "", names(df))
names(df) <- gsub("dem_", "", names(df))
names(df) <- gsub("_page.submit", "", names(df))
names(df) <- gsub("cov_", "", names(df))
# to condense message time spent to a new column
df <- df %>%
unite("msg_time", c(bipartisan_time, rl_perc_time, handcount_time, loser_time, soup_time, local_time, control_time), sep= "")
# rearrange columns for easier condensing
df <- df %>% relocate("msg_time", .before = bipart_intro_time)
# to condense intro time spent to a new column
df <- df %>%
unite("intro_time", bipart_intro_time:local_intro_time, sep = "")
#new name = old name
df <- df %>%
rename(
duration_sec = duration..in.seconds.,
dv_pre_state_conf = pre_state_conf,
dv_pre_national_conf = pre_national_conf,
accuracy_2016 = `2016_accu`,
accuracy_2020 = `2020_accu`,
gender_text = gender_6_text,
vote_who_2020_text = vote_who_2020_5_text,
party_text = party_4_text,
race_text = race_6_text,
parent = child,
dv_post_state_conf = post_state_conf,
dv_post_national_conf = post_national_conf
)
# create dummy variable for control vs. treated
df$dummy_treat <- ifelse(df$treatment_group == "Control", 0, 1)
# create dummy variable for failed attention check or no
df$attention_pass <- ifelse(df$attentioncheck == "Red,Green", 1, 0)
```
```{r fix covariate variables}
#create binary variables for covariates
df$gender_female <- ifelse(df$gender == "Female" | df$gender == "Transgender Female", 1, 0)
df$parent_yes <- ifelse(df$parent == "Yes", 1, 0)
df$race_hispanic <- ifelse(grepl("Latino or Hispanic", df$race), 1, 0)
df$edu_4college <- ifelse(df$edu == "edu4-year college degree", 1, 0)
df$edu_hs <- ifelse(df$edu == "High school graduate (high school diploma or equivalent including GED)", 1, 0)
df$edu_4college <- ifelse(df$edu == "4-year college degree" | df$edu == "Postgraduate degree (MA, MBA, JD, PhD, etc.)", 1, 0)
df$race_white = ifelse(grepl("White", df$race), 1, 0)
df$race_black= if_else(grepl("Black or African American", df$race), 1, 0)
df$race_asian= if_else(grepl("Asian", df$race), 1,0)
#fix age and income variables
df$birthyear <- as.numeric(as.character(df$birthyear))
#change birthyear to age (approx)
df <- df %>%
mutate(
age = 2021 - birthyear
)
#change income variable
df$income2 <- as.character(df$income)
#
df$income2 <- ifelse(df$income2 == "Less than $10,000", "$0-$10,000", df$income2)
df$income2 <- ifelse(df$income2 == "Prefer not to say", NA, df$income2)
df$income2 <- ifelse(df$income2 == "$150,000 or more", "$150,000", df$income2)
df$income2 <- ifelse(df$income2 == "", NA, df$income2) #make ppl who didnt provide income == NA
df$income3 <- factor(df$income2, levels = c("$0-$10,000","$10,000-$19,999","$20,000-$29,999","$30,000-$39,999","$40,000-$49,999","$50,000-$74,999","$75,000-$99,999","$100,000-$149,999","$150,000"))
df$income_num <- as.numeric(df$income3)
df <- df %>%
rename(
income_clean = income3
) %>%
select(-c("income2"))
```
```{r column types}
# All columns are currently structured as characters. Not super useful, so let's change that, allowing us to run descriptive statistics and perform mathematical operations, as needed.
# Start with the columns that need to be numeric
df[, c(1,3,4,5,14:17)] <- sapply(df[, c(1,3,4,5,14:17)], as.numeric)
# Now the columns that make sense to be structured as factors
df[c(6:10,12,19:21,23,27,29,30,32,35:42,44)] <- lapply(df[c(6:10,12,19:21,23,27,29,30,32,35:42,44)], factor)
df$treatment_group_num <- as.numeric(as.factor(df$treatment_group))
# Before we go any further, let's decide how to work with missing data and where it exists
summary(df)
# At a minimum, I suggest we need to remove those values where the DV is not measured, both pre and post test
df <- df %>% drop_na("dv_pre_state_conf", "dv_post_state_conf", "dv_pre_national_conf", "dv_post_national_conf")
# Finally, let's create a column that measures the treatment difference for the state and national DVs and check the summary stats
df$dv_state_treatment_diff = df$dv_post_state_conf - df$dv_pre_state_conf
df$dv_national_treatment_diff = df$dv_post_national_conf - df$dv_pre_national_conf
summary(df$dv_state_treatment_diff)
summary(df$dv_national_treatment_diff)
# create variable for those who always trust or never trust election outcomes
df$always_believer <- to_factor(ifelse(df$accuracy_2016 == "Yes", ifelse(df$accuracy_2020 == "Yes", 1, 0), 0))
df$never_believer <- to_factor(ifelse(df$accuracy_2016 == "No", ifelse(df$accuracy_2020 == "No", 1, 0), 0))
# Save new CSV file as "clean"
#write.csv(df, "rla_clean_5_12.csv", row.names = FALSE)
```