-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_analysis.Rmd
317 lines (238 loc) · 13.1 KB
/
data_analysis.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
---
title: "Data Cleaning and Analysis"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r load_packages}
# # Ensure that pacman is installed for package management and loading.
# if (!require("pacman")) install.packages("pacman")
# # for data reading wrangling and visualization
pacman::p_load(tidyverse)
# for working directories
pacman::p_load(here)
# # for cross tabulation and data cleaning
# pacman::p_load(janitor)
# for working with strings
pacman::p_load(glue)
# For randomized inference, also loads randomizr and estimatr
pacman::p_load(ri2)
# for marginal effects from lineal regressions
pacman::p_load(margins)
# Tests for linear regression models
pacman::p_load(lmtest)
pacman::p_load(car)
# Tables
pacman::p_load(kableExtra)
# for updated ggplot2 theme
pacman::p_load(hrbrthemes)
# for updated ggplot2 colorblind-friendly scheme
pacman::p_load(ggthemes)
# theme_set(hrbrthemes::theme_ipsum())
pacman::p_load(reshape2)
# for plotting of covariate balance
pacman::p_load(cobalt)
# for matching only
# pacman::p_load(MatchIt)
```
```{r read_data}
#download the data from GitHub
data <-'https://raw.githubusercontent.com/gsbDBI/ALP301-spr21-project3/main/RLA_0507.csv'
df <- read.csv(data)
rm(data) #remove data csv file
```
```{r glimpse_data}
tibble::glimpse(df) # overview of variables in the data
```
```{r data cleaning 1}
# Raw Data Cleaning
# First, let's view the data
view(df)
# Let's check to make sure the survey only reported participants that were 100% complete
df$Progress = as.numeric(df$Progress)
summary(df$Progress)
# The summary statistics show that all of the values (minus one) for Progress are 100, so let's delete that too
# We can also get rid of the Response ID and Lat/Long and the Recorded Date
df <- df %>%
subset(
select = -c(StartDate, EndDate, Status, IPAddress, Progress, ResponseId, LocationLatitude, LocationLongitude, RecordedDate)
)
# In our conversation with Emma, we decided only those columns that read Page.Submit were useful for understanding the timing
# of treatment groups. Let's get rid of all the other columns such as First.Click and Last.Click.
# We'll start with the bipartisan treatment
df <- df %>%
subset(select = -c(bipart_intro_time_First.Click, bipart_intro_time_Last.Click, bipart_intro_time_Click.Count,
bipartisan_time_First.Click, bipartisan_time_Last.Click, bipartisan_time_Click.Count))
# Next, the RLA as a Percentage treatment
df <- df %>%
subset(select = -c(RL_perc_intro_time_First.Click, RL_perc_intro_time_Last.Click, RL_perc_intro_time_Click.Count,
RL_perc_time_First.Click, RL_perc_time_Last.Click, RL_perc_time_Click.Count))
# Next, the Handcount treatment
df <- df %>%
subset(select = -c(Handcount_intro_time_First.Click, Handcount_intro_time_Last.Click, Handcount_intro_time_Click.Count,
Handcount_time_First.Click, Handcount_time_Last.Click, Handcount_time_Click.Count))
# Next, the Bad Loser treatment
df <- df %>%
subset(select = -c(Loser_intro_time_First.Click, Loser_intro_time_Last.Click, Loser_intro_time_Click.Count,
Loser_time_First.Click, Loser_time_Last.Click, Loser_time_Click.Count))
# Next, the Soup treatment
df <- df %>%
subset(select = -c(Soup_intro_time_First.Click, Soup_intro_time_Last.Click, Soup_intro_time_Click.Count,
Soup_time_First.Click, Soup_time_Last.Click, Soup_time_Click.Count))
# Next, the Local Officials treatment
df <- df %>%
subset(select = -c(Local_intro_time_First.Click, Local_intro_time_Last.Click, Local_intro_time_Click.Count,
Local_time_First.Click, Local_time_Last.Click, Local_time_Click.Count))
# Finally, the Control group
df <- df %>%
subset(select = -c(control_time_First.Click, control_time_Last.Click, control_time_Click.Count))
# Though the comments are fun, let's get rid of those. Let's also get rid of all of the Lucid profile data except the political
# party (Calli's idea but can't remember exactly why). We can also get rid of the column called rid
df <- df %>%
subset(select = -c(comments, rid, age, gender, hhi, ethnicity, hispanic, education, region, zip))
```
```{r data cleaning 2}
# The first row is also repetitive from the column names. Let's remove that.
df = df[-1,]
# Check to see if we can get rid of UserLanguage. If they are all EN, then let's remove and just annotate that all speakers were
# English in nature. Further, let's do the same thing with consent.
if(nrow(df) == sum(df$UserLanguage == "EN")) {
df = subset(df, select = -c(UserLanguage))
}
if(nrow(df) == sum(df$Consent == "YES")) {
df = subset(df, select = -Consent)
}
# ***NOTE: Since the following code has an if statement, we need to check the df to see if UserLanguage and Consent were deleted
#before we run the next line of code. If they were deleted, we can run the code below with no issues.
```
```{r column names}
names(df) <- # to remove white space and put all in lower case
names(df) %>%
stringr::str_replace_all("\\s","_") %>% tolower
#clean up column names
names(df) <- gsub("_1", "", names(df))
names(df) <- gsub("dem_", "", names(df))
names(df) <- gsub("_page.submit", "", names(df))
names(df) <- gsub("cov_", "", names(df))
# to condense message time spent to a new column
df <- df %>%
unite("msg_time", c(bipartisan_time, rl_perc_time, handcount_time, loser_time, soup_time, local_time, control_time), sep= "")
# rearrange columns for easier condensing
df <- df %>% relocate("msg_time", .before = bipart_intro_time)
# to condense intro time spent to a new column
df <- df %>%
unite("intro_time", bipart_intro_time:local_intro_time, sep = "")
#("intro_time", bipartisan_time:local_time, sep= "")
# create dummy variable for control vs. treated
#new name = old name
df <- df %>%
rename(
duration_sec = duration..in.seconds.,
dv_pre_stateconf = pre_state_conf,
dv_pre_nationalconf = pre_national_conf,
federal_trust = cov_trust_federal,
state_trust = cov_trust_state,
accuracy_2016 = cov_2016_accu,
accuracy_2020 = cov_2020_accu,
birthyear = birthyear,
gender_text = gender_6_text,
parenthood = child,
)
# Rename the columns
#colnames(df) = c("duration", "state", "dv_pre_stateconf", "dv_pre_nationalconf", "federal_trust", "state_trust", "accuracy_2016", "accuracy_2020", "birthyear", "gender", "gender_text", "parenthood", "bipart_intro_time", "bipart_time", "rla_intro_time", "rla_time", "handcount_intro_time", "handcount_time", "badloser_intro_time", "badloser_time", "soup_intro_time", "soup_time", "local_intro_time", "local_time", "control_time", "dv_post_stateconf", "dv_post_nationalconf", "dv_secondary", "attention_check", "voting_importance", "vote_2020", "candidate_2020", "candidate_text_2020", "education", "political_party", "political_party_text", "political_ideology", "race", "race_text", "income", "lucid_political_party", "treatment")
```
```{r column types}
# All columns are currently structured as characters. Not super useful, so let's change that, allowing us to run descriptive statistics and perform mathematical operations, as needed.
# Start with the columns that need to be numeric
i = c("dv_pre_stateconf", "dv_pre_nationalconf", "bipart_intro_time", "bipart_time", "rla_intro_time", "rla_time", "handcount_intro_time", "handcount_time", "badloser_intro_time", "badloser_time", "soup_intro_time", "soup_time", "local_intro_time", "local_time", "control_time", "dv_post_stateconf", "dv_post_nationalconf")
df[ , i] = apply(df[ , i], 2, function(x) as.numeric(x))
# Now the columns that make sense to be structured as factors
i = c("state", "federal_trust", "state_trust", "accuracy_2016", "accuracy_2020", "birthyear", "gender", "parenthood", "dv_secondary", "voting_importance", "vote_2020", "candidate_2020", "education", "political_party", "political_ideology", "race", "income", "treatment")
df[ , i] = lapply(df[ , i], factor)
# Before we go any further, let's decide how to work with missing data and where it exists
summary(df)
# At a minimum, I suggest we need to remove those values where the DV is not measured, both pre and post test
df <- df %>% drop_na("dv_pre_stateconf", "dv_post_stateconf", "dv_pre_nationalconf", "dv_post_nationalconf")
# Finally, let's create a column that measures the treatment difference for the state and national DVs and check the summary stats
df$dv_state_treatement_diff = df$dv_post_stateconf - df$dv_pre_stateconf
df$dv_national_treatment_diff = df$dv_post_nationalconf - df$dv_pre_nationalconf
summary(df$dv_state_treatement_diff)
summary(df$dv_national_treatment_diff)
# Assign a treatment number to each treatment group, corresponding with their number as noted in Notion - (Calli: I dont actually think we need this, will probably just add confusion.)
# df$treatment_number = ifelse(df$treatment == "Control", 0, ifelse(df$treatment == "Bipartisan", 1,
# ifelse(df$treatment == "RL_percentage", 2, ifelse(df$treatment == "Handcount", 3,
# ifelse(df$treatment == "Loser", 5, ifelse(df$treatment == "Soup", 6,
# ifelse(df$treatment == "Local", 7, "error")))))))
#
# Create binary variables for "always believers" and "never believers" from 2016 & 2020 accuracy questions
df$always_believer = ifelse(df$accuracy_2016 == "Yes", ifelse(df$accuracy_2020 == "Yes", 1, 0), 0)
df$never_believer = ifelse(df$accuracy_2016 == "No", ifelse(df$accuracy_2020 == "No", 1, 0), 0)
```
# Save new CSV file as "clean"
```{r treat_real}
#table() creates a contingency table of counts of observations at each combination of treat_pseudo and treat_real
with(df, table(treatment_group, useNA = 'ifany')) %>% # "ifany" includes the NA values in the table
knitr::kable() %>% #kabel(x, format) generates tables
# add in a header to label what we're cross-tabulating with
add_header_above(c('treat_group' = 2)) %>% #add_header_above(x, col_name=col_span)
kableExtra::kable_styling(bootstrap_options = "striped") #additional styling options
```
```{r attention check}
# people who did not fail attention check
# turn into dummy variable instead
df2 <- df %>% #remove second row with questions
filter(attentioncheck == "Red,Green")
#see who is in the top 2.5% of least amount of time spent on the survey
df$duration..in.seconds. <- as.numeric(df$duration..in.seconds.)
quantile(df$duration..in.seconds., c(0.025, 0.95, .975))
# we see the duration is 88 seconds for the top 2.5% and 97.5% (those who spent the least amount)
# make a density plot of the distribution of total time spent on the survey between
df %>%
filter( duration..in.seconds. < 1050, duration..in.seconds. > 88 ) %>%
ggplot( aes(x=duration..in.seconds.)) +
geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8)
```
```{r select_vars}
# select covariates
covariate_names <- c("cov_trust_federal_1", "cov_trust_state_1", "cov_2016_accu", "cov_2020_accu", "dem_birthyear", "dem_gender", "dem_gender_6_TEXT", "dem_child","dem_state", "cov_voting_impt_1", "cov_vote_2020", "cov_vote_who_2020", "cov_vote_who_2020_5_TEXT", "dem_edu", "dem_party", "dem_party_4_TEXT", "dem_libcon", "dem_race", "dem_race_6_TEXT", "dem_income")
# treatment
#treatment_names <- c()
# outcomes of interest
outcome_variable1 <- "rla_state"
outcome_variable2 <- "rla_fed"
outcome_variable3 <- "rla_info"
# create new dataset containing the covariates, treatment and outcome
election_df <- df %>%
# select all the variables of interest
select(all_of(c(covariate_names, treatment_name, outcome_variable1, outcome_variable2, outcome_variable3))) %>% # all_of() is for strict selection: if any of the variables in the character vector is missing, an error is thrown.
# Filtered dataframe with observations that have a phone number
election_full_df <- election_df %>%
# exclude missing `treat_real` observations
filter(!is.na(treat_real))
# Remove the full data from memory
rm(df)
```
```{r balance}
N <- 1000 # experiment size
K <- 7 # number of experimental conditions including the control
mean_vals <- rep(0, K) # true means; rep(x, k) replicates the value in x by k times
# outcome matrix of dimension N by K
ymat <- mapply(rnorm, n = N, mean = mean_vals) #mapply(FUN, ...) applies function FUN; rnorm(n, mean, var) generates normal random variable
# complete random assignment; ensure each condition gets balanced assignment
w <- complete_ra(N, m_each = rep(N / 5, 5))
levels(w) <- 0:6 # name the treatment levels
table(w)
# observe outcomes based on treatment assignment
yobs <- ymat[cbind(1:N, as.numeric(w))] #[row, col] indexes the matrix elements
```
```{r Zstats}
# means under each condition
ybars <- aggregate(yobs, by = list(w), mean)$x # aggregate(x, by=list, FUN) applied function to x by group. In this case, we are taking the means by treatment arm
sigma <- sqrt(sum((yobs - ybars[w])^2) / (N - K)) # calculation of standard deviation
# difference in means estimates
taus <- ybars[-1] - ybars[1] # subtracting the control outcome from each of the three treatment outcomes
# Z-stat
Z_stat <- taus / (sigma * sqrt(2 * K / N)) # calculation of Z stat according to formula above
Z_stat
```