data_analysis.Rmd

---
title: "Data Cleaning and Analysis"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r load_packages}
# # Ensure that pacman is installed for package management and loading.
# if (!require("pacman")) install.packages("pacman")
# # for data reading wrangling and visualization
pacman::p_load(tidyverse) 
# for working directories
pacman::p_load(here) 
# # for cross tabulation and data cleaning
# pacman::p_load(janitor) 
# for working with strings
pacman::p_load(glue) 
# For randomized inference, also loads randomizr and estimatr 
pacman::p_load(ri2) 
# for marginal effects from lineal regressions
pacman::p_load(margins)
# Tests for linear regression models
pacman::p_load(lmtest)
pacman::p_load(car)
# Tables
pacman::p_load(kableExtra)
# for updated ggplot2 theme
pacman::p_load(hrbrthemes)
# for updated ggplot2 colorblind-friendly scheme
pacman::p_load(ggthemes)
# theme_set(hrbrthemes::theme_ipsum())
pacman::p_load(reshape2)
# for plotting of covariate balance
pacman::p_load(cobalt)
# for matching only
# pacman::p_load(MatchIt)
```

```{r read_data}
#download the data from GitHub
data <-'https://raw.githubusercontent.com/gsbDBI/ALP301-spr21-project3/main/RLA_0507.csv'
df <- read.csv(data)
rm(data) #remove data csv file
```

```{r glimpse_data}
tibble::glimpse(df) # overview of variables in the data
```

```{r data cleaning 1}
# Raw Data Cleaning

# First, let's view the data
view(df)

# Let's check to make sure the survey only reported participants that were 100% complete
df$Progress = as.numeric(df$Progress)
summary(df$Progress)

# The summary statistics show that all of the values (minus one) for Progress are 100, so let's delete that too
# We can also get rid of the Response ID and Lat/Long and the Recorded Date
df <- df %>%
  subset(
    select = -c(StartDate, EndDate, Status, IPAddress, Progress, ResponseId, LocationLatitude, LocationLongitude, RecordedDate)
    )

# In our conversation with Emma, we decided only those columns that read Page.Submit were useful for understanding the timing
# of treatment groups. Let's get rid of all the other columns such as First.Click and Last.Click.

# We'll start with the bipartisan treatment

df <- df %>% 
  subset(select = -c(bipart_intro_time_First.Click, bipart_intro_time_Last.Click, bipart_intro_time_Click.Count,
                           bipartisan_time_First.Click, bipartisan_time_Last.Click, bipartisan_time_Click.Count))

# Next, the RLA as a Percentage treatment

df <- df %>% 
  subset(select = -c(RL_perc_intro_time_First.Click, RL_perc_intro_time_Last.Click, RL_perc_intro_time_Click.Count,
                             RL_perc_time_First.Click, RL_perc_time_Last.Click, RL_perc_time_Click.Count))


# Next, the Handcount treatment

df <- df %>% 
  subset(select = -c(Handcount_intro_time_First.Click, Handcount_intro_time_Last.Click, Handcount_intro_time_Click.Count,
                             Handcount_time_First.Click, Handcount_time_Last.Click, Handcount_time_Click.Count))


# Next, the Bad Loser treatment

df <- df %>% 
  subset(select = -c(Loser_intro_time_First.Click, Loser_intro_time_Last.Click, Loser_intro_time_Click.Count,
                             Loser_time_First.Click, Loser_time_Last.Click, Loser_time_Click.Count))

# Next, the Soup treatment

df <- df %>% 
  subset(select = -c(Soup_intro_time_First.Click, Soup_intro_time_Last.Click, Soup_intro_time_Click.Count,
                             Soup_time_First.Click, Soup_time_Last.Click, Soup_time_Click.Count))

# Next, the Local Officials treatment

df <- df %>% 
  subset(select = -c(Local_intro_time_First.Click, Local_intro_time_Last.Click, Local_intro_time_Click.Count,
                             Local_time_First.Click, Local_time_Last.Click, Local_time_Click.Count))

# Finally, the Control group

df <- df %>% 
  subset(select = -c(control_time_First.Click, control_time_Last.Click, control_time_Click.Count))

# Though the comments are fun, let's get rid of those. Let's also get rid of all of the Lucid profile data except the political
# party (Calli's idea but can't remember exactly why). We can also get rid of the column called rid

df <- df %>% 
  subset(select = -c(comments, rid, age, gender, hhi, ethnicity, hispanic, education, region, zip))
```

```{r data cleaning 2}

# The first row is also repetitive from the column names. Let's remove that.

df = df[-1,]

# Check to see if we can get rid of UserLanguage. If they are all EN, then let's remove and just annotate that all speakers were
# English in nature. Further, let's do the same thing with consent.

if(nrow(df) == sum(df$UserLanguage == "EN")) {
  df = subset(df, select = -c(UserLanguage))
}

if(nrow(df) == sum(df$Consent == "YES")) {
  df = subset(df, select = -Consent)
}

# ***NOTE: Since the following code has an if statement, we need to check the df to see if UserLanguage and Consent were deleted
#before we run the next line of code. If they were deleted, we can run the code below with no issues. 

```

```{r column names}
names(df) <- # to remove white space and put all in lower case
  names(df) %>%  
  stringr::str_replace_all("\\s","_") %>% tolower

#clean up column names
names(df) <- gsub("_1", "", names(df))
names(df) <- gsub("dem_", "", names(df))
names(df) <- gsub("_page.submit", "", names(df))
names(df) <- gsub("cov_", "", names(df))

# to condense message time spent to a new column
df <- df %>%
  unite("msg_time", c(bipartisan_time, rl_perc_time, handcount_time, loser_time, soup_time, local_time, control_time), sep= "")

# rearrange columns for easier condensing
df <- df %>% relocate("msg_time", .before = bipart_intro_time)

# to condense intro time spent to a new column
df <- df %>% 
  unite("intro_time", bipart_intro_time:local_intro_time, sep = "")

#("intro_time", bipartisan_time:local_time, sep= "")

# create dummy variable for control vs. treated

#new name = old name
df <- df %>%
  rename(
    duration_sec = duration..in.seconds.,
    dv_pre_stateconf = pre_state_conf,
    dv_pre_nationalconf = pre_national_conf,
    federal_trust = cov_trust_federal,
    state_trust = cov_trust_state,
    accuracy_2016 = cov_2016_accu,
    accuracy_2020 = cov_2020_accu,
    birthyear = birthyear,
    gender_text = gender_6_text,
    parenthood = child,
  )

# Rename the columns
#colnames(df) = c("duration", "state", "dv_pre_stateconf", "dv_pre_nationalconf", "federal_trust", "state_trust", "accuracy_2016", "accuracy_2020", "birthyear", "gender", "gender_text", "parenthood", "bipart_intro_time", "bipart_time", "rla_intro_time", "rla_time", "handcount_intro_time", "handcount_time", "badloser_intro_time", "badloser_time", "soup_intro_time", "soup_time", "local_intro_time", "local_time", "control_time", "dv_post_stateconf", "dv_post_nationalconf", "dv_secondary", "attention_check", "voting_importance", "vote_2020", "candidate_2020", "candidate_text_2020", "education", "political_party", "political_party_text", "political_ideology", "race", "race_text", "income", "lucid_political_party", "treatment")
```

```{r column types}
# All columns are currently structured as characters. Not super useful, so let's change that, allowing us to run descriptive statistics and perform mathematical operations, as needed. 

# Start with the columns that need to be numeric

i = c("dv_pre_stateconf", "dv_pre_nationalconf", "bipart_intro_time", "bipart_time", "rla_intro_time", "rla_time", "handcount_intro_time", "handcount_time", "badloser_intro_time", "badloser_time", "soup_intro_time", "soup_time", "local_intro_time", "local_time", "control_time", "dv_post_stateconf", "dv_post_nationalconf")
df[ , i] = apply(df[ , i], 2, function(x) as.numeric(x))

# Now the columns that make sense to be structured as factors
i = c("state", "federal_trust", "state_trust", "accuracy_2016", "accuracy_2020", "birthyear", "gender", "parenthood", "dv_secondary", "voting_importance", "vote_2020", "candidate_2020", "education", "political_party", "political_ideology", "race", "income", "treatment")
df[ , i] = lapply(df[ , i], factor)

# Before we go any further, let's decide how to work with missing data and where it exists
summary(df)

# At a minimum, I suggest we need to remove those values where the DV is not measured, both pre and post test
df <- df %>% drop_na("dv_pre_stateconf", "dv_post_stateconf", "dv_pre_nationalconf", "dv_post_nationalconf")

# Finally, let's create a column that measures the treatment difference for the state and national DVs and check the summary stats

df$dv_state_treatement_diff = df$dv_post_stateconf - df$dv_pre_stateconf
df$dv_national_treatment_diff = df$dv_post_nationalconf - df$dv_pre_nationalconf

summary(df$dv_state_treatement_diff)
summary(df$dv_national_treatment_diff)

# Assign a treatment number to each treatment group, corresponding with their number as noted in Notion - (Calli: I dont actually think we need this, will probably just add confusion.)

# df$treatment_number = ifelse(df$treatment == "Control", 0, ifelse(df$treatment == "Bipartisan", 1, 
#                       ifelse(df$treatment == "RL_percentage", 2, ifelse(df$treatment == "Handcount", 3,
#                       ifelse(df$treatment == "Loser", 5, ifelse(df$treatment == "Soup", 6, 
#                       ifelse(df$treatment == "Local", 7, "error")))))))
# 
# Create binary variables for "always believers" and "never believers" from 2016 & 2020 accuracy questions

df$always_believer = ifelse(df$accuracy_2016 == "Yes", ifelse(df$accuracy_2020 == "Yes", 1, 0), 0)
df$never_believer = ifelse(df$accuracy_2016 == "No", ifelse(df$accuracy_2020 == "No", 1, 0), 0)
```

# Save new CSV file as "clean"

```{r treat_real}
#table() creates a contingency table of counts of observations at each combination of treat_pseudo and treat_real
with(df, table(treatment_group, useNA = 'ifany')) %>%  # "ifany" includes the NA values in the table
  knitr::kable() %>%                                            #kabel(x, format) generates tables 
  # add in a header to label what we're cross-tabulating with
  add_header_above(c('treat_group' = 2)) %>% #add_header_above(x, col_name=col_span)
  kableExtra::kable_styling(bootstrap_options = "striped") #additional styling options

```

```{r attention check}
# people who did not fail attention check

# turn into dummy variable instead
df2 <- df %>% #remove second row with questions
  filter(attentioncheck == "Red,Green")

#see who is in the top 2.5% of least amount of time spent on the survey 
df$duration..in.seconds. <- as.numeric(df$duration..in.seconds.)
quantile(df$duration..in.seconds., c(0.025, 0.95, .975)) 
# we see the duration is 88 seconds for the top 2.5% and 97.5% (those who spent the least amount)

# make a density plot of the distribution of total time spent on the survey between 
df %>%
  filter( duration..in.seconds. < 1050, duration..in.seconds. > 88 ) %>%
  ggplot( aes(x=duration..in.seconds.)) +
    geom_density(fill="#69b3a2", color="#e9ecef", alpha=0.8)

```

```{r select_vars}
# select covariates
covariate_names <- c("cov_trust_federal_1", "cov_trust_state_1", "cov_2016_accu", "cov_2020_accu", "dem_birthyear", "dem_gender", "dem_gender_6_TEXT", "dem_child","dem_state", "cov_voting_impt_1", "cov_vote_2020", "cov_vote_who_2020", "cov_vote_who_2020_5_TEXT", "dem_edu", "dem_party", "dem_party_4_TEXT", "dem_libcon", "dem_race", "dem_race_6_TEXT", "dem_income")

# treatment
#treatment_names <- c()

# outcomes of interest
outcome_variable1 <- "rla_state"
outcome_variable2 <- "rla_fed"
outcome_variable3 <- "rla_info"

# create new dataset containing the covariates, treatment and outcome
election_df <- df %>%
  # select all the variables of interest
  select(all_of(c(covariate_names, treatment_name, outcome_variable1, outcome_variable2, outcome_variable3))) %>% # all_of() is for strict selection: if any of the variables in the character vector is missing, an error is thrown.

# Filtered dataframe with observations that have a phone number
election_full_df <- election_df %>% 
  # exclude missing `treat_real` observations
  filter(!is.na(treat_real)) 

# Remove the full data from memory
rm(df)
```

```{r balance}
N <- 1000 # experiment size
K <- 7 # number of experimental conditions including the control

mean_vals <- rep(0, K) # true means; rep(x, k) replicates the value in x by k times
# outcome matrix of dimension N by K
ymat <- mapply(rnorm, n = N, mean = mean_vals) #mapply(FUN, ...) applies function FUN; rnorm(n, mean, var) generates normal random variable

# complete random assignment; ensure each condition gets balanced assignment
w <- complete_ra(N, m_each = rep(N / 5, 5))
levels(w) <- 0:6 # name the treatment levels
table(w)

# observe outcomes based on treatment assignment
yobs <- ymat[cbind(1:N, as.numeric(w))] #[row, col] indexes the matrix elements

```


```{r Zstats}
# means under each condition
ybars <- aggregate(yobs, by = list(w), mean)$x # aggregate(x, by=list, FUN) applied function to x by group. In this case, we are taking the means by treatment arm
sigma <- sqrt(sum((yobs - ybars[w])^2) / (N - K)) # calculation of standard deviation

# difference in means estimates
taus <- ybars[-1] - ybars[1] # subtracting the control outcome from each of the three treatment outcomes

# Z-stat
Z_stat <- taus / (sigma * sqrt(2 * K / N)) # calculation of Z stat according to formula above
Z_stat
```