Merge pull request #6 from dominicwhite/dashboard

Data processing and dashboard prototype
DataKind-DC · Nov 14, 2019 · 419fd56 · 419fd56
2 parents f0a302e + 123efa2
commit 419fd56
Show file tree

Hide file tree

Showing 17 changed files with 1,116 additions and 0 deletions.
diff --git a/dashboard/dashboard.Rmd b/dashboard/dashboard.Rmd
@@ -0,0 +1,90 @@
+---
+title: "Census 2020 Cultural Messaging"
+runtime: shiny
+output: 
+  flexdashboard::flex_dashboard:
+    orientation: columns
+---
+
+```{r setup, include=FALSE}
+library(flexdashboard)
+library(here)
+```
+
+```{r global, include=FALSE}
+# load data in 'global' chunk so it can be shared by all users of the dashboard
+library(sf)
+pop <- read_sf("foreign_country_of_birth2.csv")
+pop$TotalForeignBornE <- as.numeric(pop$TotalForeignBornE)
+pop$EuropeE <- as.numeric(pop$EuropeE)
+pop$Central.AmericaE <- as.numeric(pop$Central.AmericaE)
+pop$AsiaE <- as.numeric(pop$AsiaE)
+```
+
+Column {.sidebar data-width=200}
+-----------------------------------------------------------------------
+
+```{r}
+ selectInput("voi", label = h4("Show on map"), 
+    choices = list(
+      "Foreign-born" = "TotalForeignBornE",
+      "European-born" = "EuropeE",
+      "Cent. Am.-born" = "Central.AmericaE",
+      "Asian-born" = "AsiaE"
+      ), 
+    selected = "TotalForeignBornE")
+```
+
+#### About
+
+Built by DataKind DC, Motifv, and Hofstede Insights.
+
+
+Column {data-width=450}
+-----------------------------------------------------------------------
+
+### Map
+
+```{r, echo=FALSE}
+library(leaflet)
+library(leaflet.providers)
+
+bins <- reactive({c(as.integer(seq(from=0, to=max(pop[[input$voi]]), length.out=10)), Inf)})
+
+
+m <- renderLeaflet({
+  pal <- colorNumeric("YlOrRd", domain = pop[[input$voi]])
+  leaflet(pop) %>%
+    setView(lng = -77.045992, lat = 38.9145, zoom = 12) %>%
+    addProviderTiles(providers$CartoDB.Positron) %>%
+    addPolygons(
+      fillColor = ~pal(get(input$voi)),
+      weight = 2,
+      opacity = 1,
+      color = "white",
+      dashArray = "3",
+      fillOpacity = 0.7
+    )
+  })
+m
+```
+
+Column {data-width=350}
+-----------------------------------------------------------------------
+
+### Chart B
+
+More detailed visualization to go here...
+
+```{r}
+
+```
+
+### Chart C
+
+More detailed visualization to go here...
+
+```{r}
+
+```
+
diff --git a/dashboard/foreign_country_of_birth2.csv b/dashboard/foreign_country_of_birth2.csv
diff --git a/data/README.md b/data/README.md
@@ -0,0 +1,21 @@
+# Data
+
+This directory contains data files, along with scripts to generate those data files.
+
+Descriptions of the main subdirectories:
+
+## Static data
+
+Some data is static (i.e. not generated from online sources and APIs via scripts).
+
+An example is the Hofstede Index data for different countries.
+
+## Raw data
+
+Files of raw data downloaded from various sources without any processing/cleaning. Scripts for generating these files should reside in this level.
+
+## Transformed
+
+Data files ready for use in analyses and visualizations, created from data sources in the `static` and `raw` directories.
+
+Scripts for running transformations should reside in this level.
diff --git a/data/get_data.R b/data/get_data.R
@@ -0,0 +1,211 @@
+library(tidyverse)
+library(tidycensus)
+library(sf)
+library(here)
+
+# sourceDir <- getSrcDirectory(function(dummy) {dummy})
+# dummy <- function() {0}
+# here()
+
+# uncomment to avoid cache
+options(tigris_use_cache = TRUE)
+
+# Load API key for Census Bureau. Register here: https://api.census.gov/data/key_signup.html
+api_key <- Sys.getenv(c("CENSUS_API_KEY"))
+census_api_key(api_key)
+
+# get all acs variables
+v17 <- load_variables(2017, "acs5", cache = TRUE)
+v15 <- load_variables(2015, "acs5", cache = TRUE)
+v09 <- load_variables(2009, "acs5", cache = TRUE)
+
+# select acs variables for country of birth
+regional_splits <- v17 %>%
+  filter(str_detect(name, "^B05006_")) %>%
+  select(name, label) %>%
+  separate(
+    label, 
+    into=c("Estimate", "Total", "Continent", "Subcontinent", "Country", "Subcountry"), 
+    sep="!!"
+    ) %>%
+  mutate(
+    pop_label = case_when(
+      is.na(Continent) ~ "TotalForeignBorn",
+      is.na(Subcontinent) ~ Continent,
+      is.na(Country) ~ Subcontinent,
+      is.na(Subcountry) ~ Country,
+      TRUE ~ Subcountry
+    ),
+  ) %>%
+  select(name, pop_label)
+acs_vars <- regional_splits$name
+names(acs_vars) <- regional_splits$pop_label
+
+# select acs variables for language
+language_splits <- v15 %>%
+  filter(str_detect(name, "^B16001_")) %>%
+  select(name, label) %>%
+  separate(
+    label, 
+    into=c("Estimate", "Total", "MainLanguage", "EnglishAbility"), 
+    sep="!!"
+  ) %>%
+  mutate(
+    lang_label = case_when(
+      is.na(MainLanguage) ~ "TotalSpeakersOver5",
+      is.na(EnglishAbility) ~ str_c("HouseholdLanguage:",MainLanguage),
+      TRUE ~ str_c("HouseholdLanguage:",MainLanguage,":",EnglishAbility)
+    )
+  )
+lang_vars <- language_splits$name
+names(lang_vars) <- language_splits$lang_label
+
+# select acs variables for birth x nativity
+status_splits <- v17 %>%
+  filter(str_detect(name, "^B05002_")) %>%
+  select(name, label) 
+status_vars <- status_splits$name
+names(status_vars) <- status_splits$label
+
+# select acs variables for non-response reason
+house_nonresponse_splits <- v17 %>%
+  filter(str_detect(name, "^B98021_")) %>%
+  select(name, label) %>%
+  separate(
+    label, 
+    into=c("Estimate", "Rate", "Reason"), 
+    sep="!!"
+  ) %>%
+  mutate(
+    response_label = case_when(
+      is.na(Reason) & Rate == "Response Rate" ~ "HouseResponseRate_Total",
+      is.na(Reason) ~ "HouseNonResponseRate_Total",
+      TRUE ~ str_c("HouseNonResponseRate_",Reason)
+    )
+  )
+house_nonresponse_vars <- house_nonresponse_splits$name
+names(house_nonresponse_vars) <- house_nonresponse_splits$response_label
+# select acs variables for non-response reason
+house_nonresponse_splits <- v09 %>%
+  filter(str_detect(name, "^B98021_")) %>%
+  select(name, label) %>%
+  separate(
+    label, 
+    into=c("Estimate", "Rate", "Reason"), 
+    sep="!!"
+  ) %>%
+  mutate(
+    response_label = case_when(
+      is.na(Reason) & Rate == "Response Rate" ~ "HouseResponseRate_Total",
+      is.na(Reason) ~ "HouseNonResponseRate_Total",
+      TRUE ~ str_c("HouseNonResponseRate_",Reason)
+    )
+  )
+house_nonresponse_vars <- house_nonresponse_splits$name
+names(house_nonresponse_vars) <- house_nonresponse_splits$response_label
+
+groupquarters_nonresponse_splits <- v09 %>%
+  filter(str_detect(name, "^B98022_")) %>%
+  select(name, label) %>%
+  separate(
+    label, 
+    into=c("Estimate", "Rate", "Reason"), 
+    sep="!!"
+  ) %>%
+  mutate(
+    response_label = case_when(
+      is.na(Reason) & Rate == "Response Rate" ~ "GroupQuartersResponseRate_Total",
+      is.na(Reason) ~ "GroupQuartersNonResponseRate_Total",
+      TRUE ~ str_c("GroupQuartersNonResponseRate_",Reason)
+    )
+  )
+groupquarters_nonresponse_vars <- groupquarters_nonresponse_splits$name
+names(groupquarter_nonresponse_vars) <- groupquarter_nonresponse_splits$response_label
+
+
+# combine all variables of interest
+# acs_vars <- c("TotalPop" = "B01003_001", acs_vars, lang_vars)
+
+# get block group total population data from api
+total_pop_bg <- get_acs('block group',
+                     variables = c("TotalPop" = "B01003_001"), 
+                     state = "DC",
+                     county = "District of Columbia", 
+                     geometry = TRUE,
+                     output="wide")
+st_write(total_pop_bg, 
+         file.path(here(),"data","raw", "bg_level_population.gpkg"),
+         delete_dsn=TRUE)
+
+# get tract total population data from api
+total_pop_tract <- get_acs('tract',
+                     variables = c("TotalPop" = "B01003_001"), 
+                     state = "DC",
+                     county = "District of Columbia", 
+                     geometry = TRUE,
+                     output="wide")
+st_write(total_pop_tract, 
+         file.path(here(),"data","raw", "tract_level_population.gpkg"),
+         delete_dsn=TRUE)
+
+# get tract-level nativity data from api
+nativity_data <- get_acs('tract',
+                     variables = acs_vars, 
+                     state = "DC",
+                     county = "District of Columbia", 
+                     geometry = TRUE,
+                     output="wide")
+st_write(nativity_data, 
+         file.path(here(),"data","raw", "tract_level_nativity.gpkg"),
+         delete_dsn=TRUE)
+
+# get tract-level language data from api (note: from 2015)
+language_data <- get_acs('tract',
+                         variables = lang_vars, 
+                         state = "DC",
+                         county = "District of Columbia", 
+                         geometry = TRUE,
+                         year = 2015,
+                         output="wide")
+st_write(language_data, 
+         file.path(here(),"data","raw", "tract_level_language_2015.gpkg"),
+         delete_dsn=TRUE)
+
+
+# get tract-level citizenship status data from api
+status_data <- get_acs('tract',
+                         variables = status_vars, 
+                         state = "DC",
+                         county = "District of Columbia", 
+                         geometry = TRUE,
+                         output="wide")
+st_write(status_data,
+         file.path(here(),"data","raw", "tract_level_citizen_status.gpkg"),
+         delete_dsn=TRUE)
+
+
+# get tract-level household nonresponse data from api
+house_response_data <- get_acs('tract',
+                               variables = house_nonresponse_vars, 
+                               state = "DC",
+                               county = "District of Columbia", 
+                               geometry = TRUE,
+                               output="wide",
+                               year=2009)
+st_write(house_response_data,
+         file.path(here(),"data","raw", "tract_level_house_reponse_2009.gpkg"),
+         delete_dsn=TRUE)
+
+
+# get tract-level group quarters nonresponse data from api
+gq_response_data <- get_acs('tract',
+                               variables = groupquarters_nonresponse_vars, 
+                               state = "DC",
+                               county = "District of Columbia", 
+                               geometry = TRUE,
+                               output="wide",
+                               year=2009)
+st_write(gq_response_data,
+         file.path(here(),"data","raw", "tract_level_group_quarters_reponse_2009.gpkg"),
+         delete_dsn=TRUE)
+
diff --git a/...ets/dc_block_groups_polygons_2019.geojson → ...old/dc_block_groups_polygons_2019.geojson b/...ets/dc_block_groups_polygons_2019.geojson → ...old/dc_block_groups_polygons_2019.geojson
diff --git a/..._planning_database_language_variables.csv → ..._planning_database_language_variables.csv b/..._planning_database_language_variables.csv → ..._planning_database_language_variables.csv
diff --git a/data/raw/bg_level_population.gpkg b/data/raw/bg_level_population.gpkg
diff --git a/data/raw/tract_level_citizen_status.gpkg b/data/raw/tract_level_citizen_status.gpkg
diff --git a/data/raw/tract_level_group_quarters_reponse_2009.gpkg b/data/raw/tract_level_group_quarters_reponse_2009.gpkg
diff --git a/data/raw/tract_level_language_2015.gpkg b/data/raw/tract_level_language_2015.gpkg
diff --git a/data/raw/tract_level_nativity.gpkg b/data/raw/tract_level_nativity.gpkg
diff --git a/data/raw/tract_level_population.gpkg b/data/raw/tract_level_population.gpkg
diff --git a/data/static/README.md b/data/static/README.md
@@ -0,0 +1,19 @@
+# Static Data files
+
+### dc_planning_database_subset.csv
+
+The subset of planning database variables was generated from the Block Group Planning Database available on the Census Bureau website: https://www.census.gov/topics/research/guidance/planning-databases.html
+
+The data was first filtered to just rows for just the District of Columbia (FIPS code 11) and then filtered for columns of interest. 
+
+For example, the current version was generated with:
+
+```bash
+awk -vFPAT='[^,]*|"[^"]*"' '{ if ($2 == 11) { print } }' pdb2019bgv3_us.csv >> dc_blockgroups.csv
+
+awk -vFPAT='[^,]*|"[^"]*"' '{ print $1 "," $2 "," $3 "," $4 "," $5 "," $6 "," $7 "," $8 "," $9 "," $10 "," $11 "," $12 "," $13 "," $14 "," $15 "," $32 "," $35 "," $38 "," $41 "," $47 "," $50 "," $53 "," $56 "," $59 "," $67 "," $69 "," $91 "," $93 "," $95 "," $97 "," $99 "," $187 "," $188 "," $205 "," $208 "," $211 "," $214 "," $219 "," $220 "," $222 "," $223 "," $225 "," $226 "," $228 "," $229 "," $231 "," $232 "," $234 "," $235 "," $237 "," $238 "," $264 "," $266 "," $268 "," $270 "," $272 }' dc_blockgroups.csv > dc_planning_database_subset.csv
+```
+
+### hofstede-6-dimensions-0-100.csv
+
+Contains measurements of Hofstede's 6 Cultural Dimensions for a range of countries.