From db3c17db22f4a592f2a833022909d85dab3f3b24 Mon Sep 17 00:00:00 2001 From: mle2718 Date: Wed, 30 Oct 2024 14:50:14 -0400 Subject: [PATCH] initial commit of A10 data prep file. --- Amendment10/writing/Amendment10_data_prep.Rmd | 159 ++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 Amendment10/writing/Amendment10_data_prep.Rmd diff --git a/Amendment10/writing/Amendment10_data_prep.Rmd b/Amendment10/writing/Amendment10_data_prep.Rmd new file mode 100644 index 0000000..c67e4ea --- /dev/null +++ b/Amendment10/writing/Amendment10_data_prep.Rmd @@ -0,0 +1,159 @@ +--- +title: "Am10_dataprep" +author: "Min-Yang Lee" +date: "2024-10-28" +output: html_document +--- + + +1. You must be able to see this folder ``nefscfile\BLAST\READ-SSB-Lee-MRIP-BLAST\data_folder\raw``. It contains the raw MRIP data that has been imported into Rds format. These were constructed with ``https://github.com/NEFSC/READ-SSB-MRIP-BLAST/R_code/data_extracting_processing/extraction/pull_in_MRIP.R``. + + + +```{r global_options, include=FALSE} +library("here") +library("data.table") +library("tidyverse") +library("knitr") +library("lubridate") +library("scales") +library("fredr") +library("kableExtra") +library("censusapi") +library("tigris") +library("mapview") +library("haven") +library("survey") +library("srvyr") + +#turn off scientific notation +options(scipen=999) + +#Handle single PSUs +options(survey.lonely.psu = "certainty") +options(tigris_use_cache = TRUE) + +here::i_am("writing/Amendment10_data_prep.Rmd") + +############################################################################# +#knitr options + +knitr::opts_chunk$set(echo=FALSE, warning = FALSE, error = FALSE, message = FALSE, comment = FALSE, cache = FALSE, progress = TRUE, verbose = FALSE, + dpi = 600) +options(tinytex.verbose = TRUE) +# options(knitr.table.format = "latex") +############################################################################# + + +# RFA data +RFA_filepath<-file.path("//nefscdata","RFA_EO12866_Guidelines" ,"Ownership Data", "current data and metadata","affiliates_2024_06_01.Rdata") +BLAST_raw_filepath<-file.path("//nefscfile","BLAST" ,"READ-SSB-Lee-MRIP-BLAST", "data_folder","raw") +local_BLAST_folder<-file.path("V:","READ-SSB-Lee-MRIP-BLAST") +network_BLAST_folder<-file.path("//nefscfile","BLAST" ,"READ-SSB-Lee-MRIP-BLAST") + + + + + +############################################################################# +# The census data that I'm using reports out 2022 Inflation adjusted dollars. Therefore, it makes sense to adjust everything to base year 2022. + +deflate_by <- "year" +base_year = 2022 # base year for GDP deflator - Maximum = 2020, Minimum = 1947 - max(GDPDEF_quarterly$Year) + + +vintage_string<-Sys.Date() +vintage_string<-gsub("-","_",vintage_string) + +``` + + +```{r get_census, eval=TRUE} +# Pull in and tidy up some Census data on household income +income <- getCensus( + name = "acs/acs5/subject", + vintage = 2022, + vars = c("S1901_C01_001E", "S1901_C01_010E","S1901_C01_011E","S1901_C01_012E","S1901_C01_013E","S0101_C01_001E", "S0101_C02_028E","S0601_C01_001E","S0601_C01_022E"), + region = "zip code tabulation area:*", + show_call=FALSE) + +income<-income %>% + rename ( households=S1901_C01_001E, + household_inc_150_200pct =S1901_C01_010E, + household_inc_200pct=S1901_C01_011E, + household_median_inc=S1901_C01_012E, + household_mean_inc=S1901_C01_013E, + total_population=S0101_C01_001E, + population_pct_over60=S0101_C02_028E, + total_population_s0601=S0601_C01_001E, + pct_white_alone =S0601_C01_022E + ) %>% + mutate(obscured_median=case_when(household_median_inc<0 ~ 1, .default=0), + obscured_mean=case_when(household_mean_inc<0 ~ 1, .default=0) + ) + + +# Read in the state-zcta correspondence file. +zcta_state<-read_delim("https://www2.census.gov/geo/docs/maps-data/data/rel2020/zcta520/tab20_zcta520_county20_natl.txt", delim="|", guess_max=2000) + +# Contract down to the fraction in each area. +zcta_state <- zcta_state %>% + dplyr::filter(is.na(OID_ZCTA5_20)==FALSE) %>% + group_by(OID_ZCTA5_20) %>% + mutate(total_area=sum(AREALAND_PART)) %>% + mutate(fraction=AREALAND_PART/total_area) %>% + select(c(GEOID_ZCTA5_20, OID_ZCTA5_20,OID_COUNTY_20, GEOID_COUNTY_20, fraction)) + + + +# Pull in and tidy up some Census data on household income at the state level +county_income <- getCensus( + name = "acs/acs5/subject", + vintage = 2022, + vars = c("S1901_C01_012E","S1901_C01_013E"), + region = "county:*", + show_call=FALSE) + +county_income<-county_income %>% + rename (county_household_median_inc=S1901_C01_012E, + county_household_mean_inc=S1901_C01_013E + ) %>% + mutate(st_co=paste0(state,county)) + + +# use county income to create an in imputed income based on county income +income_fill<-zcta_state %>% + left_join(county_income, by=join_by(GEOID_COUNTY_20==st_co), relationship="many-to-one") %>% + mutate(wmedian=fraction*county_household_median_inc, + wmean=fraction*county_household_mean_inc) %>% + group_by(GEOID_ZCTA5_20, OID_ZCTA5_20) %>% + summarise(imputed_household_median_income=sum(wmedian), + imputed_household_mean_income=sum(wmean)) %>% + ungroup() + +income<-income %>% + left_join(income_fill, by=join_by(zip_code_tabulation_area==GEOID_ZCTA5_20), relationship="one-to-one")%>% + mutate(household_median_inc=case_when(obscured_median==1 ~ imputed_household_median_income, .default=household_median_inc), + household_mean_inc=case_when(obscured_mean<0 ~ imputed_household_mean_income, .default=household_mean_inc) + ) %>% + select(-c(imputed_household_mean_income,imputed_household_median_income)) + +summary(income) + + +saveRDS(income,file=here("data_folder", "main", paste0("income_",vintage_string,".Rds"))) + +saveRDS(zcta_state,file=here("data_folder", "main", paste0("zcta_state",vintage_string,".Rds"))) + +``` + + + +```{r get_MRIP, eval=TRUE} + + + + + +``` +