Merge pull request #315 from PSLmodels/PR-prepare-state-basefile

PR prepare state basefile - a prelude to preparing state targets
PSLmodels · Nov 28, 2024 · 1e4bff0 · 1e4bff0
2 parents 6e78974 + 22697d1
commit 1e4bff0
Show file tree

Hide file tree

Showing 10 changed files with 455 additions and 325 deletions.
diff --git a/tmd/areas/targets/prepare/prepare_states/_quarto.yml b/tmd/areas/targets/prepare/prepare_states/_quarto.yml
@@ -6,9 +6,7 @@ project:
 
 
 # site info:
-  # OLD id: 4d646266-9d1f-4d69-acb4-b9a17b63a5ff
-  # Unique deploy URL:  https://671e13320a7e7cfb68b1ba7d--tmd-areas-prepare-targets.netlify.app
-  # url: https://tmd-areas-targets-prepare.netlify.app
+  # OLD Unique deploy URL:  https://671e13320a7e7cfb68b1ba7d--tmd-areas-prepare-targets.netlify.app
   # url: https://tmd-areas-prepare-state-targets.netlify.app
 
 # publishing with netlify cli:
@@ -20,7 +18,7 @@ project:
 
 # or step by step
 #  netlify deploy # to test it, give _examine as publish directory
-#  netlify deploy --prod   # to deploy, give _docs as publish directory
+#  netlify deploy --prod   # to deploy, give _web as publish directory
 
 execute:
   eval: true
@@ -39,16 +37,19 @@ book:
     - part: "Usage"
       chapters:
         - usage.qmd
-    - part: "IRS SOI State data"
+    - part: "Get and clean data"
       chapters:
-        # - cd_download_and_clean_census_population_data.qmd
+        - download_clean_save_census_state_population_data.qmd
         - download_soi_data.qmd
         - construct_soi_documentation.qmd
         - construct_long_soi_data_file.qmd
-    - part: "SALT analysis"
+    - part: "Analysis of SALT variables and other issues"
       chapters:
         - developing_SALT_targets.qmd
-        # - explore_soi_data.qmd
+    - part: "Create basefile for state targets"
+      chapters: 
+        - create_state_targets_basefile.qmd
+        # old files maybe use as base for new work
         # - cd_create_variable_mapping.qmd
         # - cd_compare_us_totals_tmd_vs_irs_published.qmd
         # - cd_enhance_basefile_with_special_targets.qmd
@@ -65,6 +66,7 @@ editor_options:
   chunk_output_type: console
 
 # 5 states to do:  NJ, NM, VA, AK, MN  
+# also, SC for Jason
 
 # rendering commands
 #   quarto render

diff --git a/tmd/areas/targets/prepare/prepare_states/cd_download_and_clean_census_population_data.qmd b/tmd/areas/targets/prepare/prepare_states/cd_download_and_clean_census_population_data.qmd
diff --git a/tmd/areas/targets/prepare/prepare_states/construct_long_soi_data_file.qmd b/tmd/areas/targets/prepare/prepare_states/construct_long_soi_data_file.qmd
@@ -23,7 +23,7 @@ source(here::here("R", "functions.R"))
 
 agilabels <- read_delim(
 "agistub; agilo; agihi; agilabel
-0; -9E+99; 0; Total
+0; -9E+99; 9e99; Total
 1; -9E+99; 1; Under $1
 2; 1; 10000; $1 under $10,000
 3; 10000; 25000; $10,000 under $25,000
@@ -42,15 +42,14 @@ write_csv(agilabels, fs::path(DINTERMEDIATE, "agilabels.csv"))
 ```
 
 
+## Read and save raw SOI data
+
 ```{r}
 #| label: read-data
 #| output: false
 
-# variable_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation.csv"))
-variable_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation_by_year.csv"))
-agilabels <- read_csv(fs::path(DINTERMEDIATE, "agilabels.csv"))
-
-csvfiles <- dir_ls(DRAW, glob="*.csv")
+soi_csvfiles <- dir_ls(DRAW, glob="*.csv") |> 
+  str_subset("statepop", negate = TRUE)
 
 get_csvdata <- function(csvfile){
   year <- paste0("20", str_sub(fs::path_file(csvfile), 1, 2))
@@ -59,7 +58,7 @@ get_csvdata <- function(csvfile){
     mutate(year=year)
 }
 
-csvdata <- purrr::map(csvfiles, get_csvdata) |> 
+csvdata <- purrr::map(soi_csvfiles, get_csvdata) |> 
   list_rbind()
 count(csvdata, year)
 
@@ -69,93 +68,94 @@ csvdata2 <- csvdata |>
   mutate(year=as.integer(year)) |> 
   pivot_longer(-c(stabbr, year, agistub),
                names_to = "vname") |> 
-  filter(!is.na(value)) |> 
-  left_join(variable_descriptions,
-            by = join_by(vname, year)) |> 
-  left_join(agilabels, by = join_by(agistub)) |> 
-  mutate(value=ifelse(vtype=="amount", value * 1000, value)) |> 
-  select(stabbr, vname, basevname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |> 
-  arrange(stabbr, vname, basevname, vtype, agistub, year)
+  filter(!is.na(value))
+saveRDS(csvdata2, fs::path(DINTERMEDIATE, "soilong_raw.rds"))
 
-skim(csvdata2)
-glimpse(csvdata2)
-count(csvdata2, vtype)
-csvdata2 |> filter(value==max(value))
-csvdata2 |> filter(value==min(value))
+```
+
+## Enhance the raw data with derived variables
+
+
+```{r}
+#| label: add-18400
+#| output: false
 
-saveRDS(csvdata2, fs::path(DINTERMEDIATE, "soilong.rds"))
+soilong_raw <- readRDS(fs::path(DINTERMEDIATE, "soilong_raw.rds"))
+glimpse(soilong_raw)
 
+# investigate the data to make sure correct
+check <- soilong_raw |> 
+  filter(str_sub(vname, 2, -1) %in% c("18425", "18450"))
+
+#.. 18400 State and local income or sales tax (estimated)
+est18400 <- soilong_raw |> 
+  filter(str_sub(vname, 2, -1) %in% c("18425", "18450")) |> 
+  mutate(vname=paste0(str_sub(vname, 1, 1), "18400")) |> 
+  summarise(value=sum(value),
+            .by=c(stabbr, agistub, year, vname))
+glimpse(est18400)
+skim(est18400)
+
+glimpse(soilong_raw)
+
+soilong1 <- bind_rows(soilong_raw,
+                      est18400)
 
 ```
 
+
+## Add labels and write final long file
+
 ```{r}
-#| label: info-for-targets
-#| eval: false
+#| label: soilong-final
+#| output: false
 
-# to be used later
+# variable_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation.csv"))
+variable_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation_by_year.csv"))
+agilabels <- read_csv(fs::path(DINTERMEDIATE, "agilabels.csv"))
 
-# varname: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the tmd/storage/__init__.py file
-# count: integer in [0,4] range:
-# count==0 implies dollar total of varname is tabulated
-# count==1 implies number of tax units with any value of varname is tabulated
-# count==2 implies number of tax units with a nonzero value of varname is tabulated
-# count==3 implies number of tax units with a positive value of varname is tabulated
-# count==4 implies number of tax units with a negative value of varname is tabulated
-# scope: integer in [0,2] range:
-# scope==0 implies all tax units are tabulated
-# scope==1 implies only PUF-derived filing units are tabulated
-# scope==2 implies only CPS-derived filing units are tabulated
-# agilo: float representing lower bound of the AGI range (which is included in the range) that is tabulated.
-# agihi: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated.
-# fstatus: integer in [0,5] range:
-# fstatus=0 implies all filing statuses are tabulated
-# other fstatus values imply just the tax units with the Tax-Calculator MARS variable equal to fstatus are included in the tabulation
-# target: target amount:
-# dollars if count==0
-# number of tax units if count>0
+soilong <- soilong1 |> 
+  left_join(variable_descriptions,
+            by = join_by(vname, year)) |> 
+  left_join(agilabels, by = join_by(agistub)) |> 
+  mutate(value=ifelse(vtype=="amount", value * 1000, value)) |> 
+  select(stabbr, vname, basevname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |> 
+  arrange(stabbr, vname, basevname, vtype, agistub, year)
+
+skim(soilong)
+check <- count(soilong, basevname, vtype, vname, udescription)
+
+saveRDS(soilong, fs::path(DINTERMEDIATE, "soilong.rds"))
 
 ```
 
 
-Explore the data
+## Explore the data
 
 ```{r}
 #| label: explore
 #| output: false
 #| eval: false
 
 soilong <- readRDS(fs::path(DINTERMEDIATE, "soilong.rds"))
+glimpse(soilong)
+count(soilong, stabbr) # 54: 50 states, DC, PR, OA, US
 
 # data checks
 
 soilong |>
   filter(is.na(vtype)) |>
-  count(vname)
+  count(vname) # should be zero recs
 
 soilong |>
   filter(is.na(vtype)) |> 
-  skim()
+  skim() 
 
 # n17000 had been one of the all-missing values variables in some years
 # we have since dropped all missing values
 variable_descriptions |> 
   filter(vname=="n17000") # Number of returns with Total medical and dental expense deduction
 
-ns(csvdata)
-csvdata |> 
-  filter(STATE=="NY", AGI_STUB==0) |> 
-  select(STATE, AGI_STUB, year, N1, N17000, A17000)
-
-#   STATE AGI_STUB year        N1 N17000  A17000
-#   <chr>    <dbl> <chr>    <dbl>  <dbl>   <dbl>
-# 1 NY           0 2015   9614610     NA      NA
-# 2 NY           0 2016   9589410 458840 4776918
-# 3 NY           0 2017   9694910 494520 5124475
-# 4 NY           0 2018   9742580 235020 4029879
-# 5 NY           0 2019   9760870 221060 3953827
-# 6 NY           0 2020  10159910 200990 3753634
-# 7 NY           0 2021   9813320 187220 3702585
-
 soilong |> 
   filter(stabbr=="NY", vname %in% c("n17000", "a17000"), agistub==0) |> 
   select(stabbr, agistub, vname, vtype, year, value, udescription) |> 
@@ -166,3 +166,31 @@ soilong |>
 ```
 
 
+
+```{r}
+#| label: info-for-targets
+#| eval: false
+
+# to be used later
+
+# varname: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the tmd/storage/__init__.py file
+# count: integer in [0,4] range:
+# count==0 implies dollar total of varname is tabulated
+# count==1 implies number of tax units with any value of varname is tabulated
+# count==2 implies number of tax units with a nonzero value of varname is tabulated
+# count==3 implies number of tax units with a positive value of varname is tabulated
+# count==4 implies number of tax units with a negative value of varname is tabulated
+# scope: integer in [0,2] range:
+# scope==0 implies all tax units are tabulated
+# scope==1 implies only PUF-derived filing units are tabulated
+# scope==2 implies only CPS-derived filing units are tabulated
+# agilo: float representing lower bound of the AGI range (which is included in the range) that is tabulated.
+# agihi: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated.
+# fstatus: integer in [0,5] range:
+# fstatus=0 implies all filing statuses are tabulated
+# other fstatus values imply just the tax units with the Tax-Calculator MARS variable equal to fstatus are included in the tabulation
+# target: target amount:
+# dollars if count==0
+# number of tax units if count>0
+
+```