Skip to content

Commit

Permalink
Merge pull request #315 from PSLmodels/PR-prepare-state-basefile
Browse files Browse the repository at this point in the history
PR prepare state basefile - a prelude to preparing state targets
  • Loading branch information
donboyd5 authored Nov 28, 2024
2 parents 6e78974 + 22697d1 commit 1e4bff0
Show file tree
Hide file tree
Showing 10 changed files with 455 additions and 325 deletions.
18 changes: 10 additions & 8 deletions tmd/areas/targets/prepare/prepare_states/_quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@ project:


# site info:
# OLD id: 4d646266-9d1f-4d69-acb4-b9a17b63a5ff
# Unique deploy URL: https://671e13320a7e7cfb68b1ba7d--tmd-areas-prepare-targets.netlify.app
# url: https://tmd-areas-targets-prepare.netlify.app
# OLD Unique deploy URL: https://671e13320a7e7cfb68b1ba7d--tmd-areas-prepare-targets.netlify.app
# url: https://tmd-areas-prepare-state-targets.netlify.app

# publishing with netlify cli:
Expand All @@ -20,7 +18,7 @@ project:

# or step by step
# netlify deploy # to test it, give _examine as publish directory
# netlify deploy --prod # to deploy, give _docs as publish directory
# netlify deploy --prod # to deploy, give _web as publish directory

execute:
eval: true
Expand All @@ -39,16 +37,19 @@ book:
- part: "Usage"
chapters:
- usage.qmd
- part: "IRS SOI State data"
- part: "Get and clean data"
chapters:
# - cd_download_and_clean_census_population_data.qmd
- download_clean_save_census_state_population_data.qmd
- download_soi_data.qmd
- construct_soi_documentation.qmd
- construct_long_soi_data_file.qmd
- part: "SALT analysis"
- part: "Analysis of SALT variables and other issues"
chapters:
- developing_SALT_targets.qmd
# - explore_soi_data.qmd
- part: "Create basefile for state targets"
chapters:
- create_state_targets_basefile.qmd
# old files maybe use as base for new work
# - cd_create_variable_mapping.qmd
# - cd_compare_us_totals_tmd_vs_irs_published.qmd
# - cd_enhance_basefile_with_special_targets.qmd
Expand All @@ -65,6 +66,7 @@ editor_options:
chunk_output_type: console

# 5 states to do: NJ, NM, VA, AK, MN
# also, SC for Jason

# rendering commands
# quarto render
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ source(here::here("R", "functions.R"))
agilabels <- read_delim(
"agistub; agilo; agihi; agilabel
0; -9E+99; 0; Total
0; -9E+99; 9e99; Total
1; -9E+99; 1; Under $1
2; 1; 10000; $1 under $10,000
3; 10000; 25000; $10,000 under $25,000
Expand All @@ -42,15 +42,14 @@ write_csv(agilabels, fs::path(DINTERMEDIATE, "agilabels.csv"))
```


## Read and save raw SOI data

```{r}
#| label: read-data
#| output: false
# variable_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation.csv"))
variable_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation_by_year.csv"))
agilabels <- read_csv(fs::path(DINTERMEDIATE, "agilabels.csv"))
csvfiles <- dir_ls(DRAW, glob="*.csv")
soi_csvfiles <- dir_ls(DRAW, glob="*.csv") |>
str_subset("statepop", negate = TRUE)
get_csvdata <- function(csvfile){
year <- paste0("20", str_sub(fs::path_file(csvfile), 1, 2))
Expand All @@ -59,7 +58,7 @@ get_csvdata <- function(csvfile){
mutate(year=year)
}
csvdata <- purrr::map(csvfiles, get_csvdata) |>
csvdata <- purrr::map(soi_csvfiles, get_csvdata) |>
list_rbind()
count(csvdata, year)
Expand All @@ -69,93 +68,94 @@ csvdata2 <- csvdata |>
mutate(year=as.integer(year)) |>
pivot_longer(-c(stabbr, year, agistub),
names_to = "vname") |>
filter(!is.na(value)) |>
left_join(variable_descriptions,
by = join_by(vname, year)) |>
left_join(agilabels, by = join_by(agistub)) |>
mutate(value=ifelse(vtype=="amount", value * 1000, value)) |>
select(stabbr, vname, basevname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |>
arrange(stabbr, vname, basevname, vtype, agistub, year)
filter(!is.na(value))
saveRDS(csvdata2, fs::path(DINTERMEDIATE, "soilong_raw.rds"))
skim(csvdata2)
glimpse(csvdata2)
count(csvdata2, vtype)
csvdata2 |> filter(value==max(value))
csvdata2 |> filter(value==min(value))
```

## Enhance the raw data with derived variables


```{r}
#| label: add-18400
#| output: false
saveRDS(csvdata2, fs::path(DINTERMEDIATE, "soilong.rds"))
soilong_raw <- readRDS(fs::path(DINTERMEDIATE, "soilong_raw.rds"))
glimpse(soilong_raw)
# investigate the data to make sure correct
check <- soilong_raw |>
filter(str_sub(vname, 2, -1) %in% c("18425", "18450"))
#.. 18400 State and local income or sales tax (estimated)
est18400 <- soilong_raw |>
filter(str_sub(vname, 2, -1) %in% c("18425", "18450")) |>
mutate(vname=paste0(str_sub(vname, 1, 1), "18400")) |>
summarise(value=sum(value),
.by=c(stabbr, agistub, year, vname))
glimpse(est18400)
skim(est18400)
glimpse(soilong_raw)
soilong1 <- bind_rows(soilong_raw,
est18400)
```


## Add labels and write final long file

```{r}
#| label: info-for-targets
#| eval: false
#| label: soilong-final
#| output: false
# to be used later
# variable_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation.csv"))
variable_descriptions <- read_csv(fs::path(DINTERMEDIATE, "soi_documentation_by_year.csv"))
agilabels <- read_csv(fs::path(DINTERMEDIATE, "agilabels.csv"))
# varname: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the tmd/storage/__init__.py file
# count: integer in [0,4] range:
# count==0 implies dollar total of varname is tabulated
# count==1 implies number of tax units with any value of varname is tabulated
# count==2 implies number of tax units with a nonzero value of varname is tabulated
# count==3 implies number of tax units with a positive value of varname is tabulated
# count==4 implies number of tax units with a negative value of varname is tabulated
# scope: integer in [0,2] range:
# scope==0 implies all tax units are tabulated
# scope==1 implies only PUF-derived filing units are tabulated
# scope==2 implies only CPS-derived filing units are tabulated
# agilo: float representing lower bound of the AGI range (which is included in the range) that is tabulated.
# agihi: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated.
# fstatus: integer in [0,5] range:
# fstatus=0 implies all filing statuses are tabulated
# other fstatus values imply just the tax units with the Tax-Calculator MARS variable equal to fstatus are included in the tabulation
# target: target amount:
# dollars if count==0
# number of tax units if count>0
soilong <- soilong1 |>
left_join(variable_descriptions,
by = join_by(vname, year)) |>
left_join(agilabels, by = join_by(agistub)) |>
mutate(value=ifelse(vtype=="amount", value * 1000, value)) |>
select(stabbr, vname, basevname, vtype, agistub, agilo, agihi, agilabel, year, value, udescription, description) |>
arrange(stabbr, vname, basevname, vtype, agistub, year)
skim(soilong)
check <- count(soilong, basevname, vtype, vname, udescription)
saveRDS(soilong, fs::path(DINTERMEDIATE, "soilong.rds"))
```


Explore the data
## Explore the data

```{r}
#| label: explore
#| output: false
#| eval: false
soilong <- readRDS(fs::path(DINTERMEDIATE, "soilong.rds"))
glimpse(soilong)
count(soilong, stabbr) # 54: 50 states, DC, PR, OA, US
# data checks
soilong |>
filter(is.na(vtype)) |>
count(vname)
count(vname) # should be zero recs
soilong |>
filter(is.na(vtype)) |>
skim()
skim()
# n17000 had been one of the all-missing values variables in some years
# we have since dropped all missing values
variable_descriptions |>
filter(vname=="n17000") # Number of returns with Total medical and dental expense deduction
ns(csvdata)
csvdata |>
filter(STATE=="NY", AGI_STUB==0) |>
select(STATE, AGI_STUB, year, N1, N17000, A17000)
# STATE AGI_STUB year N1 N17000 A17000
# <chr> <dbl> <chr> <dbl> <dbl> <dbl>
# 1 NY 0 2015 9614610 NA NA
# 2 NY 0 2016 9589410 458840 4776918
# 3 NY 0 2017 9694910 494520 5124475
# 4 NY 0 2018 9742580 235020 4029879
# 5 NY 0 2019 9760870 221060 3953827
# 6 NY 0 2020 10159910 200990 3753634
# 7 NY 0 2021 9813320 187220 3702585
soilong |>
filter(stabbr=="NY", vname %in% c("n17000", "a17000"), agistub==0) |>
select(stabbr, agistub, vname, vtype, year, value, udescription) |>
Expand All @@ -166,3 +166,31 @@ soilong |>
```



```{r}
#| label: info-for-targets
#| eval: false
# to be used later
# varname: any Tax-Calculator input variable name plus any Tax-Calculator calculated variable in the list of cached variables in the tmd/storage/__init__.py file
# count: integer in [0,4] range:
# count==0 implies dollar total of varname is tabulated
# count==1 implies number of tax units with any value of varname is tabulated
# count==2 implies number of tax units with a nonzero value of varname is tabulated
# count==3 implies number of tax units with a positive value of varname is tabulated
# count==4 implies number of tax units with a negative value of varname is tabulated
# scope: integer in [0,2] range:
# scope==0 implies all tax units are tabulated
# scope==1 implies only PUF-derived filing units are tabulated
# scope==2 implies only CPS-derived filing units are tabulated
# agilo: float representing lower bound of the AGI range (which is included in the range) that is tabulated.
# agihi: float representing upper bound of the AGI range (which is excluded from the range) that is tabulated.
# fstatus: integer in [0,5] range:
# fstatus=0 implies all filing statuses are tabulated
# other fstatus values imply just the tax units with the Tax-Calculator MARS variable equal to fstatus are included in the tabulation
# target: target amount:
# dollars if count==0
# number of tax units if count>0
```
Loading

0 comments on commit 1e4bff0

Please sign in to comment.