-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #270 from PSLmodels/examine-initial
Initial pr for examine results for areas
- Loading branch information
Showing
11 changed files
with
1,273 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# examine | ||
|
||
# folders to ignore | ||
.Rproj.user/ | ||
.quarto/ | ||
_examine/ | ||
_freeze/ | ||
site_libs/ | ||
|
||
# files to ignore | ||
.Rhistory | ||
*.html | ||
|
||
# Local Netlify folder | ||
.netlify | ||
|
||
|
||
/.quarto/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
|
||
# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\storage\output | ||
TMDDIR <- here::here("..", "..", "..", "storage", "output") | ||
# list.files(TMDDIR) | ||
|
||
TARGETSDIR <- here::here("..", "..", "targets") | ||
WEIGHTSDIR <- here::here("..") | ||
# list.files(TARGETSDIR) | ||
# list.files(WEIGHTSDIR) | ||
|
||
# CDZIPURL <- "https://www.irs.gov/pub/irs-soi/congressional2021.zip" | ||
# CDDOCURL <- "https://www.irs.gov/pub/irs-soi/21incddocguide.docx" | ||
|
||
# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\weights\examine | ||
# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\targets\prepare | ||
# TARGETSPREPDIR <- here::here("..", "..", "targets", "prepare") | ||
# print(TARGETSPREPDIR) # Should print the absolute path to the folder | ||
# list.files(TARGETSPREPDIR) | ||
|
||
# CDDIR <- here::here("cds") | ||
# CDDIR <- fs::path(TARGETSPREPDIR, "cds") | ||
# CDRAW <- fs::path(CDDIR, "raw_data") | ||
# CDINTERMEDIATE <- fs::path(CDDIR, "intermediate") | ||
# CDFINAL <- fs::path(CDDIR, "final") | ||
# list.files(CDFINAL) | ||
# CDDOCEXTRACT <- "cd_documentation_extracted_from_21incddocguide.docx.xlsx" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
# libraries --------------------------------------------------------------- | ||
|
||
library(DT) | ||
library(fs) | ||
library(gt) | ||
library(knitr) | ||
library(readxl) | ||
library(skimr) | ||
library(stringr) | ||
library(tidyverse) | ||
# includes: dplyr, forcats, ggplot2, lubridate, purrr, stringr, tibble, tidyr | ||
|
||
tprint <- 75 # default tibble print | ||
options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows | ||
|
||
library(janitor) | ||
# census_api_key("b27cb41e46ffe3488af186dd80c64dce66bd5e87", install = TRUE) # stored in .Renviron | ||
# libraries needed for census population | ||
library(sf) | ||
library(tidycensus) | ||
library(tigris) | ||
options(tigris_use_cache = TRUE) | ||
library(vroom) | ||
|
||
|
||
# possible libraries ------------------------------------------------------ | ||
|
||
# library(rlang) | ||
# library(tidyverse) | ||
# tprint <- 75 # default tibble print | ||
# options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows | ||
# | ||
# library(fs) | ||
|
||
# tools | ||
# library(vroom) | ||
# library(readxl) | ||
# library(openxlsx) # for writing xlsx files | ||
# library(lubridate) | ||
# library(RColorBrewer) | ||
# library(RcppRoll) | ||
# library(fredr) | ||
# library(tidycensus) | ||
# library(googledrive) | ||
# library(arrow) | ||
# | ||
# library(jsonlite) | ||
# library(tidyjson) | ||
# | ||
# | ||
# # boyd libraries | ||
# # library(btools) | ||
# # library(bdata) | ||
# # library(bggtools) | ||
# # library(bmaps) | ||
# | ||
# # graphics | ||
# library(scales) | ||
# library(ggbeeswarm) | ||
# library(patchwork) | ||
# library(gridExtra) | ||
# library(ggrepel) | ||
# library(ggbreak) | ||
# | ||
# # tables | ||
# library(knitr) | ||
# library(kableExtra) | ||
# library(DT) | ||
# library(gt) | ||
# library(gtExtras) | ||
# library(janitor) | ||
# library(skimr) | ||
# library(vtable) | ||
# | ||
# # maps | ||
# library(maps) | ||
# # https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html | ||
# library(usmap) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
project: | ||
type: book | ||
output-dir: _examine | ||
|
||
# https://prerelease.quarto.org/ # quarto documentation at this link | ||
|
||
# publishing with netllify cli: | ||
# open terminal in examine | ||
# quarto render && netlify deploy --prod --dir=_examine | ||
|
||
# quarto render # inspect to be sure it is as desired | ||
# netlify deploy --prod --dir=_examine | ||
|
||
# or step by step | ||
# netlify deploy # to test it, give _examine as publish directory | ||
# netlify deploy --prod # to deploy, give _examine as publish directory | ||
|
||
execute: | ||
eval: true | ||
echo: true | ||
output: true | ||
freeze: auto # auto: during global project renders, re-render only when source changes | ||
|
||
book: | ||
title: "Examine area weights creation results" | ||
subtitle: "Create csv file" | ||
# author: "Don Boyd" | ||
date: today | ||
date-format: long | ||
chapters: | ||
- index.qmd | ||
- part: "IRS Congressional District data" | ||
chapters: | ||
# - cd_overall_documentation.qmd | ||
- cd_prepare_data.qmd | ||
- cd_simple_tables.qmd | ||
- cd_results_vs_targets_tables.qmd | ||
|
||
format: | ||
html: | ||
theme: cosmo | ||
code-fold: true | ||
|
||
editor_options: | ||
chunk_output_type: console | ||
|
||
# R packages using old 209 libxml | ||
# gt, | ||
|
||
|
||
# rendering commands | ||
# quarto render | ||
# quarto publish netlify --no-prompt --no-render --no-browser | ||
|
||
# possibly use this at start of each doc | ||
# --- | ||
# output: html_document | ||
# editor_options: | ||
# chunk_output_type: console | ||
# --- | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
--- | ||
output: html_document | ||
editor_options: | ||
chunk_output_type: console | ||
--- | ||
|
||
# Read tmd 2021, area targets, area weights and prepare data | ||
|
||
|
||
## Setup | ||
|
||
```{r} | ||
#| label: setup | ||
#| output: false | ||
source(here::here("R", "libraries.R")) | ||
source(here::here("R", "constants.R")) | ||
phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00") | ||
``` | ||
|
||
|
||
```{r} | ||
#| label: functions | ||
#| output: false | ||
ns <- function(obj){ | ||
sort(names(obj)) | ||
} | ||
``` | ||
|
||
## Download files from google drive | ||
|
||
Only do this when target files and results have changed. Otherwise, necessary data should be in the temp_data folder. | ||
|
||
```{r} | ||
#| label: hookup-googledrive | ||
#| eval: false | ||
library(googledrive) | ||
drive_auth() # authenticate | ||
``` | ||
|
||
|
||
```{r} | ||
#| label: download-files | ||
#| eval: false | ||
# /AFPI_2024/Phase 4 | ||
# folder_id <- "1pEdofaxeQgEeDLM8NOpo0vOGL1jT8Qa1" # AFPI folder | ||
folder_id <- "1Z7ZWYTbldfuQCFbpqKi4Z8FYxkbwmhnu" # Phase 4 folder | ||
files <- drive_ls(as_id(folder_id)) | ||
files | ||
f <- function(gdfname){ | ||
fpath <- here::here("temp_data", gdfname) | ||
print(fpath) | ||
drive_download(gdfname, path = fpath, overwrite = TRUE) | ||
} | ||
# f(files$name[[1]]) | ||
files |> | ||
pull(name) |> | ||
walk(\(gdfname) f(gdfname)) | ||
``` | ||
|
||
## Prepare target files | ||
|
||
Get all targets prepared | ||
|
||
```{r} | ||
#| label: targets-all | ||
#| eval: false | ||
#| output: false | ||
# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/weights/examine # project dir | ||
# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/targets/prepare/cds/intermediate # cdbasefile | ||
HERE <- here::here() | ||
CDTARGETSDIR <- fs::path(HERE, "..", "..", "targets", "prepare", "cds", "intermediate") | ||
# list.files(CDTARGETSDIR) | ||
targets_data <- read_csv(fs::path(CDTARGETSDIR, "cdbasefile.csv")) | ||
glimpse(targets_data) | ||
saveRDS(targets_data, here::here("temp_data", "targets_data.rds")) | ||
``` | ||
|
||
|
||
Get targets used in the optimization | ||
|
||
```{r} | ||
#| label: targets-used | ||
#| eval: false | ||
#| output: false | ||
targetfiles <- dir_ls(here::here("temp_data")) |> str_subset("targets.csv") | ||
targets_used <- vroom(targetfiles, id="src") |> | ||
mutate(src=path_file(src) |> str_sub(1, 4)) |> | ||
mutate(active=!(str_sub(varname, 1, 1) == "#"), | ||
varname = ifelse(!active, | ||
varname |> str_remove("#") |> str_trim(), | ||
varname)) | ||
saveRDS(targets_used, here::here("temp_data", "targets_used.rds")) | ||
glimpse(targets_used) | ||
count(targets_used, src) | ||
count(targets_used, active) | ||
count(targets_used, varname) | ||
count(targets_used, varname, active) | ||
targets_used |> filter(src == "ak00") | ||
targets_used |> filter(src == "de00") | ||
``` | ||
|
||
|
||
## Get and prepare tmd data and area weights | ||
|
||
```{r} | ||
#| label: get-tmd-2021 | ||
#| eval: false | ||
#| output: false | ||
# fpath <- fs::path(TMDDIR, "tmd_2021.csv") # NO - it is out of sync with tmd.csv | ||
fpath <- here::here("temp_data", "djbout.csv") | ||
tmd2021 <- read_csv(fpath) | ||
ns(tmd2021) | ||
# djbout <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from create_area_weights.py | ||
saveRDS(tmd2021, here::here("temp_data", "tmd2021.rds")) | ||
sum(tmd2021$s006) # 184,024,657 with djbout.csv, s006 units are numbers of units, not hundreds of units | ||
# con <- unz(zpath, "21incd.csv") | ||
# data <- read_csv(con) | ||
us_weights <- read_csv(fs::path(TMDDIR, "tmd_weights.csv.gz")) | ||
sum(us_weights$WT2021) # 184,024,656.95 # must divide by 100 | ||
saveRDS(us_weights, here::here("temp_data", "us_weights.rds")) | ||
tmd_base <- read_csv(fs::path(TMDDIR, "tmd.csv.gz")) # for comparison to tmd2021 | ||
ns(tmd_base) | ||
saveRDS(tmd_base, here::here("temp_data", "tmd_base.rds")) | ||
``` | ||
|
||
|
||
```{r} | ||
#| label: prep-weights | ||
#| eval: false | ||
#| output: false | ||
# weightfiles <- dir_ls(here::here("temp_data")) |> str_subset("weights.csv.gz") | ||
wtfiles <- dir_ls(WEIGHTSDIR, glob="*.gz") # |> path_file() | ||
df <- read_csv(wtfiles[1]) | ||
sum(df$WT2021) | ||
area_weights <- vroom(wtfiles, id="src") |> | ||
mutate(src = str_sub(path_file(src), 1, 4), | ||
across(-src, \(x) x / 100.)) | ||
glimpse(area_weights) | ||
count(area_weights, src) | ||
area_weights |> | ||
select(src, WT2021) |> | ||
summarise(wtdn=sum(WT2021), .by=src) | ||
saveRDS(area_weights, here::here("temp_data", "area_weights.rds")) | ||
``` | ||
|
Oops, something went wrong.