Merge pull request #270 from PSLmodels/examine-initial

Initial pr for examine results for areas
PSLmodels · Oct 29, 2024 · 178493b · 178493b
2 parents af01a85 + 05520bf
commit 178493b
Show file tree

Hide file tree

Showing 11 changed files with 1,273 additions and 0 deletions.
diff --git a/tmd/areas/weights/examine/.gitignore b/tmd/areas/weights/examine/.gitignore
@@ -0,0 +1,18 @@
+# examine
+
+# folders to ignore
+.Rproj.user/
+.quarto/
+_examine/
+_freeze/
+site_libs/
+
+# files to ignore
+.Rhistory
+*.html
+
+# Local Netlify folder
+.netlify
+
+
+/.quarto/
diff --git a/tmd/areas/weights/examine/R/constants.R b/tmd/areas/weights/examine/R/constants.R
@@ -0,0 +1,26 @@
+
+# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\storage\output
+TMDDIR <- here::here("..", "..", "..", "storage", "output")
+# list.files(TMDDIR)
+
+TARGETSDIR <- here::here("..", "..", "targets")
+WEIGHTSDIR <- here::here("..")
+# list.files(TARGETSDIR)
+# list.files(WEIGHTSDIR)
+
+# CDZIPURL <- "https://www.irs.gov/pub/irs-soi/congressional2021.zip"
+# CDDOCURL <- "https://www.irs.gov/pub/irs-soi/21incddocguide.docx"
+
+# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\weights\examine
+# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\targets\prepare
+# TARGETSPREPDIR <- here::here("..", "..", "targets", "prepare")
+# print(TARGETSPREPDIR)  # Should print the absolute path to the folder
+# list.files(TARGETSPREPDIR)
+
+# CDDIR <- here::here("cds")
+# CDDIR <- fs::path(TARGETSPREPDIR, "cds")
+# CDRAW <- fs::path(CDDIR, "raw_data")
+# CDINTERMEDIATE <- fs::path(CDDIR, "intermediate")
+# CDFINAL <- fs::path(CDDIR, "final")
+# list.files(CDFINAL)
+# CDDOCEXTRACT <- "cd_documentation_extracted_from_21incddocguide.docx.xlsx"
diff --git a/tmd/areas/weights/examine/R/libraries.R b/tmd/areas/weights/examine/R/libraries.R
@@ -0,0 +1,79 @@
+# libraries ---------------------------------------------------------------
+
+library(DT)
+library(fs)
+library(gt)
+library(knitr)
+library(readxl)
+library(skimr)
+library(stringr)
+library(tidyverse)
+# includes: dplyr, forcats, ggplot2, lubridate, purrr, stringr, tibble, tidyr
+
+tprint <- 75  # default tibble print
+options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows
+
+library(janitor)
+# census_api_key("b27cb41e46ffe3488af186dd80c64dce66bd5e87", install = TRUE) # stored in .Renviron
+# libraries needed for census population
+library(sf)
+library(tidycensus)
+library(tigris)
+options(tigris_use_cache = TRUE)
+library(vroom)
+
+
+# possible libraries ------------------------------------------------------
+
+# library(rlang)
+# library(tidyverse)
+# tprint <- 75  # default tibble print
+# options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows
+#  
+# library(fs)
+
+# tools
+# library(vroom)
+# library(readxl)
+# library(openxlsx) # for writing xlsx files
+# library(lubridate)
+# library(RColorBrewer)
+# library(RcppRoll)
+# library(fredr)
+# library(tidycensus)
+# library(googledrive)
+# library(arrow)
+# 
+# library(jsonlite)
+# library(tidyjson)
+# 
+# 
+# # boyd libraries
+# # library(btools)
+# # library(bdata)
+# # library(bggtools)
+# # library(bmaps)
+# 
+# # graphics
+# library(scales)
+# library(ggbeeswarm)
+# library(patchwork)
+# library(gridExtra)
+# library(ggrepel)
+# library(ggbreak)
+# 
+# # tables
+# library(knitr)
+# library(kableExtra)
+# library(DT)
+# library(gt)
+# library(gtExtras)
+# library(janitor)
+# library(skimr)
+# library(vtable)
+# 
+# # maps
+# library(maps)
+# # https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html
+# library(usmap)
+
diff --git a/tmd/areas/weights/examine/_quarto.yml b/tmd/areas/weights/examine/_quarto.yml
@@ -0,0 +1,63 @@
+project:
+  type: book
+  output-dir: _examine
+
+# https://prerelease.quarto.org/  # quarto documentation at this link
+
+# publishing with netllify cli:
+#  open terminal in examine
+# quarto render && netlify deploy --prod --dir=_examine
+
+#  quarto render # inspect to be sure it is as desired
+#  netlify deploy --prod --dir=_examine
+
+# or step by step
+#  netlify deploy # to test it, give _examine as publish directory
+#  netlify deploy --prod   # to deploy, give _examine as publish directory
+
+execute:
+  eval: true
+  echo: true
+  output: true
+  freeze: auto  # auto: during global project renders, re-render only when source changes
+
+book:
+  title: "Examine area weights creation results"
+  subtitle: "Create csv file"
+  # author: "Don Boyd"
+  date: today
+  date-format: long
+  chapters:
+    - index.qmd
+    - part: "IRS Congressional District data"
+      chapters:
+        # - cd_overall_documentation.qmd
+        - cd_prepare_data.qmd
+        - cd_simple_tables.qmd
+        - cd_results_vs_targets_tables.qmd
+
+format:
+  html:
+    theme: cosmo
+    code-fold: true    
+
+editor_options:
+  chunk_output_type: console
+
+# R packages using old 209 libxml
+#  gt, 
+
+
+# rendering commands
+#   quarto render
+#   quarto publish netlify --no-prompt --no-render --no-browser
+
+# possibly use this at start of each doc
+# ---
+# output: html_document
+# editor_options: 
+#   chunk_output_type: console
+# ---
+
+
+
diff --git a/tmd/areas/weights/examine/cd_prepare_data.qmd b/tmd/areas/weights/examine/cd_prepare_data.qmd
@@ -0,0 +1,180 @@
+---
+output: html_document
+editor_options: 
+ chunk_output_type: console
+---
+
+# Read tmd 2021, area targets, area weights and prepare data
+
+
+## Setup
+
+```{r}
+#| label: setup
+#| output: false
+
+source(here::here("R", "libraries.R"))
+source(here::here("R", "constants.R"))
+
+phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00")
+
+```
+
+
+```{r}
+#| label: functions
+#| output: false
+
+ns <- function(obj){
+  sort(names(obj))
+}
+
+```
+
+## Download files from google drive
+
+Only do this when target files and results have changed. Otherwise, necessary data should be in the temp_data folder.
+
+```{r}
+#| label: hookup-googledrive
+#| eval: false
+
+library(googledrive)
+drive_auth() # authenticate
+
+```
+
+
+```{r}
+#| label: download-files
+#| eval: false
+
+# /AFPI_2024/Phase 4
+# folder_id <- "1pEdofaxeQgEeDLM8NOpo0vOGL1jT8Qa1" # AFPI folder
+folder_id <- "1Z7ZWYTbldfuQCFbpqKi4Z8FYxkbwmhnu" # Phase 4 folder
+
+files <- drive_ls(as_id(folder_id))
+files
+
+f <- function(gdfname){
+  fpath <- here::here("temp_data", gdfname)
+  print(fpath)
+  drive_download(gdfname, path = fpath, overwrite = TRUE)
+}
+# f(files$name[[1]])
+
+files |> 
+  pull(name) |> 
+  walk(\(gdfname) f(gdfname))
+
+```
+
+## Prepare target files
+
+Get all targets prepared
+
+```{r}
+#| label: targets-all
+#| eval: false
+#| output: false
+
+# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/weights/examine   # project dir
+# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/targets/prepare/cds/intermediate  # cdbasefile
+HERE <- here::here()
+CDTARGETSDIR <- fs::path(HERE, "..", "..", "targets", "prepare", "cds", "intermediate")
+# list.files(CDTARGETSDIR)
+
+targets_data <- read_csv(fs::path(CDTARGETSDIR, "cdbasefile.csv"))
+glimpse(targets_data)
+
+saveRDS(targets_data, here::here("temp_data", "targets_data.rds"))
+
+```
+
+
+Get targets used in the optimization
+
+```{r}
+#| label: targets-used
+#| eval: false
+#| output: false
+
+targetfiles <- dir_ls(here::here("temp_data")) |> str_subset("targets.csv")
+
+targets_used <- vroom(targetfiles, id="src") |> 
+  mutate(src=path_file(src) |> str_sub(1, 4)) |> 
+  mutate(active=!(str_sub(varname, 1, 1) == "#"),
+         varname = ifelse(!active, 
+                          varname |> str_remove("#") |> str_trim(),
+                          varname))
+saveRDS(targets_used, here::here("temp_data", "targets_used.rds"))
+
+glimpse(targets_used)
+count(targets_used, src)
+count(targets_used, active)
+count(targets_used, varname)
+count(targets_used, varname, active)
+
+targets_used |> filter(src == "ak00")
+targets_used |> filter(src == "de00")
+
+```
+
+
+## Get and prepare tmd data and area weights
+
+```{r}
+#| label: get-tmd-2021
+#| eval: false
+#| output: false
+
+# fpath <-  fs::path(TMDDIR, "tmd_2021.csv") # NO - it is out of sync with tmd.csv
+fpath <- here::here("temp_data", "djbout.csv")
+tmd2021 <- read_csv(fpath)
+ns(tmd2021)
+
+# djbout <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from create_area_weights.py
+saveRDS(tmd2021, here::here("temp_data", "tmd2021.rds"))
+
+sum(tmd2021$s006) # 184,024,657 with djbout.csv, s006 units are numbers of units, not hundreds of units
+
+# con <- unz(zpath, "21incd.csv")
+# data <- read_csv(con)
+
+us_weights <- read_csv(fs::path(TMDDIR, "tmd_weights.csv.gz"))
+sum(us_weights$WT2021) # 184,024,656.95 # must divide by 100
+saveRDS(us_weights, here::here("temp_data", "us_weights.rds"))
+
+tmd_base <- read_csv(fs::path(TMDDIR, "tmd.csv.gz")) # for comparison to tmd2021
+ns(tmd_base)
+saveRDS(tmd_base, here::here("temp_data", "tmd_base.rds"))
+
+
+```
+
+
+```{r}
+#| label: prep-weights
+#| eval: false
+#| output: false
+
+# weightfiles <- dir_ls(here::here("temp_data")) |> str_subset("weights.csv.gz")
+wtfiles <- dir_ls(WEIGHTSDIR, glob="*.gz") #  |> path_file()
+
+df <- read_csv(wtfiles[1])
+sum(df$WT2021)
+
+area_weights <- vroom(wtfiles, id="src") |> 
+  mutate(src = str_sub(path_file(src), 1, 4),
+         across(-src, \(x) x / 100.))
+glimpse(area_weights)
+count(area_weights, src)
+
+area_weights |> 
+  select(src, WT2021) |> 
+  summarise(wtdn=sum(WT2021), .by=src)
+
+saveRDS(area_weights, here::here("temp_data", "area_weights.rds"))
+
+```
+