Skip to content

Commit

Permalink
Merge pull request #270 from PSLmodels/examine-initial
Browse files Browse the repository at this point in the history
Initial pr for examine results for areas
  • Loading branch information
donboyd5 authored Oct 29, 2024
2 parents af01a85 + 05520bf commit 178493b
Show file tree
Hide file tree
Showing 11 changed files with 1,273 additions and 0 deletions.
18 changes: 18 additions & 0 deletions tmd/areas/weights/examine/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# examine

# folders to ignore
.Rproj.user/
.quarto/
_examine/
_freeze/
site_libs/

# files to ignore
.Rhistory
*.html

# Local Netlify folder
.netlify


/.quarto/
26 changes: 26 additions & 0 deletions tmd/areas/weights/examine/R/constants.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@

# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\storage\output
TMDDIR <- here::here("..", "..", "..", "storage", "output")
# list.files(TMDDIR)

TARGETSDIR <- here::here("..", "..", "targets")
WEIGHTSDIR <- here::here("..")
# list.files(TARGETSDIR)
# list.files(WEIGHTSDIR)

# CDZIPURL <- "https://www.irs.gov/pub/irs-soi/congressional2021.zip"
# CDDOCURL <- "https://www.irs.gov/pub/irs-soi/21incddocguide.docx"

# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\weights\examine
# \\wsl.localhost\Ubuntu\home\donboyd5\Documents\python_projects\tax-microdata-benchmarking\tmd\areas\targets\prepare
# TARGETSPREPDIR <- here::here("..", "..", "targets", "prepare")
# print(TARGETSPREPDIR) # Should print the absolute path to the folder
# list.files(TARGETSPREPDIR)

# CDDIR <- here::here("cds")
# CDDIR <- fs::path(TARGETSPREPDIR, "cds")
# CDRAW <- fs::path(CDDIR, "raw_data")
# CDINTERMEDIATE <- fs::path(CDDIR, "intermediate")
# CDFINAL <- fs::path(CDDIR, "final")
# list.files(CDFINAL)
# CDDOCEXTRACT <- "cd_documentation_extracted_from_21incddocguide.docx.xlsx"
79 changes: 79 additions & 0 deletions tmd/areas/weights/examine/R/libraries.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# libraries ---------------------------------------------------------------

library(DT)
library(fs)
library(gt)
library(knitr)
library(readxl)
library(skimr)
library(stringr)
library(tidyverse)
# includes: dplyr, forcats, ggplot2, lubridate, purrr, stringr, tibble, tidyr

tprint <- 75 # default tibble print
options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows

library(janitor)
# census_api_key("b27cb41e46ffe3488af186dd80c64dce66bd5e87", install = TRUE) # stored in .Renviron
# libraries needed for census population
library(sf)
library(tidycensus)
library(tigris)
options(tigris_use_cache = TRUE)
library(vroom)


# possible libraries ------------------------------------------------------

# library(rlang)
# library(tidyverse)
# tprint <- 75 # default tibble print
# options(tibble.print_max = tprint, tibble.print_min = tprint) # show up to tprint rows
#
# library(fs)

# tools
# library(vroom)
# library(readxl)
# library(openxlsx) # for writing xlsx files
# library(lubridate)
# library(RColorBrewer)
# library(RcppRoll)
# library(fredr)
# library(tidycensus)
# library(googledrive)
# library(arrow)
#
# library(jsonlite)
# library(tidyjson)
#
#
# # boyd libraries
# # library(btools)
# # library(bdata)
# # library(bggtools)
# # library(bmaps)
#
# # graphics
# library(scales)
# library(ggbeeswarm)
# library(patchwork)
# library(gridExtra)
# library(ggrepel)
# library(ggbreak)
#
# # tables
# library(knitr)
# library(kableExtra)
# library(DT)
# library(gt)
# library(gtExtras)
# library(janitor)
# library(skimr)
# library(vtable)
#
# # maps
# library(maps)
# # https://cran.r-project.org/web/packages/usmap/vignettes/mapping.html
# library(usmap)

63 changes: 63 additions & 0 deletions tmd/areas/weights/examine/_quarto.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
project:
type: book
output-dir: _examine

# https://prerelease.quarto.org/ # quarto documentation at this link

# publishing with netllify cli:
# open terminal in examine
# quarto render && netlify deploy --prod --dir=_examine

# quarto render # inspect to be sure it is as desired
# netlify deploy --prod --dir=_examine

# or step by step
# netlify deploy # to test it, give _examine as publish directory
# netlify deploy --prod # to deploy, give _examine as publish directory

execute:
eval: true
echo: true
output: true
freeze: auto # auto: during global project renders, re-render only when source changes

book:
title: "Examine area weights creation results"
subtitle: "Create csv file"
# author: "Don Boyd"
date: today
date-format: long
chapters:
- index.qmd
- part: "IRS Congressional District data"
chapters:
# - cd_overall_documentation.qmd
- cd_prepare_data.qmd
- cd_simple_tables.qmd
- cd_results_vs_targets_tables.qmd

format:
html:
theme: cosmo
code-fold: true

editor_options:
chunk_output_type: console

# R packages using old 209 libxml
# gt,


# rendering commands
# quarto render
# quarto publish netlify --no-prompt --no-render --no-browser

# possibly use this at start of each doc
# ---
# output: html_document
# editor_options:
# chunk_output_type: console
# ---



180 changes: 180 additions & 0 deletions tmd/areas/weights/examine/cd_prepare_data.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
---
output: html_document
editor_options:
chunk_output_type: console
---

# Read tmd 2021, area targets, area weights and prepare data


## Setup

```{r}
#| label: setup
#| output: false
source(here::here("R", "libraries.R"))
source(here::here("R", "constants.R"))
phase4_statecds <- c("AK00", "DE00", "ID01", "ID02", "ME02", "MT00", "ND00", "PA08", "SD00", "WY00")
```


```{r}
#| label: functions
#| output: false
ns <- function(obj){
sort(names(obj))
}
```

## Download files from google drive

Only do this when target files and results have changed. Otherwise, necessary data should be in the temp_data folder.

```{r}
#| label: hookup-googledrive
#| eval: false
library(googledrive)
drive_auth() # authenticate
```


```{r}
#| label: download-files
#| eval: false
# /AFPI_2024/Phase 4
# folder_id <- "1pEdofaxeQgEeDLM8NOpo0vOGL1jT8Qa1" # AFPI folder
folder_id <- "1Z7ZWYTbldfuQCFbpqKi4Z8FYxkbwmhnu" # Phase 4 folder
files <- drive_ls(as_id(folder_id))
files
f <- function(gdfname){
fpath <- here::here("temp_data", gdfname)
print(fpath)
drive_download(gdfname, path = fpath, overwrite = TRUE)
}
# f(files$name[[1]])
files |>
pull(name) |>
walk(\(gdfname) f(gdfname))
```

## Prepare target files

Get all targets prepared

```{r}
#| label: targets-all
#| eval: false
#| output: false
# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/weights/examine # project dir
# ~/Documents/python_projects/tax-microdata-benchmarking/tmd/areas/targets/prepare/cds/intermediate # cdbasefile
HERE <- here::here()
CDTARGETSDIR <- fs::path(HERE, "..", "..", "targets", "prepare", "cds", "intermediate")
# list.files(CDTARGETSDIR)
targets_data <- read_csv(fs::path(CDTARGETSDIR, "cdbasefile.csv"))
glimpse(targets_data)
saveRDS(targets_data, here::here("temp_data", "targets_data.rds"))
```


Get targets used in the optimization

```{r}
#| label: targets-used
#| eval: false
#| output: false
targetfiles <- dir_ls(here::here("temp_data")) |> str_subset("targets.csv")
targets_used <- vroom(targetfiles, id="src") |>
mutate(src=path_file(src) |> str_sub(1, 4)) |>
mutate(active=!(str_sub(varname, 1, 1) == "#"),
varname = ifelse(!active,
varname |> str_remove("#") |> str_trim(),
varname))
saveRDS(targets_used, here::here("temp_data", "targets_used.rds"))
glimpse(targets_used)
count(targets_used, src)
count(targets_used, active)
count(targets_used, varname)
count(targets_used, varname, active)
targets_used |> filter(src == "ak00")
targets_used |> filter(src == "de00")
```


## Get and prepare tmd data and area weights

```{r}
#| label: get-tmd-2021
#| eval: false
#| output: false
# fpath <- fs::path(TMDDIR, "tmd_2021.csv") # NO - it is out of sync with tmd.csv
fpath <- here::here("temp_data", "djbout.csv")
tmd2021 <- read_csv(fpath)
ns(tmd2021)
# djbout <- read_csv(here::here("temp_data", "djbout.csv")) # this is tax calc output vdf from create_area_weights.py
saveRDS(tmd2021, here::here("temp_data", "tmd2021.rds"))
sum(tmd2021$s006) # 184,024,657 with djbout.csv, s006 units are numbers of units, not hundreds of units
# con <- unz(zpath, "21incd.csv")
# data <- read_csv(con)
us_weights <- read_csv(fs::path(TMDDIR, "tmd_weights.csv.gz"))
sum(us_weights$WT2021) # 184,024,656.95 # must divide by 100
saveRDS(us_weights, here::here("temp_data", "us_weights.rds"))
tmd_base <- read_csv(fs::path(TMDDIR, "tmd.csv.gz")) # for comparison to tmd2021
ns(tmd_base)
saveRDS(tmd_base, here::here("temp_data", "tmd_base.rds"))
```


```{r}
#| label: prep-weights
#| eval: false
#| output: false
# weightfiles <- dir_ls(here::here("temp_data")) |> str_subset("weights.csv.gz")
wtfiles <- dir_ls(WEIGHTSDIR, glob="*.gz") # |> path_file()
df <- read_csv(wtfiles[1])
sum(df$WT2021)
area_weights <- vroom(wtfiles, id="src") |>
mutate(src = str_sub(path_file(src), 1, 4),
across(-src, \(x) x / 100.))
glimpse(area_weights)
count(area_weights, src)
area_weights |>
select(src, WT2021) |>
summarise(wtdn=sum(WT2021), .by=src)
saveRDS(area_weights, here::here("temp_data", "area_weights.rds"))
```

Loading

0 comments on commit 178493b

Please sign in to comment.