From f6a32a22996e8db9f6a17ff5c1fb92780a09cb45 Mon Sep 17 00:00:00 2001
From: "upx3 (CFA)" <127630341+AFg6K7h4fhy2@users.noreply.github.com>
Date: Wed, 5 Feb 2025 14:36:48 -0500
Subject: [PATCH] Add State Populations To Location Table (#58)

* add lead's suggestion as text file to start pull

* edit csvs

* update location table; add united states data; update descriptions in readme

* remove suggestion

* Update .pre-commit-config.yaml

Co-authored-by: Dylan H. Morris <dylanhmorris@users.noreply.github.com>

* reduction in united states code

* update US population to use all non-null locations

* update readme

* add united states again

* Update forecasttools/__init__.py

Co-authored-by: Dylan H. Morris <dylanhmorris@users.noreply.github.com>

* Update forecasttools/__init__.py

Co-authored-by: Dylan H. Morris <dylanhmorris@users.noreply.github.com>

* Update forecasttools/__init__.py

Co-authored-by: Dylan H. Morris <dylanhmorris@users.noreply.github.com>

---------

Co-authored-by: Dylan H. Morris <dylanhmorris@users.noreply.github.com>
---
 .gitignore                           |   1 +
 .pre-commit-config.yaml              |   6 --
 README.md                            |  58 ++++++++++------
 README.qmd                           |  13 +++-
 forecasttools/__init__.py            |   4 +-
 forecasttools/data.py                |  99 ++++++++++++++++++++++++++-
 forecasttools/location_table.parquet | Bin 2073 -> 3032 bytes
 7 files changed, 148 insertions(+), 33 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3ea5f21..be1eb42 100644
--- a/.gitignore
+++ b/.gitignore
@@ -181,3 +181,4 @@ DS_Store
 _book/
 _book
 render_test_idata_general_time_representation_files
+*.csv
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9c75e09..fd585c3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -69,12 +69,6 @@ repos:
 ################################################################################
 # PYTHON
 ################################################################################
--   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.10.0
-    hooks:
-    -   id: black
-        args: ["--line-length", "79"]
-        language_version: python3
 -   repo: https://github.com/PyCQA/isort
     rev: 5.13.2
     hooks:
diff --git a/README.md b/README.md
index 455ac42..0c0fea0 100644
--- a/README.md
+++ b/README.md
@@ -117,9 +117,11 @@ See below for more information on the datasets.
 
 ## Location Table
 
-The location table contains abbreviations, codes, and extended names for
-the US jurisdictions for which the FluSight and COVID forecasting hubs
-require users to generate forecasts.
+The location table contains abbreviations, codes, extended names, and
+populations for the jurisdictions of the United States that the FluSight
+and COVID forecasting hubs require users to generate forecasts. The US
+population value is the sum of all available states and territories
+(some territories have `null` population values).
 
 The location table is stored in `forecasttools-py` as a `polars`
 dataframe and is accessed via:
@@ -129,24 +131,24 @@ loc_table = forecasttools.location_table
 print(loc_table)
 ```
 
-    shape: (58, 3)
-    ┌───────────────┬────────────┬─────────────────────────────┐
-    │ location_code ┆ short_name ┆ long_name                   │
-    │ ---           ┆ ---        ┆ ---                         │
-    │ str           ┆ str        ┆ str                         │
-    ╞═══════════════╪════════════╪═════════════════════════════╡
-    │ US            ┆ US         ┆ United States               │
-    │ 01            ┆ AL         ┆ Alabama                     │
-    │ 02            ┆ AK         ┆ Alaska                      │
-    │ 04            ┆ AZ         ┆ Arizona                     │
-    │ 05            ┆ AR         ┆ Arkansas                    │
-    │ …             ┆ …          ┆ …                           │
-    │ 66            ┆ GU         ┆ Guam                        │
-    │ 69            ┆ MP         ┆ Northern Mariana Islands    │
-    │ 72            ┆ PR         ┆ Puerto Rico                 │
-    │ 74            ┆ UM         ┆ U.S. Minor Outlying Islands │
-    │ 78            ┆ VI         ┆ U.S. Virgin Islands         │
-    └───────────────┴────────────┴─────────────────────────────┘
+    shape: (58, 5)
+    ┌───────────────┬────────────┬─────────────────────────────┬────────────┬──────────┐
+    │ location_code ┆ short_name ┆ long_name                   ┆ population ┆ is_state │
+    │ ---           ┆ ---        ┆ ---                         ┆ ---        ┆ ---      │
+    │ str           ┆ str        ┆ str                         ┆ i64        ┆ bool     │
+    ╞═══════════════╪════════════╪═════════════════════════════╪════════════╪══════════╡
+    │ US            ┆ US         ┆ United States               ┆ 334735155  ┆ false    │
+    │ 01            ┆ AL         ┆ Alabama                     ┆ 5024279    ┆ true     │
+    │ 02            ┆ AK         ┆ Alaska                      ┆ 733391     ┆ true     │
+    │ 04            ┆ AZ         ┆ Arizona                     ┆ 7151502    ┆ true     │
+    │ 05            ┆ AR         ┆ Arkansas                    ┆ 3011524    ┆ true     │
+    │ …             ┆ …          ┆ …                           ┆ …          ┆ …        │
+    │ 66            ┆ GU         ┆ Guam                        ┆ null       ┆ false    │
+    │ 69            ┆ MP         ┆ Northern Mariana Islands    ┆ null       ┆ false    │
+    │ 72            ┆ PR         ┆ Puerto Rico                 ┆ 3285874    ┆ false    │
+    │ 74            ┆ UM         ┆ U.S. Minor Outlying Islands ┆ null       ┆ false    │
+    │ 78            ┆ VI         ┆ U.S. Virgin Islands         ┆ null       ┆ false    │
+    └───────────────┴────────────┴─────────────────────────────┴────────────┴──────────┘
 
 Using `./forecasttools/data.py`, the location table was created by
 running the following:
@@ -160,6 +162,20 @@ make_census_dataset(
 )
 ```
 
+## United States
+
+Calling `forecasttools.united_states` simply returns a Python list that
+contains the 50 United States (`United States` itself is not included).
+While quite simple, it’s to have this capability available in fewer
+steps than through calling and selecting values from `location_table`.
+
+``` python
+united_states = forecasttools.united_states
+print(united_states)
+```
+
+    ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
+
 ## Example FluSight Hub Submission
 
 The example FluSight submission comes from the [following 2023-24
diff --git a/README.qmd b/README.qmd
index 62971a5..1567f3f 100644
--- a/README.qmd
+++ b/README.qmd
@@ -109,8 +109,7 @@ See below for more information on the datasets.
 
 ## Location Table
 
-The location table contains abbreviations, codes, and extended names for the US jurisdictions for which the FluSight and COVID forecasting hubs require users to generate forecasts.
-
+The location table contains abbreviations, codes, extended names, and populations for the jurisdictions of the United States that the FluSight and COVID forecasting hubs require users to generate forecasts. The US population value is the sum of all available states and territories (some territories have `null` population values).
 
 The location table is stored in `forecasttools-py` as a `polars` dataframe and is accessed via:
 
@@ -130,6 +129,16 @@ make_census_dataset(
 )
 ```
 
+## United States
+
+Calling `forecasttools.united_states` simply returns a Python list that contains the 50 United States (`United States` itself is not included). While quite simple, it's to have this capability available in fewer steps than through calling and selecting values from `location_table`.
+
+```{python}
+united_states = forecasttools.united_states
+print(united_states)
+```
+
+
 ## Example FluSight Hub Submission
 
 The example FluSight submission comes from the [following 2023-24 submission](https://raw.githubusercontent.com/cdcepi/FluSight-forecast-hub/main/model-output/cfa-flumech/2023-10-14-cfa-flumech.csv).
diff --git a/forecasttools/__init__.py b/forecasttools/__init__.py
index 3593075..023c3f3 100644
--- a/forecasttools/__init__.py
+++ b/forecasttools/__init__.py
@@ -27,11 +27,12 @@
     validate_iter_has_expected_types,
 )
 
-# location table (from Census data)
+# location table (from Census data; contains territory data)
 location_table_path = importlib.resources.files(__package__).joinpath(
     "location_table.parquet"
 )
 location_table = pl.read_parquet(location_table_path)
+united_states = location_table.filter(pl.col("is_state")).get_column("long_name").to_list()
 
 # load example flusight submission
 example_flusight_submission_path = importlib.resources.files(
@@ -71,6 +72,7 @@
 
 __all__ = [
     "location_table",
+    "united_states",
     "example_flusight_submission",
     "nhsn_hosp_COVID",
     "nhsn_hosp_flu",
diff --git a/forecasttools/data.py b/forecasttools/data.py
index 008d24c..b15bf23 100644
--- a/forecasttools/data.py
+++ b/forecasttools/data.py
@@ -7,7 +7,10 @@
 an example FluSight submission.
 """
 
+
+
 import os
+import pathlib
 from urllib import error, request
 
 import polars as pl
@@ -61,14 +64,99 @@ def check_file_save_path(
         raise FileExistsError(f"File already exists at: {file_save_path}")
 
 
+
+def merge_pop_data_and_loc_data(
+    file_save_path: str,
+    population_file_path: str,
+    locations_file_path: str,
+    overwrite: bool = False,
+) -> None:
+    """
+    Takes a location table parquet and a census
+    populations parquet and adds the population
+    values from the populations data to the
+    location table.
+
+    Parameters
+    ----------
+    file_save_path : str
+        Where to save the outputted parquet file.
+    population_file_path : str
+        From where to load the populations table.
+    locations_file_path : str
+        From where to load the locations table.
+    overwrite : bool
+        Whether or not to overwrite the location
+        table, should one already exist. Defaults
+        to False.
+
+    Returns
+    -------
+    None
+        Saves the outputted parquet file at the
+        given file save path.
+    """
+    population_path = pathlib.Path(population_file_path)
+    locations_path = pathlib.Path(locations_file_path)
+    save_path = pathlib.Path(file_save_path)
+    if not population_path.exists():
+        raise FileNotFoundError(
+            f"Population file not found: {population_path}"
+        )
+    if not locations_path.exists():
+        raise FileNotFoundError(f"Locations file not found: {locations_path}")
+    if save_path.exists() and not overwrite:
+        print(f"File already exists at {save_path}. Skipping writing.")
+        return
+    us_states = [
+        "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
+        "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho",
+        "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana",
+        "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
+        "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada",
+        "New Hampshire", "New Jersey", "New Mexico", "New York",
+        "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon",
+        "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota",
+        "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington",
+        "West Virginia", "Wisconsin", "Wyoming"
+    ]
+    pop_df = pl.read_parquet(population_path).select(
+        [
+            pl.col("STNAME").alias("long_name"),
+            pl.col("POPULATION").alias("population"),
+        ]
+    )
+    loc_df = pl.read_parquet(locations_path)  # should have "long_name"
+    merged_df = loc_df.join(pop_df, on="long_name", how="left")
+    # US total is not included by default; get US total from
+    # non-null territories & states
+    us_population = merged_df["population"].sum()
+    merged_df = merged_df.with_columns(
+        pl.when(pl.col("long_name") == "United States")
+        .then(us_population)
+        .otherwise(pl.col("population"))
+        .alias("population")
+    )
+    merged_df = merged_df.with_columns(
+        pl.col("long_name").is_in(us_states).alias("is_state")
+    )
+    merged_df.write_parquet(save_path)
+    print(f"File successfully written to {save_path}")
+
+
+
 def make_census_dataset(
     file_save_path: str,
 ) -> None:
     """
     Retrieves US 2020 Census data in a
     three column Polars dataframe, then
-    saves the dataset as a csv in a given
+    saves the dataset as a parquet in a given
     directory, if it does not already exist.
+    Note: As of 2025-01-05, the Census link
+    below is not available, so the existing
+    parquet file in forecasttools must instead
+    be relied upon.
 
     Parameters
     ----------
@@ -147,7 +235,8 @@ def make_nshn_fitting_dataset(
         ]
         if not set(required_cols).issubset(set(df_cols)):
             raise ValueError(
-                f"NHSN dataset missing required columns: {set(required_cols) - set(df_cols)}"
+                f"NHSN dataset missing required columns:"
+                f" {set(required_cols) - set(df_cols)}"
             )
         # fully load and save NHSN dataframe
         df = pl.read_csv(nhsn_dataset_path)
@@ -199,7 +288,11 @@ def get_and_save_flusight_submission(
     # check if the save file exists
     check_file_save_path(file_save_path)
     # check if the FluSight example url is still valid
-    url = "https://raw.githubusercontent.com/cdcepi/FluSight-forecast-hub/main/model-output/cfa-flumech/2023-10-14-cfa-flumech.csv"
+    url = (
+        "https://raw.githubusercontent.com/cdcepi/"
+        "FluSight-forecast-hub/main/model-output/"
+        "cfa-flumech/2023-10-14-cfa-flumech.csv"
+    )
     check_url(url)
     # read csv from URL, convert to polars
     submission_df = pl.read_csv(url, infer_schema_length=7500)
diff --git a/forecasttools/location_table.parquet b/forecasttools/location_table.parquet
index b9526748daae25074b1bdf610f29197de69a473b..15a44decd90397c396d7aabebf332da31988969f 100644
GIT binary patch
delta 1098
zcmbO!a6^2<PnLR77SRiAqJ1nrqFJI03=A4u_5UWUV0g>U(86uJ{If7nh(Uan55su_
zZiYVjT!tf`bs6;iqM0U1MlkGPHD~xE#m~^$%Ei#V)Ppglcrn9r6EP+MnL7=Ke{`lN
zt}~FhqVwi}ORSLl_LCwf9OLG?WKTXa^Pf<7y}s_kSFw)W0`&^-WYgmHf3k=w?d_f9
zub^me!g$APK10}J?it)$EVFD*EO$7Qer)kRft!+k-R-;;#hD9t?qNLe+K_?u0%ziu
z=7W<;x5P<@y^9P!%Tc6k1adEfvBHO|MtNHlAG`IdcAwa>kyq0E)`Y^v$0xa_+_}-#
zBb|F<$3-wzu{Z=O=dN?}m8y45XA1jtoyeq<Qj<g(L^DLIbVN%;1w`3I*<{SPG&o=Z
zBEbQ)3~UpS0EL)>$wEd379b~sk(q;yfAW7ujr!DBQ5H#41~vgn39f?tg3_GClFa-(
zQ68}pu@xL*KUmd13ESW@6ll4Os2<3228iVh8jOq*j36t)*6|fGFfai*C5-?6{s#hn
z22ln{RR%Um369L-_~Mepl2j3}O+jKQYWJjU%%J)we`j&i6l4M_gLweRg!u|+gcuit
zBuJW(0f-qVo3UysOUj7NlMp?_C<e3+$i62fngU{qGD+rmuoUIzmxyvr&SCXX6J=3l
zLiQ-oML+?#a~Q-*BxNLHChuZ3<@+WpDj_yOMpTVK?H-5BWPK(*4zWHtpx8E<$;NE<
z^~*r)J2IkH3~H|=WylXjv3($erpb!hF{mAqmca-jY$3$JAZBz;PE<_nima#-gII&A
zScw{k*c=HFknbgU9D{=V!>x*wGg5OCC6=)0gMg{0)8r*=cCw+4K#=Yj=;&w+VtJ(l
z2`5hw)5+0s@_cqlCa3htyVx~!!4d_@hL%++?rsKAKn;$LF2x|D{6S1-5aAdJCIdYu
z^Rvt9Sq1<l4ZtEmv%pHc9Dz0gm3o8t$?lfrDd80vDegICN$Hcl*qwzyrn*;@CFQ21
zCFQ!60Br#}!C5FDDC|^~=x$k=;$~SI8E%*}`3sw~jUULABzMcK$cUhVBqP(bNF&Q^
fpdMQW3ATXzoW!DH$ru?124ut_1x)Y(jzNY1H8Cqo

delta 248
zcmca1K2u=BPnO9X>@JgQ*p+!inIwHYSc>xVOGMcwA7b}l6DyIFk+hj?z+uY%Lqt?U
z?33{191b(~38Fyu43WtzIqZ#AfY|#)M6DRa+GNB^)HuWr2#PQ;NXkg?I0gmzhg%gV
zXQbvPN<3iF2Lltw$ul|agnS(x9i2ghV=#zt40N0v$?Yuc1!5<;Tb4!{T9!tJ8|FAV
zPF~ODER+ru_VCM3GWIfvFfz?Z3J;yk!tHEs02FpAPBQY#NO8|8OUkj#N-_!p>an$D
ckYEeQ&q*vQmb8&!V1OWojZ6#-0gge208`vS8~^|S