Add State Populations To Location Table (#58)

* add lead's suggestion as text file to start pull * edit csvs * update location table; add united states data; update descriptions in readme * remove suggestion * Update .pre-commit-config.yaml Co-authored-by: Dylan H. Morris <[email protected]> * reduction in united states code * update US population to use all non-null locations * update readme * add united states again * Update forecasttools/__init__.py Co-authored-by: Dylan H. Morris <[email protected]> * Update forecasttools/__init__.py Co-authored-by: Dylan H. Morris <[email protected]> * Update forecasttools/__init__.py Co-authored-by: Dylan H. Morris <[email protected]> --------- Co-authored-by: Dylan H. Morris <[email protected]>
CDCgov · Feb 5, 2025 · f6a32a2 · f6a32a2
1 parent cca8f02
commit f6a32a2
Show file tree

Hide file tree

Showing 7 changed files with 148 additions and 33 deletions.
diff --git a/.gitignore b/.gitignore
@@ -181,3 +181,4 @@ DS_Store
 _book/
 _book
 render_test_idata_general_time_representation_files
+*.csv
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -69,12 +69,6 @@ repos:
 ################################################################################
 # PYTHON
 ################################################################################
--   repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 24.10.0
-    hooks:
-    -   id: black
-        args: ["--line-length", "79"]
-        language_version: python3
 -   repo: https://github.com/PyCQA/isort
     rev: 5.13.2
     hooks:

diff --git a/README.md b/README.md
@@ -117,9 +117,11 @@ See below for more information on the datasets.
 
 ## Location Table
 
-The location table contains abbreviations, codes, and extended names for
-the US jurisdictions for which the FluSight and COVID forecasting hubs
-require users to generate forecasts.
+The location table contains abbreviations, codes, extended names, and
+populations for the jurisdictions of the United States that the FluSight
+and COVID forecasting hubs require users to generate forecasts. The US
+population value is the sum of all available states and territories
+(some territories have `null` population values).
 
 The location table is stored in `forecasttools-py` as a `polars`
 dataframe and is accessed via:
@@ -129,24 +131,24 @@ loc_table = forecasttools.location_table
 print(loc_table)
 ```
 
-    shape: (58, 3)
-    ┌───────────────┬────────────┬─────────────────────────────┐
-    │ location_code ┆ short_name ┆ long_name                   │
-    │ ---           ┆ ---        ┆ ---                         │
-    │ str           ┆ str        ┆ str                         │
-    ╞═══════════════╪════════════╪═════════════════════════════╡
-    │ US            ┆ US         ┆ United States               │
-    │ 01            ┆ AL         ┆ Alabama                     │
-    │ 02            ┆ AK         ┆ Alaska                      │
-    │ 04            ┆ AZ         ┆ Arizona                     │
-    │ 05            ┆ AR         ┆ Arkansas                    │
-    │ …             ┆ …          ┆ …                           │
-    │ 66            ┆ GU         ┆ Guam                        │
-    │ 69            ┆ MP         ┆ Northern Mariana Islands    │
-    │ 72            ┆ PR         ┆ Puerto Rico                 │
-    │ 74            ┆ UM         ┆ U.S. Minor Outlying Islands │
-    │ 78            ┆ VI         ┆ U.S. Virgin Islands         │
-    └───────────────┴────────────┴─────────────────────────────┘
+    shape: (58, 5)
+    ┌───────────────┬────────────┬─────────────────────────────┬────────────┬──────────┐
+    │ location_code ┆ short_name ┆ long_name                   ┆ population ┆ is_state │
+    │ ---           ┆ ---        ┆ ---                         ┆ ---        ┆ ---      │
+    │ str           ┆ str        ┆ str                         ┆ i64        ┆ bool     │
+    ╞═══════════════╪════════════╪═════════════════════════════╪════════════╪══════════╡
+    │ US            ┆ US         ┆ United States               ┆ 334735155  ┆ false    │
+    │ 01            ┆ AL         ┆ Alabama                     ┆ 5024279    ┆ true     │
+    │ 02            ┆ AK         ┆ Alaska                      ┆ 733391     ┆ true     │
+    │ 04            ┆ AZ         ┆ Arizona                     ┆ 7151502    ┆ true     │
+    │ 05            ┆ AR         ┆ Arkansas                    ┆ 3011524    ┆ true     │
+    │ …             ┆ …          ┆ …                           ┆ …          ┆ …        │
+    │ 66            ┆ GU         ┆ Guam                        ┆ null       ┆ false    │
+    │ 69            ┆ MP         ┆ Northern Mariana Islands    ┆ null       ┆ false    │
+    │ 72            ┆ PR         ┆ Puerto Rico                 ┆ 3285874    ┆ false    │
+    │ 74            ┆ UM         ┆ U.S. Minor Outlying Islands ┆ null       ┆ false    │
+    │ 78            ┆ VI         ┆ U.S. Virgin Islands         ┆ null       ┆ false    │
+    └───────────────┴────────────┴─────────────────────────────┴────────────┴──────────┘
 
 Using `./forecasttools/data.py`, the location table was created by
 running the following:
@@ -160,6 +162,20 @@ make_census_dataset(
 )
 ```
 
+## United States
+
+Calling `forecasttools.united_states` simply returns a Python list that
+contains the 50 United States (`United States` itself is not included).
+While quite simple, it’s to have this capability available in fewer
+steps than through calling and selecting values from `location_table`.
+
+``` python
+united_states = forecasttools.united_states
+print(united_states)
+```
+
+    ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']
+
 ## Example FluSight Hub Submission
 
 The example FluSight submission comes from the [following 2023-24

diff --git a/README.qmd b/README.qmd
@@ -109,8 +109,7 @@ See below for more information on the datasets.
 
 ## Location Table
 
-The location table contains abbreviations, codes, and extended names for the US jurisdictions for which the FluSight and COVID forecasting hubs require users to generate forecasts.
-
+The location table contains abbreviations, codes, extended names, and populations for the jurisdictions of the United States that the FluSight and COVID forecasting hubs require users to generate forecasts. The US population value is the sum of all available states and territories (some territories have `null` population values).
 
 The location table is stored in `forecasttools-py` as a `polars` dataframe and is accessed via:
 
@@ -130,6 +129,16 @@ make_census_dataset(
 )
 ```
 
+## United States
+
+Calling `forecasttools.united_states` simply returns a Python list that contains the 50 United States (`United States` itself is not included). While quite simple, it's to have this capability available in fewer steps than through calling and selecting values from `location_table`.
+
+```{python}
+united_states = forecasttools.united_states
+print(united_states)
+```
+
+
 ## Example FluSight Hub Submission
 
 The example FluSight submission comes from the [following 2023-24 submission](https://raw.githubusercontent.com/cdcepi/FluSight-forecast-hub/main/model-output/cfa-flumech/2023-10-14-cfa-flumech.csv).

diff --git a/forecasttools/__init__.py b/forecasttools/__init__.py
@@ -27,11 +27,12 @@
     validate_iter_has_expected_types,
 )
 
-# location table (from Census data)
+# location table (from Census data; contains territory data)
 location_table_path = importlib.resources.files(__package__).joinpath(
     "location_table.parquet"
 )
 location_table = pl.read_parquet(location_table_path)
+united_states = location_table.filter(pl.col("is_state")).get_column("long_name").to_list()
 
 # load example flusight submission
 example_flusight_submission_path = importlib.resources.files(
@@ -71,6 +72,7 @@
 
 __all__ = [
     "location_table",
+    "united_states",
     "example_flusight_submission",
     "nhsn_hosp_COVID",
     "nhsn_hosp_flu",

diff --git a/forecasttools/data.py b/forecasttools/data.py
@@ -7,7 +7,10 @@
 an example FluSight submission.
 """
 
+
+
 import os
+import pathlib
 from urllib import error, request
 
 import polars as pl
@@ -61,14 +64,99 @@ def check_file_save_path(
         raise FileExistsError(f"File already exists at: {file_save_path}")
 
 
+
+def merge_pop_data_and_loc_data(
+    file_save_path: str,
+    population_file_path: str,
+    locations_file_path: str,
+    overwrite: bool = False,
+) -> None:
+    """
+    Takes a location table parquet and a census
+    populations parquet and adds the population
+    values from the populations data to the
+    location table.
+
+    Parameters
+    ----------
+    file_save_path : str
+        Where to save the outputted parquet file.
+    population_file_path : str
+        From where to load the populations table.
+    locations_file_path : str
+        From where to load the locations table.
+    overwrite : bool
+        Whether or not to overwrite the location
+        table, should one already exist. Defaults
+        to False.
+
+    Returns
+    -------
+    None
+        Saves the outputted parquet file at the
+        given file save path.
+    """
+    population_path = pathlib.Path(population_file_path)
+    locations_path = pathlib.Path(locations_file_path)
+    save_path = pathlib.Path(file_save_path)
+    if not population_path.exists():
+        raise FileNotFoundError(
+            f"Population file not found: {population_path}"
+        )
+    if not locations_path.exists():
+        raise FileNotFoundError(f"Locations file not found: {locations_path}")
+    if save_path.exists() and not overwrite:
+        print(f"File already exists at {save_path}. Skipping writing.")
+        return
+    us_states = [
+        "Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado",
+        "Connecticut", "Delaware", "Florida", "Georgia", "Hawaii", "Idaho",
+        "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana",
+        "Maine", "Maryland", "Massachusetts", "Michigan", "Minnesota",
+        "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada",
+        "New Hampshire", "New Jersey", "New Mexico", "New York",
+        "North Carolina", "North Dakota", "Ohio", "Oklahoma", "Oregon",
+        "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota",
+        "Tennessee", "Texas", "Utah", "Vermont", "Virginia", "Washington",
+        "West Virginia", "Wisconsin", "Wyoming"
+    ]
+    pop_df = pl.read_parquet(population_path).select(
+        [
+            pl.col("STNAME").alias("long_name"),
+            pl.col("POPULATION").alias("population"),
+        ]
+    )
+    loc_df = pl.read_parquet(locations_path)  # should have "long_name"
+    merged_df = loc_df.join(pop_df, on="long_name", how="left")
+    # US total is not included by default; get US total from
+    # non-null territories & states
+    us_population = merged_df["population"].sum()
+    merged_df = merged_df.with_columns(
+        pl.when(pl.col("long_name") == "United States")
+        .then(us_population)
+        .otherwise(pl.col("population"))
+        .alias("population")
+    )
+    merged_df = merged_df.with_columns(
+        pl.col("long_name").is_in(us_states).alias("is_state")
+    )
+    merged_df.write_parquet(save_path)
+    print(f"File successfully written to {save_path}")
+
+
+
 def make_census_dataset(
     file_save_path: str,
 ) -> None:
     """
     Retrieves US 2020 Census data in a
     three column Polars dataframe, then
-    saves the dataset as a csv in a given
+    saves the dataset as a parquet in a given
     directory, if it does not already exist.
+    Note: As of 2025-01-05, the Census link
+    below is not available, so the existing
+    parquet file in forecasttools must instead
+    be relied upon.
 
     Parameters
     ----------
@@ -147,7 +235,8 @@ def make_nshn_fitting_dataset(
         ]
         if not set(required_cols).issubset(set(df_cols)):
             raise ValueError(
-                f"NHSN dataset missing required columns: {set(required_cols) - set(df_cols)}"
+                f"NHSN dataset missing required columns:"
+                f" {set(required_cols) - set(df_cols)}"
             )
         # fully load and save NHSN dataframe
         df = pl.read_csv(nhsn_dataset_path)
@@ -199,7 +288,11 @@ def get_and_save_flusight_submission(
     # check if the save file exists
     check_file_save_path(file_save_path)
     # check if the FluSight example url is still valid
-    url = "https://raw.githubusercontent.com/cdcepi/FluSight-forecast-hub/main/model-output/cfa-flumech/2023-10-14-cfa-flumech.csv"
+    url = (
+        "https://raw.githubusercontent.com/cdcepi/"
+        "FluSight-forecast-hub/main/model-output/"
+        "cfa-flumech/2023-10-14-cfa-flumech.csv"
+    )
     check_url(url)
     # read csv from URL, convert to polars
     submission_df = pl.read_csv(url, infer_schema_length=7500)

diff --git a/forecasttools/location_table.parquet b/forecasttools/location_table.parquet
-Original file line number
+Diff line change
@@ Expand Up / @@ -181,3 +181,4 @@ DS_Store @@
     _book/
     _book
     render_test_idata_general_time_representation_files
+    *.csv