From d7e02f3cc4d08128f1cd0e896d4553a5b64c2293 Mon Sep 17 00:00:00 2001 From: Thanasi Bakis Date: Thu, 15 Aug 2024 12:07:28 -0700 Subject: [PATCH] Cleaner data filtering --- linmod/data.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/linmod/data.py b/linmod/data.py index 2692980..e9366e5 100755 --- a/linmod/data.py +++ b/linmod/data.py @@ -15,7 +15,8 @@ `lineage`, `count`. Rows are uniquely identified by `(date, division, lineage)`. `date` and `fd_offset` can be computed from each other, given the forecast date; the `fd_offset` column is the number of days between the forecast date and the `date` -column, such that, for example, 0 is the forecast date, -1 the day before, and 1 the day after. +column, such that, for example, 0 is the forecast date, -1 the day before, and 1 the +day after. Note that observations without a recorded date are removed, and only observations from human hosts are included. @@ -184,6 +185,13 @@ config["data"]["forecast_date"]["day"], ) + horizon_lower_date = forecast_date.dt.offset_by( + f'{config["data"]["horizon"]["lower"]}d' + ) + horizon_upper_date = forecast_date.dt.offset_by( + f'{config["data"]["horizon"]["upper"]}d' + ) + full_df = ( pl.scan_csv(cache_path, separator="\t") .rename({config["data"]["lineage_column_name"]: "lineage"}) @@ -192,16 +200,16 @@ # that are resolved only to the month, not the day .cast({"date": pl.Date, "date_submitted": pl.Date}, strict=False) .filter( + # Drop samples with missing collection or reporting dates pl.col("date").is_not_null(), pl.col("date_submitted").is_not_null(), - forecast_date + config["data"]["horizon"]["lower"] - <= pl.col("date"), - pl.col("date") - <= forecast_date + config["data"]["horizon"]["upper"], - forecast_date + config["data"]["horizon"]["lower"] - <= pl.col("date_submitted"), - pl.col("date_submitted") - <= forecast_date + config["data"]["horizon"]["upper"], + # Drop samples collected outside the horizon + horizon_lower_date <= pl.col("date"), + pl.col("date") <= horizon_upper_date, + # Drop samples reported outside the horizon + horizon_lower_date <= pl.col("date_submitted"), + pl.col("date_submitted") <= horizon_upper_date, + # Drop samples not from humans in the included US divisions pl.col("division").is_in(config["data"]["included_divisions"]), country="USA", host="Homo sapiens",