Skip to content

Commit

Permalink
Cleaner data filtering
Browse files Browse the repository at this point in the history
  • Loading branch information
thanasibakis committed Aug 15, 2024
1 parent a002a2e commit d7e02f3
Showing 1 changed file with 17 additions and 9 deletions.
26 changes: 17 additions & 9 deletions linmod/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
`lineage`, `count`. Rows are uniquely identified by `(date, division, lineage)`.
`date` and `fd_offset` can be computed from each other, given the forecast date;
the `fd_offset` column is the number of days between the forecast date and the `date`
column, such that, for example, 0 is the forecast date, -1 the day before, and 1 the day after.
column, such that, for example, 0 is the forecast date, -1 the day before, and 1 the
day after.
Note that observations without a recorded date are removed, and only observations
from human hosts are included.
Expand Down Expand Up @@ -184,6 +185,13 @@
config["data"]["forecast_date"]["day"],
)

horizon_lower_date = forecast_date.dt.offset_by(
f'{config["data"]["horizon"]["lower"]}d'
)
horizon_upper_date = forecast_date.dt.offset_by(
f'{config["data"]["horizon"]["upper"]}d'
)

full_df = (
pl.scan_csv(cache_path, separator="\t")
.rename({config["data"]["lineage_column_name"]: "lineage"})
Expand All @@ -192,16 +200,16 @@
# that are resolved only to the month, not the day
.cast({"date": pl.Date, "date_submitted": pl.Date}, strict=False)
.filter(
# Drop samples with missing collection or reporting dates
pl.col("date").is_not_null(),
pl.col("date_submitted").is_not_null(),
forecast_date + config["data"]["horizon"]["lower"]
<= pl.col("date"),
pl.col("date")
<= forecast_date + config["data"]["horizon"]["upper"],
forecast_date + config["data"]["horizon"]["lower"]
<= pl.col("date_submitted"),
pl.col("date_submitted")
<= forecast_date + config["data"]["horizon"]["upper"],
# Drop samples collected outside the horizon
horizon_lower_date <= pl.col("date"),
pl.col("date") <= horizon_upper_date,
# Drop samples reported outside the horizon
horizon_lower_date <= pl.col("date_submitted"),
pl.col("date_submitted") <= horizon_upper_date,
# Drop samples not from humans in the included US divisions
pl.col("division").is_in(config["data"]["included_divisions"]),
country="USA",
host="Homo sapiens",
Expand Down

0 comments on commit d7e02f3

Please sign in to comment.