diff --git a/exploration/whole_hist_viz.py b/exploration/whole_hist_viz.py new file mode 100755 index 0000000..5b578b7 --- /dev/null +++ b/exploration/whole_hist_viz.py @@ -0,0 +1,131 @@ +import argparse +import datetime + +import polars as pl +from plotnine import aes, geom_line, ggplot, theme_bw + + +def make_plot(dfp, ignore_under, first_date, last_date): + r""" + Plots all NextStrain clades in the available data, weekly, for the full duration. + """ + + lineage_year = ( + pl.col("lineage") + .replace("recombinant", "0") + .str.extract(r"(\d+)") + .str.to_integer(strict=True) + .add(2000) + ) + + df = ( + pl.scan_csv(dfp, separator="\t") + .rename({"clade_nextstrain": "lineage"}) + .cast({"date": pl.Date, "date_submitted": pl.Date}, strict=False) + .filter( + pl.col("lineage").is_not_null(), + # Drop samples with missing collection or reporting dates + pl.col("date").is_not_null(), + # Drop samples claiming to be reported before being collected + pl.col("date") <= pl.col("date_submitted"), + # Keep only samples in range of specified days + pl.col("date") >= first_date, + pl.col("date") <= last_date, + # Drop impossible lineage assigments + # (lineages which had not yet been named, e.g. a sequence + # in 2020 cannot belong to 23D) + pl.col("date").dt.year() >= lineage_year, + country="USA", + host="Homo sapiens", + ) + ) + + df = ( + df.with_columns(wd=pl.col("date").dt.weekday()) + .with_columns( + offset=pl.col("wd").map_elements( + lambda x: "-" + str(x) + "d" if x != 7 else "0d", + return_dtype=pl.String, + ) + ) + .with_columns(pl.col("date").dt.offset_by(pl.col("offset"))) + .group_by("lineage", "date") + .agg(pl.len().alias("count")) + .with_columns( + frequency=(pl.col("count") / pl.sum("count").over("date")), + ) + .select("date", "lineage", "frequency") + .collect() + ) + + trivial = ( + df.group_by("lineage") + .agg(pl.col("frequency").max()) + .with_columns(ignorable=(pl.col("frequency") < ignore_under)) + ) + + plt = ( + ggplot( + df.join( + trivial.drop("frequency"), on="lineage", how="left" + ).filter(pl.col("ignorable").not_()) + ) + + geom_line( + aes( + x="date", + y="frequency", + color="lineage", + ), + ) + + theme_bw() + ) + + return plt + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="A simple plot of the proportions of all currently-named NextStrain clades over time in the United States, with no per-state breakdown." + ) + parser.add_argument("data_filename") + parser.add_argument("output_filepath") + parser.add_argument( + "-f", + "--first_date", + default="1900-01-01", + help="Plot will only include data for this day or later. Default corresponds to all days.", + ) + parser.add_argument( + "-l", + "--last_date", + default="2100-01-01", + help="Plot will only include data for up to this day or earlier. Default corresponds to all days.", + ) + parser.add_argument( + "-c", + "--frequency_cutoff", + default=0.5, + help="Plot will only include lineages which exceed this frequency in at least one week", + ) + parser.add_argument( + "-w", "--width", default=6, help="Plot width, in inches." + ) + parser.add_argument( + "-t", "--height", default=4, help="Plot height, in inches." + ) + + args = parser.parse_args() + + plt = make_plot( + args.data_filename, + ignore_under=float(args.frequency_cutoff), + first_date=datetime.datetime.strptime(args.first_date, "%Y-%m-%d"), + last_date=datetime.datetime.strptime(args.last_date, "%Y-%m-%d"), + ) + + plt.save( + args.output_filepath, + width=float(args.width), + height=float(args.height), + limitsize=False, + ) diff --git a/linmod/data.py b/linmod/data.py index 368cadb..cb6e139 100755 --- a/linmod/data.py +++ b/linmod/data.py @@ -132,6 +132,7 @@ `data` key. """ + if __name__ == "__main__": # Load configuration, if given @@ -192,6 +193,14 @@ f'{config["data"]["horizon"]["upper"]}d' ) + lineage_year = ( + pl.col("lineage") + .replace("recombinant", "0") + .str.extract(r"(\d+)") + .str.to_integer(strict=True) + .add(2000) + ) + full_df = ( pl.scan_csv(cache_path, separator="\t") .rename({config["data"]["lineage_column_name"]: "lineage"}) @@ -200,6 +209,8 @@ # that are resolved only to the month, not the day .cast({"date": pl.Date, "date_submitted": pl.Date}, strict=False) .filter( + # Drop samples with missing lineage + pl.col("lineage").is_not_null(), # Drop samples with missing collection or reporting dates pl.col("date").is_not_null(), pl.col("date_submitted").is_not_null(), @@ -208,6 +219,10 @@ pl.col("date") <= horizon_upper_date, # Drop samples claiming to be reported before being collected pl.col("date") <= pl.col("date_submitted"), + # Drop impossible lineage assigments + # (lineages which had not yet been named, e.g. a sequence + # in 2020 cannot belong to 23D) + pl.col("date").dt.year() >= lineage_year, # Drop samples not from humans in the included US divisions pl.col("division").is_in(config["data"]["included_divisions"]), country="USA",