Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Simple history visualization #32

Merged
merged 5 commits into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
131 changes: 131 additions & 0 deletions exploration/whole_hist_viz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import argparse
import datetime

import polars as pl
from plotnine import aes, geom_line, ggplot, theme_bw


def make_plot(dfp, ignore_under, first_date, last_date):
r"""
Plots all NextStrain clades in the available data, weekly, for the full duration.
"""

lineage_year = (
pl.col("lineage")
.replace("recombinant", "0")
.str.extract(r"(\d+)")
.str.to_integer(strict=True)
.add(2000)
)

df = (
pl.scan_csv(dfp, separator="\t")
.rename({"clade_nextstrain": "lineage"})
.cast({"date": pl.Date, "date_submitted": pl.Date}, strict=False)
.filter(
pl.col("lineage").is_not_null(),
# Drop samples with missing collection or reporting dates
pl.col("date").is_not_null(),
# Drop samples claiming to be reported before being collected
pl.col("date") <= pl.col("date_submitted"),
# Keep only samples in range of specified days
pl.col("date") >= first_date,
pl.col("date") <= last_date,
# Drop impossible lineage assigments
# (lineages which had not yet been named, e.g. a sequence
# in 2020 cannot belong to 23D)
pl.col("date").dt.year() >= lineage_year,
country="USA",
host="Homo sapiens",
)
)

df = (
df.with_columns(wd=pl.col("date").dt.weekday())
.with_columns(
offset=pl.col("wd").map_elements(
lambda x: "-" + str(x) + "d" if x != 7 else "0d",
return_dtype=pl.String,
)
)
.with_columns(pl.col("date").dt.offset_by(pl.col("offset")))
.group_by("lineage", "date")
.agg(pl.len().alias("count"))
.with_columns(
frequency=(pl.col("count") / pl.sum("count").over("date")),
)
.select("date", "lineage", "frequency")
.collect()
)

trivial = (
df.group_by("lineage")
.agg(pl.col("frequency").max())
.with_columns(ignorable=(pl.col("frequency") < ignore_under))
)

plt = (
ggplot(
df.join(
trivial.drop("frequency"), on="lineage", how="left"
).filter(pl.col("ignorable").not_())
)
+ geom_line(
aes(
x="date",
y="frequency",
color="lineage",
),
)
+ theme_bw()
)

return plt


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="A simple plot of the proportions of all currently-named NextStrain clades over time in the United States, with no per-state breakdown."
)
parser.add_argument("data_filename")
parser.add_argument("output_filepath")
parser.add_argument(
"-f",
"--first_date",
default="1900-01-01",
help="Plot will only include data for this day or later. Default corresponds to all days.",
)
parser.add_argument(
"-l",
"--last_date",
default="2100-01-01",
help="Plot will only include data for up to this day or earlier. Default corresponds to all days.",
)
parser.add_argument(
"-c",
"--frequency_cutoff",
default=0.5,
help="Plot will only include lineages which exceed this frequency in at least one week",
)
parser.add_argument(
"-w", "--width", default=6, help="Plot width, in inches."
)
parser.add_argument(
"-t", "--height", default=4, help="Plot height, in inches."
)

args = parser.parse_args()

plt = make_plot(
args.data_filename,
ignore_under=float(args.frequency_cutoff),
first_date=datetime.datetime.strptime(args.first_date, "%Y-%m-%d"),
last_date=datetime.datetime.strptime(args.last_date, "%Y-%m-%d"),
)

plt.save(
args.output_filepath,
width=float(args.width),
height=float(args.height),
limitsize=False,
)
15 changes: 15 additions & 0 deletions linmod/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@
`data` key.
"""


if __name__ == "__main__":
# Load configuration, if given

Expand Down Expand Up @@ -192,6 +193,14 @@
f'{config["data"]["horizon"]["upper"]}d'
)

lineage_year = (
pl.col("lineage")
.replace("recombinant", "0")
.str.extract(r"(\d+)")
.str.to_integer(strict=True)
.add(2000)
)

full_df = (
pl.scan_csv(cache_path, separator="\t")
.rename({config["data"]["lineage_column_name"]: "lineage"})
Expand All @@ -200,6 +209,8 @@
# that are resolved only to the month, not the day
.cast({"date": pl.Date, "date_submitted": pl.Date}, strict=False)
.filter(
# Drop samples with missing lineage
pl.col("lineage").is_not_null(),
# Drop samples with missing collection or reporting dates
pl.col("date").is_not_null(),
pl.col("date_submitted").is_not_null(),
Expand All @@ -208,6 +219,10 @@
pl.col("date") <= horizon_upper_date,
# Drop samples claiming to be reported before being collected
pl.col("date") <= pl.col("date_submitted"),
# Drop impossible lineage assigments
# (lineages which had not yet been named, e.g. a sequence
# in 2020 cannot belong to 23D)
pl.col("date").dt.year() >= lineage_year,
# Drop samples not from humans in the included US divisions
pl.col("division").is_in(config["data"]["included_divisions"]),
country="USA",
Expand Down
Loading