diff --git a/Cladetime.ipynb b/Cladetime.ipynb
new file mode 100644
index 0000000..a675e21
--- /dev/null
+++ b/Cladetime.ipynb
@@ -0,0 +1,248 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "88fb8744-e06c-4bec-92bb-ab741b9ef32e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from pathlib import Path\n",
+ "import polars as pl\n",
+ "from cladetime import CladeTime, Tree"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "96397e19-9397-47f1-a081-80f82ce74508",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
shape: (20, 6)country | division | date | host | clade_nextstrain | pango_lineage |
---|
str | str | date | str | str | str |
"USA" | "Alabama" | 2022-07-07 | "Homo sapiens" | "22A" | "BA.4.1" |
"USA" | "Arizona" | 2022-07-02 | "Homo sapiens" | "22B" | "BE.1" |
"USA" | "Arizona" | 2022-07-19 | "Homo sapiens" | "22B" | "BF.10" |
"USA" | "Arizona" | 2022-07-15 | "Homo sapiens" | "22B" | "BA.5.2.1" |
"USA" | "Arizona" | 2022-07-20 | "Homo sapiens" | "22B" | "BF.10" |
… | … | … | … | … | … |
"USA" | "California" | 2022-07-19 | "Homo sapiens" | "22B" | "BA.5.2" |
"USA" | "California" | 2022-07-12 | "Homo sapiens" | "22B" | "BA.5.5" |
"USA" | "California" | 2022-07-08 | "Homo sapiens" | "22B" | "BA.5.2" |
"USA" | "California" | 2022-07-15 | "Homo sapiens" | "22B" | "BA.5.2.1" |
"USA" | "California" | 2022-07-27 | "Homo sapiens" | "22B" | "BF.10" |
"
+ ],
+ "text/plain": [
+ "shape: (20, 6)\n",
+ "┌─────────┬────────────┬────────────┬──────────────┬──────────────────┬───────────────┐\n",
+ "│ country ┆ division ┆ date ┆ host ┆ clade_nextstrain ┆ pango_lineage │\n",
+ "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
+ "│ str ┆ str ┆ date ┆ str ┆ str ┆ str │\n",
+ "╞═════════╪════════════╪════════════╪══════════════╪══════════════════╪═══════════════╡\n",
+ "│ USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A ┆ BA.4.1 │\n",
+ "│ USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B ┆ BE.1 │\n",
+ "│ USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n",
+ "│ USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B ┆ BA.5.2.1 │\n",
+ "│ USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n",
+ "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
+ "│ USA ┆ California ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B ┆ BA.5.2 │\n",
+ "│ USA ┆ California ┆ 2022-07-12 ┆ Homo sapiens ┆ 22B ┆ BA.5.5 │\n",
+ "│ USA ┆ California ┆ 2022-07-08 ┆ Homo sapiens ┆ 22B ┆ BA.5.2 │\n",
+ "│ USA ┆ California ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B ┆ BA.5.2.1 │\n",
+ "│ USA ┆ California ┆ 2022-07-27 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n",
+ "└─────────┴────────────┴────────────┴──────────────┴──────────────────┴───────────────┘"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import polars as pl\n",
+ "from cladetime import CladeTime\n",
+ "\n",
+ "ct = CladeTime()\n",
+ "filtered_sequence_metadata = (\n",
+ " ct.sequence_metadata.select([\"country\", \"division\", \"date\", \"host\", \"clade_nextstrain\", \"pango_lineage\". \"'Nextclade_pango\",])\n",
+ " .filter(\n",
+ " pl.col(\"country\") == \"USA\",\n",
+ " pl.col(\"date\").is_not_null(),\n",
+ " pl.col(\"host\") == \"Homo sapiens\",\n",
+ " )\n",
+ " .cast({\"date\": pl.Date}, strict=False)\n",
+ ")\n",
+ "\n",
+ "filtered_sequence_metadata.head(20).collect()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "25c4e1a0-45a2-4d5d-9a0d-1aa6bc3c2092",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ct = CladeTime(sequence_as_of=\"2024-10-01\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "327ec474-08f2-4813-8ab0-ae2bc4e25b8d",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=JhEz3CcrWNaeUs9wSjIazSgM5mvbaZCO'"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ct.url_sequence_metadata"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "12aec086-e163-4bd1-ad79-355c138613b1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ct = CladeTime(tree_as_of=\"2024-10-01\")\n",
+ "tree = Tree(ct)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "32047579-1923-4a6c-8acb-4b4e163a1563",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "PosixPath('https:/data.clades.nextstrain.org/v3/nextstrain/sars-cov-2/wuhan-hu-1/orfs/2024-09-25--21-50-30Z/tree.json')"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tree.url\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "22ae68d3-cba3-4275-82a6-e57b00192fbc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['strain',\n",
+ " 'virus',\n",
+ " 'gisaid_epi_isl',\n",
+ " 'genbank_accession',\n",
+ " 'genbank_accession_rev',\n",
+ " 'sra_accession',\n",
+ " 'date',\n",
+ " 'region',\n",
+ " 'country',\n",
+ " 'division',\n",
+ " 'location',\n",
+ " 'region_exposure',\n",
+ " 'country_exposure',\n",
+ " 'division_exposure',\n",
+ " 'segment',\n",
+ " 'length',\n",
+ " 'host',\n",
+ " 'age',\n",
+ " 'sex',\n",
+ " 'Nextstrain_clade',\n",
+ " 'pango_lineage',\n",
+ " 'GISAID_clade',\n",
+ " 'originating_lab',\n",
+ " 'submitting_lab',\n",
+ " 'authors',\n",
+ " 'url',\n",
+ " 'title',\n",
+ " 'paper_url',\n",
+ " 'date_submitted',\n",
+ " 'date_updated',\n",
+ " 'sampling_strategy',\n",
+ " 'database',\n",
+ " 'clade_nextstrain',\n",
+ " 'clade_who',\n",
+ " 'Nextclade_pango',\n",
+ " 'immune_escape',\n",
+ " 'ace2_binding',\n",
+ " 'missing_data',\n",
+ " 'divergence',\n",
+ " 'nonACGTN',\n",
+ " 'coverage',\n",
+ " 'rare_mutations',\n",
+ " 'reversion_mutations',\n",
+ " 'potential_contaminants',\n",
+ " 'QC_missing_data',\n",
+ " 'QC_mixed_sites',\n",
+ " 'QC_rare_mutations',\n",
+ " 'QC_snp_clusters',\n",
+ " 'QC_frame_shifts',\n",
+ " 'QC_stop_codons',\n",
+ " 'QC_overall_score',\n",
+ " 'QC_overall_status',\n",
+ " 'frame_shifts',\n",
+ " 'deletions',\n",
+ " 'insertions',\n",
+ " 'substitutions',\n",
+ " 'aaSubstitutions',\n",
+ " 'clock_deviation']"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ct.sequence_metadata.collect_schema().names()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "856601d5-93e5-4ba8-be12-e460fefd1ebb",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/filtered_metadata.parquet b/filtered_metadata.parquet
new file mode 100644
index 0000000..24a0add
Binary files /dev/null and b/filtered_metadata.parquet differ
diff --git a/requirements/docs.txt b/requirements/docs.txt
new file mode 100644
index 0000000..933c8f8
--- /dev/null
+++ b/requirements/docs.txt
@@ -0,0 +1,248 @@
+# This file was autogenerated by uv via the following command:
+# uv pip compile pyproject.toml --extra docs -o requirements/docs.txt
+alabaster==0.7.16
+ # via sphinx
+apeye==1.4.1
+ # via sphinx-toolbox
+apeye-core==1.1.5
+ # via apeye
+autodocsumm==0.2.13
+ # via sphinx-toolbox
+awscli==1.35.12
+ # via cladetime (pyproject.toml)
+babel==2.16.0
+ # via sphinx
+beautifulsoup4==4.12.3
+ # via
+ # furo
+ # sphinx-toolbox
+boto3==1.35.46
+ # via cladetime (pyproject.toml)
+botocore==1.35.46
+ # via
+ # awscli
+ # boto3
+ # s3transfer
+cachecontrol==0.14.0
+ # via sphinx-toolbox
+certifi==2024.8.30
+ # via requests
+charset-normalizer==3.4.0
+ # via requests
+click==8.1.7
+ # via
+ # cladetime (pyproject.toml)
+ # rich-click
+cloudpathlib==0.20.0
+ # via cladetime (pyproject.toml)
+colorama==0.4.6
+ # via awscli
+contourpy==1.3.0
+ # via matplotlib
+cssutils==2.11.1
+ # via dict2css
+cycler==0.12.1
+ # via matplotlib
+dict2css==0.3.0.post1
+ # via sphinx-toolbox
+docutils==0.16
+ # via
+ # awscli
+ # myst-parser
+ # sphinx
+ # sphinx-tabs
+ # sphinx-toolbox
+domdf-python-tools==3.9.0
+ # via
+ # apeye
+ # apeye-core
+ # dict2css
+ # sphinx-toolbox
+filelock==3.16.1
+ # via
+ # cachecontrol
+ # sphinx-toolbox
+fonttools==4.54.1
+ # via matplotlib
+furo==2023.3.27
+ # via cladetime (pyproject.toml)
+html5lib==1.1
+ # via sphinx-toolbox
+idna==3.10
+ # via
+ # apeye-core
+ # requests
+imagesize==1.4.1
+ # via sphinx
+jellyfish==1.1.0
+ # via us
+jinja2==3.1.4
+ # via
+ # myst-parser
+ # sphinx
+ # sphinx-jinja2-compat
+jmespath==1.0.1
+ # via
+ # boto3
+ # botocore
+kiwisolver==1.4.7
+ # via matplotlib
+markdown-it-py==2.2.0
+ # via
+ # mdit-py-plugins
+ # myst-parser
+ # rich
+markupsafe==3.0.2
+ # via
+ # jinja2
+ # sphinx-jinja2-compat
+matplotlib==3.9.2
+ # via cladetime (pyproject.toml)
+mdit-py-plugins==0.3.5
+ # via myst-parser
+mdurl==0.1.2
+ # via markdown-it-py
+more-itertools==10.5.0
+ # via cssutils
+msgpack==1.1.0
+ # via cachecontrol
+myst-parser==1.0.0
+ # via cladetime (pyproject.toml)
+natsort==8.4.0
+ # via domdf-python-tools
+numpy==2.1.2
+ # via
+ # contourpy
+ # matplotlib
+ # pandas
+ # pyarrow
+packaging==24.1
+ # via
+ # matplotlib
+ # sphinx
+pandas==2.2.3
+ # via cladetime (pyproject.toml)
+pillow==11.0.0
+ # via matplotlib
+platformdirs==4.3.6
+ # via apeye
+polars==1.10.0
+ # via cladetime (pyproject.toml)
+pyarrow==17.0.0
+ # via cladetime (pyproject.toml)
+pyasn1==0.6.1
+ # via rsa
+pygments==2.18.0
+ # via
+ # furo
+ # rich
+ # sphinx
+ # sphinx-prompt
+ # sphinx-tabs
+pyparsing==3.2.0
+ # via matplotlib
+python-dateutil==2.9.0.post0
+ # via
+ # botocore
+ # matplotlib
+ # pandas
+pytz==2024.2
+ # via pandas
+pyyaml==6.0.2
+ # via
+ # awscli
+ # myst-parser
+requests==2.32.3
+ # via
+ # cladetime (pyproject.toml)
+ # apeye
+ # cachecontrol
+ # sphinx
+rich==13.9.3
+ # via
+ # cladetime (pyproject.toml)
+ # rich-click
+rich-click==1.8.3
+ # via cladetime (pyproject.toml)
+rsa==4.7.2
+ # via awscli
+ruamel-yaml==0.18.6
+ # via sphinx-toolbox
+ruamel-yaml-clib==0.2.12
+ # via ruamel-yaml
+s3transfer==0.10.3
+ # via
+ # awscli
+ # boto3
+six==1.16.0
+ # via
+ # html5lib
+ # python-dateutil
+snowballstemmer==2.2.0
+ # via sphinx
+soupsieve==2.6
+ # via beautifulsoup4
+sphinx==5.3.0
+ # via
+ # cladetime (pyproject.toml)
+ # autodocsumm
+ # furo
+ # myst-parser
+ # sphinx-autodoc-typehints
+ # sphinx-basic-ng
+ # sphinx-copybutton
+ # sphinx-github-style
+ # sphinx-prompt
+ # sphinx-tabs
+ # sphinx-toolbox
+ # sphinxext-opengraph
+sphinx-autodoc-typehints==1.23.0
+ # via sphinx-toolbox
+sphinx-basic-ng==1.0.0b2
+ # via furo
+sphinx-copybutton==0.5.2
+ # via cladetime (pyproject.toml)
+sphinx-github-style==1.2.2
+ # via cladetime (pyproject.toml)
+sphinx-jinja2-compat==0.3.0
+ # via sphinx-toolbox
+sphinx-prompt==1.5.0
+ # via sphinx-toolbox
+sphinx-tabs==3.4.5
+ # via sphinx-toolbox
+sphinx-toolbox==3.8.1
+ # via cladetime (pyproject.toml)
+sphinxcontrib-applehelp==2.0.0
+ # via sphinx
+sphinxcontrib-devhelp==2.0.0
+ # via sphinx
+sphinxcontrib-htmlhelp==2.1.0
+ # via sphinx
+sphinxcontrib-jsmath==1.0.1
+ # via sphinx
+sphinxcontrib-qthelp==2.0.0
+ # via sphinx
+sphinxcontrib-serializinghtml==2.0.0
+ # via sphinx
+sphinxext-opengraph==0.9.1
+ # via cladetime (pyproject.toml)
+structlog==24.4.0
+ # via cladetime (pyproject.toml)
+tabulate==0.9.0
+ # via sphinx-toolbox
+typing-extensions==4.12.2
+ # via
+ # domdf-python-tools
+ # rich-click
+ # sphinx-toolbox
+tzdata==2024.2
+ # via pandas
+urllib3==2.2.3
+ # via
+ # cladetime (pyproject.toml)
+ # botocore
+ # requests
+us==3.2.0
+ # via cladetime (pyproject.toml)
+webencodings==0.5.1
+ # via html5lib
diff --git a/src/cladetime/util/sequence.py b/src/cladetime/util/sequence.py
index 1057073..f806c5e 100644
--- a/src/cladetime/util/sequence.py
+++ b/src/cladetime/util/sequence.py
@@ -147,7 +147,11 @@ def _get_ncov_metadata(
)
return {}
- return response.json()
+ metadata = response.json()
+ if metadata.get("nextclade_dataset_name", "").lower() == "sars-cov-2":
+ metadata["nextclade_dataset_name_full"] = "nextstrain/sars-cov-2/wuhan-hu-1/orfs"
+
+ return metadata
def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl.LazyFrame:
@@ -175,11 +179,13 @@ def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl.
.filter(
pl.col("country") == "USA",
pl.col("division").is_in(states),
- pl.col("date").is_not_null(),
pl.col("host") == "Homo sapiens",
)
.rename({"clade_nextstrain": "clade", "division": "location"})
.cast({"date": pl.Date}, strict=False)
+ .filter(
+ pl.col("date").is_not_null(),
+ )
)
return filtered_metadata
diff --git a/tests/unit/util/test_sequence.py b/tests/unit/util/test_sequence.py
index b5f21b7..2532c7d 100644
--- a/tests/unit/util/test_sequence.py
+++ b/tests/unit/util/test_sequence.py
@@ -94,15 +94,23 @@ def test_download_covid_genome_metadata_no_history(s3_setup, tmp_path, mock_sess
def test_filter_covid_genome_metadata():
test_genome_metadata = {
- "date": ["2022-01-01", "2022-01-02", "2022-01-03", "2023-12-25", None, "2023-12-27"],
- "host": ["Homo sapiens", "Homo sapiens", "Homo sapiens", "Narwhals", "Homo sapiens", "Homo sapiens"],
- "country": ["USA", "Argentina", "USA", "USA", "USA", "USA"],
- "division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Pennsylvania"],
- "clade_nextstrain": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF"],
- "location": ["Vulcan", "Reisa", "Bajor", "Deep Space 9", "Earth", "Cardassia"],
- "genbank_accession": ["A1", "A2", "B1", "B2", "C1", "C2"],
- "genbank_accession_rev": ["A1.1", "A2.4", "B1.1", "B2.5", "C1.1", "C2.1"],
- "unwanted_column": [1, 2, 3, 4, 5, 6],
+ "date": ["2022-01-01", "2022-01-02", "2022-01-03", "2023-12-25", None, "2023-12-27", "2023-05"],
+ "host": [
+ "Homo sapiens",
+ "Homo sapiens",
+ "Homo sapiens",
+ "Narwhals",
+ "Homo sapiens",
+ "Homo sapiens",
+ "Homo sapiens",
+ ],
+ "country": ["USA", "Argentina", "USA", "USA", "USA", "USA", "USA"],
+ "division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Pennsylvania", "Pennsylvania"],
+ "clade_nextstrain": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "FFF"],
+ "location": ["Vulcan", "Reisa", "Bajor", "Deep Space 9", "Earth", "Cardassia", "Cardassia"],
+ "genbank_accession": ["A1", "A2", "B1", "B2", "C1", "C2", "C2"],
+ "genbank_accession_rev": ["A1.1", "A2.4", "B1.1", "B2.5", "C1.1", "C2.1", "C2.1"],
+ "unwanted_column": [1, 2, 3, 4, 5, 6, 7],
}
lf_metadata = pl.LazyFrame(test_genome_metadata)