diff --git a/Cladetime.ipynb b/Cladetime.ipynb new file mode 100644 index 0000000..a675e21 --- /dev/null +++ b/Cladetime.ipynb @@ -0,0 +1,248 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 6, + "id": "88fb8744-e06c-4bec-92bb-ab741b9ef32e", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "import polars as pl\n", + "from cladetime import CladeTime, Tree" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "96397e19-9397-47f1-a081-80f82ce74508", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "shape: (20, 6)
countrydivisiondatehostclade_nextstrainpango_lineage
strstrdatestrstrstr
"USA""Alabama"2022-07-07"Homo sapiens""22A""BA.4.1"
"USA""Arizona"2022-07-02"Homo sapiens""22B""BE.1"
"USA""Arizona"2022-07-19"Homo sapiens""22B""BF.10"
"USA""Arizona"2022-07-15"Homo sapiens""22B""BA.5.2.1"
"USA""Arizona"2022-07-20"Homo sapiens""22B""BF.10"
"USA""California"2022-07-19"Homo sapiens""22B""BA.5.2"
"USA""California"2022-07-12"Homo sapiens""22B""BA.5.5"
"USA""California"2022-07-08"Homo sapiens""22B""BA.5.2"
"USA""California"2022-07-15"Homo sapiens""22B""BA.5.2.1"
"USA""California"2022-07-27"Homo sapiens""22B""BF.10"
" + ], + "text/plain": [ + "shape: (20, 6)\n", + "┌─────────┬────────────┬────────────┬──────────────┬──────────────────┬───────────────┐\n", + "│ country ┆ division ┆ date ┆ host ┆ clade_nextstrain ┆ pango_lineage │\n", + "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", + "│ str ┆ str ┆ date ┆ str ┆ str ┆ str │\n", + "╞═════════╪════════════╪════════════╪══════════════╪══════════════════╪═══════════════╡\n", + "│ USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A ┆ BA.4.1 │\n", + "│ USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B ┆ BE.1 │\n", + "│ USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n", + "│ USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B ┆ BA.5.2.1 │\n", + "│ USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n", + "│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", + "│ USA ┆ California ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B ┆ BA.5.2 │\n", + "│ USA ┆ California ┆ 2022-07-12 ┆ Homo sapiens ┆ 22B ┆ BA.5.5 │\n", + "│ USA ┆ California ┆ 2022-07-08 ┆ Homo sapiens ┆ 22B ┆ BA.5.2 │\n", + "│ USA ┆ California ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B ┆ BA.5.2.1 │\n", + "│ USA ┆ California ┆ 2022-07-27 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n", + "└─────────┴────────────┴────────────┴──────────────┴──────────────────┴───────────────┘" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import polars as pl\n", + "from cladetime import CladeTime\n", + "\n", + "ct = CladeTime()\n", + "filtered_sequence_metadata = (\n", + " ct.sequence_metadata.select([\"country\", \"division\", \"date\", \"host\", \"clade_nextstrain\", \"pango_lineage\". \"'Nextclade_pango\",])\n", + " .filter(\n", + " pl.col(\"country\") == \"USA\",\n", + " pl.col(\"date\").is_not_null(),\n", + " pl.col(\"host\") == \"Homo sapiens\",\n", + " )\n", + " .cast({\"date\": pl.Date}, strict=False)\n", + ")\n", + "\n", + "filtered_sequence_metadata.head(20).collect()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "25c4e1a0-45a2-4d5d-9a0d-1aa6bc3c2092", + "metadata": {}, + "outputs": [], + "source": [ + "ct = CladeTime(sequence_as_of=\"2024-10-01\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "327ec474-08f2-4813-8ab0-ae2bc4e25b8d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=JhEz3CcrWNaeUs9wSjIazSgM5mvbaZCO'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ct.url_sequence_metadata" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "12aec086-e163-4bd1-ad79-355c138613b1", + "metadata": {}, + "outputs": [], + "source": [ + "ct = CladeTime(tree_as_of=\"2024-10-01\")\n", + "tree = Tree(ct)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "32047579-1923-4a6c-8acb-4b4e163a1563", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "PosixPath('https:/data.clades.nextstrain.org/v3/nextstrain/sars-cov-2/wuhan-hu-1/orfs/2024-09-25--21-50-30Z/tree.json')" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tree.url\n" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "22ae68d3-cba3-4275-82a6-e57b00192fbc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['strain',\n", + " 'virus',\n", + " 'gisaid_epi_isl',\n", + " 'genbank_accession',\n", + " 'genbank_accession_rev',\n", + " 'sra_accession',\n", + " 'date',\n", + " 'region',\n", + " 'country',\n", + " 'division',\n", + " 'location',\n", + " 'region_exposure',\n", + " 'country_exposure',\n", + " 'division_exposure',\n", + " 'segment',\n", + " 'length',\n", + " 'host',\n", + " 'age',\n", + " 'sex',\n", + " 'Nextstrain_clade',\n", + " 'pango_lineage',\n", + " 'GISAID_clade',\n", + " 'originating_lab',\n", + " 'submitting_lab',\n", + " 'authors',\n", + " 'url',\n", + " 'title',\n", + " 'paper_url',\n", + " 'date_submitted',\n", + " 'date_updated',\n", + " 'sampling_strategy',\n", + " 'database',\n", + " 'clade_nextstrain',\n", + " 'clade_who',\n", + " 'Nextclade_pango',\n", + " 'immune_escape',\n", + " 'ace2_binding',\n", + " 'missing_data',\n", + " 'divergence',\n", + " 'nonACGTN',\n", + " 'coverage',\n", + " 'rare_mutations',\n", + " 'reversion_mutations',\n", + " 'potential_contaminants',\n", + " 'QC_missing_data',\n", + " 'QC_mixed_sites',\n", + " 'QC_rare_mutations',\n", + " 'QC_snp_clusters',\n", + " 'QC_frame_shifts',\n", + " 'QC_stop_codons',\n", + " 'QC_overall_score',\n", + " 'QC_overall_status',\n", + " 'frame_shifts',\n", + " 'deletions',\n", + " 'insertions',\n", + " 'substitutions',\n", + " 'aaSubstitutions',\n", + " 'clock_deviation']" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ct.sequence_metadata.collect_schema().names()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "856601d5-93e5-4ba8-be12-e460fefd1ebb", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/filtered_metadata.parquet b/filtered_metadata.parquet new file mode 100644 index 0000000..24a0add Binary files /dev/null and b/filtered_metadata.parquet differ diff --git a/requirements/docs.txt b/requirements/docs.txt new file mode 100644 index 0000000..933c8f8 --- /dev/null +++ b/requirements/docs.txt @@ -0,0 +1,248 @@ +# This file was autogenerated by uv via the following command: +# uv pip compile pyproject.toml --extra docs -o requirements/docs.txt +alabaster==0.7.16 + # via sphinx +apeye==1.4.1 + # via sphinx-toolbox +apeye-core==1.1.5 + # via apeye +autodocsumm==0.2.13 + # via sphinx-toolbox +awscli==1.35.12 + # via cladetime (pyproject.toml) +babel==2.16.0 + # via sphinx +beautifulsoup4==4.12.3 + # via + # furo + # sphinx-toolbox +boto3==1.35.46 + # via cladetime (pyproject.toml) +botocore==1.35.46 + # via + # awscli + # boto3 + # s3transfer +cachecontrol==0.14.0 + # via sphinx-toolbox +certifi==2024.8.30 + # via requests +charset-normalizer==3.4.0 + # via requests +click==8.1.7 + # via + # cladetime (pyproject.toml) + # rich-click +cloudpathlib==0.20.0 + # via cladetime (pyproject.toml) +colorama==0.4.6 + # via awscli +contourpy==1.3.0 + # via matplotlib +cssutils==2.11.1 + # via dict2css +cycler==0.12.1 + # via matplotlib +dict2css==0.3.0.post1 + # via sphinx-toolbox +docutils==0.16 + # via + # awscli + # myst-parser + # sphinx + # sphinx-tabs + # sphinx-toolbox +domdf-python-tools==3.9.0 + # via + # apeye + # apeye-core + # dict2css + # sphinx-toolbox +filelock==3.16.1 + # via + # cachecontrol + # sphinx-toolbox +fonttools==4.54.1 + # via matplotlib +furo==2023.3.27 + # via cladetime (pyproject.toml) +html5lib==1.1 + # via sphinx-toolbox +idna==3.10 + # via + # apeye-core + # requests +imagesize==1.4.1 + # via sphinx +jellyfish==1.1.0 + # via us +jinja2==3.1.4 + # via + # myst-parser + # sphinx + # sphinx-jinja2-compat +jmespath==1.0.1 + # via + # boto3 + # botocore +kiwisolver==1.4.7 + # via matplotlib +markdown-it-py==2.2.0 + # via + # mdit-py-plugins + # myst-parser + # rich +markupsafe==3.0.2 + # via + # jinja2 + # sphinx-jinja2-compat +matplotlib==3.9.2 + # via cladetime (pyproject.toml) +mdit-py-plugins==0.3.5 + # via myst-parser +mdurl==0.1.2 + # via markdown-it-py +more-itertools==10.5.0 + # via cssutils +msgpack==1.1.0 + # via cachecontrol +myst-parser==1.0.0 + # via cladetime (pyproject.toml) +natsort==8.4.0 + # via domdf-python-tools +numpy==2.1.2 + # via + # contourpy + # matplotlib + # pandas + # pyarrow +packaging==24.1 + # via + # matplotlib + # sphinx +pandas==2.2.3 + # via cladetime (pyproject.toml) +pillow==11.0.0 + # via matplotlib +platformdirs==4.3.6 + # via apeye +polars==1.10.0 + # via cladetime (pyproject.toml) +pyarrow==17.0.0 + # via cladetime (pyproject.toml) +pyasn1==0.6.1 + # via rsa +pygments==2.18.0 + # via + # furo + # rich + # sphinx + # sphinx-prompt + # sphinx-tabs +pyparsing==3.2.0 + # via matplotlib +python-dateutil==2.9.0.post0 + # via + # botocore + # matplotlib + # pandas +pytz==2024.2 + # via pandas +pyyaml==6.0.2 + # via + # awscli + # myst-parser +requests==2.32.3 + # via + # cladetime (pyproject.toml) + # apeye + # cachecontrol + # sphinx +rich==13.9.3 + # via + # cladetime (pyproject.toml) + # rich-click +rich-click==1.8.3 + # via cladetime (pyproject.toml) +rsa==4.7.2 + # via awscli +ruamel-yaml==0.18.6 + # via sphinx-toolbox +ruamel-yaml-clib==0.2.12 + # via ruamel-yaml +s3transfer==0.10.3 + # via + # awscli + # boto3 +six==1.16.0 + # via + # html5lib + # python-dateutil +snowballstemmer==2.2.0 + # via sphinx +soupsieve==2.6 + # via beautifulsoup4 +sphinx==5.3.0 + # via + # cladetime (pyproject.toml) + # autodocsumm + # furo + # myst-parser + # sphinx-autodoc-typehints + # sphinx-basic-ng + # sphinx-copybutton + # sphinx-github-style + # sphinx-prompt + # sphinx-tabs + # sphinx-toolbox + # sphinxext-opengraph +sphinx-autodoc-typehints==1.23.0 + # via sphinx-toolbox +sphinx-basic-ng==1.0.0b2 + # via furo +sphinx-copybutton==0.5.2 + # via cladetime (pyproject.toml) +sphinx-github-style==1.2.2 + # via cladetime (pyproject.toml) +sphinx-jinja2-compat==0.3.0 + # via sphinx-toolbox +sphinx-prompt==1.5.0 + # via sphinx-toolbox +sphinx-tabs==3.4.5 + # via sphinx-toolbox +sphinx-toolbox==3.8.1 + # via cladetime (pyproject.toml) +sphinxcontrib-applehelp==2.0.0 + # via sphinx +sphinxcontrib-devhelp==2.0.0 + # via sphinx +sphinxcontrib-htmlhelp==2.1.0 + # via sphinx +sphinxcontrib-jsmath==1.0.1 + # via sphinx +sphinxcontrib-qthelp==2.0.0 + # via sphinx +sphinxcontrib-serializinghtml==2.0.0 + # via sphinx +sphinxext-opengraph==0.9.1 + # via cladetime (pyproject.toml) +structlog==24.4.0 + # via cladetime (pyproject.toml) +tabulate==0.9.0 + # via sphinx-toolbox +typing-extensions==4.12.2 + # via + # domdf-python-tools + # rich-click + # sphinx-toolbox +tzdata==2024.2 + # via pandas +urllib3==2.2.3 + # via + # cladetime (pyproject.toml) + # botocore + # requests +us==3.2.0 + # via cladetime (pyproject.toml) +webencodings==0.5.1 + # via html5lib diff --git a/src/cladetime/util/sequence.py b/src/cladetime/util/sequence.py index 1057073..f806c5e 100644 --- a/src/cladetime/util/sequence.py +++ b/src/cladetime/util/sequence.py @@ -147,7 +147,11 @@ def _get_ncov_metadata( ) return {} - return response.json() + metadata = response.json() + if metadata.get("nextclade_dataset_name", "").lower() == "sars-cov-2": + metadata["nextclade_dataset_name_full"] = "nextstrain/sars-cov-2/wuhan-hu-1/orfs" + + return metadata def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl.LazyFrame: @@ -175,11 +179,13 @@ def filter_covid_genome_metadata(metadata: pl.LazyFrame, cols: list = []) -> pl. .filter( pl.col("country") == "USA", pl.col("division").is_in(states), - pl.col("date").is_not_null(), pl.col("host") == "Homo sapiens", ) .rename({"clade_nextstrain": "clade", "division": "location"}) .cast({"date": pl.Date}, strict=False) + .filter( + pl.col("date").is_not_null(), + ) ) return filtered_metadata diff --git a/tests/unit/util/test_sequence.py b/tests/unit/util/test_sequence.py index b5f21b7..2532c7d 100644 --- a/tests/unit/util/test_sequence.py +++ b/tests/unit/util/test_sequence.py @@ -94,15 +94,23 @@ def test_download_covid_genome_metadata_no_history(s3_setup, tmp_path, mock_sess def test_filter_covid_genome_metadata(): test_genome_metadata = { - "date": ["2022-01-01", "2022-01-02", "2022-01-03", "2023-12-25", None, "2023-12-27"], - "host": ["Homo sapiens", "Homo sapiens", "Homo sapiens", "Narwhals", "Homo sapiens", "Homo sapiens"], - "country": ["USA", "Argentina", "USA", "USA", "USA", "USA"], - "division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Pennsylvania"], - "clade_nextstrain": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF"], - "location": ["Vulcan", "Reisa", "Bajor", "Deep Space 9", "Earth", "Cardassia"], - "genbank_accession": ["A1", "A2", "B1", "B2", "C1", "C2"], - "genbank_accession_rev": ["A1.1", "A2.4", "B1.1", "B2.5", "C1.1", "C2.1"], - "unwanted_column": [1, 2, 3, 4, 5, 6], + "date": ["2022-01-01", "2022-01-02", "2022-01-03", "2023-12-25", None, "2023-12-27", "2023-05"], + "host": [ + "Homo sapiens", + "Homo sapiens", + "Homo sapiens", + "Narwhals", + "Homo sapiens", + "Homo sapiens", + "Homo sapiens", + ], + "country": ["USA", "Argentina", "USA", "USA", "USA", "USA", "USA"], + "division": ["Alaska", "Maine", "Guam", "Puerto Rico", "Utah", "Pennsylvania", "Pennsylvania"], + "clade_nextstrain": ["AAA", "BBB", "CCC", "DDD", "EEE", "FFF", "FFF"], + "location": ["Vulcan", "Reisa", "Bajor", "Deep Space 9", "Earth", "Cardassia", "Cardassia"], + "genbank_accession": ["A1", "A2", "B1", "B2", "C1", "C2", "C2"], + "genbank_accession_rev": ["A1.1", "A2.4", "B1.1", "B2.5", "C1.1", "C2.1", "C2.1"], + "unwanted_column": [1, 2, 3, 4, 5, 6, 7], } lf_metadata = pl.LazyFrame(test_genome_metadata)