Skip to content

Commit

Permalink
Remove null collection dates from SAR-CoV-2 sequence metadata
Browse files Browse the repository at this point in the history
Closes #36

This corrects a regression introduced in #4c0728a and adds
a test case for non-empty/bad-format dates in the metadata file.
  • Loading branch information
bsweger committed Oct 24, 2024
1 parent 5847385 commit d074289
Show file tree
Hide file tree
Showing 5 changed files with 521 additions and 11 deletions.
248 changes: 248 additions & 0 deletions Cladetime.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,248 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 6,
"id": "88fb8744-e06c-4bec-92bb-ab741b9ef32e",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"import polars as pl\n",
"from cladetime import CladeTime, Tree"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "96397e19-9397-47f1-a081-80f82ce74508",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (20, 6)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>country</th><th>division</th><th>date</th><th>host</th><th>clade_nextstrain</th><th>pango_lineage</th></tr><tr><td>str</td><td>str</td><td>date</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;USA&quot;</td><td>&quot;Alabama&quot;</td><td>2022-07-07</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22A&quot;</td><td>&quot;BA.4.1&quot;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;Arizona&quot;</td><td>2022-07-02</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BE.1&quot;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;Arizona&quot;</td><td>2022-07-19</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BF.10&quot;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;Arizona&quot;</td><td>2022-07-15</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BA.5.2.1&quot;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;Arizona&quot;</td><td>2022-07-20</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BF.10&quot;</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;California&quot;</td><td>2022-07-19</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BA.5.2&quot;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;California&quot;</td><td>2022-07-12</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BA.5.5&quot;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;California&quot;</td><td>2022-07-08</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BA.5.2&quot;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;California&quot;</td><td>2022-07-15</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BA.5.2.1&quot;</td></tr><tr><td>&quot;USA&quot;</td><td>&quot;California&quot;</td><td>2022-07-27</td><td>&quot;Homo sapiens&quot;</td><td>&quot;22B&quot;</td><td>&quot;BF.10&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (20, 6)\n",
"┌─────────┬────────────┬────────────┬──────────────┬──────────────────┬───────────────┐\n",
"│ country ┆ division ┆ date ┆ host ┆ clade_nextstrain ┆ pango_lineage │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ date ┆ str ┆ str ┆ str │\n",
"╞═════════╪════════════╪════════════╪══════════════╪══════════════════╪═══════════════╡\n",
"│ USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A ┆ BA.4.1 │\n",
"│ USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B ┆ BE.1 │\n",
"│ USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n",
"│ USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B ┆ BA.5.2.1 │\n",
"│ USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ USA ┆ California ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B ┆ BA.5.2 │\n",
"│ USA ┆ California ┆ 2022-07-12 ┆ Homo sapiens ┆ 22B ┆ BA.5.5 │\n",
"│ USA ┆ California ┆ 2022-07-08 ┆ Homo sapiens ┆ 22B ┆ BA.5.2 │\n",
"│ USA ┆ California ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B ┆ BA.5.2.1 │\n",
"│ USA ┆ California ┆ 2022-07-27 ┆ Homo sapiens ┆ 22B ┆ BF.10 │\n",
"└─────────┴────────────┴────────────┴──────────────┴──────────────────┴───────────────┘"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import polars as pl\n",
"from cladetime import CladeTime\n",
"\n",
"ct = CladeTime()\n",
"filtered_sequence_metadata = (\n",
" ct.sequence_metadata.select([\"country\", \"division\", \"date\", \"host\", \"clade_nextstrain\", \"pango_lineage\". \"'Nextclade_pango\",])\n",
" .filter(\n",
" pl.col(\"country\") == \"USA\",\n",
" pl.col(\"date\").is_not_null(),\n",
" pl.col(\"host\") == \"Homo sapiens\",\n",
" )\n",
" .cast({\"date\": pl.Date}, strict=False)\n",
")\n",
"\n",
"filtered_sequence_metadata.head(20).collect()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "25c4e1a0-45a2-4d5d-9a0d-1aa6bc3c2092",
"metadata": {},
"outputs": [],
"source": [
"ct = CladeTime(sequence_as_of=\"2024-10-01\")"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "327ec474-08f2-4813-8ab0-ae2bc4e25b8d",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=JhEz3CcrWNaeUs9wSjIazSgM5mvbaZCO'"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ct.url_sequence_metadata"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "12aec086-e163-4bd1-ad79-355c138613b1",
"metadata": {},
"outputs": [],
"source": [
"ct = CladeTime(tree_as_of=\"2024-10-01\")\n",
"tree = Tree(ct)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "32047579-1923-4a6c-8acb-4b4e163a1563",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"PosixPath('https:/data.clades.nextstrain.org/v3/nextstrain/sars-cov-2/wuhan-hu-1/orfs/2024-09-25--21-50-30Z/tree.json')"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tree.url\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "22ae68d3-cba3-4275-82a6-e57b00192fbc",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['strain',\n",
" 'virus',\n",
" 'gisaid_epi_isl',\n",
" 'genbank_accession',\n",
" 'genbank_accession_rev',\n",
" 'sra_accession',\n",
" 'date',\n",
" 'region',\n",
" 'country',\n",
" 'division',\n",
" 'location',\n",
" 'region_exposure',\n",
" 'country_exposure',\n",
" 'division_exposure',\n",
" 'segment',\n",
" 'length',\n",
" 'host',\n",
" 'age',\n",
" 'sex',\n",
" 'Nextstrain_clade',\n",
" 'pango_lineage',\n",
" 'GISAID_clade',\n",
" 'originating_lab',\n",
" 'submitting_lab',\n",
" 'authors',\n",
" 'url',\n",
" 'title',\n",
" 'paper_url',\n",
" 'date_submitted',\n",
" 'date_updated',\n",
" 'sampling_strategy',\n",
" 'database',\n",
" 'clade_nextstrain',\n",
" 'clade_who',\n",
" 'Nextclade_pango',\n",
" 'immune_escape',\n",
" 'ace2_binding',\n",
" 'missing_data',\n",
" 'divergence',\n",
" 'nonACGTN',\n",
" 'coverage',\n",
" 'rare_mutations',\n",
" 'reversion_mutations',\n",
" 'potential_contaminants',\n",
" 'QC_missing_data',\n",
" 'QC_mixed_sites',\n",
" 'QC_rare_mutations',\n",
" 'QC_snp_clusters',\n",
" 'QC_frame_shifts',\n",
" 'QC_stop_codons',\n",
" 'QC_overall_score',\n",
" 'QC_overall_status',\n",
" 'frame_shifts',\n",
" 'deletions',\n",
" 'insertions',\n",
" 'substitutions',\n",
" 'aaSubstitutions',\n",
" 'clock_deviation']"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ct.sequence_metadata.collect_schema().names()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "856601d5-93e5-4ba8-be12-e460fefd1ebb",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Binary file added filtered_metadata.parquet
Binary file not shown.
Loading

0 comments on commit d074289

Please sign in to comment.