Skip to content

Commit

Permalink
Merge pull request #27 from reichlab/bsweger/sequence-by-state-date/50
Browse files Browse the repository at this point in the history
Add a sequence_metadata attribute to CladeTime
  • Loading branch information
bsweger authored Oct 4, 2024
2 parents 5ace911 + 41ea13b commit a15a7f3
Show file tree
Hide file tree
Showing 9 changed files with 181 additions and 35 deletions.
57 changes: 44 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,44 @@ In [1]: from virus_clade_utils.cladetime import CladeTime

In [2]: ct = CladeTime()

# URL for the corresponding Nextstrain Sars-Cov-2 sequence metadata
In [3]: ct.url_sequence_metadata
Out[3]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=VJomXHLN2L9aqvS9Ax_LJ4ecr5ZsFFhE'

# Metadata from the pipeline that produced the above file
In [4]: ct.ncov_metadata
Out[4]:
# Return a Polars LazyFrame with the sequence metadata.
In [4]: import polars as pl

In [5]: lf = ct.sequence_metadata

# From there, you can use Polars to manipulate the data as needed
In [6]: filtered_sequence_metadata = (
lf
.select(["country", "division", "date", "host", "clade_nextstrain"])
.rename({"clade_nextstrain": "clade", "division": "location"})
.filter(
pl.col("country") == "USA",
pl.col("host") == "Homo sapiens"
)
).collect()

In [7]: filtered_sequence_metadata.head()
Out[7]:
shape: (5, 5)
┌─────────┬──────────┬────────────┬──────────────┬───────┐
│ country ┆ location ┆ date ┆ host ┆ clade │
---------------
strstrstrstrstr
╞═════════╪══════════╪════════════╪══════════════╪═══════╡
USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A
USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B
USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B
USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B
USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B
└─────────┴──────────┴────────────┴──────────────┴───────┘

# Pandas users can create a Pandas dataframe with sequence metadata

In [8]: pandas = lf.collect().to_pandas()

# Metadata from the pipeline that produced the above sequence_data
In [9]: ct.ncov_metadata
Out[9]:
{'schema_version': 'v1',
'nextclade_version': 'nextclade 3.8.2',
'nextclade_dataset_name': 'SARS-CoV-2',
Expand All @@ -55,17 +86,17 @@ Out[4]:
#### Work with point-in-time Nextstrain Sars-Cov-2 sequence metadata and clade assignments

```python
In [5]: from virus_clade_utils.cladetime import CladeTime
In [10]: from virus_clade_utils.cladetime import CladeTime

In [6]: ct = CladeTime(sequence_as_of="2024-08-31", tree_as_of="2024-08-01")
In [11]: ct = CladeTime(sequence_as_of="2024-08-31", tree_as_of="2024-08-01")

# URL for the corresponding Nextstrain Sars-Cov-2 sequence metadata as it existing on 2024-08-31
In [7]: ct.url_sequence_metadata
Out[7]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=1SZMfjWxXjNy530F6L7MfyflUCbue.JD'
In [12]: ct.url_sequence_metadata
Out[12]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=1SZMfjWxXjNy530F6L7MfyflUCbue.JD'

# Metadata for the pipeline run that produced the above file
In [8]: ct.ncov_metadata
Out[8]: {'schema_version': 'v1',
In [13]: ct.ncov_metadata
Out[13]: {'schema_version': 'v1',
'nextclade_version': 'nextclade 3.8.2',
'nextclade_dataset_name': 'SARS-CoV-2',
'nextclade_dataset_version': '2024-07-17--12-57-03Z',
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ filterwarnings = [
"ignore::DeprecationWarning",
'ignore:polars found a filename',
]
testpaths = [
"tests",
]

[tool.ruff]
line-length = 120
Expand Down
10 changes: 10 additions & 0 deletions src/virus_clade_utils/_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Type aliases for this package."""

from pathlib import Path
from typing import TypeAlias, Union

from cloudpathlib import AnyPath, CloudPath

# Data types
# Pathlike: TypeAlias = Path | AnyPath | CloudPath
Pathlike: TypeAlias = Union["Path", "AnyPath", "CloudPath"]
22 changes: 20 additions & 2 deletions src/virus_clade_utils/cladetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

from datetime import datetime, timezone

import polars as pl
import structlog

from virus_clade_utils.exceptions import CladeTimeInvalidDateError
from virus_clade_utils.exceptions import CladeTimeInvalidDateError, CladeTimeInvalidURLError
from virus_clade_utils.util.config import Config
from virus_clade_utils.util.reference import _get_s3_object_url
from virus_clade_utils.util.sequence import _get_ncov_metadata
from virus_clade_utils.util.sequence import _get_ncov_metadata, get_covid_genome_metadata

logger = structlog.get_logger()

Expand All @@ -25,6 +26,8 @@ class CladeTime:
ncov_metadata : dict
Metadata for the Nextstrain ncov pipeline that generated the sequence and
sequence metadata that correspond to the sequence_as_of date.
metadata_metadata : pl.LazyFrame
A Polars lazyframe reference to url_sequence_metadata.
tree_as_of : datetime
Use the NextStrain reference tree that was available as of this
date and time (UTC).
Expand Down Expand Up @@ -58,6 +61,7 @@ def __init__(self, sequence_as_of=None, tree_as_of=None):
self.sequence_as_of = self._validate_as_of_date(sequence_as_of)
self.tree_as_of = self._validate_as_of_date(tree_as_of)
self._ncov_metadata = {}
self._sequence_metadata = pl.LazyFrame()

self.url_sequence = _get_s3_object_url(
self._config.nextstrain_ncov_bucket, self._config.nextstrain_genome_sequence_key, self.sequence_as_of
Expand Down Expand Up @@ -88,6 +92,20 @@ def ncov_metadata(self) -> dict:
metadata = {}
return metadata

@property
def sequence_metadata(self):
return self._sequence_metadata

@sequence_metadata.getter
def sequence_metadata(self) -> pl.LazyFrame:
"""Set the sequence_metadata attribute."""
if self.url_sequence_metadata:
sequence_metadata = get_covid_genome_metadata(metadata_url=self.url_sequence_metadata)
return sequence_metadata
else:
raise CladeTimeInvalidURLError("CladeTime is missing url_sequence_metadata")
return sequence_metadata

def __repr__(self):
return f"CladeTime(sequence_as_of={self.sequence_as_of}, tree_as_of={self.tree_as_of})"

Expand Down
4 changes: 4 additions & 0 deletions src/virus_clade_utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ class Error(Exception):

class CladeTimeInvalidDateError(Error):
"""Raised when an invalid date string is passed to CladeTime."""


class CladeTimeInvalidURLError(Error):
"""Raised when CladeTime encounters an invalid URL."""
44 changes: 35 additions & 9 deletions src/virus_clade_utils/util/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,41 @@ def download_covid_genome_metadata(
return filename


def get_covid_genome_metadata(metadata_path: Path, num_rows: int | None = None) -> pl.LazyFrame:
"""Read GenBank genome metadata into a Polars LazyFrame."""

if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]:
metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows)
elif compression_type == ".xz":
metadata = pl.read_csv(
lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000
).lazy()
def get_covid_genome_metadata(
metadata_path: Path | None = None, metadata_url: str | None = None, num_rows: int | None = None
) -> pl.LazyFrame:
"""
Read GenBank genome metadata into a Polars LazyFrame.
Parameters
----------
metadata_path : Path | None
Path to location of a NextStrain GenBank genome metadata file.
Cannot be used with metadata_url.
metadata_url: str | None
URL to a NextStrain GenBank genome metadata file.
Cannot be used with metadata_path.
num_rows : int | None, default = None
The number of genome metadata rows to request.
When not supplied, request all rows.
"""

path_flag = metadata_path is not None
url_flag = metadata_url is not None

assert path_flag + url_flag == 1, "Specify metadata_path or metadata_url, but not both."

if metadata_url:
metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows)
return metadata

if metadata_path:
if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]:
metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows)
elif compression_type == ".xz":
metadata = pl.read_csv(
lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000
).lazy()

return metadata

Expand Down
32 changes: 32 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from datetime import datetime, timezone

import boto3
import pytest
import requests
from freezegun import freeze_time
from moto import mock_aws
from virus_clade_utils.util.config import Config


@pytest.fixture
Expand Down Expand Up @@ -34,6 +37,19 @@ def s3_setup(s3_object_keys):
s3_client = boto3.client("s3", region_name="us-east-1")
s3_client.create_bucket(Bucket=bucket_name)
s3_client.put_bucket_versioning(Bucket=bucket_name, VersioningConfiguration={"Status": "Enabled"})
s3_client.put_bucket_cors(
Bucket=bucket_name,
CORSConfiguration={
"CORSRules": [
{
"AllowedMethods": ["GET"],
"AllowedOrigins": ["https://*"],
"AllowedHeaders": ["*"],
"MaxAgeSeconds": 3000,
}
]
},
)

for file, object_key in s3_object_keys.items():
# Upload multiple versions of the object
Expand All @@ -55,3 +71,19 @@ def s3_setup(s3_object_keys):
)

yield s3_client, bucket_name, s3_object_keys


@pytest.fixture
def test_config(s3_setup):
"""
Return a Config object for use with the s3_setup fixture.
"""
s3_client, bucket_name, s3_object_keys = s3_setup
test_config = Config(datetime.now(), datetime.now())
test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc)
test_config.nextstrain_ncov_bucket = "versioned-bucket"
test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"]
test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"]
test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"]

return test_config
31 changes: 20 additions & 11 deletions tests/unit/test_cladetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pytest
from freezegun import freeze_time
from virus_clade_utils.cladetime import CladeTime
from virus_clade_utils.exceptions import CladeTimeInvalidDateError
from virus_clade_utils.util.config import Config
from virus_clade_utils.exceptions import CladeTimeInvalidDateError, CladeTimeInvalidURLError


def test_cladetime_no_args():
Expand Down Expand Up @@ -82,17 +81,9 @@ def test_cladetime_invalid_date(bad_date):
),
],
)
def test_cladetime_urls(s3_setup, sequence_as_of, expected_content):
def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content):
s3_client, bucket_name, s3_object_keys = s3_setup

# FIXME: perhaps the test_config that works with the mock aws setup
# should be a fixture.
test_config = Config(datetime.now(), datetime.now())
test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc)
test_config.nextstrain_ncov_bucket = "versioned-bucket"
test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"]
test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"]
test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"]
mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")

with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock):
Expand All @@ -118,3 +109,21 @@ def test_cladetime_ncov_metadata():

ct.url_ncov_metadata = "https://httpstat.us/504"
assert ct.ncov_metadata == {}


@pytest.mark.skip("Need moto fixup to test S3 URLs")
def test_cladetime_sequence_metadata(test_config):
mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")
with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock):
ct = CladeTime()
assert isinstance(ct.sequence_metadata)


def test_cladetime_sequence_metadata_no_url(test_config):
mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")
with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock):
ct = CladeTime()
ct.url_sequence_metadata = None

with pytest.raises(CladeTimeInvalidURLError):
ct.sequence_metadata
13 changes: 13 additions & 0 deletions tests/unit/util/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@ def test_get_covid_genome_metadata(test_file_path, metadata_file):
assert expected_cols.issubset(metadata_cols)


@pytest.mark.parametrize("metadata_file", ["metadata.tsv.zst", "metadata.tsv.xz"])
def test_get_covid_genome_metadata_url(s3_setup, test_file_path, metadata_file):
"""
Test get_covid_genome_metadata when used with an S3 URL instead of a local file.
Needs additional research into moto and S3 url access.
"""
s3_client, bucket_name, s3_object_keys = s3_setup

url = f"https://{bucket_name}.s3.amazonaws.com/data/object-key/{metadata_file}"
metadata = get_covid_genome_metadata(metadata_url=url)
assert isinstance(metadata, pl.LazyFrame)


@pytest.mark.parametrize(
"as_of, filename",
[
Expand Down

0 comments on commit a15a7f3

Please sign in to comment.