Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a sequence_metadata attribute to CladeTime #27

Merged
merged 2 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 44 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,44 @@ In [1]: from virus_clade_utils.cladetime import CladeTime

In [2]: ct = CladeTime()

# URL for the corresponding Nextstrain Sars-Cov-2 sequence metadata
In [3]: ct.url_sequence_metadata
Out[3]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=VJomXHLN2L9aqvS9Ax_LJ4ecr5ZsFFhE'

# Metadata from the pipeline that produced the above file
In [4]: ct.ncov_metadata
Out[4]:
# Return a Polars LazyFrame with the sequence metadata.
In [4]: import polars as pl

In [5]: lf = ct.sequence_metadata

# From there, you can use Polars to manipulate the data as needed
In [6]: filtered_sequence_metadata = (
lf
.select(["country", "division", "date", "host", "clade_nextstrain"])
.rename({"clade_nextstrain": "clade", "division": "location"})
.filter(
pl.col("country") == "USA",
pl.col("host") == "Homo sapiens"
)
).collect()

In [7]: filtered_sequence_metadata.head()
Out[7]:
shape: (5, 5)
┌─────────┬──────────┬────────────┬──────────────┬───────┐
│ country ┆ location ┆ date ┆ host ┆ clade │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ str ┆ str ┆ str ┆ str │
╞═════════╪══════════╪════════════╪══════════════╪═══════╡
│ USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A │
│ USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B │
│ USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B │
│ USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B │
│ USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B │
└─────────┴──────────┴────────────┴──────────────┴───────┘

# Pandas users can create a Pandas dataframe with sequence metadata

In [8]: pandas = lf.collect().to_pandas()

# Metadata from the pipeline that produced the above sequence_data
In [9]: ct.ncov_metadata
Out[9]:
{'schema_version': 'v1',
'nextclade_version': 'nextclade 3.8.2',
'nextclade_dataset_name': 'SARS-CoV-2',
Expand All @@ -55,17 +86,17 @@ Out[4]:
#### Work with point-in-time Nextstrain Sars-Cov-2 sequence metadata and clade assignments

```python
In [5]: from virus_clade_utils.cladetime import CladeTime
In [10]: from virus_clade_utils.cladetime import CladeTime

In [6]: ct = CladeTime(sequence_as_of="2024-08-31", tree_as_of="2024-08-01")
In [11]: ct = CladeTime(sequence_as_of="2024-08-31", tree_as_of="2024-08-01")

# URL for the corresponding Nextstrain Sars-Cov-2 sequence metadata as it existing on 2024-08-31
In [7]: ct.url_sequence_metadata
Out[7]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=1SZMfjWxXjNy530F6L7MfyflUCbue.JD'
In [12]: ct.url_sequence_metadata
Out[12]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=1SZMfjWxXjNy530F6L7MfyflUCbue.JD'

# Metadata for the pipeline run that produced the above file
In [8]: ct.ncov_metadata
Out[8]: {'schema_version': 'v1',
In [13]: ct.ncov_metadata
Out[13]: {'schema_version': 'v1',
'nextclade_version': 'nextclade 3.8.2',
'nextclade_dataset_name': 'SARS-CoV-2',
'nextclade_dataset_version': '2024-07-17--12-57-03Z',
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ filterwarnings = [
"ignore::DeprecationWarning",
'ignore:polars found a filename',
]
testpaths = [
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Speed up pytest by telling it where the tests are

"tests",
]

[tool.ruff]
line-length = 120
Expand Down
10 changes: 10 additions & 0 deletions src/virus_clade_utils/_typing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""Type aliases for this package."""

from pathlib import Path
from typing import TypeAlias, Union

from cloudpathlib import AnyPath, CloudPath

# Data types
# Pathlike: TypeAlias = Path | AnyPath | CloudPath
Pathlike: TypeAlias = Union["Path", "AnyPath", "CloudPath"]
22 changes: 20 additions & 2 deletions src/virus_clade_utils/cladetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@

from datetime import datetime, timezone

import polars as pl
import structlog

from virus_clade_utils.exceptions import CladeTimeInvalidDateError
from virus_clade_utils.exceptions import CladeTimeInvalidDateError, CladeTimeInvalidURLError
from virus_clade_utils.util.config import Config
from virus_clade_utils.util.reference import _get_s3_object_url
from virus_clade_utils.util.sequence import _get_ncov_metadata
from virus_clade_utils.util.sequence import _get_ncov_metadata, get_covid_genome_metadata

logger = structlog.get_logger()

Expand All @@ -25,6 +26,8 @@ class CladeTime:
ncov_metadata : dict
Metadata for the Nextstrain ncov pipeline that generated the sequence and
sequence metadata that correspond to the sequence_as_of date.
metadata_metadata : pl.LazyFrame
A Polars lazyframe reference to url_sequence_metadata.
tree_as_of : datetime
Use the NextStrain reference tree that was available as of this
date and time (UTC).
Expand Down Expand Up @@ -58,6 +61,7 @@ def __init__(self, sequence_as_of=None, tree_as_of=None):
self.sequence_as_of = self._validate_as_of_date(sequence_as_of)
self.tree_as_of = self._validate_as_of_date(tree_as_of)
self._ncov_metadata = {}
self._sequence_metadata = pl.LazyFrame()

self.url_sequence = _get_s3_object_url(
self._config.nextstrain_ncov_bucket, self._config.nextstrain_genome_sequence_key, self.sequence_as_of
Expand Down Expand Up @@ -88,6 +92,20 @@ def ncov_metadata(self) -> dict:
metadata = {}
return metadata

@property
def sequence_metadata(self):
return self._sequence_metadata

@sequence_metadata.getter
def sequence_metadata(self) -> pl.LazyFrame:
"""Set the sequence_metadata attribute."""
if self.url_sequence_metadata:
sequence_metadata = get_covid_genome_metadata(metadata_url=self.url_sequence_metadata)
return sequence_metadata
else:
raise CladeTimeInvalidURLError("CladeTime is missing url_sequence_metadata")
return sequence_metadata

def __repr__(self):
return f"CladeTime(sequence_as_of={self.sequence_as_of}, tree_as_of={self.tree_as_of})"

Expand Down
4 changes: 4 additions & 0 deletions src/virus_clade_utils/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ class Error(Exception):

class CladeTimeInvalidDateError(Error):
"""Raised when an invalid date string is passed to CladeTime."""


class CladeTimeInvalidURLError(Error):
"""Raised when CladeTime encounters an invalid URL."""
44 changes: 35 additions & 9 deletions src/virus_clade_utils/util/sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,15 +98,41 @@ def download_covid_genome_metadata(
return filename


def get_covid_genome_metadata(metadata_path: Path, num_rows: int | None = None) -> pl.LazyFrame:
"""Read GenBank genome metadata into a Polars LazyFrame."""

if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]:
metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows)
elif compression_type == ".xz":
metadata = pl.read_csv(
lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000
).lazy()
def get_covid_genome_metadata(
metadata_path: Path | None = None, metadata_url: str | None = None, num_rows: int | None = None
) -> pl.LazyFrame:
"""
Read GenBank genome metadata into a Polars LazyFrame.

Parameters
----------
metadata_path : Path | None
Path to location of a NextStrain GenBank genome metadata file.
Cannot be used with metadata_url.
metadata_url: str | None
URL to a NextStrain GenBank genome metadata file.
Cannot be used with metadata_path.
num_rows : int | None, default = None
The number of genome metadata rows to request.
When not supplied, request all rows.
"""

path_flag = metadata_path is not None
url_flag = metadata_url is not None

assert path_flag + url_flag == 1, "Specify metadata_path or metadata_url, but not both."

if metadata_url:
metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows)
return metadata

if metadata_path:
if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]:
metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows)
elif compression_type == ".xz":
metadata = pl.read_csv(
lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000
).lazy()

return metadata

Expand Down
32 changes: 32 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
from datetime import datetime, timezone

import boto3
import pytest
import requests
from freezegun import freeze_time
from moto import mock_aws
from virus_clade_utils.util.config import Config


@pytest.fixture
Expand Down Expand Up @@ -34,6 +37,19 @@ def s3_setup(s3_object_keys):
s3_client = boto3.client("s3", region_name="us-east-1")
s3_client.create_bucket(Bucket=bucket_name)
s3_client.put_bucket_versioning(Bucket=bucket_name, VersioningConfiguration={"Status": "Enabled"})
s3_client.put_bucket_cors(
Bucket=bucket_name,
CORSConfiguration={
"CORSRules": [
{
"AllowedMethods": ["GET"],
"AllowedOrigins": ["https://*"],
"AllowedHeaders": ["*"],
"MaxAgeSeconds": 3000,
}
]
},
)

for file, object_key in s3_object_keys.items():
# Upload multiple versions of the object
Expand All @@ -55,3 +71,19 @@ def s3_setup(s3_object_keys):
)

yield s3_client, bucket_name, s3_object_keys


@pytest.fixture
def test_config(s3_setup):
"""
Return a Config object for use with the s3_setup fixture.
"""
s3_client, bucket_name, s3_object_keys = s3_setup
test_config = Config(datetime.now(), datetime.now())
test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc)
test_config.nextstrain_ncov_bucket = "versioned-bucket"
test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"]
test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"]
test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"]

return test_config
31 changes: 20 additions & 11 deletions tests/unit/test_cladetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@
import pytest
from freezegun import freeze_time
from virus_clade_utils.cladetime import CladeTime
from virus_clade_utils.exceptions import CladeTimeInvalidDateError
from virus_clade_utils.util.config import Config
from virus_clade_utils.exceptions import CladeTimeInvalidDateError, CladeTimeInvalidURLError


def test_cladetime_no_args():
Expand Down Expand Up @@ -82,17 +81,9 @@ def test_cladetime_invalid_date(bad_date):
),
],
)
def test_cladetime_urls(s3_setup, sequence_as_of, expected_content):
def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content):
s3_client, bucket_name, s3_object_keys = s3_setup

# FIXME: perhaps the test_config that works with the mock aws setup
# should be a fixture.
test_config = Config(datetime.now(), datetime.now())
test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc)
test_config.nextstrain_ncov_bucket = "versioned-bucket"
test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"]
test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"]
test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"]
mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")

with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock):
Expand All @@ -118,3 +109,21 @@ def test_cladetime_ncov_metadata():

ct.url_ncov_metadata = "https://httpstat.us/504"
assert ct.ncov_metadata == {}


@pytest.mark.skip("Need moto fixup to test S3 URLs")
def test_cladetime_sequence_metadata(test_config):
mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")
with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock):
ct = CladeTime()
assert isinstance(ct.sequence_metadata)


def test_cladetime_sequence_metadata_no_url(test_config):
mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock")
with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock):
ct = CladeTime()
ct.url_sequence_metadata = None

with pytest.raises(CladeTimeInvalidURLError):
ct.sequence_metadata
13 changes: 13 additions & 0 deletions tests/unit/util/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,19 @@ def test_get_covid_genome_metadata(test_file_path, metadata_file):
assert expected_cols.issubset(metadata_cols)


@pytest.mark.parametrize("metadata_file", ["metadata.tsv.zst", "metadata.tsv.xz"])
def test_get_covid_genome_metadata_url(s3_setup, test_file_path, metadata_file):
"""
Test get_covid_genome_metadata when used with an S3 URL instead of a local file.
Needs additional research into moto and S3 url access.
"""
s3_client, bucket_name, s3_object_keys = s3_setup

url = f"https://{bucket_name}.s3.amazonaws.com/data/object-key/{metadata_file}"
metadata = get_covid_genome_metadata(metadata_url=url)
assert isinstance(metadata, pl.LazyFrame)


@pytest.mark.parametrize(
"as_of, filename",
[
Expand Down