From 9874b80b3bebbfc02b3f774415e72dde685b9004 Mon Sep 17 00:00:00 2001 From: Becky Sweger Date: Thu, 3 Oct 2024 16:03:47 -0400 Subject: [PATCH 1/2] Add a sequence_metadata attribute to CladeTime The first iteration of CladeTime contained a url_sequence_metadata attribute that points to the S3 link for NextStrain's sequence metadata file. This PR adds a sequence_metadata attribute that supplies users with a Polars LazyFrame to the S3 file. Note: doing the sorting and filtering on the S3 LazyFrame (i.e., without downloading the file first) saves time for those interested in only a subset of the metadata (e.g., US only, homo sapiens). --- README.md | 57 ++++++++++++++++++++------ src/virus_clade_utils/cladetime.py | 20 ++++++++- src/virus_clade_utils/util/sequence.py | 44 ++++++++++++++++---- tests/conftest.py | 32 +++++++++++++++ tests/unit/test_cladetime.py | 28 ++++++++----- tests/unit/util/test_sequence.py | 13 ++++++ 6 files changed, 161 insertions(+), 33 deletions(-) diff --git a/README.md b/README.md index 595d0c2..0844149 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,44 @@ In [1]: from virus_clade_utils.cladetime import CladeTime In [2]: ct = CladeTime() -# URL for the corresponding Nextstrain Sars-Cov-2 sequence metadata -In [3]: ct.url_sequence_metadata -Out[3]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=VJomXHLN2L9aqvS9Ax_LJ4ecr5ZsFFhE' - -# Metadata from the pipeline that produced the above file -In [4]: ct.ncov_metadata -Out[4]: +# Return a Polars LazyFrame with the sequence metadata. +In [4]: import polars as pl + +In [5]: lf = ct.sequence_metadata + +# From there, you can use Polars to manipulate the data as needed +In [6]: filtered_sequence_metadata = ( + lf + .select(["country", "division", "date", "host", "clade_nextstrain"]) + .rename({"clade_nextstrain": "clade", "division": "location"}) + .filter( + pl.col("country") == "USA", + pl.col("host") == "Homo sapiens" + ) +).collect() + +In [7]: filtered_sequence_metadata.head() +Out[7]: +shape: (5, 5) +┌─────────┬──────────┬────────────┬──────────────┬───────┐ +│ country ┆ location ┆ date ┆ host ┆ clade │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ str ┆ str ┆ str │ +╞═════════╪══════════╪════════════╪══════════════╪═══════╡ +│ USA ┆ Alabama ┆ 2022-07-07 ┆ Homo sapiens ┆ 22A │ +│ USA ┆ Arizona ┆ 2022-07-02 ┆ Homo sapiens ┆ 22B │ +│ USA ┆ Arizona ┆ 2022-07-19 ┆ Homo sapiens ┆ 22B │ +│ USA ┆ Arizona ┆ 2022-07-15 ┆ Homo sapiens ┆ 22B │ +│ USA ┆ Arizona ┆ 2022-07-20 ┆ Homo sapiens ┆ 22B │ +└─────────┴──────────┴────────────┴──────────────┴───────┘ + +# Pandas users can create a Pandas dataframe with sequence metadata + +In [8]: pandas = lf.collect().to_pandas() + +# Metadata from the pipeline that produced the above sequence_data +In [9]: ct.ncov_metadata +Out[9]: {'schema_version': 'v1', 'nextclade_version': 'nextclade 3.8.2', 'nextclade_dataset_name': 'SARS-CoV-2', @@ -55,17 +86,17 @@ Out[4]: #### Work with point-in-time Nextstrain Sars-Cov-2 sequence metadata and clade assignments ```python -In [5]: from virus_clade_utils.cladetime import CladeTime +In [10]: from virus_clade_utils.cladetime import CladeTime -In [6]: ct = CladeTime(sequence_as_of="2024-08-31", tree_as_of="2024-08-01") +In [11]: ct = CladeTime(sequence_as_of="2024-08-31", tree_as_of="2024-08-01") # URL for the corresponding Nextstrain Sars-Cov-2 sequence metadata as it existing on 2024-08-31 -In [7]: ct.url_sequence_metadata -Out[7]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=1SZMfjWxXjNy530F6L7MfyflUCbue.JD' +In [12]: ct.url_sequence_metadata +Out[12]: 'https://nextstrain-data.s3.amazonaws.com/files/ncov/open/metadata.tsv.zst?versionId=1SZMfjWxXjNy530F6L7MfyflUCbue.JD' # Metadata for the pipeline run that produced the above file -In [8]: ct.ncov_metadata -Out[8]: {'schema_version': 'v1', +In [13]: ct.ncov_metadata +Out[13]: {'schema_version': 'v1', 'nextclade_version': 'nextclade 3.8.2', 'nextclade_dataset_name': 'SARS-CoV-2', 'nextclade_dataset_version': '2024-07-17--12-57-03Z', diff --git a/src/virus_clade_utils/cladetime.py b/src/virus_clade_utils/cladetime.py index 24dd33d..e547eff 100644 --- a/src/virus_clade_utils/cladetime.py +++ b/src/virus_clade_utils/cladetime.py @@ -2,12 +2,13 @@ from datetime import datetime, timezone +import polars as pl import structlog from virus_clade_utils.exceptions import CladeTimeInvalidDateError from virus_clade_utils.util.config import Config from virus_clade_utils.util.reference import _get_s3_object_url -from virus_clade_utils.util.sequence import _get_ncov_metadata +from virus_clade_utils.util.sequence import _get_ncov_metadata, get_covid_genome_metadata logger = structlog.get_logger() @@ -25,6 +26,8 @@ class CladeTime: ncov_metadata : dict Metadata for the Nextstrain ncov pipeline that generated the sequence and sequence metadata that correspond to the sequence_as_of date. + metadata_metadata : pl.LazyFrame + A Polars lazyframe reference to url_sequence_metadata. tree_as_of : datetime Use the NextStrain reference tree that was available as of this date and time (UTC). @@ -58,6 +61,7 @@ def __init__(self, sequence_as_of=None, tree_as_of=None): self.sequence_as_of = self._validate_as_of_date(sequence_as_of) self.tree_as_of = self._validate_as_of_date(tree_as_of) self._ncov_metadata = {} + self._sequence_metadata = pl.LazyFrame() self.url_sequence = _get_s3_object_url( self._config.nextstrain_ncov_bucket, self._config.nextstrain_genome_sequence_key, self.sequence_as_of @@ -88,6 +92,20 @@ def ncov_metadata(self) -> dict: metadata = {} return metadata + @property + def sequence_metadata(self): + return self._sequence_metadata + + @sequence_metadata.getter + def sequence_metadata(self) -> pl.LazyFrame: + """Set the sequence_metadata attribute.""" + if self.url_sequence_metadata: + sequence_metadata = get_covid_genome_metadata(metadata_url=self.url_sequence_metadata) + return sequence_metadata + else: + sequence_metadata = pl.LazyFrame() + return sequence_metadata + def __repr__(self): return f"CladeTime(sequence_as_of={self.sequence_as_of}, tree_as_of={self.tree_as_of})" diff --git a/src/virus_clade_utils/util/sequence.py b/src/virus_clade_utils/util/sequence.py index dd9e342..83b4ee0 100644 --- a/src/virus_clade_utils/util/sequence.py +++ b/src/virus_clade_utils/util/sequence.py @@ -98,15 +98,41 @@ def download_covid_genome_metadata( return filename -def get_covid_genome_metadata(metadata_path: Path, num_rows: int | None = None) -> pl.LazyFrame: - """Read GenBank genome metadata into a Polars LazyFrame.""" - - if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]: - metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows) - elif compression_type == ".xz": - metadata = pl.read_csv( - lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000 - ).lazy() +def get_covid_genome_metadata( + metadata_path: Path | None = None, metadata_url: str | None = None, num_rows: int | None = None +) -> pl.LazyFrame: + """ + Read GenBank genome metadata into a Polars LazyFrame. + + Parameters + ---------- + metadata_path : Path | None + Path to location of a NextStrain GenBank genome metadata file. + Cannot be used with metadata_url. + metadata_url: str | None + URL to a NextStrain GenBank genome metadata file. + Cannot be used with metadata_path. + num_rows : int | None, default = None + The number of genome metadata rows to request. + When not supplied, request all rows. + """ + + path_flag = metadata_path is not None + url_flag = metadata_url is not None + + assert path_flag + url_flag == 1, "Specify metadata_path or metadata_url, but not both." + + if metadata_url: + metadata = pl.scan_csv(metadata_url, separator="\t", n_rows=num_rows) + return metadata + + if metadata_path: + if (compression_type := metadata_path.suffix) in [".tsv", ".zst"]: + metadata = pl.scan_csv(metadata_path, separator="\t", n_rows=num_rows) + elif compression_type == ".xz": + metadata = pl.read_csv( + lzma.open(metadata_path), separator="\t", n_rows=num_rows, infer_schema_length=100000 + ).lazy() return metadata diff --git a/tests/conftest.py b/tests/conftest.py index 66987dd..9075659 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,11 @@ +from datetime import datetime, timezone + import boto3 import pytest import requests from freezegun import freeze_time from moto import mock_aws +from virus_clade_utils.util.config import Config @pytest.fixture @@ -34,6 +37,19 @@ def s3_setup(s3_object_keys): s3_client = boto3.client("s3", region_name="us-east-1") s3_client.create_bucket(Bucket=bucket_name) s3_client.put_bucket_versioning(Bucket=bucket_name, VersioningConfiguration={"Status": "Enabled"}) + s3_client.put_bucket_cors( + Bucket=bucket_name, + CORSConfiguration={ + "CORSRules": [ + { + "AllowedMethods": ["GET"], + "AllowedOrigins": ["https://*"], + "AllowedHeaders": ["*"], + "MaxAgeSeconds": 3000, + } + ] + }, + ) for file, object_key in s3_object_keys.items(): # Upload multiple versions of the object @@ -55,3 +71,19 @@ def s3_setup(s3_object_keys): ) yield s3_client, bucket_name, s3_object_keys + + +@pytest.fixture +def test_config(s3_setup): + """ + Return a Config object for use with the s3_setup fixture. + """ + s3_client, bucket_name, s3_object_keys = s3_setup + test_config = Config(datetime.now(), datetime.now()) + test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc) + test_config.nextstrain_ncov_bucket = "versioned-bucket" + test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"] + test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"] + test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"] + + return test_config diff --git a/tests/unit/test_cladetime.py b/tests/unit/test_cladetime.py index c601b0e..2d75860 100644 --- a/tests/unit/test_cladetime.py +++ b/tests/unit/test_cladetime.py @@ -7,7 +7,6 @@ from freezegun import freeze_time from virus_clade_utils.cladetime import CladeTime from virus_clade_utils.exceptions import CladeTimeInvalidDateError -from virus_clade_utils.util.config import Config def test_cladetime_no_args(): @@ -82,17 +81,9 @@ def test_cladetime_invalid_date(bad_date): ), ], ) -def test_cladetime_urls(s3_setup, sequence_as_of, expected_content): +def test_cladetime_urls(s3_setup, test_config, sequence_as_of, expected_content): s3_client, bucket_name, s3_object_keys = s3_setup - # FIXME: perhaps the test_config that works with the mock aws setup - # should be a fixture. - test_config = Config(datetime.now(), datetime.now()) - test_config.nextstrain_min_seq_date = datetime(2023, 1, 1).replace(tzinfo=timezone.utc) - test_config.nextstrain_ncov_bucket = "versioned-bucket" - test_config.nextstrain_genome_metadata_key = s3_object_keys["sequence_metadata"] - test_config.nextstrain_genome_sequence_key = s3_object_keys["sequence"] - test_config.nextstrain_ncov_metadata_key = s3_object_keys["ncov_metadata"] mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock") with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock): @@ -118,3 +109,20 @@ def test_cladetime_ncov_metadata(): ct.url_ncov_metadata = "https://httpstat.us/504" assert ct.ncov_metadata == {} + + +@pytest.mark.skip("Need moto fixup to test S3 URLs") +def test_cladetime_sequence_metadata(test_config): + mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock") + with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock): + ct = CladeTime() + assert isinstance(ct.sequence_metadata) + + +def test_cladetime_sequence_metadata_no_url(test_config): + mock = MagicMock(return_value=test_config, name="CladeTime._get_config_mock") + with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock): + ct = CladeTime() + ct.url_sequence_metadata = None + # if there's no metadata url, sequence metadata should be an empty LazyFrame + assert ct.sequence_metadata.collect().shape == (0, 0) diff --git a/tests/unit/util/test_sequence.py b/tests/unit/util/test_sequence.py index c958716..6bd9382 100644 --- a/tests/unit/util/test_sequence.py +++ b/tests/unit/util/test_sequence.py @@ -54,6 +54,19 @@ def test_get_covid_genome_metadata(test_file_path, metadata_file): assert expected_cols.issubset(metadata_cols) +@pytest.mark.parametrize("metadata_file", ["metadata.tsv.zst", "metadata.tsv.xz"]) +def test_get_covid_genome_metadata_url(s3_setup, test_file_path, metadata_file): + """ + Test get_covid_genome_metadata when used with an S3 URL instead of a local file. + Needs additional research into moto and S3 url access. + """ + s3_client, bucket_name, s3_object_keys = s3_setup + + url = f"https://{bucket_name}.s3.amazonaws.com/data/object-key/{metadata_file}" + metadata = get_covid_genome_metadata(metadata_url=url) + assert isinstance(metadata, pl.LazyFrame) + + @pytest.mark.parametrize( "as_of, filename", [ From 41ea13b9795e51776084a7909457325b847f9ad1 Mon Sep 17 00:00:00 2001 From: Becky Sweger Date: Fri, 4 Oct 2024 14:28:26 -0400 Subject: [PATCH 2/2] Throw an error in sequence_metadata getter if there's no URL If CladeTime doesn't have a value for url_sequence_metadata, there's no point in proceeding. --- pyproject.toml | 3 +++ src/virus_clade_utils/_typing.py | 10 ++++++++++ src/virus_clade_utils/cladetime.py | 4 ++-- src/virus_clade_utils/exceptions.py | 4 ++++ tests/unit/test_cladetime.py | 7 ++++--- 5 files changed, 23 insertions(+), 5 deletions(-) create mode 100644 src/virus_clade_utils/_typing.py diff --git a/pyproject.toml b/pyproject.toml index 62605eb..ba79255 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,6 +59,9 @@ filterwarnings = [ "ignore::DeprecationWarning", 'ignore:polars found a filename', ] +testpaths = [ + "tests", +] [tool.ruff] line-length = 120 diff --git a/src/virus_clade_utils/_typing.py b/src/virus_clade_utils/_typing.py new file mode 100644 index 0000000..05ebc43 --- /dev/null +++ b/src/virus_clade_utils/_typing.py @@ -0,0 +1,10 @@ +"""Type aliases for this package.""" + +from pathlib import Path +from typing import TypeAlias, Union + +from cloudpathlib import AnyPath, CloudPath + +# Data types +# Pathlike: TypeAlias = Path | AnyPath | CloudPath +Pathlike: TypeAlias = Union["Path", "AnyPath", "CloudPath"] diff --git a/src/virus_clade_utils/cladetime.py b/src/virus_clade_utils/cladetime.py index e547eff..28ca0f5 100644 --- a/src/virus_clade_utils/cladetime.py +++ b/src/virus_clade_utils/cladetime.py @@ -5,7 +5,7 @@ import polars as pl import structlog -from virus_clade_utils.exceptions import CladeTimeInvalidDateError +from virus_clade_utils.exceptions import CladeTimeInvalidDateError, CladeTimeInvalidURLError from virus_clade_utils.util.config import Config from virus_clade_utils.util.reference import _get_s3_object_url from virus_clade_utils.util.sequence import _get_ncov_metadata, get_covid_genome_metadata @@ -103,7 +103,7 @@ def sequence_metadata(self) -> pl.LazyFrame: sequence_metadata = get_covid_genome_metadata(metadata_url=self.url_sequence_metadata) return sequence_metadata else: - sequence_metadata = pl.LazyFrame() + raise CladeTimeInvalidURLError("CladeTime is missing url_sequence_metadata") return sequence_metadata def __repr__(self): diff --git a/src/virus_clade_utils/exceptions.py b/src/virus_clade_utils/exceptions.py index c2f1010..f53b5f4 100644 --- a/src/virus_clade_utils/exceptions.py +++ b/src/virus_clade_utils/exceptions.py @@ -7,3 +7,7 @@ class Error(Exception): class CladeTimeInvalidDateError(Error): """Raised when an invalid date string is passed to CladeTime.""" + + +class CladeTimeInvalidURLError(Error): + """Raised when CladeTime encounters an invalid URL.""" diff --git a/tests/unit/test_cladetime.py b/tests/unit/test_cladetime.py index 2d75860..590c1a2 100644 --- a/tests/unit/test_cladetime.py +++ b/tests/unit/test_cladetime.py @@ -6,7 +6,7 @@ import pytest from freezegun import freeze_time from virus_clade_utils.cladetime import CladeTime -from virus_clade_utils.exceptions import CladeTimeInvalidDateError +from virus_clade_utils.exceptions import CladeTimeInvalidDateError, CladeTimeInvalidURLError def test_cladetime_no_args(): @@ -124,5 +124,6 @@ def test_cladetime_sequence_metadata_no_url(test_config): with patch("virus_clade_utils.cladetime.CladeTime._get_config", mock): ct = CladeTime() ct.url_sequence_metadata = None - # if there's no metadata url, sequence metadata should be an empty LazyFrame - assert ct.sequence_metadata.collect().shape == (0, 0) + + with pytest.raises(CladeTimeInvalidURLError): + ct.sequence_metadata