Skip to content

Commit

Permalink
Merge branch 'dev' into documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
mashehu authored Dec 3, 2024
2 parents a683b97 + be5f2e2 commit ac5497a
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 11 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,12 @@
- Move `includeConfig 'conf/modules.config'` next to `includeConfig 'conf/base.config'` to not overwrite tests profiles configurations ([#3301](https://github.com/nf-core/tools/pull/3301))
- Use `params.monochrome_logs` in the template and update nf-core components ([#3310](https://github.com/nf-core/tools/pull/3310))
- Fix some typos and improve writing in `usage.md` and `CONTRIBUTING.md` ([#3302](https://github.com/nf-core/tools/pull/3302))
- Add `manifest.contributors` to `nextflow.config` ([#3311](https://github.com/nf-core/tools/pull/3311))

### Download

- First steps towards fixing [#3179](https://github.com/nf-core/tools/issues/3179): Modify `prioritize_direct_download()` to retain Seqera Singularity https:// Container URIs and hardcode Seqera Containers into `gather_registries()` ([#3244](https://github.com/nf-core/tools/pull/3244)).
- Further steps towards fixing [#3179](https://github.com/nf-core/tools/issues/3179): Enable limited support for `oras://` container paths (_only absolute URIs, no flexible registries like with Docker_) and prevent unnecessary image downloads for Seqera Container modules with `reconcile_seqera_container_uris()` ([#3293](https://github.com/nf-core/tools/pull/3293)).
- Update dawidd6/action-download-artifact action to v7 ([#3306](https://github.com/nf-core/tools/pull/3306))

### Linting
Expand Down
15 changes: 14 additions & 1 deletion nf_core/pipeline-template/nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,20 @@ dag {

manifest {
name = '{{ name }}'
author = """{{ author }}"""
author = """{{ author }}""" // The author field is deprecated from Nextflow version 24.10.0, use contributors instead
contributors = [
// TODO nf-core: Update the field with the details of the contributors to your pipeline. New with Nextflow version 24.10.0
{%- for author_name in author.split(",") %}
[
name: '{{ author_name }}',
affiliation: '',
email: '',
github: '',
contribution: [], // List of contribution types ('author', 'maintainer' or 'contributor')
orcid: ''
],
{%- endfor %}
]
homePage = 'https://github.com/{{ name }}'
description = """{{ description }}"""
mainScript = 'main.nf'
Expand Down
71 changes: 65 additions & 6 deletions nf_core/pipelines/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -839,11 +839,12 @@ def rectify_raw_container_matches(self, raw_findings):
url_regex = (
r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
)
oras_regex = r"oras:\/\/[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)"
# Thanks Stack Overflow for the regex: https://stackoverflow.com/a/39672069/713980
docker_regex = r"^(?:(?=[^:\/]{1,253})(?!-)[a-zA-Z0-9-]{1,63}(?<!-)(?:\.(?!-)[a-zA-Z0-9-]{1,63}(?<!-))*(?::[0-9]{1,5})?/)?((?![._-])(?:[a-z0-9._-]*)(?<![._-])(?:/(?![._-])[a-z0-9._-]*(?<![._-]))*)(?::(?![.-])[a-zA-Z0-9_.-]{1,128})?$"

# at this point, we don't have to distinguish anymore, because we will later prioritize direct downloads over Docker URIs.
either_url_or_docker = re.compile(f"{url_regex}|{docker_regex}", re.S)
either_url_or_docker = re.compile(f"{url_regex}|{oras_regex}|{docker_regex}", re.S)

for _, container_value, search_space, file_path in raw_findings:
"""
Expand Down Expand Up @@ -1000,14 +1001,18 @@ def prioritize_direct_download(self, container_list: List[str]) -> List[str]:
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data'
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data'
Lastly, we want to remove at least a few Docker URIs for those modules, that have an oras:// download link.
"""
d: Dict[str, str] = {}
seqera_containers: List[str] = []
seqera_containers_http: List[str] = []
seqera_containers_oras: List[str] = []
all_others: List[str] = []

for c in container_list:
if bool(re.search(r"/data$", c)):
seqera_containers.append(c)
seqera_containers_http.append(c)
elif bool(re.search(r"^oras://", c)):
seqera_containers_oras.append(c)
else:
all_others.append(c)

Expand All @@ -1016,8 +1021,47 @@ def prioritize_direct_download(self, container_list: List[str]) -> List[str]:
log.debug(f"{c} matches and will be saved as {k}")
d[k] = c

# combine deduplicated others and Seqera containers
return sorted(list(d.values()) + seqera_containers)
combined_with_oras = self.reconcile_seqera_container_uris(seqera_containers_oras, list(d.values()))

# combine deduplicated others (Seqera containers oras, http others and Docker URI others) and Seqera containers http
return sorted(list(set(combined_with_oras + seqera_containers_http)))

@staticmethod
def reconcile_seqera_container_uris(prioritized_container_list: List[str], other_list: List[str]) -> List[str]:
"""
Helper function that takes a list of Seqera container URIs,
extracts the software string and builds a regex from them to filter out
similar containers from the second container list.
prioritzed_container_list = [
... "oras://community.wave.seqera.io/library/multiqc:1.25.1--f0e743d16869c0bf",
... "oras://community.wave.seqera.io/library/multiqc_pip_multiqc-plugins:e1f4877f1515d03c"
... ]
will be cleaned to
['library/multiqc:1.25.1', 'library/multiqc_pip_multiqc-plugins']
Subsequently, build a regex from those and filter out matching duplicates in other_list:
"""
if not prioritized_container_list:
return other_list
else:
# trim the URIs to the stem that contains the tool string, assign with Walrus operator to account for non-matching patterns
trimmed_priority_list = [
match.group()
for c in set(prioritized_container_list)
if (match := re.search(r"library/.*?:[\d.]+", c) if "--" in c else re.search(r"library/[^\s:]+", c))
]

# build regex
prioritized_containers = re.compile("|".join(f"{re.escape(c)}" for c in trimmed_priority_list))

# filter out matches in other list
filtered_containers = [c for c in other_list if not re.search(prioritized_containers, c)]

# combine prioritized and regular container lists
return sorted(list(set(prioritized_container_list + filtered_containers)))

def gather_registries(self, workflow_directory: str) -> None:
"""Fetch the registries from the pipeline config and CLI arguments and store them in a set.
Expand Down Expand Up @@ -1419,9 +1463,10 @@ def singularity_pull_image(
# Sometimes, container still contain an explicit library specification, which
# resulted in attempted pulls e.g. from docker://quay.io/quay.io/qiime2/core:2022.11
# Thus, if an explicit registry is specified, the provided -l value is ignored.
# Additionally, check if the container to be pulled is native Singularity: oras:// protocol.
container_parts = container.split("/")
if len(container_parts) > 2:
address = f"docker://{container}"
address = container if container.startswith("oras://") else f"docker://{container}"
absolute_URI = True
else:
address = f"docker://{library}/{container.replace('docker://', '')}"
Expand Down Expand Up @@ -1843,6 +1888,9 @@ def __init__(
elif re.search(r"manifest\sunknown", line):
self.error_type = self.InvalidTagError(self)
break
elif re.search(r"ORAS\sSIF\simage\sshould\shave\sa\ssingle\slayer", line):
self.error_type = self.NoSingularityContainerError(self)
break
elif re.search(r"Image\sfile\salready\sexists", line):
self.error_type = self.ImageExistsError(self)
break
Expand Down Expand Up @@ -1907,6 +1955,17 @@ def __init__(self, error_log):
self.helpmessage = f'Saving image of "{self.error_log.container}" failed, because "{self.error_log.out_path}" exists.\nPlease troubleshoot the command \n"{" ".join(self.error_log.singularity_command)}" manually.\n'
super().__init__(self.message)

class NoSingularityContainerError(RuntimeError):
"""The container image is no native Singularity Image Format."""

def __init__(self, error_log):
self.error_log = error_log
self.message = (
f'[bold red]"{self.error_log.container}" is no valid Singularity Image Format container.[/]\n'
)
self.helpmessage = f"Pulling \"{self.error_log.container}\" failed, because it appears invalid. To convert from Docker's OCI format, prefix the URI with 'docker://' instead of 'oras://'.\n"
super().__init__(self.message)

class OtherError(RuntimeError):
"""Undefined error with the container"""

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
process UMI_TRANSFER {
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6' :
'community.wave.seqera.io/library/umi-transfer:1.0.0--d30e8812ea280fa1' }"

// truncated

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
process UMI_TRANSFER_MULLED {
label 'process_single'

conda "${moduleDir}/environment.yml"
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
'oras://community.wave.seqera.io/library/umi-transfer_umicollapse:796a995ff53da9e3' :
'community.wave.seqera.io/library/umi-transfer_umicollapse:3298d4f1b49e33bd' }"

// truncated

}
93 changes: 89 additions & 4 deletions tests/pipelines/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,20 @@ def test_find_container_images_modules(self, tmp_path, mock_fetch_wf_config):
not in download_obj.containers
)

# mock_seqera_container.nf
# mock_seqera_container_oras.nf
assert "oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6" in download_obj.containers
assert "community.wave.seqera.io/library/umi-transfer:1.0.0--d30e8812ea280fa1" not in download_obj.containers

# mock_seqera_container_oras_mulled.nf
assert (
"oras://community.wave.seqera.io/library/umi-transfer_umicollapse:796a995ff53da9e3"
in download_obj.containers
)
assert (
"community.wave.seqera.io/library/umi-transfer_umicollapse:3298d4f1b49e33bd" not in download_obj.containers
)

# mock_seqera_container_http.nf
assert (
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data"
in download_obj.containers
Expand Down Expand Up @@ -294,6 +307,7 @@ def test_prioritize_direct_download(self, tmp_path):
"https://depot.galaxyproject.org/singularity/sortmerna:4.2.0--h9ee0642_1",
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data",
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data",
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data",
]

result = download_obj.prioritize_direct_download(test_container)
Expand All @@ -316,7 +330,7 @@ def test_prioritize_direct_download(self, tmp_path):
assert "https://depot.galaxyproject.org/singularity/sortmerna:4.3.7--hdbdd923_0" in result
assert "https://depot.galaxyproject.org/singularity/sortmerna:4.2.0--h9ee0642_1" in result

# Verify that Seqera containers are not deduplicated
# Verify that Seqera containers are not deduplicated...
assert (
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/63/6397750e9730a3fbcc5b4c43f14bd141c64c723fd7dad80e47921a68a7c3cd21/data"
in result
Expand All @@ -325,6 +339,58 @@ def test_prioritize_direct_download(self, tmp_path):
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data"
in result
)
# ...but identical ones are.
assert (
result.count(
"https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/c2/c262fc09eca59edb5a724080eeceb00fb06396f510aefb229c2d2c6897e63975/data"
)
== 1
)

#
# Test for 'reconcile_seqera_container_uris'
#
@with_temporary_folder
def test_reconcile_seqera_container_uris(self, tmp_path):
download_obj = DownloadWorkflow(pipeline="dummy", outdir=tmp_path)

prioritized_container = [
"oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6",
"oras://community.wave.seqera.io/library/sylph:0.6.1--b97274cdc1caa649",
]

test_container = [
"https://depot.galaxyproject.org/singularity/ubuntu:22.04",
"nf-core/ubuntu:22.04",
"nf-core/ubuntu:22.04",
"nf-core/ubuntu:22.04",
"community.wave.seqera.io/library/umi-transfer:1.5.0--73c1a6b65e5b0b81",
"community.wave.seqera.io/library/sylph:0.6.1--a21713a57a65a373",
"biocontainers/sylph:0.6.1--b97274cdc1caa649",
]

# test that the test_container list is returned as it is, if no prioritized_containers are specified
result_empty = download_obj.reconcile_seqera_container_uris([], test_container)
assert result_empty == test_container

result = download_obj.reconcile_seqera_container_uris(prioritized_container, test_container)

# Verify that unrelated images are retained
assert "https://depot.galaxyproject.org/singularity/ubuntu:22.04" in result
assert "nf-core/ubuntu:22.04" in result

# Verify that the priority works for regular Seqera container (Native Singularity over Docker, but only for Seqera registry)
assert "oras://community.wave.seqera.io/library/sylph:0.6.1--b97274cdc1caa649" in result
assert "community.wave.seqera.io/library/sylph:0.6.1--a21713a57a65a373" not in result
assert "biocontainers/sylph:0.6.1--b97274cdc1caa649" in result

# Verify that version strings are respected: Version 1.0.0 does not replace version 1.5.0
assert "oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6" in result
assert "community.wave.seqera.io/library/umi-transfer:1.5.0--73c1a6b65e5b0b81" in result

# assert that the deduplication works
assert test_container.count("nf-core/ubuntu:22.04") == 3
assert result.count("nf-core/ubuntu:22.04") == 1

#
# Tests for 'singularity_pull_image'
Expand Down Expand Up @@ -356,11 +422,30 @@ def test_singularity_pull_image_singularity_installed(self, tmp_dir, mock_rich_p
"docker.io/bschiffthaler/sed", f"{tmp_dir}/sed.sif", None, "docker.io", mock_rich_progress
)

# Test successful pull with absolute oras:// URI
download_obj.singularity_pull_image(
"oras://community.wave.seqera.io/library/umi-transfer:1.0.0--e5b0c1a65b8173b6",
f"{tmp_dir}/umi-transfer-oras.sif",
None,
"docker.io",
mock_rich_progress,
)

# try pulling Docker container image with oras://
with pytest.raises(ContainerError.NoSingularityContainerError):
download_obj.singularity_pull_image(
"oras://ghcr.io/matthiaszepper/umi-transfer:dev",
f"{tmp_dir}/umi-transfer-oras_impostor.sif",
None,
"docker.io",
mock_rich_progress,
)

# try to pull from non-existing registry (Name change hello-world_new.sif is needed, otherwise ImageExistsError is raised before attempting to pull.)
with pytest.raises(ContainerError.RegistryNotFoundError):
download_obj.singularity_pull_image(
"hello-world",
f"{tmp_dir}/hello-world_new.sif",
f"{tmp_dir}/break_the_registry_test.sif",
None,
"register-this-domain-to-break-the-test.io",
mock_rich_progress,
Expand Down Expand Up @@ -396,7 +481,7 @@ def test_singularity_pull_image_singularity_installed(self, tmp_dir, mock_rich_p
with pytest.raises(ContainerError.InvalidTagError):
download_obj.singularity_pull_image(
"ewels/multiqc:go-rewrite",
f"{tmp_dir}/umi-transfer.sif",
f"{tmp_dir}/multiqc-go.sif",
None,
"ghcr.io",
mock_rich_progress,
Expand Down

0 comments on commit ac5497a

Please sign in to comment.