From 304bc237352861234fb0465f312c3ddc6b3b79a6 Mon Sep 17 00:00:00 2001 From: fgvieira <1151762+fgvieira@users.noreply.github.com> Date: Thu, 2 Jan 2025 17:00:47 +0100 Subject: [PATCH 1/3] Add support for seqkit split2 --- bio/seqkit/environment.linux-64.pin.txt | 31 ++++++++++---- bio/seqkit/environment.yaml | 1 + bio/seqkit/test/Snakefile | 16 +++++++ bio/seqkit/wrapper.py | 55 ++++++++++++++++++++----- test_wrappers.py | 14 +++++++ 5 files changed, 98 insertions(+), 19 deletions(-) diff --git a/bio/seqkit/environment.linux-64.pin.txt b/bio/seqkit/environment.linux-64.pin.txt index 4e6e7297ded..9edecb3b738 100644 --- a/bio/seqkit/environment.linux-64.pin.txt +++ b/bio/seqkit/environment.linux-64.pin.txt @@ -1,30 +1,43 @@ # This file may be used to create an environment using: # $ conda create --name --file # platform: linux-64 -# created-by: conda 24.9.0 +# created-by: conda 24.9.2 @EXPLICIT https://conda.anaconda.org/bioconda/linux-64/seqkit-2.9.0-h9ee0642_0.tar.bz2#14369282dc7a09cb3e117e4db9c9ddc2 https://conda.anaconda.org/conda-forge/linux-64/_libgcc_mutex-0.1-conda_forge.tar.bz2#d7c89558ba9fa0495403155b64376d81 -https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.8.30-hbcca054_0.conda#c27d1c142233b5bc9ca570c6e2e0c244 +https://conda.anaconda.org/conda-forge/linux-64/ca-certificates-2024.12.14-hbcca054_0.conda#720523eb0d6a9b0f6120c16b2aa4e7de +https://conda.anaconda.org/conda-forge/linux-64/ld_impl_linux-64-2.43-h712a8e2_2.conda#048b02e3962f066da18efe3a21b77672 +https://conda.anaconda.org/conda-forge/linux-64/python_abi-3.13-5_cp313.conda#381bbd2a92c863f640a55b6ff3c35161 +https://conda.anaconda.org/conda-forge/noarch/tzdata-2024b-hc8b5060_0.conda#8ac3367aafb1cc0a068483c580af8015 https://conda.anaconda.org/conda-forge/linux-64/libgomp-14.2.0-h77fa898_1.conda#cc3573974587f12dda90d96e3e55a702 https://conda.anaconda.org/conda-forge/linux-64/_openmp_mutex-4.5-2_gnu.tar.bz2#73aaf86a425cc6e73fcf236a5a46396d https://conda.anaconda.org/conda-forge/linux-64/libgcc-14.2.0-h77fa898_1.conda#3cb76c3f10d3bc7f1105b2fc9db984df -https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.2-heb4867d_0.conda#2b780c0338fc0ffa678ac82c54af51fd +https://conda.anaconda.org/conda-forge/linux-64/c-ares-1.34.4-hb9d3cd8_0.conda#e2775acf57efd5af15b8e3d1d74d72d3 +https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.23-h4ddbbb0_0.conda#8dfae1d2e74767e9ce36d5fa0d8605db +https://conda.anaconda.org/conda-forge/linux-64/libexpat-2.6.4-h5888daf_0.conda#db833e03127376d461e1e13e76f09b6c https://conda.anaconda.org/conda-forge/linux-64/libgcc-ng-14.2.0-h69a702a_1.conda#e39480b9ca41323497b05492a63bc35b +https://conda.anaconda.org/conda-forge/linux-64/liblzma-5.6.3-hb9d3cd8_1.conda#2ecf2f1c7e4e21fcfe6423a51a992d84 https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-14.2.0-hc0a3c3a_1.conda#234a5554c53625688d51062645337328 https://conda.anaconda.org/conda-forge/linux-64/libzlib-1.3.1-hb9d3cd8_2.conda#edb0dca6bc32e4f4789199455a1dbeb8 -https://conda.anaconda.org/conda-forge/linux-64/openssl-3.3.2-hb9d3cd8_0.conda#4d638782050ab6faa27275bed57e9b4e +https://conda.anaconda.org/conda-forge/linux-64/openssl-3.4.0-hb9d3cd8_0.conda#23cc74f77eb99315c0360ec3533147a9 https://conda.anaconda.org/conda-forge/linux-64/bzip2-1.0.8-h4bc722e_7.conda#62ee74e96c5ebb0af99386de58cf9553 https://conda.anaconda.org/conda-forge/linux-64/keyutils-1.6.1-h166bdaf_0.tar.bz2#30186d27e2c9fa62b45fb1476b7200e3 -https://conda.anaconda.org/conda-forge/linux-64/libdeflate-1.21-h4bc722e_0.conda#36ce76665bf67f5aac36be7a0d21b7f3 https://conda.anaconda.org/conda-forge/linux-64/libev-4.33-hd590300_2.conda#172bf1cd1ff8629f2b1179945ed45055 -https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.0-h0841786_0.conda#1f5a58e686b13bcfde88b93f547d23fe +https://conda.anaconda.org/conda-forge/linux-64/libffi-3.4.2-h7f98852_5.tar.bz2#d645c6d2ac96843a2bfaccd2d62b3ac3 +https://conda.anaconda.org/conda-forge/linux-64/libmpdec-4.0.0-h4bc722e_0.conda#aeb98fdeb2e8f25d43ef71fbacbeec80 +https://conda.anaconda.org/conda-forge/linux-64/libsqlite-3.47.2-hee588c1_0.conda#b58da17db24b6e08bcbf8fed2fb8c915 +https://conda.anaconda.org/conda-forge/linux-64/libssh2-1.11.1-hf672d98_0.conda#be2de152d8073ef1c01b7728475f2fe7 https://conda.anaconda.org/conda-forge/linux-64/libstdcxx-ng-14.2.0-h4852527_1.conda#8371ac6457591af2cf6159439c1fd051 +https://conda.anaconda.org/conda-forge/linux-64/libuuid-2.38.1-h0b41bf4_0.conda#40b61aab5c7ba9ff276c41cfffe6b80b https://conda.anaconda.org/conda-forge/linux-64/ncurses-6.5-he02047a_1.conda#70caf8bb6cf39a0b6b7efc885f51c0fe -https://conda.anaconda.org/conda-forge/linux-64/xz-5.2.6-h166bdaf_0.tar.bz2#2161070d867d1b1204ea749c8eec4ef0 +https://conda.anaconda.org/conda-forge/linux-64/tk-8.6.13-noxft_h4845f30_101.conda#d453b98d9c83e71da0741bb0ff4d76bc https://conda.anaconda.org/conda-forge/linux-64/libedit-3.1.20191231-he28a2e2_2.tar.bz2#4d331e44109e3f0e19b4cb8f9b82f3e1 https://conda.anaconda.org/conda-forge/linux-64/libnghttp2-1.64.0-h161d5f1_0.conda#19e57602824042dfd0446292ef90488b +https://conda.anaconda.org/conda-forge/linux-64/readline-8.2-h8228510_1.conda#47d31b792659ce70f470b5c82fdfb7a4 https://conda.anaconda.org/conda-forge/linux-64/zstd-1.5.6-ha6fb4c9_0.conda#4d056880988120e29d75bfff282e0f45 https://conda.anaconda.org/conda-forge/linux-64/krb5-1.21.3-h659f571_0.conda#3f43953b7d3fb3aaa1d0d0723d91e368 -https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.10.1-hbbe4b11_0.conda#6e801c50a40301f6978c53976917b277 -https://conda.anaconda.org/bioconda/linux-64/htslib-1.21-h5efdd21_0.tar.bz2#06b995dc2244c024b45bbb3e53ae2f27 +https://conda.anaconda.org/conda-forge/linux-64/python-3.13.1-ha99a958_102_cp313.conda#6e7535f1d1faf524e9210d2689b3149b +https://conda.anaconda.org/conda-forge/linux-64/libcurl-8.11.1-h332b0f4_0.conda#2b3e0081006dc21e8bf53a91c83a055c +https://conda.anaconda.org/conda-forge/noarch/pip-24.3.1-pyh145f28c_2.conda#76601b0ccfe1fe13a21a5f8813cb38de +https://conda.anaconda.org/bioconda/noarch/snakemake-wrapper-utils-0.6.2-pyhdfd78af_0.tar.bz2#fd8759bbd04116eace828c4fab906096 +https://conda.anaconda.org/bioconda/linux-64/htslib-1.21-h566b1c6_1.tar.bz2#944598fba531a668e8fafea92ca39bb4 diff --git a/bio/seqkit/environment.yaml b/bio/seqkit/environment.yaml index 132f28f3073..6d440a2fbb1 100644 --- a/bio/seqkit/environment.yaml +++ b/bio/seqkit/environment.yaml @@ -5,3 +5,4 @@ channels: dependencies: - seqkit =2.9.0 - htslib =1.21 + - snakemake-wrapper-utils =0.6.2 diff --git a/bio/seqkit/test/Snakefile b/bio/seqkit/test/Snakefile index bf6dca4d5ed..ac1cdf66fcd 100644 --- a/bio/seqkit/test/Snakefile +++ b/bio/seqkit/test/Snakefile @@ -190,3 +190,19 @@ rule seqkit_concat: threads: 2 wrapper: "master/bio/seqkit" + + +rule seqkit_split2_part: + input: + fasta="data/{sample}.fa", + output: + fasta=["out/split2/part/{sample}.1-of-2.fas", "out/split2/part/{sample}.2-of-2.fas"], + log: + "logs/split/part/{sample}.log", + params: + command="split2", + extra="--by-part 2", + out_bgzip=True, + threads: 2 + wrapper: + "master/bio/seqkit" diff --git a/bio/seqkit/wrapper.py b/bio/seqkit/wrapper.py index d3ef69ae367..3f55df40e04 100644 --- a/bio/seqkit/wrapper.py +++ b/bio/seqkit/wrapper.py @@ -2,19 +2,27 @@ __copyright__ = "Copyright 2023, Filipe G. Vieira" __license__ = "MIT" -from snakemake.shell import shell +import tempfile from pathlib import Path +from snakemake.shell import shell +from snakemake_wrapper_utils.snakemake import is_arg extra = snakemake.params.get("extra", "") log = snakemake.log_fmt_shell(stdout=False, stderr=True) -# subcommands concat and common use multiple input files + +# Subcommands with multiple input files if snakemake.params.command in ["concat", "common", "stats"]: input = " ".join(snakemake.input) +# Subcommands with a single input file (if more than one provided, concat'em all) +elif snakemake.params.command in ["stats", "sum", "rmdup", "split", "split2", "sample", "sort"]: + input = "<(cat " + " ".join(snakemake.input) + ")" else: input = snakemake.input[0] + +# Extra input extra_input = " ".join( [ ( @@ -26,6 +34,8 @@ ][1:] ) + +# Extra output extra_output = " ".join( [ ( @@ -38,15 +48,33 @@ ) -if snakemake.params.get("out_bgzip"): - assert Path(snakemake.output[0]).suffix in [ - ".gz", - ".bgz", - ".bgzip", - ], "invalid output file extension" - input = input + f" | bgzip --threads {snakemake.threads} > {snakemake.output[0]}" +if snakemake.params.command in ["split", "split2"]: + # Check type of splitting + if is_arg("-i", extra) or is_arg("--by-id", extra): + split_by = "id" + elif is_arg("-p", extra) or is_arg("--by-part", extra): + split_by = "part" + elif is_arg("-r", extra) or is_arg("--by-region", extra): + split_by = "region" + elif is_arg("-s", extra) or is_arg("--by-size", extra): + split_by = "size" + elif is_arg("-l", extra) or is_arg("--by-length", extra): + split_by = "length" + + out_dir = Path(snakemake.output[0]).parent + output = f"--out-dir {out_dir} --by-{split_by}-prefix output_part. --extension .fas" else: - input = f"--out-file {snakemake.output[0]} " + input + if snakemake.params.get("out_bgzip"): + assert Path(snakemake.output[0]).suffix in [ + ".gz", + ".bgz", + ".bgzip", + ], "invalid output file extension" + output = f"| bgzip --threads {snakemake.threads} > {snakemake.output[0]}" + else: + output = f"--out-file {snakemake.output[0]}" + + shell( "(seqkit {snakemake.params.command}" @@ -55,5 +83,12 @@ " {extra_output}" " {extra}" " {input}" + " {output}" ") {log}" ) + + +# Rename output files +if snakemake.params.command in ["split", "split2"]: + for idx, output_file in enumerate(snakemake.output, start=1): + shell("mv {out_dir}/output_part.{idx:03d}.fas {output_file}") diff --git a/test_wrappers.py b/test_wrappers.py index 0a7a3e8798e..b35a7a5be99 100644 --- a/test_wrappers.py +++ b/test_wrappers.py @@ -461,6 +461,20 @@ def test_seqkit_concat(run): ], ) +def test_seqkit_split2_part(run): + run( + "bio/seqkit", + [ + "snakemake", + "--cores", + "2", + "--use-conda", + "-F", + "out/split2/part/a.1-of-2.fas", + "out/split2/part/a.2-of-2.fas", + ], + ) + def test_sickle_pe(run): run( From be6041b4c0148d63c30e672e2f7eae6bfb435d76 Mon Sep 17 00:00:00 2001 From: fgvieira <1151762+fgvieira@users.noreply.github.com> Date: Thu, 2 Jan 2025 17:17:19 +0100 Subject: [PATCH 2/3] Code format --- bio/seqkit/test/Snakefile | 5 ++++- bio/seqkit/wrapper.py | 13 ++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/bio/seqkit/test/Snakefile b/bio/seqkit/test/Snakefile index ac1cdf66fcd..001aa105538 100644 --- a/bio/seqkit/test/Snakefile +++ b/bio/seqkit/test/Snakefile @@ -196,7 +196,10 @@ rule seqkit_split2_part: input: fasta="data/{sample}.fa", output: - fasta=["out/split2/part/{sample}.1-of-2.fas", "out/split2/part/{sample}.2-of-2.fas"], + fasta=[ + "out/split2/part/{sample}.1-of-2.fas", + "out/split2/part/{sample}.2-of-2.fas", + ], log: "logs/split/part/{sample}.log", params: diff --git a/bio/seqkit/wrapper.py b/bio/seqkit/wrapper.py index 3f55df40e04..cda0a459dd9 100644 --- a/bio/seqkit/wrapper.py +++ b/bio/seqkit/wrapper.py @@ -16,7 +16,15 @@ if snakemake.params.command in ["concat", "common", "stats"]: input = " ".join(snakemake.input) # Subcommands with a single input file (if more than one provided, concat'em all) -elif snakemake.params.command in ["stats", "sum", "rmdup", "split", "split2", "sample", "sort"]: +elif snakemake.params.command in [ + "stats", + "sum", + "rmdup", + "split", + "split2", + "sample", + "sort", +]: input = "<(cat " + " ".join(snakemake.input) + ")" else: input = snakemake.input[0] @@ -69,13 +77,12 @@ ".gz", ".bgz", ".bgzip", - ], "invalid output file extension" + ], "invalid output file extension" output = f"| bgzip --threads {snakemake.threads} > {snakemake.output[0]}" else: output = f"--out-file {snakemake.output[0]}" - shell( "(seqkit {snakemake.params.command}" " --threads {snakemake.threads}" From e6058dc2e8e2e400b257489322de1779d936f9c4 Mon Sep 17 00:00:00 2001 From: fgvieira <1151762+fgvieira@users.noreply.github.com> Date: Thu, 2 Jan 2025 17:27:05 +0100 Subject: [PATCH 3/3] Removed double stats --- bio/seqkit/test/Snakefile | 1 - bio/seqkit/wrapper.py | 1 - 2 files changed, 2 deletions(-) diff --git a/bio/seqkit/test/Snakefile b/bio/seqkit/test/Snakefile index 001aa105538..db3e92d92a4 100644 --- a/bio/seqkit/test/Snakefile +++ b/bio/seqkit/test/Snakefile @@ -205,7 +205,6 @@ rule seqkit_split2_part: params: command="split2", extra="--by-part 2", - out_bgzip=True, threads: 2 wrapper: "master/bio/seqkit" diff --git a/bio/seqkit/wrapper.py b/bio/seqkit/wrapper.py index cda0a459dd9..516b0ff1e96 100644 --- a/bio/seqkit/wrapper.py +++ b/bio/seqkit/wrapper.py @@ -17,7 +17,6 @@ input = " ".join(snakemake.input) # Subcommands with a single input file (if more than one provided, concat'em all) elif snakemake.params.command in [ - "stats", "sum", "rmdup", "split",