Skip to content

Commit

Permalink
Merge branch 'CW-3745-add-pytests' into 'dev'
Browse files Browse the repository at this point in the history
Unit tests [CW-3745]

See merge request epi2melabs/workflows/wf-metagenomics!194
  • Loading branch information
nggvs committed Dec 13, 2024
2 parents c9c81a9 + 117ac3e commit e1bf542
Show file tree
Hide file tree
Showing 13 changed files with 473 additions and 31 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [Unreleased]
## [v2.12.0]
### Added
- `bracken_threshold` parameter to adjust bracken minimum read threshold, default 10.
### Fixed
Expand Down
35 changes: 18 additions & 17 deletions bin/workflow_glue/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import workflow_glue.diversity as diversity
import workflow_glue.report_utils.report_utils as report_utils


from .util import get_named_logger, wf_parser # noqa: ABS101

# Setup simple globals
Expand All @@ -34,8 +35,8 @@ def amr_section(amr_data, html_id):
for i, (gene, data) in enumerate(amr_data.items()):
_head = html_tags.h2(id=str(i), style="border: 1px solid rgba(0,0,0,.125);\
border-collapse: collapse;\
padding:0;\
margin-bottom:0")
padding:0;\
margin-bottom:0")
_button = html_tags.button(
html_tags.span(html_tags.b(gene)),
html_tags.span(
Expand Down Expand Up @@ -424,7 +425,7 @@ def main(args):
params = json.load(f)
amr_db = params["amr_db"].capitalize()
p(f"""Detection of acquired AMR genes within sample using Abricate
with the {amr_db} database.
with the {amr_db} database.
Please note that SNP-mediated AMR cannot be detected.
""")
amr_data = report_utils.parse_amr(args.amr)
Expand Down Expand Up @@ -516,38 +517,38 @@ def argparser():
help="sample metadata")
parser.add_argument(
"--read_stats", nargs='+', required=False,
help="Fastcat per-read stats, ordered as per entries in --metadata "
)
help="Fastcat per-read stats, ordered as per entries in --metadata",
type=report_utils.is_not_empty_or_exit)
parser.add_argument(
"--lineages", nargs='+', required=True,
help="Read lineage file.")
help="Read lineage file.",
type=report_utils.is_not_empty_or_exit)
parser.add_argument(
"--align_stats", required=False,
help="Folder containing the mapping and depth statistics in TSV format.")
help="Folder containing the mapping and depth statistics in TSV format.",
type=report_utils.is_not_empty_or_exit)
parser.add_argument(
"--abundance_table", required=True,
help="Read abundance tsv file.")
help="Read abundance tsv file.",
type=report_utils.is_not_empty_or_exit)
parser.add_argument(
'--taxonomic_rank', required=True, choices=["S", "G", "k", "F", "O", "C", "P"],
help="Taxonomic rank.")
parser.add_argument(
"--versions", required=True,
help="directory containing CSVs containing name,version.")
help="directory containing CSVs containing name,version.",
type=report_utils.is_not_empty_or_exit)
parser.add_argument(
"--params", default=None, required=True,
help="A JSON file containing the workflow parameter key/values")
parser.add_argument(
"--revision", default='unknown',
help="git branch/tag of the executed workflow")
parser.add_argument(
"--commit", default='unknown',
help="git commit of the executed workflow")
help="A JSON file containing the workflow parameter key/values",
type=report_utils.is_not_empty_or_exit)
parser.add_argument(
"--pipeline", default='kraken2', choices=["kraken2", "minimap2", "real_time"],
help="kraken2, minimap2 or real_time")
parser.add_argument(
"--amr", default=None,
help="Path to combined AMR results")
help="Path to combined AMR results",
type=report_utils.is_not_empty_or_exit)
parser.add_argument(
"--abundance_threshold", default=1, type=float,
help="Remove those taxa whose abundance is below this cut-off.")
Expand Down
44 changes: 34 additions & 10 deletions bin/workflow_glue/report_utils/report_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def calculate_diversity_metrics(counts_per_taxa_df):


def filter_by_abundance(
df, column_to_filter, abundance_threshold=0, column_to_group=None):
df, column_to_filter, abundance_threshold=0, columns_to_group=None):
"""Given a df, return a filtered dataframe after applying a threshold of abundances.
:param df (DataFrame): Dataframe with counts.
Expand All @@ -154,18 +154,18 @@ def filter_by_abundance(
abundance_threshold = round(
abundance_threshold * df[column_to_filter].sum())
# Group & filter them
if column_to_group:
if columns_to_group:
# Subset just columns that are going to be used
# E.g. not use lineages columns
# In case the df contains character/factor columns
interesting_cols = [
colname for colname in [
column_to_filter, column_to_group
column_to_filter, columns_to_group
] if colname in df.columns]
mini_df = df[interesting_cols]
mini_df = mini_df.groupby(column_to_group).sum()
mini_df = mini_df.groupby(columns_to_group).sum()
df_filtered = df[
df[column_to_group].isin( # list of species that satisfy the threshold
df[columns_to_group].isin( # list of species that satisfy the threshold
mini_df.loc[mini_df[column_to_filter] > abundance_threshold].index
)]
else:
Expand Down Expand Up @@ -288,7 +288,7 @@ def alignment_metrics(depth, stats):
return reference_stats.reset_index()


def depth2heatmap(depth, reference, min_cov=1):
def depth2heatmap(depth, reference, min_cov=1, windows=100):
"""
Calculate depth by windows for those references with a sequencing depth.
Expand All @@ -309,8 +309,9 @@ def depth2heatmap(depth, reference, min_cov=1):
# keep explicit array of ref_lens to avoid any assumptions on order
# use an np array to support broadcasting division later
ref_lens = np.zeros(n_seqs, dtype=np.uint64)

# heatmap matrix - declared as float type to support div later
ref_heatmap = np.zeros((n_seqs, 100), dtype=np.float64)
ref_heatmap = np.zeros((n_seqs, windows), dtype=np.float64)
for i, ref in enumerate(reference.itertuples()):
ref_ids[ref.Index] = i
ref_lens[i] = ref.endpos # ref_lens are 1 based endpos
Expand All @@ -325,8 +326,8 @@ def depth2heatmap(depth, reference, min_cov=1):
for row in depth.itertuples():
this_ref_id = ref_ids[row.ref]
this_ref_len = ref_lens[this_ref_id]
# convert row.pos to 0 based to ensure no window can be 100
this_window = floor((row.pos - 1) / this_ref_len * 100)
# convert row.pos to 0 based to ensure no window can be (window)
this_window = floor((row.pos - 1) / this_ref_len * windows)
ref_heatmap[this_ref_id, this_window] += row.depth

# calculate the average over all windows for the ref
Expand All @@ -335,7 +336,7 @@ def depth2heatmap(depth, reference, min_cov=1):
ref_mask = ref_mean_cov >= min_cov

# now convert window count cells to averages for plotting
ref_heatmap /= (ref_lens // 100)[:, None]
ref_heatmap /= (ref_lens // windows)[:, None]

# apply the mask to remove refs that do not meet the threshold
ref_heatmap = ref_heatmap[ref_mask]
Expand Down Expand Up @@ -467,3 +468,26 @@ def n_reads_pass(metadata):
df[cols_to_make_pc + ' (%)'] = df[cols_to_make_pc].apply(
lambda x: round(x / df[reference_column] * 100, 2))
return df


def is_not_empty_or_exit(input_file_or_dir):
"""Make sure the files/directories indeed contain something."""
input_path = Path(input_file_or_dir)
if input_path.exists():
if input_path.is_file():
if input_path.stat().st_size > 0:
# all good (unless file is compressed in which case the size will always
# be non-zero)
return input_file_or_dir
else:
raise SystemExit(f"Empty input file: {input_file_or_dir}")
elif input_path.is_dir():
if not any(Path(input_path).iterdir()):
raise SystemExit(f"Empty directory: {input_file_or_dir}")
else:
return input_file_or_dir
else:
raise SystemExit(
f"{input_file_or_dir} appears to be neither a file nor a directory")
else:
raise FileNotFoundError(f"File/Dir not found: {input_file_or_dir}")
11 changes: 11 additions & 0 deletions bin/workflow_glue/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/usr/bin/env python
"""Pytests argument definitions."""


def pytest_addoption(parser):
"""Define command line arguments for pytest."""
parser.addoption(
"--test_data",
action="store",
default="/host/test_data"
)
2 changes: 2 additions & 0 deletions bin/workflow_glue/tests/pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
tmp_path_retention_policy=none
70 changes: 70 additions & 0 deletions bin/workflow_glue/tests/test_empty_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
"""Test report errors when empty files are input."""

import pytest
from workflow_glue import report


BASE_PARAMS = [
"wf-metagenomics-report.html",
"--workflow_name", "wf-metagenomics-test",
"--pipeline", "minimap2",
"--taxonomic_rank", "S",
]

INPUT_PARAMS = [
# `[filename, flag, is_dir, no_test]`
["versions", "--versions", True, True],
["params", "--params", False, True],
["metadata", "--metadata", False, False],
["stats", "--read_stats", True, False],
["lineages", "--lineages", True, True],
["abundance_table_species", "--abundance_table", False, True],
["alignment_stats", "--align_stats", True, True],
["amr", "--amr", True, True],
]


def test_empty_inputs(tmp_path):
"""Test that the report script properly checks that relevant inputs aren't empty.
For each file/directory input param that is not allowed to be empty, create
an argument list with current param pointing to an empty file/dir and all
other params pointing to valid file/directory.
tmp_path is a pytest fixture to create tmp dir for testing
"""
for empty_param, flag, is_empty_dir, no_test in INPUT_PARAMS:
# Reset args list in each loop, to determine which param
# will have the empty file/dir
args = BASE_PARAMS.copy()
if not no_test:
# don't need to test this one as it can be empty (e.g. `metadata` in
# real-time)
continue
# use an empty file / dir for `input_name`
empty_input_dir = tmp_path / f'{empty_param}_empty'
empty_input_dir.mkdir()
if is_empty_dir:
args += [flag, str(empty_input_dir)]
else:
# Add empty file
empty_input_file = empty_input_dir / 'empty.txt'
empty_input_file.touch()
args += [flag, str(empty_input_file)]
# Build args for the remaining non-empty params
for new_input_name, flag, is_dir, _ in INPUT_PARAMS:
if new_input_name != empty_param:
# Again can be a dir with files or just a file
fname = tmp_path / new_input_name
if is_dir:
# The dir should contain non empty files
fname.mkdir(parents=True, exist_ok=True)
(fname / f"{new_input_name}.txt").write_text('blo')
else:
fname.write_text('blo')
# Add the flag and folder/file to args.
args += [flag, str(fname)]
# Define pattern for the expected message
pattern = f"Empty {'directory' if is_empty_dir else 'input file'}"
with pytest.raises(SystemExit, match=pattern):
report.argparser().parse_args(args)
Loading

0 comments on commit e1bf542

Please sign in to comment.