Merge branch 'CW-3745-add-pytests' into 'dev'

Unit tests [CW-3745] See merge request epi2melabs/workflows/wf-metagenomics!194
epi2me-labs · Dec 13, 2024 · e1bf542 · e1bf542
2 parents c9c81a9 + 117ac3e
commit e1bf542
Show file tree

Hide file tree

Showing 13 changed files with 473 additions and 31 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 
-## [Unreleased]
+## [v2.12.0]
 ### Added
 - `bracken_threshold` parameter to adjust bracken minimum read threshold, default 10.
 ### Fixed

diff --git a/bin/workflow_glue/report.py b/bin/workflow_glue/report.py
@@ -17,6 +17,7 @@
 import workflow_glue.diversity as diversity
 import workflow_glue.report_utils.report_utils as report_utils
 
+
 from .util import get_named_logger, wf_parser  # noqa: ABS101
 
 # Setup simple globals
@@ -34,8 +35,8 @@ def amr_section(amr_data, html_id):
     for i, (gene, data) in enumerate(amr_data.items()):
         _head = html_tags.h2(id=str(i), style="border: 1px solid rgba(0,0,0,.125);\
                             border-collapse: collapse;\
-                             padding:0;\
-                             margin-bottom:0")
+                            padding:0;\
+                            margin-bottom:0")
         _button = html_tags.button(
             html_tags.span(html_tags.b(gene)),
             html_tags.span(
@@ -424,7 +425,7 @@ def main(args):
                 params = json.load(f)
             amr_db = params["amr_db"].capitalize()
             p(f"""Detection of acquired AMR genes within sample using Abricate
-               with the {amr_db} database.
+                with the {amr_db} database.
             Please note that SNP-mediated AMR cannot be detected.
             """)
             amr_data = report_utils.parse_amr(args.amr)
@@ -516,38 +517,38 @@ def argparser():
         help="sample metadata")
     parser.add_argument(
         "--read_stats",  nargs='+', required=False,
-        help="Fastcat per-read stats, ordered as per entries in --metadata "
-    )
+        help="Fastcat per-read stats, ordered as per entries in --metadata",
+        type=report_utils.is_not_empty_or_exit)
     parser.add_argument(
         "--lineages", nargs='+', required=True,
-        help="Read lineage file.")
+        help="Read lineage file.",
+        type=report_utils.is_not_empty_or_exit)
     parser.add_argument(
         "--align_stats", required=False,
-        help="Folder containing the mapping and depth statistics in TSV format.")
+        help="Folder containing the mapping and depth statistics in TSV format.",
+        type=report_utils.is_not_empty_or_exit)
     parser.add_argument(
         "--abundance_table", required=True,
-        help="Read abundance tsv file.")
+        help="Read abundance tsv file.",
+        type=report_utils.is_not_empty_or_exit)
     parser.add_argument(
         '--taxonomic_rank', required=True, choices=["S", "G", "k", "F", "O", "C", "P"],
         help="Taxonomic rank.")
     parser.add_argument(
         "--versions", required=True,
-        help="directory containing CSVs containing name,version.")
+        help="directory containing CSVs containing name,version.",
+        type=report_utils.is_not_empty_or_exit)
     parser.add_argument(
         "--params", default=None, required=True,
-        help="A JSON file containing the workflow parameter key/values")
-    parser.add_argument(
-        "--revision", default='unknown',
-        help="git branch/tag of the executed workflow")
-    parser.add_argument(
-        "--commit", default='unknown',
-        help="git commit of the executed workflow")
+        help="A JSON file containing the workflow parameter key/values",
+        type=report_utils.is_not_empty_or_exit)
     parser.add_argument(
         "--pipeline", default='kraken2', choices=["kraken2", "minimap2", "real_time"],
         help="kraken2, minimap2 or real_time")
     parser.add_argument(
         "--amr", default=None,
-        help="Path to combined AMR results")
+        help="Path to combined AMR results",
+        type=report_utils.is_not_empty_or_exit)
     parser.add_argument(
         "--abundance_threshold", default=1, type=float,
         help="Remove those taxa whose abundance is below this cut-off.")

diff --git a/bin/workflow_glue/report_utils/report_utils.py b/bin/workflow_glue/report_utils/report_utils.py
@@ -138,7 +138,7 @@ def calculate_diversity_metrics(counts_per_taxa_df):
 
 
 def filter_by_abundance(
-        df, column_to_filter, abundance_threshold=0, column_to_group=None):
+        df, column_to_filter, abundance_threshold=0, columns_to_group=None):
     """Given a df, return a filtered dataframe after applying a threshold of abundances.
 
     :param df (DataFrame): Dataframe with counts.
@@ -154,18 +154,18 @@ def filter_by_abundance(
         abundance_threshold = round(
             abundance_threshold * df[column_to_filter].sum())
     # Group & filter them
-    if column_to_group:
+    if columns_to_group:
         # Subset just columns that are going to be used
         # E.g. not use lineages columns
         # In case the df contains character/factor columns
         interesting_cols = [
             colname for colname in [
-                column_to_filter, column_to_group
+                column_to_filter, columns_to_group
                 ] if colname in df.columns]
         mini_df = df[interesting_cols]
-        mini_df = mini_df.groupby(column_to_group).sum()
+        mini_df = mini_df.groupby(columns_to_group).sum()
         df_filtered = df[
-            df[column_to_group].isin(  # list of species that satisfy the threshold
+            df[columns_to_group].isin(  # list of species that satisfy the threshold
                 mini_df.loc[mini_df[column_to_filter] > abundance_threshold].index
             )]
     else:
@@ -288,7 +288,7 @@ def alignment_metrics(depth, stats):
     return reference_stats.reset_index()
 
 
-def depth2heatmap(depth, reference, min_cov=1):
+def depth2heatmap(depth, reference, min_cov=1, windows=100):
     """
     Calculate depth by windows for those references with a sequencing depth.
 
@@ -309,8 +309,9 @@ def depth2heatmap(depth, reference, min_cov=1):
     # keep explicit array of ref_lens to avoid any assumptions on order
     # use an np array to support broadcasting division later
     ref_lens = np.zeros(n_seqs, dtype=np.uint64)
+
     # heatmap matrix - declared as float type to support div later
-    ref_heatmap = np.zeros((n_seqs, 100), dtype=np.float64)
+    ref_heatmap = np.zeros((n_seqs, windows), dtype=np.float64)
     for i, ref in enumerate(reference.itertuples()):
         ref_ids[ref.Index] = i
         ref_lens[i] = ref.endpos  # ref_lens are 1 based endpos
@@ -325,8 +326,8 @@ def depth2heatmap(depth, reference, min_cov=1):
     for row in depth.itertuples():
         this_ref_id = ref_ids[row.ref]
         this_ref_len = ref_lens[this_ref_id]
-        # convert row.pos to 0 based to ensure no window can be 100
-        this_window = floor((row.pos - 1) / this_ref_len * 100)
+        # convert row.pos to 0 based to ensure no window can be (window)
+        this_window = floor((row.pos - 1) / this_ref_len * windows)
         ref_heatmap[this_ref_id, this_window] += row.depth
 
     # calculate the average over all windows for the ref
@@ -335,7 +336,7 @@ def depth2heatmap(depth, reference, min_cov=1):
     ref_mask = ref_mean_cov >= min_cov
 
     # now convert window count cells to averages for plotting
-    ref_heatmap /= (ref_lens // 100)[:, None]
+    ref_heatmap /= (ref_lens // windows)[:, None]
 
     # apply the mask to remove refs that do not meet the threshold
     ref_heatmap = ref_heatmap[ref_mask]
@@ -467,3 +468,26 @@ def n_reads_pass(metadata):
     df[cols_to_make_pc + ' (%)'] = df[cols_to_make_pc].apply(
         lambda x: round(x / df[reference_column] * 100, 2))
     return df
+
+
+def is_not_empty_or_exit(input_file_or_dir):
+    """Make sure the files/directories indeed contain something."""
+    input_path = Path(input_file_or_dir)
+    if input_path.exists():
+        if input_path.is_file():
+            if input_path.stat().st_size > 0:
+                # all good (unless file is compressed in which case the size will always
+                # be non-zero)
+                return input_file_or_dir
+            else:
+                raise SystemExit(f"Empty input file: {input_file_or_dir}")
+        elif input_path.is_dir():
+            if not any(Path(input_path).iterdir()):
+                raise SystemExit(f"Empty directory: {input_file_or_dir}")
+            else:
+                return input_file_or_dir
+        else:
+            raise SystemExit(
+                f"{input_file_or_dir} appears to be neither a file nor a directory")
+    else:
+        raise FileNotFoundError(f"File/Dir not found: {input_file_or_dir}")
diff --git a/bin/workflow_glue/tests/conftest.py b/bin/workflow_glue/tests/conftest.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+"""Pytests argument definitions."""
+
+
+def pytest_addoption(parser):
+    """Define command line arguments for pytest."""
+    parser.addoption(
+        "--test_data",
+        action="store",
+        default="/host/test_data"
+    )
diff --git a/bin/workflow_glue/tests/pytest.ini b/bin/workflow_glue/tests/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+tmp_path_retention_policy=none
diff --git a/bin/workflow_glue/tests/test_empty_files.py b/bin/workflow_glue/tests/test_empty_files.py
@@ -0,0 +1,70 @@
+"""Test report errors when empty files are input."""
+
+import pytest
+from workflow_glue import report
+
+
+BASE_PARAMS = [
+    "wf-metagenomics-report.html",
+    "--workflow_name", "wf-metagenomics-test",
+    "--pipeline", "minimap2",
+    "--taxonomic_rank", "S",
+]
+
+INPUT_PARAMS = [
+    # `[filename, flag, is_dir, no_test]`
+    ["versions", "--versions", True, True],
+    ["params", "--params", False, True],
+    ["metadata", "--metadata", False, False],
+    ["stats", "--read_stats", True, False],
+    ["lineages", "--lineages", True, True],
+    ["abundance_table_species", "--abundance_table", False, True],
+    ["alignment_stats", "--align_stats", True, True],
+    ["amr", "--amr", True, True],
+]
+
+
+def test_empty_inputs(tmp_path):
+    """Test that the report script properly checks that relevant inputs aren't empty.
+
+    For each file/directory input param that is not allowed to be empty, create
+    an argument list with current param pointing to an empty file/dir and all
+    other params pointing to valid file/directory.
+    tmp_path is a pytest fixture to create tmp dir for testing
+
+    """
+    for empty_param, flag, is_empty_dir, no_test in INPUT_PARAMS:
+        # Reset args list in each loop, to determine which param
+        # will have the empty file/dir
+        args = BASE_PARAMS.copy()
+        if not no_test:
+            # don't need to test this one as it can be empty (e.g. `metadata` in
+            # real-time)
+            continue
+        # use an empty file / dir for `input_name`
+        empty_input_dir = tmp_path / f'{empty_param}_empty'
+        empty_input_dir.mkdir()
+        if is_empty_dir:
+            args += [flag, str(empty_input_dir)]
+        else:
+            # Add empty file
+            empty_input_file = empty_input_dir / 'empty.txt'
+            empty_input_file.touch()
+            args += [flag, str(empty_input_file)]
+        # Build args for the remaining non-empty params
+        for new_input_name, flag, is_dir, _ in INPUT_PARAMS:
+            if new_input_name != empty_param:
+                # Again can be a dir with files or just a file
+                fname = tmp_path / new_input_name
+                if is_dir:
+                    # The dir should contain non empty files
+                    fname.mkdir(parents=True, exist_ok=True)
+                    (fname / f"{new_input_name}.txt").write_text('blo')
+                else:
+                    fname.write_text('blo')
+                # Add the flag and folder/file to args.
+                args += [flag, str(fname)]
+        # Define pattern for the expected message
+        pattern = f"Empty {'directory' if is_empty_dir else 'input file'}"
+        with pytest.raises(SystemExit, match=pattern):
+            report.argparser().parse_args(args)