Merge pull request #243 from MontgomeryLab/issue-242

Configuration: Samples Sheet validation
MontgomeryLab · Oct 22, 2022 · 1588297 · 1588297
2 parents 81ef5e8 + c65ae81
commit 1588297
Show file tree

Hide file tree

Showing 4 changed files with 249 additions and 106 deletions.
diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py
@@ -7,10 +7,15 @@
 import signal
 import psutil
 import shlex
+import csv
 import sys
 import io
 import os
 
+from typing import List
+
+from tiny.rna.configuration import CSVReader
+
 rules_template = [{'Identity': ("Name", "N/A"),
                    'Strand': "both",
                    'Hierarchy': 0,
@@ -20,6 +25,28 @@
                    'Overlap': "partial"}]
 
 
+def csv_factory(type: str, rows: List[dict], header=()):
+    """Returns the file contents of the specified config csv. The written header does NOT match
+    the fieldnames expected in rows. Fieldnames are expected to be the internal
+    short names defined in Configuration.CSVReader (for brevity)"""
+
+    if type == "features.csv":
+        fields = list(CSVReader.tinyrna_sheet_fields['Features Sheet'].values())
+        header = {short: long for long, short in CSVReader.tinyrna_sheet_fields['Features Sheet'].items()}
+    elif type == "samples.csv":
+        fields = list(CSVReader.tinyrna_sheet_fields['Samples Sheet'].values())
+        header = {short: long for long, short in CSVReader.tinyrna_sheet_fields['Samples Sheet'].items()}
+    else:
+        sys.exit("Unsupported config file")
+
+    csv_string = io.StringIO()
+    writer = csv.DictWriter(csv_string, fieldnames=fields)
+    writer.writerow(header)
+    writer.writerows(rows)
+
+    return csv_string.getvalue()
+
+
 def get_dir_tree(root_path: str) -> dict:
     """Returns a nested dictionary representation of a given directory tree.
 

diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py
@@ -4,10 +4,11 @@
 import unittest
 from unittest.mock import patch, mock_open, call
 
-from tiny.rna.configuration import Configuration
+from tiny.rna.configuration import Configuration, SamplesSheet
+from unit_test_helpers import csv_factory
 
 
-class ConfigurationTests(unittest.TestCase):
+class BowtieIndexesTest(unittest.TestCase):
     @classmethod
     def setUpClass(self):
         self.root_cfg_dir = os.path.abspath("../tiny/templates")
@@ -135,5 +136,69 @@ def test_verify_bowtie_build_outputs(self):
         self.assertListEqual(config['bt_index_files'], expected_ebwt)
         self.assertListEqual(mo.call_args_list, expected_writes)
 
+
+class SamplesSheetTest(unittest.TestCase):
+
+    """Does SamplesSheet catch multi-assignment of control condition?"""
+
+    def test_validate_control_group(self):
+        sheet = csv_factory("samples.csv", [
+            {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''},  # Good
+            {'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''},  # Good
+            {'File': '3.fastq', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''}   # Bad
+        ])                                                                 # ^^^
+
+        exp_contains = r".*(multiple control conditions).*"
+        with self.assertRaisesRegex(AssertionError, exp_contains), \
+                patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \
+                patch('tiny.rna.configuration.os.path.isfile', return_value=True):
+            SamplesSheet('mock_filename')
+
+    """Does SamplesSheet catch duplicate entries for the same group and rep?"""
+    def test_validate_group_rep(self):
+        sheet = csv_factory("samples.csv", [
+            {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''},  # Good
+            {'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''},  # Good
+            {'File': '3.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}   # Bad
+        ])                             # ^^^                ^^^
+
+        exp_contains = r".*(same group and replicate).*"
+        with self.assertRaisesRegex(AssertionError, exp_contains), \
+                patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \
+                patch('tiny.rna.configuration.os.path.isfile', return_value=True):
+            SamplesSheet('mock_filename')
+
+    """Does SamplesSheet catch fastq files that don't exist, have a bad file extension, or are listed more than once?"""
+    def test_validate_fastq_filepath(self):
+        csv_rows = [
+            {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''},  # Good
+            {'File': '1.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''},  # Bad
+            {'File': '2.fasta', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''}   # Bad
+        ]           # ^^^^^^^
+        sheet = csv_factory("samples.csv", csv_rows)
+
+        # File doesn't exist
+        exp_contains = r".*(was not found).*"
+        with self.assertRaisesRegex(AssertionError, exp_contains), \
+                patch('tiny.rna.configuration.open', mock_open(read_data=sheet)):
+            SamplesSheet('mock_filename')
+
+        # Duplicate filename
+        exp_contains = r".*(listed more than once).*"
+        with self.assertRaisesRegex(AssertionError, exp_contains), \
+                patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \
+                patch('tiny.rna.configuration.os.path.isfile', return_value=True):
+            SamplesSheet('mock_filename')
+
+        # Bad file extension
+        exp_contains = r".*(\.fastq\(\.gz\) extension).*"
+        csv_rows.pop(0)
+        sheet = csv_factory("samples.csv", csv_rows)
+        with self.assertRaisesRegex(AssertionError, exp_contains), \
+                patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \
+                patch('tiny.rna.configuration.os.path.isfile', return_value=True):
+            SamplesSheet('mock_filename')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py
@@ -1,6 +1,5 @@
 import io
 import os
-import csv
 import unittest
 
 from unittest.mock import patch, mock_open
@@ -26,76 +25,62 @@ def setUpClass(self):
         self.short_sam = helpers.read(self.short_sam_file)
 
         self.strand = {'sense': tuple('+'), 'antisense': tuple('-'), 'both': ('+', '-')}
+        self.csv = staticmethod(helpers.csv_factory)
 
         # Represents an unparsed Features Sheet row
         # Key is the user-facing column header
         self.csv_feat_row_dict = {
-            'Select for...':     "Class",
-            'with value...':     "CSR",
-            'Alias by...':       "Alias",
-            'Tag':               '',
-            'Hierarchy':         "1",
-            'Strand':            "antisense",
-            "5' End Nucleotide": '"C,G,U"',  # Needs to be double-quoted due to commas
-            'Length':            "all",
-            'Overlap':           "Partial",
-            'Feature Source':    "test_file.gff3"
+            'Key':       "Class",
+            'Value':     "CSR",
+            'Name':      "Alias",
+            'Tag':       "",
+            'Hierarchy': "1",
+            'Strand':    "antisense",
+            "nt5end":    '"C,G,U"',  # Needs to be double-quoted due to commas
+            'Length':    "all",
+            'Overlap':   "Partial",
+            'Source':    "test_file.gff3"
         }
 
         # Represents the parsed Features Sheet row above
         # Key is the internal short name
         _row = self.csv_feat_row_dict
         self.parsed_feat_rule = [{
-            'Identity':  (_row['Select for...'], _row['with value...']),
+            'Identity':  (_row['Key'], _row['Value']),
             'Tag':       _row['Tag'],
             'Hierarchy': int(_row['Hierarchy']),
             'Strand':    _row['Strand'],
-            'nt5end':    _row["5' End Nucleotide"].upper().translate({ord('U'): 'T'}),
+            'nt5end':    _row["nt5end"].upper().translate({ord('U'): 'T'}),
             'Length':    _row['Length'],
             'Overlap':   _row['Overlap'].lower()
         }]
 
         # Represents an unparsed Samples Sheet row
         # Key is the user-facing column header
         self.csv_samp_row_dict = {
-            'Input FASTQ Files': "test_file.fastq",
-            'Sample/Group Name': "test_group",
-            'Replicate Number':  "0",
-            'Control':           "",
-            'Normalization':     ''
+            'File':          "test_file.fastq",
+            'Group':         "test_group",
+            'Replicate':     "0",
+            'Control':       "",
+            'Normalization': ""
         }
 
         # This is the same Samples Sheet row above, but with internal names
         # It does NOT represent the parsed result of loading the Samples Sheet
         _row = self.csv_samp_row_dict
         self.parsed_samp_rule = {
-            'File':          _row['Input FASTQ Files'],
-            'Group':         _row['Sample/Group Name'],
-            'Replicate':     _row['Replicate Number'],
+            'File':          _row['File'],
+            'Group':         _row['Group'],
+            'Replicate':     _row['Replicate'],
             'Control':       _row['Control'],
             'Normalization': _row['Normalization']
         }
 
     # === HELPERS ===
-
-    @staticmethod
-    def csv(type, rows, header=()):
-        if type == "features.csv":
-            header = ['Select for...', 'with value...', 'Alias by...', 'Tag', 'Hierarchy',
-                      'Strand', "5' End Nucleotide", 'Length', 'Overlap', 'Feature Source']
-        elif type == "samples.csv":
-            header = ['Input FASTQ Files', 'Sample/Group Name', 'Replicate Number', 'Control', 'Normalization']
-
-        csv_string = io.StringIO()
-        writer = csv.DictWriter(csv_string, fieldnames=header)
-        writer.writeheader()
-        writer.writerows(rows)
-
-        return csv_string.getvalue()
 
-    def get_parsed_samples_row(self, row, exp_file):
+    def get_loaded_samples_row(self, row, exp_file):
         return [{
-            'Name': "_rep_".join(row[i] for i in ["Sample/Group Name", "Replicate Number"]),
+            'Name': "_rep_".join(row[i] for i in ["Group", "Replicate"]),
             'File': exp_file,
             'Norm': row['Normalization']
         }]
@@ -109,13 +94,13 @@ def test_load_samples_single_cmd(self):
         inp_file = "test.fastq"
         exp_file = from_here(mock_samp_sheet_path, "test_aligned_seqs.sam")
 
-        row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': inp_file})
+        row = dict(self.csv_samp_row_dict, **{'File': inp_file})
         csv = self.csv("samples.csv", [row])
 
         with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
             inputs_step = counter.load_samples(mock_samp_sheet_path, is_pipeline=False)
 
-        expected_result = self.get_parsed_samples_row(row, exp_file)
+        expected_result = self.get_loaded_samples_row(row, exp_file)
         self.assertEqual(inputs_step, expected_result)
 
     """Does load_samples correctly parse a single record samples.csv for pipeline invocation?"""
@@ -125,13 +110,13 @@ def test_load_samples_single_pipeline(self):
         inp_file = "test.fastq"
         exp_file = "test_aligned_seqs.sam"
 
-        row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': inp_file})
+        row = dict(self.csv_samp_row_dict, **{'File': inp_file})
         csv = self.csv("samples.csv", [row])
 
         with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
             inputs_pipeline = counter.load_samples(mock_samp_sheet_path, is_pipeline=True)
 
-        expected_result = self.get_parsed_samples_row(row, exp_file)
+        expected_result = self.get_loaded_samples_row(row, exp_file)
         self.assertEqual(inputs_pipeline, expected_result)
 
     """Does load_samples correctly handle duplicate samples? There should be no duplicates."""
@@ -150,21 +135,21 @@ def test_load_samples_duplicate(self):
 
     def test_load_samples_sam(self):
         sam_filename = "/fake/absolute/path/sample.sam"
-        row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': sam_filename})
+        row = dict(self.csv_samp_row_dict, **{'File': sam_filename})
         csv = self.csv("samples.csv", [row])
 
         with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
             dummy_file = '/dev/null'
             inputs = counter.load_samples(dummy_file, is_pipeline=False)
 
-        expected_result = self.get_parsed_samples_row(row, sam_filename)
+        expected_result = self.get_loaded_samples_row(row, sam_filename)
         self.assertEqual(inputs, expected_result)
 
     """Does load_samples throw ValueError if a non-absolute path to a SAM file is provided?"""
 
     def test_load_samples_nonabs_path(self):
         bad = "./dne.sam"
-        row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': bad})
+        row = dict(self.csv_samp_row_dict, **{'File': bad})
         csv = self.csv("samples.csv", [row])
 
         expected_error = "The following file must be expressed as an absolute path:\n" + bad
@@ -178,7 +163,7 @@ def test_load_samples_nonabs_path(self):
 
     def test_load_samples_bad_extension(self):
         bad = "./bad_extension.xyz"
-        row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': bad})
+        row = dict(self.csv_samp_row_dict, **{'File': bad})
         csv = self.csv("samples.csv", [row])
 
         expected_error = r"The filenames defined in your Samples Sheet must have a \.fastq\(\.gz\) or \.sam extension\.\n" \
@@ -201,8 +186,8 @@ def test_load_config_single_cmd(self):
             ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=False)
 
         expected_ruleset = self.parsed_feat_rule
-        expected_gff_file = from_here(dummy_file, row['Feature Source'])
-        expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]]))
+        expected_gff_file = from_here(dummy_file, row['Source'])
+        expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]]))
 
         self.assertEqual(gff_files, expected_gff_ret)
         self.assertEqual(ruleset, expected_ruleset)
@@ -219,8 +204,8 @@ def test_load_config_single_pipeline(self):
             ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=True)
 
         expected_ruleset = self.parsed_feat_rule
-        expected_gff_file = os.path.basename(row['Feature Source'])
-        expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]]))
+        expected_gff_file = os.path.basename(row['Source'])
+        expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]]))
 
         self.assertEqual(gff_files, expected_gff_ret)
         self.assertEqual(ruleset, expected_ruleset)
@@ -237,8 +222,8 @@ def test_load_config_duplicate_rules(self):
             ruleset, gff_files = counter.load_config(dummy_filename, False)
 
         expected_ruleset = self.parsed_feat_rule
-        expected_gff_file = from_here(dummy_filename, row['Feature Source'])
-        expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]]))
+        expected_gff_file = from_here(dummy_filename, row['Source'])
+        expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]]))
 
         self.assertEqual(gff_files, expected_gff_ret)
         self.assertEqual(ruleset, expected_ruleset)
@@ -247,7 +232,7 @@ def test_load_config_duplicate_rules(self):
 
     def test_load_config_rna_to_cDNA(self):
         row = self.csv_feat_row_dict.copy()
-        row["5' End Nucleotide"] = 'U'
+        row["nt5end"] = 'U'
         csv = self.csv("features.csv", [row])
 
         with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
@@ -260,15 +245,15 @@ def test_load_config_rna_to_cDNA(self):
 
     def test_load_config_id_name_attr(self):
         row = self.csv_feat_row_dict.copy()
-        row['Alias by...'] = 'ID'
+        row['Name'] = 'ID'
         csv = self.csv("features.csv", [row])
 
         with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
             dummy_file = '/dev/null'
             _, gff_files = counter.load_config(dummy_file, False)
 
         # Expect {file: [empty Name Attribute list]}
-        from_dummy = from_here(dummy_file, row['Feature Source'])
+        from_dummy = from_here(dummy_file, row['Source'])
         expected = defaultdict(list, zip([from_dummy], [[]]))
         self.assertEqual(gff_files, expected)