Skip to content

Commit

Permalink
Merge pull request #243 from MontgomeryLab/issue-242
Browse files Browse the repository at this point in the history
Configuration: Samples Sheet validation
  • Loading branch information
taimontgomery authored Oct 22, 2022
2 parents 81ef5e8 + c65ae81 commit 1588297
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 106 deletions.
27 changes: 27 additions & 0 deletions tests/unit_test_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,15 @@
import signal
import psutil
import shlex
import csv
import sys
import io
import os

from typing import List

from tiny.rna.configuration import CSVReader

rules_template = [{'Identity': ("Name", "N/A"),
'Strand': "both",
'Hierarchy': 0,
Expand All @@ -20,6 +25,28 @@
'Overlap': "partial"}]


def csv_factory(type: str, rows: List[dict], header=()):
"""Returns the file contents of the specified config csv. The written header does NOT match
the fieldnames expected in rows. Fieldnames are expected to be the internal
short names defined in Configuration.CSVReader (for brevity)"""

if type == "features.csv":
fields = list(CSVReader.tinyrna_sheet_fields['Features Sheet'].values())
header = {short: long for long, short in CSVReader.tinyrna_sheet_fields['Features Sheet'].items()}
elif type == "samples.csv":
fields = list(CSVReader.tinyrna_sheet_fields['Samples Sheet'].values())
header = {short: long for long, short in CSVReader.tinyrna_sheet_fields['Samples Sheet'].items()}
else:
sys.exit("Unsupported config file")

csv_string = io.StringIO()
writer = csv.DictWriter(csv_string, fieldnames=fields)
writer.writerow(header)
writer.writerows(rows)

return csv_string.getvalue()


def get_dir_tree(root_path: str) -> dict:
"""Returns a nested dictionary representation of a given directory tree.
Expand Down
69 changes: 67 additions & 2 deletions tests/unit_tests_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@
import unittest
from unittest.mock import patch, mock_open, call

from tiny.rna.configuration import Configuration
from tiny.rna.configuration import Configuration, SamplesSheet
from unit_test_helpers import csv_factory


class ConfigurationTests(unittest.TestCase):
class BowtieIndexesTest(unittest.TestCase):
@classmethod
def setUpClass(self):
self.root_cfg_dir = os.path.abspath("../tiny/templates")
Expand Down Expand Up @@ -135,5 +136,69 @@ def test_verify_bowtie_build_outputs(self):
self.assertListEqual(config['bt_index_files'], expected_ebwt)
self.assertListEqual(mo.call_args_list, expected_writes)


class SamplesSheetTest(unittest.TestCase):

"""Does SamplesSheet catch multi-assignment of control condition?"""

def test_validate_control_group(self):
sheet = csv_factory("samples.csv", [
{'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good
{'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good
{'File': '3.fastq', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''} # Bad
]) # ^^^

exp_contains = r".*(multiple control conditions).*"
with self.assertRaisesRegex(AssertionError, exp_contains), \
patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \
patch('tiny.rna.configuration.os.path.isfile', return_value=True):
SamplesSheet('mock_filename')

"""Does SamplesSheet catch duplicate entries for the same group and rep?"""
def test_validate_group_rep(self):
sheet = csv_factory("samples.csv", [
{'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good
{'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good
{'File': '3.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''} # Bad
]) # ^^^ ^^^

exp_contains = r".*(same group and replicate).*"
with self.assertRaisesRegex(AssertionError, exp_contains), \
patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \
patch('tiny.rna.configuration.os.path.isfile', return_value=True):
SamplesSheet('mock_filename')

"""Does SamplesSheet catch fastq files that don't exist, have a bad file extension, or are listed more than once?"""
def test_validate_fastq_filepath(self):
csv_rows = [
{'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good
{'File': '1.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Bad
{'File': '2.fasta', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''} # Bad
] # ^^^^^^^
sheet = csv_factory("samples.csv", csv_rows)

# File doesn't exist
exp_contains = r".*(was not found).*"
with self.assertRaisesRegex(AssertionError, exp_contains), \
patch('tiny.rna.configuration.open', mock_open(read_data=sheet)):
SamplesSheet('mock_filename')

# Duplicate filename
exp_contains = r".*(listed more than once).*"
with self.assertRaisesRegex(AssertionError, exp_contains), \
patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \
patch('tiny.rna.configuration.os.path.isfile', return_value=True):
SamplesSheet('mock_filename')

# Bad file extension
exp_contains = r".*(\.fastq\(\.gz\) extension).*"
csv_rows.pop(0)
sheet = csv_factory("samples.csv", csv_rows)
with self.assertRaisesRegex(AssertionError, exp_contains), \
patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \
patch('tiny.rna.configuration.os.path.isfile', return_value=True):
SamplesSheet('mock_filename')


if __name__ == '__main__':
unittest.main()
95 changes: 40 additions & 55 deletions tests/unit_tests_counter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import io
import os
import csv
import unittest

from unittest.mock import patch, mock_open
Expand All @@ -26,76 +25,62 @@ def setUpClass(self):
self.short_sam = helpers.read(self.short_sam_file)

self.strand = {'sense': tuple('+'), 'antisense': tuple('-'), 'both': ('+', '-')}
self.csv = staticmethod(helpers.csv_factory)

# Represents an unparsed Features Sheet row
# Key is the user-facing column header
self.csv_feat_row_dict = {
'Select for...': "Class",
'with value...': "CSR",
'Alias by...': "Alias",
'Tag': '',
'Hierarchy': "1",
'Strand': "antisense",
"5' End Nucleotide": '"C,G,U"', # Needs to be double-quoted due to commas
'Length': "all",
'Overlap': "Partial",
'Feature Source': "test_file.gff3"
'Key': "Class",
'Value': "CSR",
'Name': "Alias",
'Tag': "",
'Hierarchy': "1",
'Strand': "antisense",
"nt5end": '"C,G,U"', # Needs to be double-quoted due to commas
'Length': "all",
'Overlap': "Partial",
'Source': "test_file.gff3"
}

# Represents the parsed Features Sheet row above
# Key is the internal short name
_row = self.csv_feat_row_dict
self.parsed_feat_rule = [{
'Identity': (_row['Select for...'], _row['with value...']),
'Identity': (_row['Key'], _row['Value']),
'Tag': _row['Tag'],
'Hierarchy': int(_row['Hierarchy']),
'Strand': _row['Strand'],
'nt5end': _row["5' End Nucleotide"].upper().translate({ord('U'): 'T'}),
'nt5end': _row["nt5end"].upper().translate({ord('U'): 'T'}),
'Length': _row['Length'],
'Overlap': _row['Overlap'].lower()
}]

# Represents an unparsed Samples Sheet row
# Key is the user-facing column header
self.csv_samp_row_dict = {
'Input FASTQ Files': "test_file.fastq",
'Sample/Group Name': "test_group",
'Replicate Number': "0",
'Control': "",
'Normalization': ''
'File': "test_file.fastq",
'Group': "test_group",
'Replicate': "0",
'Control': "",
'Normalization': ""
}

# This is the same Samples Sheet row above, but with internal names
# It does NOT represent the parsed result of loading the Samples Sheet
_row = self.csv_samp_row_dict
self.parsed_samp_rule = {
'File': _row['Input FASTQ Files'],
'Group': _row['Sample/Group Name'],
'Replicate': _row['Replicate Number'],
'File': _row['File'],
'Group': _row['Group'],
'Replicate': _row['Replicate'],
'Control': _row['Control'],
'Normalization': _row['Normalization']
}

# === HELPERS ===

@staticmethod
def csv(type, rows, header=()):
if type == "features.csv":
header = ['Select for...', 'with value...', 'Alias by...', 'Tag', 'Hierarchy',
'Strand', "5' End Nucleotide", 'Length', 'Overlap', 'Feature Source']
elif type == "samples.csv":
header = ['Input FASTQ Files', 'Sample/Group Name', 'Replicate Number', 'Control', 'Normalization']

csv_string = io.StringIO()
writer = csv.DictWriter(csv_string, fieldnames=header)
writer.writeheader()
writer.writerows(rows)

return csv_string.getvalue()

def get_parsed_samples_row(self, row, exp_file):
def get_loaded_samples_row(self, row, exp_file):
return [{
'Name': "_rep_".join(row[i] for i in ["Sample/Group Name", "Replicate Number"]),
'Name': "_rep_".join(row[i] for i in ["Group", "Replicate"]),
'File': exp_file,
'Norm': row['Normalization']
}]
Expand All @@ -109,13 +94,13 @@ def test_load_samples_single_cmd(self):
inp_file = "test.fastq"
exp_file = from_here(mock_samp_sheet_path, "test_aligned_seqs.sam")

row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': inp_file})
row = dict(self.csv_samp_row_dict, **{'File': inp_file})
csv = self.csv("samples.csv", [row])

with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
inputs_step = counter.load_samples(mock_samp_sheet_path, is_pipeline=False)

expected_result = self.get_parsed_samples_row(row, exp_file)
expected_result = self.get_loaded_samples_row(row, exp_file)
self.assertEqual(inputs_step, expected_result)

"""Does load_samples correctly parse a single record samples.csv for pipeline invocation?"""
Expand All @@ -125,13 +110,13 @@ def test_load_samples_single_pipeline(self):
inp_file = "test.fastq"
exp_file = "test_aligned_seqs.sam"

row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': inp_file})
row = dict(self.csv_samp_row_dict, **{'File': inp_file})
csv = self.csv("samples.csv", [row])

with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
inputs_pipeline = counter.load_samples(mock_samp_sheet_path, is_pipeline=True)

expected_result = self.get_parsed_samples_row(row, exp_file)
expected_result = self.get_loaded_samples_row(row, exp_file)
self.assertEqual(inputs_pipeline, expected_result)

"""Does load_samples correctly handle duplicate samples? There should be no duplicates."""
Expand All @@ -150,21 +135,21 @@ def test_load_samples_duplicate(self):

def test_load_samples_sam(self):
sam_filename = "/fake/absolute/path/sample.sam"
row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': sam_filename})
row = dict(self.csv_samp_row_dict, **{'File': sam_filename})
csv = self.csv("samples.csv", [row])

with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
dummy_file = '/dev/null'
inputs = counter.load_samples(dummy_file, is_pipeline=False)

expected_result = self.get_parsed_samples_row(row, sam_filename)
expected_result = self.get_loaded_samples_row(row, sam_filename)
self.assertEqual(inputs, expected_result)

"""Does load_samples throw ValueError if a non-absolute path to a SAM file is provided?"""

def test_load_samples_nonabs_path(self):
bad = "./dne.sam"
row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': bad})
row = dict(self.csv_samp_row_dict, **{'File': bad})
csv = self.csv("samples.csv", [row])

expected_error = "The following file must be expressed as an absolute path:\n" + bad
Expand All @@ -178,7 +163,7 @@ def test_load_samples_nonabs_path(self):

def test_load_samples_bad_extension(self):
bad = "./bad_extension.xyz"
row = dict(self.csv_samp_row_dict, **{'Input FASTQ Files': bad})
row = dict(self.csv_samp_row_dict, **{'File': bad})
csv = self.csv("samples.csv", [row])

expected_error = r"The filenames defined in your Samples Sheet must have a \.fastq\(\.gz\) or \.sam extension\.\n" \
Expand All @@ -201,8 +186,8 @@ def test_load_config_single_cmd(self):
ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=False)

expected_ruleset = self.parsed_feat_rule
expected_gff_file = from_here(dummy_file, row['Feature Source'])
expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]]))
expected_gff_file = from_here(dummy_file, row['Source'])
expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]]))

self.assertEqual(gff_files, expected_gff_ret)
self.assertEqual(ruleset, expected_ruleset)
Expand All @@ -219,8 +204,8 @@ def test_load_config_single_pipeline(self):
ruleset, gff_files = counter.load_config(dummy_file, is_pipeline=True)

expected_ruleset = self.parsed_feat_rule
expected_gff_file = os.path.basename(row['Feature Source'])
expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]]))
expected_gff_file = os.path.basename(row['Source'])
expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]]))

self.assertEqual(gff_files, expected_gff_ret)
self.assertEqual(ruleset, expected_ruleset)
Expand All @@ -237,8 +222,8 @@ def test_load_config_duplicate_rules(self):
ruleset, gff_files = counter.load_config(dummy_filename, False)

expected_ruleset = self.parsed_feat_rule
expected_gff_file = from_here(dummy_filename, row['Feature Source'])
expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Alias by...']]]))
expected_gff_file = from_here(dummy_filename, row['Source'])
expected_gff_ret = defaultdict(list, zip([expected_gff_file], [[row['Name']]]))

self.assertEqual(gff_files, expected_gff_ret)
self.assertEqual(ruleset, expected_ruleset)
Expand All @@ -247,7 +232,7 @@ def test_load_config_duplicate_rules(self):

def test_load_config_rna_to_cDNA(self):
row = self.csv_feat_row_dict.copy()
row["5' End Nucleotide"] = 'U'
row["nt5end"] = 'U'
csv = self.csv("features.csv", [row])

with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
Expand All @@ -260,15 +245,15 @@ def test_load_config_rna_to_cDNA(self):

def test_load_config_id_name_attr(self):
row = self.csv_feat_row_dict.copy()
row['Alias by...'] = 'ID'
row['Name'] = 'ID'
csv = self.csv("features.csv", [row])

with patch('tiny.rna.configuration.open', mock_open(read_data=csv)):
dummy_file = '/dev/null'
_, gff_files = counter.load_config(dummy_file, False)

# Expect {file: [empty Name Attribute list]}
from_dummy = from_here(dummy_file, row['Feature Source'])
from_dummy = from_here(dummy_file, row['Source'])
expected = defaultdict(list, zip([from_dummy], [[]]))
self.assertEqual(gff_files, expected)

Expand Down
Loading

0 comments on commit 1588297

Please sign in to comment.