From 113ae1c65b31a5bc2c53a17f4aa18f96eadd6303 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 22 Apr 2023 17:46:41 -0700 Subject: [PATCH 01/18] Updating the SamplesSheet class so that it can be used in tiny-count (or any other pipeline step). This means the Samples Sheet will be subject to many of the same validation steps in tiny-count standalone runs as it is at pipeline startup. Additionally: - The normalization column is now validated and stored by the class - The samples sheet can contain .sam and .bam files if it is used in a standalone run context - File paths are properly resolved when used in a pipeline step context - tiny-deseq-related validations (control condition and deseq2 compatibility checks) are skipped when used in a standalone run or pipeline step context. They are only validated at pipeline startup. - The class now reports the filename and row number if any exception is raised during csv parsing --- tiny/rna/configuration.py | 125 +++++++++++++++++++++++++++----------- 1 file changed, 91 insertions(+), 34 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 53b7e0fc..04e56ac3 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -10,12 +10,12 @@ from pkg_resources import resource_filename from collections import Counter, OrderedDict, defaultdict -from typing import Union, Any, List, Dict +from typing import Union, Any, List, Dict, Optional from glob import glob from tiny.rna.compatibility import RunConfigCompatibility from tiny.rna.counter.validation import GFFValidator -from tiny.rna.util import get_timestamp, get_r_safename +from tiny.rna.util import get_timestamp, get_r_safename, append_to_exception class ConfigBase: @@ -265,13 +265,13 @@ def absorb_paths_file(self): def process_samples_sheet(self): samples_sheet_path = self.paths['samples_csv'] - samples_sheet = SamplesSheet(samples_sheet_path) + samples_sheet = SamplesSheet(samples_sheet_path, context="Pipeline Start") self['sample_basenames'] = samples_sheet.sample_basenames self['control_condition'] = samples_sheet.control_condition self['run_deseq'] = samples_sheet.is_compatible_df - self['in_fq'] = [self.cwl_file(fq, verify=False) for fq in samples_sheet.fastq_files] + self['in_fq'] = [self.cwl_file(fq, verify=False) for fq in samples_sheet.hts_samples] self['fastp_report_titles'] = [f"{g}_rep_{r}" for g, r in samples_sheet.groups_reps] def process_features_sheet(self) -> List[dict]: @@ -641,47 +641,93 @@ def append_to(self, key: str, val: Any): class SamplesSheet: - def __init__(self, file): + def __init__(self, file, context): self.csv = CSVReader(file, "Samples Sheet") self.basename = os.path.basename(file) self.dir = os.path.dirname(file) self.file = file - self.fastq_files = [] + self.hts_samples = [] self.groups_reps = [] + self.normalizations = [] self.sample_basenames = [] self.control_condition = None self.is_compatible_df = False + map_path, validate_path = self.get_context_methods(context).values() + self.validate_sample_path = validate_path + self.map_path = map_path + self.context = context + self.read_csv() + def get_context_methods(self, context): + return { + "Standalone Run": {'map_path': self.from_here, 'validate_path': self.validate_alignments_filepath}, + "Pipeline Start": {'map_path': self.from_here, 'validate_path': self.validate_fastq_filepath}, + "Pipeline Step": {'map_path': self.from_pipeline, 'validate_path': lambda _: True}, # skip validation + }[context] + def read_csv(self): reps_per_group = Counter() - for row in self.csv.rows(): - fastq_file = Configuration.joinpath(self.dir, row['File']) - group_name = row['Group'] - rep_number = row['Replicate'] - is_control = row['Control'].lower() == 'true' - basename = self.get_sample_basename(fastq_file) + try: + for row in self.csv.rows(): + hts_sample = self.map_path(row['File']) + group_name = row['Group'] + rep_number = row['Replicate'] + is_control = row['Control'].lower() == 'true' + norm_prefs = row['Normalization'] + basename = self.get_sample_basename(hts_sample) + + self.validate_sample_path(hts_sample) + self.validate_group_rep(group_name, rep_number) + self.validate_control_group(is_control, group_name) + self.validate_normalization(norm_prefs) + + self.hts_samples.append(hts_sample) + self.sample_basenames.append(basename) + self.groups_reps.append((group_name, rep_number)) + self.normalizations.append(norm_prefs) + reps_per_group[group_name] += 1 + + if is_control: self.control_condition = group_name + except Exception as e: + msg = f"Error occurred on line {self.csv.line_num} of {self.basename}" + append_to_exception(e, msg) + raise - self.validate_fastq_filepath(fastq_file) - self.validate_group_rep(group_name, rep_number) - self.validate_control_group(is_control, group_name) + self.is_compatible_df = self.validate_deseq_compatibility(reps_per_group) - self.fastq_files.append(fastq_file) - self.sample_basenames.append(basename) - self.groups_reps.append((group_name, rep_number)) - reps_per_group[group_name] += 1 + def from_here(self, input_file): + """Resolves .sam/.bam paths in pipeline startup and standalone context""" - if is_control: self.control_condition = group_name + return ConfigBase.joinpath(self.dir, input_file) - self.is_compatible_df = self.validate_deseq_compatibility(reps_per_group) + def from_pipeline(self, input_file): + """Resolves .fastq(.gz) file paths in pipeline step context""" + + basename_root, ext = os.path.splitext(os.path.basename(input_file)) + return f"{basename_root}_aligned_seqs.sam" + + def validate_alignments_filepath(self, file: str): + """Checks file existence, extension, and duplicate entries in standalone context""" + + root, ext = os.path.splitext(file) + + assert os.path.isfile(file), \ + "The file on row {row_num} of {selfname} was not found:\n\t{file}" \ + .format(row_num=self.csv.row_num, selfname=self.basename, file=file) + + assert ext in (".sam", ".bam"), \ + "Files in {selfname} must have a .sam or .bam extension (row {row_num})" \ + .format(selfname=self.basename, row_num=self.csv.row_num) + + assert file not in self.hts_samples, \ + "Alignment files cannot be listed more than once in {selfname} (row {row_num})" \ + .format(selfname=self.basename, row_num=self.csv.row_num) def validate_fastq_filepath(self, file: str): - """Checks file existence, extension, and duplicate entries. - Args: - file: fastq file path. For which has already been resolved relative to self.dir - """ + """Checks file existence, extension, and duplicate entries in pipeline startup context""" root, ext = os.path.splitext(file) @@ -690,30 +736,41 @@ def validate_fastq_filepath(self, file: str): .format(row_num=self.csv.row_num, selfname=self.basename, file=file) assert ext in (".fastq", ".gz"), \ - "Files in {selfname} must have a .fastq(.gz) extension (row {row_num})"\ + "Files in {selfname} must have a .fastq(.gz) extension (row {row_num})" \ .format(selfname=self.basename, row_num=self.csv.row_num) - assert file not in self.fastq_files, \ - "Fastq files cannot be listed more than once in {selfname} (row {row_num})"\ + assert file not in self.hts_samples, \ + "Fastq files cannot be listed more than once in {selfname} (row {row_num})" \ .format(selfname=self.basename, row_num=self.csv.row_num) def validate_group_rep(self, group:str, rep:str): assert (group, rep) not in self.groups_reps, \ "The same group and replicate number cannot appear on " \ - "more than one row in {selfname} (row {row_num})"\ + "more than one row in {selfname} (row {row_num})" \ .format(selfname=self.basename, row_num=self.csv.row_num) def validate_control_group(self, is_control: bool, group: str): - if not is_control: return + + if not is_control or self.context != "Pipeline Start": + return + assert self.control_condition in (group, None), \ - "tinyRNA does not support multiple control conditions " \ + "Experiments with multiple control conditions aren't supported " \ "(row {row_num} in {selfname}).\nHowever, if the control condition " \ "is unspecified, all possible comparisons will be made and this " \ - "should accomplish your goal."\ + "should accomplish your goal." \ .format(row_num=self.csv.row_num, selfname=self.basename) - @staticmethod - def validate_deseq_compatibility(sample_groups: Counter) -> bool: + def validate_normalization(self, norm): + assert re.fullmatch(r"(\s*[\d.]+\s*)|(rpm)", norm, re.IGNORECASE) or not norm, \ + "Invalid normalization value in {selfname} (row {row_num})" \ + .format(selfname=self.basename, row_num=self.csv.row_num) + + def validate_deseq_compatibility(self, sample_groups: Counter) -> Optional[bool]: + + if self.context != "Pipeline Start": + return None + SamplesSheet.validate_r_safe_sample_groups(sample_groups) total_samples = sum(sample_groups.values()) From 3c3a458864df2bc02301391a47d2b195c19661a8 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 22 Apr 2023 17:51:36 -0700 Subject: [PATCH 02/18] Updating load_samples() to use the SamplesSheet class. Cleaner, more consistent, and more thorough! This means that the from_here() method can be removed from util.py, which is a good thing in the end because ConfigBase.from_here() (which SamplesSheet uses) is more developed --- tiny/rna/counter/counter.py | 57 ++++++++++--------------------------- tiny/rna/util.py | 11 ------- 2 files changed, 15 insertions(+), 53 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 4bb95f7d..c6b8a836 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -12,14 +12,13 @@ from tiny.rna.counter.features import Features, FeatureCounter from tiny.rna.counter.statistics import MergedStatsManager from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, ReferenceBase -from tiny.rna.configuration import CSVReader, PathsFile, get_templates +from tiny.rna.configuration import PathsFile, SamplesSheet, CSVReader, get_templates from tiny.rna.util import ( report_execution_time, add_transparent_help, append_to_exception, get_timestamp, - ReadOnlyDict, - from_here + ReadOnlyDict ) # Global variables for multiprocessing @@ -89,15 +88,15 @@ def get_args(): return ReadOnlyDict(args_dict) -def load_samples(samples_csv: str, is_pipeline: bool) -> List[Dict[str, str]]: +def load_samples(samples_csv: str, in_pipeline: bool) -> List[Dict[str, str]]: """Parses the Samples Sheet to determine library names and alignment files for counting Sample files may have a .fastq(.gz) extension (i.e. when tiny-count is called as part of a - pipeline run) or a .sam extension (i.e. when tiny-count is called as a standalone tool). + pipeline run) or a .sam/.bam extension (i.e. when tiny-count is called as a standalone tool). Args: samples_csv: a csv file which defines sample group, replicate, and file location - is_pipeline: helps locate sample SAM files. If true, files are assumed to reside + in_pipeline: helps locate sample SAM files. If true, files are assumed to reside in the working directory. If false, files are assumed to reside in the same directory as their source FASTQ files with '_aligned_seqs.sam' appended (i.e. /dir/sample1.fastq -> /dir/sample1_aligned_seqs.sam). @@ -107,44 +106,18 @@ def load_samples(samples_csv: str, is_pipeline: bool) -> List[Dict[str, str]]: library name and the location of its SAM file for counting. """ - def get_library_filename(csv_row_file: str, samples_csv: str) -> str: - """The input samples.csv may contain either fastq or sam files""" - - file_ext = os.path.splitext(csv_row_file)[1].lower() - - # If the sample file has a fastq(.gz) extension, infer the name of its pipeline-produced .sam file - if file_ext in [".fastq", ".gz"]: - # Fix relative paths to be relative to sample_csv's path, rather than relative to cwd - csv_row_file = os.path.basename(csv_row_file) if is_pipeline else from_here(samples_csv, csv_row_file) - csv_row_file = os.path.splitext(csv_row_file)[0] + "_aligned_seqs.sam" - elif file_ext == ".sam": - if not os.path.isabs(csv_row_file): - raise ValueError("The following file must be expressed as an absolute path:\n%s" % (csv_row_file,)) - else: - raise ValueError("The filenames defined in your Samples Sheet must have a .fastq(.gz) or .sam extension.\n" - "The following filename contained neither:\n%s" % (csv_row_file,)) - return csv_row_file - + context = "Pipeline Step" if in_pipeline else "Standalone Run" + samples = SamplesSheet(samples_csv, context=context) inputs = list() - sheet = CSVReader(samples_csv, "Samples Sheet") - try: - for row in sheet.rows(): - library_name = f"{row['Group']}_rep_{row['Replicate']}" - library_file_name = get_library_filename(row['File'], samples_csv) - library_normalization = row['Normalization'] - - record = { - "Name": library_name, - "File": library_file_name, - "Norm": library_normalization - } - - if record not in inputs: inputs.append(record) - except Exception as e: - msg = f"Error occurred on line {sheet.line_num} of {os.path.basename(samples_csv)}" - append_to_exception(e, msg) - raise + for file, group_rep, norm in zip(samples.hts_samples, samples.groups_reps, samples.normalizations): + record = { + "Name": "_rep_".join(group_rep), + "File": file, + "Norm": norm + } + + if record not in inputs: inputs.append(record) return inputs diff --git a/tiny/rna/util.py b/tiny/rna/util.py index 1d8700df..1670ef3b 100644 --- a/tiny/rna/util.py +++ b/tiny/rna/util.py @@ -148,17 +148,6 @@ def _fill_text(self, text, width, indent): return '\n\n'.join(output_lines) -def from_here(config_file, input_file): - """Calculates paths relative to the config file which contains them""" - config_file, input_file = (os.path.expanduser(p) for p in [config_file, input_file]) - - if not os.path.isabs(input_file): - from_here = os.path.dirname(config_file) - input_file = os.path.normpath(os.path.join(from_here, input_file)) - - return input_file - - def make_filename(args, ext='.csv'): return '_'.join([str(chnk) for chnk in args if chnk is not None]) + ext From 6662696f6a112c93cd13ae6f4f61b6ee4c920e53 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 22 Apr 2023 17:55:57 -0700 Subject: [PATCH 03/18] Changing usages of is_pipeline to in_pipeline to be consistent with the PathsFile class (and because it just makes more sense) --- doc/Parameters.md | 4 ++-- tiny/cwl/tools/tiny-count.cwl | 4 ++-- tiny/cwl/workflows/tinyrna_wf.cwl | 4 ++-- tiny/rna/counter/counter.py | 16 ++++++++-------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/Parameters.md b/doc/Parameters.md index d085c17a..94ea2bef 100644 --- a/doc/Parameters.md +++ b/doc/Parameters.md @@ -101,7 +101,7 @@ A custom Cython implementation of HTSeq's StepVector is used for finding feature ### Is Pipeline | Run Config Key | Commandline Argument | |----------------|----------------------| -| | `--is-pipeline` | +| | `--in-pipeline` | This commandline argument tells tiny-count that it is running as a workflow step rather than a standalone/manual run. Under these conditions tiny-count will look for all input files in the current working directory regardless of the paths defined in the Samples Sheet and Features Sheet. @@ -152,7 +152,7 @@ Optional arguments: -sv {Cython,HTSeq}, --stepvector {Cython,HTSeq} Select which StepVector implementation is used to find features overlapping an interval. (default: Cython) - -p, --is-pipeline Indicates that tiny-count was invoked as part of a + -p, --in-pipeline Indicates that tiny-count was invoked as part of a pipeline run and that input files should be sourced as such. (default: False) -d, --report-diags Produce diagnostic information about diff --git a/tiny/cwl/tools/tiny-count.cwl b/tiny/cwl/tools/tiny-count.cwl index aeb2e694..f2a695a6 100644 --- a/tiny/cwl/tools/tiny-count.cwl +++ b/tiny/cwl/tools/tiny-count.cwl @@ -55,10 +55,10 @@ inputs: inputBinding: prefix: --all-features - is_pipeline: + in_pipeline: type: boolean? inputBinding: - prefix: --is-pipeline + prefix: --in-pipeline diagnostics: type: boolean? diff --git a/tiny/cwl/workflows/tinyrna_wf.cwl b/tiny/cwl/workflows/tinyrna_wf.cwl index 0a877b86..582ded97 100644 --- a/tiny/cwl/workflows/tinyrna_wf.cwl +++ b/tiny/cwl/workflows/tinyrna_wf.cwl @@ -82,7 +82,7 @@ inputs: features_csv: File gff_files: File[]? aligned_seqs: File[]? - is_pipeline: boolean? + in_pipeline: boolean? counter_diags: boolean? counter_decollapse: boolean? counter_stepvector: string? @@ -223,7 +223,7 @@ steps: valueFrom: $(String(self)) # convert boolean -> string decollapse: counter_decollapse stepvector: counter_stepvector - is_pipeline: {default: true} + in_pipeline: {default: true} diagnostics: counter_diags fastp_logs: preprocessing/json_report_file collapsed_fa: preprocessing/uniq_seqs diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index c6b8a836..6a756830 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -67,7 +67,7 @@ def get_args(): optional_args.add_argument('-a', '--all-features', action='store_true', help=argparse.SUPPRESS) #help='Represent all features in output counts table, ' # 'even if they did not match in Stage 1 selection.') - optional_args.add_argument('-p', '--is-pipeline', action='store_true', + optional_args.add_argument('-p', '--in-pipeline', action='store_true', help='Indicates that tiny-count was invoked as part of a pipeline run ' 'and that input files should be sourced as such.') optional_args.add_argument('-d', '--report-diags', action='store_true', @@ -122,12 +122,12 @@ def load_samples(samples_csv: str, in_pipeline: bool) -> List[Dict[str, str]]: return inputs -def load_config(features_csv: str, is_pipeline: bool) -> List[dict]: +def load_config(features_csv: str, in_pipeline: bool) -> List[dict]: """Parses the Features Sheet to provide inputs to FeatureSelector and reference parsers Args: features_csv: a csv file which defines feature sources and selection rules - is_pipeline: not currently used; helps properly locate input files + in_pipeline: not currently used; helps properly locate input files Returns: rules: a list of dictionaries, each representing a parsed row from input. @@ -212,13 +212,13 @@ def map_and_reduce(libraries, paths, prefs): def main(): # Get command line arguments. args = get_args() - is_pipeline = args['is_pipeline'] + in_pipeline = args['in_pipeline'] try: # Load and validate config files and input files - paths = PathsFile(args['paths_file'], is_pipeline) - libraries = load_samples(paths['samples_csv'], is_pipeline) - selection = load_config(paths['features_csv'], is_pipeline) + paths = PathsFile(args['paths_file'], in_pipeline) + libraries = load_samples(paths['samples_csv'], in_pipeline) + selection = load_config(paths['features_csv'], in_pipeline) reference = load_references(paths, libraries, selection, args) # global for multiprocessing @@ -233,7 +233,7 @@ def main(): except Exception as e: if type(e) is SystemExit: return traceback.print_exception(*sys.exc_info()) - if args['is_pipeline']: + if in_pipeline: print("\n\ntiny-count encountered an error. Don't worry! You don't have to start over.\n" "You can resume the pipeline at tiny-count. To do so:\n\t" "1. cd into your Run Directory\n\t" From fdf2c21fee0488571a8f8c1178d910bc58d757e1 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 24 Apr 2023 13:09:25 -0700 Subject: [PATCH 04/18] Renaming SAM_reader to AlignmentReader and adding better checks for proper ordering of alignments --- tiny/rna/counter/counter.py | 2 +- tiny/rna/counter/features.py | 8 ++-- tiny/rna/counter/hts_parsing.py | 65 +++++++++++++++++++++++++++------ tiny/rna/counter/statistics.py | 2 +- tiny/rna/counter/validation.py | 6 +-- 5 files changed, 62 insertions(+), 21 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 6a756830..4a16c15b 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -59,7 +59,7 @@ def get_args(): optional_args.add_argument('-vs', '--verify-stats', metavar='T/F', default='T', help='Verify that all reported stats are internally consistent.') optional_args.add_argument('-dc', '--decollapse', action='store_true', - help='Create a decollapsed copy of all SAM files listed in your Samples Sheet. ' + help='Create a decollapsed SAM copy of all files listed in your Samples Sheet. ' 'This option is ignored for non-collapsed inputs.') optional_args.add_argument('-sv', '--stepvector', choices=['Cython', 'HTSeq'], default='Cython', help='Select which StepVector implementation is used to find ' diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 512292bd..1ce7be63 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -4,7 +4,7 @@ from collections import defaultdict from typing import List, Tuple, Set, Dict, Mapping, Union -from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, SAM_reader +from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, AlignmentReader from .statistics import LibraryStats from .matching import * from ..util import append_to_exception @@ -32,7 +32,7 @@ class FeatureCounter: def __init__(self, references, selection_rules, **prefs): self.stats = LibraryStats(**prefs) - self.sam_reader = SAM_reader(**prefs) + self.alignment_reader = AlignmentReader(**prefs) self.selector = FeatureSelector(selection_rules, **prefs) if isinstance(references, ReferenceFeatures): @@ -48,7 +48,7 @@ def __init__(self, references, selection_rules, **prefs): def count_reads(self, library: dict): """Collects statistics on features assigned to each alignment associated with each read""" - read_seq = self.sam_reader.bundle_multi_alignments(library["File"]) + read_seq = self.alignment_reader.bundle_multi_alignments(library["File"]) self.stats.assign_library(library) # For each sequence in the sam file... @@ -85,7 +85,7 @@ class FeatureSelector: """Performs hierarchical selection given a set of candidate features for a locus Two sources of data serve as targets for selection: feature attributes (sourced from - input GFF files), and sequence attributes (sourced from input SAM files). + input GFF files), and sequence attributes (sourced from input alignment files). All candidate features are assumed to have matched at least one Identity selector, as determined by hts_parsing.ReferenceFeatures.get_matches_and_classes() diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 469e184f..838c8b26 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -19,21 +19,23 @@ _re_attr_main = re.compile(r"\s*([^\s=]+)[\s=]+(.*)") _re_attr_empty = re.compile(r"^\s*$") -# For SAM_reader +# For AlignmentReader AlignmentDict = Dict[str, Union[str, int]] Bundle = Tuple[List[AlignmentDict], int] _re_tiny = r"\d+_count=\d+" _re_fastx = r"seq\d+_x(\d+)" -class SAM_reader: - """A minimal SAM reader that bundles multiple-alignments and only parses data relevant to the workflow.""" +class AlignmentReader: + """A minimal SAM/BAM reader that bundles multiple-alignments and only parses data relevant to the workflow.""" def __init__(self, **prefs): self.decollapse = prefs.get("decollapse", False) self.out_prefix = prefs.get("out_prefix", None) self.collapser_type = None self.references = [] + self.file_type = None + self.basename = None self.has_nm = None self.file = None @@ -50,8 +52,8 @@ def bundle_multi_alignments(self, file: str) -> Iterator[Bundle]: """Bundles multi-alignments (determined by a shared QNAME) and reports the associated read's count""" pysam_reader = pysam.AlignmentFile(file) - self.file = file + self._assign_library(file) self._gather_metadata(pysam_reader) self._iter = AlignmentIter(pysam_reader, self.has_nm, self._decollapsed_callback, self._decollapsed_reads) bundle, read_count = self._new_bundle(next(self._iter)) @@ -78,18 +80,25 @@ def _new_bundle(self, aln: AlignmentDict) -> Bundle: return [aln], count + def _assign_library(self, file): + self.file_type = file.rsplit('.')[-1].lower() + self.basename = os.path.basename(file) + self.file = file + def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: - """Saves header information, examines the first alignment to determine which collapser utility was used (if any), and whether the alignment has a query sequence and NM tag. + """Saves header information, examines the first alignment to determine which collapser utility was used (if any) + and whether the alignment has a query sequence and NM tag. The input file's header is written to the decollapsed output file if necessary.""" header = reader.header first_aln = next(reader.head(1)) if first_aln.query_sequence is None: - raise ValueError("SAM alignments must include the read sequence.") + raise ValueError(f"Alignments must include the read sequence ({self.basename}).") self._header = header - self._header_dict = header.to_dict() # performs validation + self._header_dict = header.to_dict() # performs header validation + self._check_for_incompatible_order() self._determine_collapser_type(first_aln.query_name) self.has_nm = first_aln.has_tag("NM") self.references = header.references @@ -97,6 +106,43 @@ def _gather_metadata(self, reader: pysam.AlignmentFile) -> None: if self.decollapse: self._write_header_for_decollapsed_sam(str(header)) + def _check_for_incompatible_order(self): + """Inspects headers to determine if alignments of multi-mapping reads are in adjacent order + + Headers that report unsorted/unknown sorting order are likely still compatible, but + since the SAM/BAM standard doesn't address alignment adjacency, we have to assume + that it is implementation dependent. Therefore, we check the last reported @PG + header against a short (incomplete) list of alignment tools that are known to + follow the adjacency convention.""" + + hd_header = self._header_dict.get('HD', {}) + so_header = hd_header.get('SO', "") + go_header = hd_header.get('GO', "") + + # Check for strictly compatible order + if "queryname" in so_header or "query" in go_header: + return + + error = self.basename + " is incompatible because {reason}.\n" \ + "Please ensure that alignments for multi-mapping reads are ordered so " \ + "that they are adjacent to each other. One way to achieve this is to " \ + 'sort your files by QNAME with the command "samtools sort -n"' + + # Check for strictly incompatible order + if not ({'SO', 'GO'} & hd_header.keys()): + raise ValueError(error.format(reason="its sorting/grouping couldn't be determined from headers")) + if "coordinate" in so_header: + raise ValueError(error.format(reason="it is sorted by coordinate")) + if "reference" in go_header: + raise ValueError(error.format(reason="it is grouped by reference")) + + # If SO is unknown/unsorted or GO is none, check to see if the last program + # to handle the alignments is one that produces adjacent alignments by default + pg_header = self._header_dict.get('PG', [{}]) + last_prog = pg_header[-1].get('ID', "") + if last_prog.lower() not in ("bowtie", "bowtie2", "star"): + raise ValueError(error.format(reason="multi-mapping adjacency couldn't be determined")) + def _determine_collapser_type(self, qname: str) -> None: """Attempts to determine the collapsing utility that was used before producing the input alignment file, then checks basic requirements for the utility's outputs.""" @@ -108,11 +154,6 @@ def _determine_collapser_type(self, qname: str) -> None: elif re.match(_re_fastx, qname) is not None: self.collapser_type = "fastx" self._collapser_token = "_x" - - sort_order = self._header_dict.get('HD', {}).get('SO', None) - if sort_order is None or sort_order != "queryname": - raise ValueError("SAM files from fastx collapsed outputs must be sorted by queryname\n" - "(and the @HD [...] SO header must be set accordingly).") else: self.collapser_type = None diff --git a/tiny/rna/counter/statistics.py b/tiny/rna/counter/statistics.py index 41fbcfa7..a3afd0fd 100644 --- a/tiny/rna/counter/statistics.py +++ b/tiny/rna/counter/statistics.py @@ -634,7 +634,7 @@ def record_alignment_details(self, assignments, aln, bundle, n_candidates): This is called once per locus per read (every alignment) when the user elects to save diagnostics. The recorded information is later written to {library['Name']}_aln_table.txt - after the entire SAM file has been processed.""" + after the entire alignment file has been processed.""" # Map internal strand representation to +/-/. strand = self.map_strand[aln['Strand']] diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index a55b488e..c2d7868c 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -7,7 +7,7 @@ from collections import Counter, defaultdict from typing import List, Dict -from tiny.rna.counter.hts_parsing import parse_gff, ReferenceFeatures, SAM_reader +from tiny.rna.counter.hts_parsing import parse_gff, ReferenceFeatures from tiny.rna.counter.features import FeatureSelector from tiny.rna.util import ReportFormatter, sorted_natural, gzip_open @@ -18,7 +18,7 @@ class GFFValidator: targets = { "ID attribute": "Features missing a valid identifier attribute", - "sam files": "Potentially incompatible SAM alignment files", + "alignment files": "Potentially incompatible alignment files", "seq chromosomes": "Chromosomes present in sequence files", "gff chromosomes": "Chromosomes present in GFF files", "strand": "Features missing strand information", @@ -210,7 +210,7 @@ def generate_chrom_heuristics_report(self, suspect_files): for file, chroms in suspect_files.items()} summary = { - "sam files": chroms, + "alignment files": chroms, "gff chromosomes": sorted(self.chrom_set) } From 7f805be38e064f3177d468c9e8ea8ebbef6a252d Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 24 Apr 2023 14:35:07 -0700 Subject: [PATCH 05/18] Renaming SamSqValidator to AlignmentSqValidator, and uses of "sam" to "alignment" --- tiny/rna/counter/counter.py | 20 +++++++++----------- tiny/rna/counter/validation.py | 22 +++++++++++----------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index 4a16c15b..d0bf3ab8 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -8,7 +8,7 @@ from typing import List, Dict -from tiny.rna.counter.validation import GFFValidator, SamSqValidator +from tiny.rna.counter.validation import GFFValidator, AlignmentSqValidator from tiny.rna.counter.features import Features, FeatureCounter from tiny.rna.counter.statistics import MergedStatsManager from tiny.rna.counter.hts_parsing import ReferenceFeatures, ReferenceSeqs, ReferenceBase @@ -91,19 +91,17 @@ def get_args(): def load_samples(samples_csv: str, in_pipeline: bool) -> List[Dict[str, str]]: """Parses the Samples Sheet to determine library names and alignment files for counting - Sample files may have a .fastq(.gz) extension (i.e. when tiny-count is called as part of a + Sample files can have a .fastq(.gz) extension (i.e. when tiny-count is called as part of a pipeline run) or a .sam/.bam extension (i.e. when tiny-count is called as a standalone tool). Args: samples_csv: a csv file which defines sample group, replicate, and file location - in_pipeline: helps locate sample SAM files. If true, files are assumed to reside - in the working directory. If false, files are assumed to reside in the same - directory as their source FASTQ files with '_aligned_seqs.sam' appended - (i.e. /dir/sample1.fastq -> /dir/sample1_aligned_seqs.sam). + in_pipeline: helps locate sample alignment files. If true, files are assumed to + reside in the working directory. Returns: - inputs: a list of dictionaries for each library, where each dictionary defines the - library name and the location of its SAM file for counting. + inputs: a list of dictionaries for each library. Each dictionary holds + the library's name, file location, and normalization preferences. """ context = "Pipeline Step" if in_pipeline else "Standalone Run" @@ -170,13 +168,13 @@ def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], """ gff_files = paths.get_gff_config() - sam_files = [lib['File'] for lib in libraries] + aln_files = [lib['File'] for lib in libraries] if gff_files: - GFFValidator(gff_files, rules, alignments=sam_files).validate() + GFFValidator(gff_files, rules, alignments=aln_files).validate() references = ReferenceFeatures(gff_files, **prefs) else: - sq_validator = SamSqValidator(sam_files) + sq_validator = AlignmentSqValidator(aln_files) # Reuse sq_validator's parsing results to save time sq_validator.validate() diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index c2d7868c..005cc5e5 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -229,24 +229,24 @@ def build_column_filters(rules): return FeatureSelector.build_selectors([selector_defs])[0] -class SamSqValidator: +class AlignmentSqValidator: """Validates @SQ headers for tiny-count's sequence-based counting mode""" targets = { "inter sq": "Sequence identifiers with inconsistent lengths", - "intra sq": "SAM files with repeated sequence identifiers", - "incomplete sq": "SAM files with incomplete @SQ headers", - "missing sq": "SAM files that lack @SQ headers" + "intra sq": "alignment files with repeated sequence identifiers", + "incomplete sq": "alignment files with incomplete @SQ headers", + "missing sq": "alignment files that lack @SQ headers" } - def __init__(self, sam_files): + def __init__(self, alignment_files): self.report = ReportFormatter(self.targets) - self.sam_files = sam_files + self.alignment_files = alignment_files self.reference_seqs = {} self.sq_headers = {} def validate(self): - print("Validating sequence identifiers in SAM files... ", end='') + print("Validating sequence identifiers in alignment files... ", end='') self.read_sq_headers() self.validate_sq_headers() print("done.") @@ -279,7 +279,7 @@ def get_ambiguous_lengths(self) -> List[str]: Sequences with consistent length definitions are added to self.reference_seqs""" seq_lengths = defaultdict(set) - for sam in self.sam_files: + for sam in self.alignment_files: for sq in self.sq_headers[sam]: seq_id = sq['SN'] seq_len = int(sq['LN']) @@ -299,7 +299,7 @@ def get_duplicate_identifiers(self) -> Dict[str, List[str]]: """Returns a dictionary of SAM files that contain duplicate sequence identifiers""" bad_files = {} - for file in self.sam_files: + for file in self.alignment_files: sq = self.sq_headers[file] id_count = Counter(seq['SN'] for seq in sq) duplicates = [seq_id for seq_id, count in id_count.items() if count > 1] @@ -326,7 +326,7 @@ def generate_header_syntax_report(self, missing, incomplete): if incomplete: report['incomplete sq'] = sorted_natural(incomplete) - header = "Every SAM file must have complete @SQ headers with SN and LN\n" \ + header = "Every alignment file must have complete @SQ headers with SN and LN\n" \ "fields when performing sequence-based counting.\n" self.report.add_error_section(header, report) @@ -341,6 +341,6 @@ def generate_identifier_report(self, duplicate, ambiguous): self.report.add_error_section(header, report) def read_sq_headers(self): - for file in self.sam_files: + for file in self.alignment_files: pr = pysam.AlignmentFile(file, check_sq=False) self.sq_headers[file] = pr.header.to_dict().get('SQ', []) From 7a7a3742fffb820e0b2a177276ac9db6651c9bd9 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 24 Apr 2023 14:39:05 -0700 Subject: [PATCH 06/18] Updating infer_strandedness() (even though it isn't currently in use) --- tiny/rna/counter/hts_parsing.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 838c8b26..b1311a0e 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -1,4 +1,5 @@ import functools +import itertools import os.path import HTSeq import pysam @@ -197,15 +198,15 @@ def _write_decollapsed_sam(self) -> None: self._decollapsed_reads.clear() -def infer_strandedness(sam_file: str, intervals: dict) -> str: - """Infers strandedness from a sample SAM file and intervals from a parsed GFF file +def infer_strandedness(alignment_file: str, intervals: dict) -> str: + """Infers strandedness from a sample alignment file and intervals from a parsed GFF file Credit: this technique is an adaptation of those in RSeQC's infer_experiment.py. It has been modified to accept a GFF reference file rather than a BED file, - and to use HTSeq rather than bx-python. + and to use our AlignmentReader rather than bx-python. Args: - sam_file: the path of the SAM file to evaluate + alignment_file: the path of the alignment file to evaluate intervals: the intervals instance attribute of ReferenceFeatures, populated by .get() """ @@ -216,18 +217,14 @@ def infer_strandedness(sam_file: str, intervals: dict) -> str: iv_convert.strand = '.' unstranded[iv_convert] = orig_iv.strand - # Assumes read_SAM() defaults to non-reverse strandedness - sample_read = read_SAM(sam_file) + # Assumes AlignmentReader defaults to non-reverse strandedness + sample_read = AlignmentReader().bundle_multi_alignments(alignment_file) gff_sam_map = Counter() - for count in range(1, 20000): - try: - rec = next(sample_read) - strandless = HTSeq.GenomicInterval(rec['Chrom'], rec['Start'], rec['End']) - sam_strand = rec['strand'] - gff_strand = ':'.join(unstranded[strandless].get_steps()) - gff_sam_map[sam_strand + gff_strand] += 1 - except StopIteration: - break + for rec, _ in zip(itertools.chain(*sample_read), range(20000)): + strandless = HTSeq.GenomicInterval(rec['Chrom'], rec['Start'], rec['End']) + sam_strand = rec['strand'] + gff_strand = ':'.join(unstranded[strandless].get_steps()) + gff_sam_map[sam_strand + gff_strand] += 1 non_rev = (gff_sam_map['++'] + gff_sam_map['--']) / sum(gff_sam_map.values()) reverse = (gff_sam_map['+-'] + gff_sam_map['-+']) / sum(gff_sam_map.values()) From 59f8bd06b1c1877dc49954f0d722bdfc28ac09b6 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 29 Apr 2023 11:23:31 -0700 Subject: [PATCH 07/18] Restructuring testdata folder for tiny-count --- .../counter/{ => gff}/discontinuous.gff3 | 0 .../{ => gff}/identity_choice_test.gff3 | 0 tests/testdata/counter/{ => gff}/single.gff3 | 0 tests/testdata/counter/{ => gff}/single2.gff3 | 0 .../counter/{ => sam}/Lib304_test.sam | 0 .../{ => sam}/identity_choice_test.sam | 0 .../counter/{ => sam}/non-collapsed.sam | 2 + tests/testdata/counter/{ => sam}/single.sam | 2 + tests/unit_tests_counter.py | 8 ++-- tests/unit_tests_features.py | 8 ++-- tests/unit_tests_hts_parsing.py | 38 +++++++++---------- tests/unit_tests_validation.py | 16 +++++--- 12 files changed, 42 insertions(+), 32 deletions(-) rename tests/testdata/counter/{ => gff}/discontinuous.gff3 (100%) rename tests/testdata/counter/{ => gff}/identity_choice_test.gff3 (100%) rename tests/testdata/counter/{ => gff}/single.gff3 (100%) rename tests/testdata/counter/{ => gff}/single2.gff3 (100%) rename tests/testdata/counter/{ => sam}/Lib304_test.sam (100%) rename tests/testdata/counter/{ => sam}/identity_choice_test.sam (100%) rename tests/testdata/counter/{ => sam}/non-collapsed.sam (81%) rename tests/testdata/counter/{ => sam}/single.sam (80%) diff --git a/tests/testdata/counter/discontinuous.gff3 b/tests/testdata/counter/gff/discontinuous.gff3 similarity index 100% rename from tests/testdata/counter/discontinuous.gff3 rename to tests/testdata/counter/gff/discontinuous.gff3 diff --git a/tests/testdata/counter/identity_choice_test.gff3 b/tests/testdata/counter/gff/identity_choice_test.gff3 similarity index 100% rename from tests/testdata/counter/identity_choice_test.gff3 rename to tests/testdata/counter/gff/identity_choice_test.gff3 diff --git a/tests/testdata/counter/single.gff3 b/tests/testdata/counter/gff/single.gff3 similarity index 100% rename from tests/testdata/counter/single.gff3 rename to tests/testdata/counter/gff/single.gff3 diff --git a/tests/testdata/counter/single2.gff3 b/tests/testdata/counter/gff/single2.gff3 similarity index 100% rename from tests/testdata/counter/single2.gff3 rename to tests/testdata/counter/gff/single2.gff3 diff --git a/tests/testdata/counter/Lib304_test.sam b/tests/testdata/counter/sam/Lib304_test.sam similarity index 100% rename from tests/testdata/counter/Lib304_test.sam rename to tests/testdata/counter/sam/Lib304_test.sam diff --git a/tests/testdata/counter/identity_choice_test.sam b/tests/testdata/counter/sam/identity_choice_test.sam similarity index 100% rename from tests/testdata/counter/identity_choice_test.sam rename to tests/testdata/counter/sam/identity_choice_test.sam diff --git a/tests/testdata/counter/non-collapsed.sam b/tests/testdata/counter/sam/non-collapsed.sam similarity index 81% rename from tests/testdata/counter/non-collapsed.sam rename to tests/testdata/counter/sam/non-collapsed.sam index 94e2d107..48368cde 100644 --- a/tests/testdata/counter/non-collapsed.sam +++ b/tests/testdata/counter/sam/non-collapsed.sam @@ -1,2 +1,4 @@ +@HD SO:unsorted @SQ SN:I LN:21 +@PG ID:bowtie NON_COLLAPSED_QNAME 16 I 15064570 255 21M * 0 0 CAAGACAGAGCTTCACCGTTC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 XM:i:2 diff --git a/tests/testdata/counter/single.sam b/tests/testdata/counter/sam/single.sam similarity index 80% rename from tests/testdata/counter/single.sam rename to tests/testdata/counter/sam/single.sam index 6d8dc69b..ccf1b730 100644 --- a/tests/testdata/counter/single.sam +++ b/tests/testdata/counter/sam/single.sam @@ -1,2 +1,4 @@ +@HD SO:unsorted @SQ SN:I LN:21 +@PG ID:bowtie 0_count=5 16 I 15064570 255 21M * 0 0 CAAGACAGAGCTTCACCGTTC IIIIIIIIIIIIIIIIIIIII XA:i:0 MD:Z:21 NM:i:0 XM:i:2 \ No newline at end of file diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index e32a0fa6..ad83ee01 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -16,12 +16,12 @@ class CounterTests(unittest.TestCase): @classmethod def setUpClass(self): - self.gff_file = f"{resources}/identity_choice_test.gff3" - self.short_gff_file = f"{resources}/single.gff3" + self.gff_file = f"{resources}/gff/identity_choice_test.gff3" + self.short_gff_file = f"{resources}/gff/single.gff3" self.short_gff = helpers.read(self.short_gff_file) - self.sam_file = f"{resources}/identity_choice_test.sam" - self.short_sam_file = f"{resources}/single.sam" + self.sam_file = f"{resources}/sam/identity_choice_test.sam" + self.short_sam_file = f"{resources}/sam/single.sam" self.short_sam = helpers.read(self.short_sam_file) self.strand = {'sense': tuple('+'), 'antisense': tuple('-'), 'both': ('+', '-')} diff --git a/tests/unit_tests_features.py b/tests/unit_tests_features.py index 236be93e..ce752dbd 100644 --- a/tests/unit_tests_features.py +++ b/tests/unit_tests_features.py @@ -11,12 +11,12 @@ class FeaturesTests(unittest.TestCase): @classmethod def setUpClass(self): - self.gff_file = f"{resources}/identity_choice_test.gff3" - self.short_gff_file = f"{resources}/single.gff3" + self.gff_file = f"{resources}/gff/identity_choice_test.gff3" + self.short_gff_file = f"{resources}/gff/single.gff3" self.short_gff = read(self.short_gff_file) - self.sam_file = f"{resources}/identity_choice_test.sam" - self.short_sam_file = f"{resources}/single.sam" + self.sam_file = f"{resources}/sam/identity_choice_test.sam" + self.short_sam_file = f"{resources}/sam/single.sam" self.short_sam = read(self.short_sam_file) """Helper functions""" diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index 4af12415..b9b7eac3 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -25,8 +25,8 @@ class SamReaderTests(unittest.TestCase): @classmethod def setUpClass(self): - self.sam_file = f"{resources}/identity_choice_test.sam" - self.short_sam_file = f"{resources}/single.sam" + self.sam_file = f"{resources}/sam/identity_choice_test.sam" + self.short_sam_file = f"{resources}/sam/single.sam" self.short_sam = helpers.read(self.short_sam_file) @staticmethod @@ -59,8 +59,8 @@ def test_sam_reader(self): """ def test_sam_parser_comparison(self): - file = f"{resources}/Lib304_test.sam" - ours = SAM_reader().bundle_multi_alignments(file) + file = f"{resources}/sam/Lib304_test.sam" + ours = AlignmentReader().bundle_multi_alignments(file) theirs = HTSeq.bundle_multiple_alignments(HTSeq.BAM_Reader(file)) for (our_bundle, _), their_bundle in zip(ours, theirs): @@ -183,8 +183,8 @@ def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self): with patch.object(SAM_reader, "_write_decollapsed_sam") as write_sam, \ patch.object(SAM_reader, "_write_header_for_decollapsed_sam") as write_header: with contextlib.redirect_stderr(stdout_capture): - reader = SAM_reader(decollapse=True) - records = reader.bundle_multi_alignments(f"{resources}/non-collapsed.sam") + reader = AlignmentReader(decollapse=True) + records = reader.bundle_multi_alignments(f"{resources}/sam/non-collapsed.sam") self.exhaust_iterator(records) write_sam.assert_not_called() @@ -199,8 +199,8 @@ class ReferenceFeaturesTests(unittest.TestCase): @classmethod def setUpClass(self): - self.gff_file = f"{resources}/identity_choice_test.gff3" - self.short_gff_file = f"{resources}/single.gff3" + self.gff_file = f"{resources}/gff/identity_choice_test.gff3" + self.short_gff_file = f"{resources}/gff/single.gff3" self.short_gff = helpers.read(self.short_gff_file) self.maxDiff = None @@ -365,7 +365,7 @@ def test_ref_tables_alias_multisource_concat_all_features_false(self): def test_ref_tables_identity_matches_multisource_concat(self): feature_source = { self.short_gff_file: ["ID"], - f"{resources}/single2.gff3": ["ID"] + f"{resources}/gff/single2.gff3": ["ID"] } kwargs = {'all_features': True} @@ -392,7 +392,7 @@ def test_ref_tables_identity_matches_multisource_concat(self): def test_ref_tables_discontinuous_aliases(self): kwargs = {'all_features': True} - feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} + feature_source = {f"{resources}/gff/discontinuous.gff3": ["Name"]} mock_selector = self.selector_with_template(helpers.rules_template) _, alias, _ = ReferenceFeatures(feature_source, **kwargs).get(mock_selector) @@ -408,7 +408,7 @@ def test_ref_tables_discontinuous_aliases(self): def test_ref_tables_discontinuous_no_match_all_features_false(self): kwargs = {'all_features': False} - feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} + feature_source = {f"{resources}/gff/discontinuous.gff3": ["Name"]} mock_selector = self.selector_with_template([{'Identity': ('No', 'Match')}]) expected_err = "No features were retained while parsing your GFF file.\n" \ @@ -425,7 +425,7 @@ def test_ref_tables_discontinuous_no_match_all_features_false(self): def test_ref_tables_discontinuous_features(self): kwargs = {'all_features': True} - feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} + feature_source = {f"{resources}/gff/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template([{'Identity': ('No', 'Match')}]) # Features that fail to match on identity are not added to the StepVector, @@ -440,7 +440,7 @@ def test_ref_tables_discontinuous_features(self): def test_ref_tables_discontinuous_intervals(self): kwargs = {'all_features': True} - feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} + feature_source = {f"{resources}/gff/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template(helpers.rules_template) grandparent_iv = HTSeq.GenomicInterval('I', 0, 10, '-') @@ -467,7 +467,7 @@ def test_ref_tables_discontinuous_intervals(self): performed for intervals in this test.""" def test_ref_tables_discontinuous_identity_matches(self): - feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} + feature_source = {f"{resources}/gff/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template([ {'Identity': ('Class', 'NA'), 'Hierarchy': 2}, # Rule 1 {'Identity': ('Name', 'Sibling3'), 'Hierarchy': 3}, # Rule 2 @@ -511,7 +511,7 @@ def test_ref_tables_discontinuous_identity_matches(self): def test_ref_tables_source_filter(self): kwargs = {'all_features': False} - feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} + feature_source = {f"{resources}/gff/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template([{'Filter_s': "Source2Name"}]) child2_iv = HTSeq.GenomicInterval('I', 39, 50, '-') @@ -536,7 +536,7 @@ def test_ref_tables_source_filter(self): def test_ref_tables_type_filter(self): kwargs = {'all_features': False} - feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} + feature_source = {f"{resources}/gff/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template([{'Filter_t': "CDS"}]) child1_iv = HTSeq.GenomicInterval('I', 29, 40, '-') @@ -561,7 +561,7 @@ def test_ref_tables_type_filter(self): def test_ref_tables_both_filter(self): kwargs = {'all_features': False} - feature_source = {f"{resources}/discontinuous.gff3": ["Name"]} + feature_source = {f"{resources}/gff/discontinuous.gff3": ["Name"]} feature_selector = self.selector_with_template([{'Filter_s': "SourceName", 'Filter_t': "gene"}]) rt = ReferenceFeatures(feature_source, **kwargs) @@ -577,7 +577,7 @@ def test_ref_tables_both_filter(self): def test_ref_tables_tagged_match_single(self): kwargs = {'all_features': False} feat_id = "Gene:WBGene00023193" - feature_source = {f"{resources}/single.gff3": ["sequence_name"]} + feature_source = {f"{resources}/gff/single.gff3": ["sequence_name"]} feature_selector = self.selector_with_template([ {'Identity': ("ID", feat_id), 'Class': "tagged_match", 'Hierarchy': 1}, {'Identity': ("ID", feat_id), 'Class': "", 'Hierarchy': 2} @@ -604,7 +604,7 @@ def test_ref_tables_tagged_match_single(self): """Does ReferenceFeatures.get() correctly merge records for discontinuous features matching multiple tagged rules?""" def test_ref_tables_tagged_match_merging(self): - feature_source = {f"{resources}/discontinuous.gff3": ['Name']} + feature_source = {f"{resources}/gff/discontinuous.gff3": ['Name']} # All rules match the same root feature feature_selector = self.selector_with_template([ diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py index 32a0845f..a17dbeef 100644 --- a/tests/unit_tests_validation.py +++ b/tests/unit_tests_validation.py @@ -6,7 +6,13 @@ from glob import glob from unittest.mock import patch, mock_open -from tiny.rna.counter.validation import GFFValidator, ReportFormatter, SamSqValidator +from tiny.rna.counter.validation import GFFValidator, ReportFormatter, AlignmentSqValidator + +resources = "./testdata/counter" + +# To run all test suites +if __name__ == '__main__': + unittest.main() class GFFValidatorTest(unittest.TestCase): @@ -104,7 +110,7 @@ def test_gff_id_validation(self): def test_ebwt_chroms(self): validator = self.make_gff_validator() - ebwt_prefix = "./testdata/counter/validation/ebwt/ram1" + ebwt_prefix = f"{resources}/validation/ebwt/ram1" # Chroms are shared validator.chrom_set = {'ram1'} @@ -122,7 +128,7 @@ def test_ebwt_chroms(self): def test_genome_chroms(self): validator = self.make_gff_validator() - fasta_file = "./testdata/counter/validation/genome/genome.fasta" + fasta_file = f"{resources}/validation/genome/genome.fasta" # Chroms are shared validator.chrom_set = {'chr1', 'chr2', 'chr3'} @@ -140,8 +146,8 @@ def test_genome_chroms(self): def test_alignments_heuristic(self): validator = self.make_gff_validator() - sam_files = ['./testdata/counter/identity_choice_test.sam', - './testdata/counter/single.sam'] + sam_files = [f'{resources}/sam/identity_choice_test.sam', + f'{resources}/sam/single.sam'] sam_chroms = { sam_files[0]: {'I', 'V', 'MtDNA'}, From 78a7f87099d446d9c7f70bbef3b931defbab148d Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 29 Apr 2023 11:28:55 -0700 Subject: [PATCH 08/18] Correcting load_config to use the CSVReader class' row_num attribute instead of the superclass' line_num in error catching. This is more meaningful to the user since CSV records can technically span multiple lines --- tiny/rna/counter/counter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index d0bf3ab8..de4d71bd 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -146,7 +146,7 @@ def load_config(features_csv: str, in_pipeline: bool) -> List[dict]: # Duplicate rule entries are not allowed if rule not in rules: rules.append(rule) except Exception as e: - msg = f"Error occurred on line {sheet.line_num} of {os.path.basename(features_csv)}" + msg = f"Error occurred on line {sheet.row_num} of {os.path.basename(features_csv)}" append_to_exception(e, msg) raise From 51d265750fd12473f9c2f9c0cc3456ca63000a68 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 29 Apr 2023 11:31:40 -0700 Subject: [PATCH 09/18] Corrections in SamplesSheet class. Validation methods already report row_num so it is redundant to add that info to AssertionErrors. Also adding more robust regex for matching normalization definitions --- tiny/rna/configuration.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index 04e56ac3..fbd1e3db 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -692,8 +692,10 @@ def read_csv(self): if is_control: self.control_condition = group_name except Exception as e: - msg = f"Error occurred on line {self.csv.line_num} of {self.basename}" - append_to_exception(e, msg) + if not isinstance(e, AssertionError): + # Validation steps already indicate row, exception likely from something else + msg = f"Error occurred on line {self.csv.row_num} of {self.basename}" + append_to_exception(e, msg) raise self.is_compatible_df = self.validate_deseq_compatibility(reps_per_group) @@ -762,7 +764,7 @@ def validate_control_group(self, is_control: bool, group: str): .format(row_num=self.csv.row_num, selfname=self.basename) def validate_normalization(self, norm): - assert re.fullmatch(r"(\s*[\d.]+\s*)|(rpm)", norm, re.IGNORECASE) or not norm, \ + assert re.fullmatch(r"\s*((?:\d+(?:\.\d*)?|\.\d+)|(rpm))\s*", norm, re.IGNORECASE) or not norm, \ "Invalid normalization value in {selfname} (row {row_num})" \ .format(selfname=self.basename, row_num=self.csv.row_num) From 9350f131d3d24842c47ea1ee67c6e69785243626 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sat, 29 Apr 2023 11:33:28 -0700 Subject: [PATCH 10/18] Bringing some outdated and inconsistent section comments up to date in Paths File and Run Config --- START_HERE/paths.yml | 9 ++++++--- START_HERE/run_config.yml | 5 ++--- tests/testdata/config_files/paths.yml | 9 ++++++--- tests/testdata/config_files/run_config_template.yml | 5 ++--- tiny/templates/paths.yml | 9 ++++++--- tiny/templates/run_config_template.yml | 5 ++--- 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/START_HERE/paths.yml b/START_HERE/paths.yml index 0246ca0e..ca45c9f6 100644 --- a/START_HERE/paths.yml +++ b/START_HERE/paths.yml @@ -1,14 +1,17 @@ ############################## MAIN INPUT FILES FOR ANALYSIS ############################## # -# Edit this section to provide the path to your Samples and Features sheets. Relative and -# absolute paths are both allowed. All relative paths are relative to THIS config file. +# Relative and absolute paths are both allowed. +# All relative paths are evaluated relative to THIS config file. # # Directions: # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] -# 3. Set samples_csv and features_csv (below) to point to these files +# 3. Set samples_csv and features_csv to point to these files # 4. Add annotation files and per-file alias preferences to gff_files (optional) # +# If using the tinyRNA workflow, additionally set ebwt and/or reference_genome_files +# in the BOWTIE-BUILD section. +# ######-------------------------------------------------------------------------------###### ##-- Path to Sample & Features Sheets (relative paths are relative to this config file) --## diff --git a/START_HERE/run_config.yml b/START_HERE/run_config.yml index 8d7d5bc5..260b146a 100644 --- a/START_HERE/run_config.yml +++ b/START_HERE/run_config.yml @@ -41,9 +41,8 @@ run_native: false ######------------------------- BOWTIE INDEX BUILD OPTIONS --------------------------###### # -# If you do not already have bowtie indexes, they can be built for you by setting -# run_bowtie_build (above) to true and adding your reference genome file(s) to your -# paths_config file. +# If you do not already have bowtie indexes, they can be built for you +# (see the BOWTIE-BUILD section in the Paths File) # # We have specified default parameters for small RNA data based on our own "best practices". # You can change the parameters here. diff --git a/tests/testdata/config_files/paths.yml b/tests/testdata/config_files/paths.yml index 46c14bf2..2dc39b9a 100644 --- a/tests/testdata/config_files/paths.yml +++ b/tests/testdata/config_files/paths.yml @@ -1,14 +1,17 @@ ############################## MAIN INPUT FILES FOR ANALYSIS ############################## # -# Edit this section to provide the path to your Samples and Features sheets. Relative and -# absolute paths are both allowed. All relative paths are relative to THIS config file. +# Relative and absolute paths are both allowed. +# All relative paths are evaluated relative to THIS config file. # # Directions: # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] -# 3. Set samples_csv and features_csv (below) to point to these files +# 3. Set samples_csv and features_csv to point to these files # 4. Add annotation files and per-file alias preferences to gff_files (optional) # +# If using the tinyRNA workflow, additionally set ebwt and/or reference_genome_files +# in the BOWTIE-BUILD section. +# ######-------------------------------------------------------------------------------###### ##-- Path to Sample & Features Sheets (relative paths are relative to this config file) --## diff --git a/tests/testdata/config_files/run_config_template.yml b/tests/testdata/config_files/run_config_template.yml index e8195a36..04fda2f8 100644 --- a/tests/testdata/config_files/run_config_template.yml +++ b/tests/testdata/config_files/run_config_template.yml @@ -41,9 +41,8 @@ run_native: false ######------------------------- BOWTIE INDEX BUILD OPTIONS --------------------------###### # -# If you do not already have bowtie indexes, they can be built for you by setting -# run_bowtie_build (above) to true and adding your reference genome file(s) to your -# paths_config file. +# If you do not already have bowtie indexes, they can be built for you +# (see the BOWTIE-BUILD section in the Paths File) # # We have specified default parameters for small RNA data based on our own "best practices". # You can change the parameters here. diff --git a/tiny/templates/paths.yml b/tiny/templates/paths.yml index a05f83fc..c72eebc0 100644 --- a/tiny/templates/paths.yml +++ b/tiny/templates/paths.yml @@ -1,14 +1,17 @@ ############################## MAIN INPUT FILES FOR ANALYSIS ############################## # -# Edit this section to provide the path to your Samples and Features sheets. Relative and -# absolute paths are both allowed. All relative paths are relative to THIS config file. +# Relative and absolute paths are both allowed. +# All relative paths are evaluated relative to THIS config file. # # Directions: # 1. Fill out the Samples Sheet with files to process + naming scheme. [samples.csv] # 2. Fill out the Features Sheet with selection rules [features.csv] -# 3. Set samples_csv and features_csv (below) to point to these files +# 3. Set samples_csv and features_csv to point to these files # 4. Add annotation files and per-file alias preferences to gff_files (optional) # +# If using the tinyRNA workflow, additionally set ebwt and/or reference_genome_files +# in the BOWTIE-BUILD section. +# ######-------------------------------------------------------------------------------###### ##-- Path to Sample & Features Sheets (relative paths are relative to this config file) --## diff --git a/tiny/templates/run_config_template.yml b/tiny/templates/run_config_template.yml index 5eb18add..b8eb7c22 100644 --- a/tiny/templates/run_config_template.yml +++ b/tiny/templates/run_config_template.yml @@ -41,9 +41,8 @@ run_native: false ######------------------------- BOWTIE INDEX BUILD OPTIONS --------------------------###### # -# If you do not already have bowtie indexes, they can be built for you by setting -# run_bowtie_build (above) to true and adding your reference genome file(s) to your -# paths_config file. +# If you do not already have bowtie indexes, they can be built for you +# (see the BOWTIE-BUILD section in the Paths File) # # We have specified default parameters for small RNA data based on our own "best practices". # You can change the parameters here. From cb9aeb610c6696b9a2cf512e69f33fbbe6e172a8 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 30 Apr 2023 21:15:31 -0700 Subject: [PATCH 11/18] Renaming the FASTQ/SAM Files column in Samples Sheet to Input Files --- START_HERE/samples.csv | 2 +- doc/Configuration.md | 2 +- tests/testdata/config_files/samples.csv | 2 +- tiny/rna/configuration.py | 11 ++++++++++- tiny/templates/samples.csv | 2 +- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/START_HERE/samples.csv b/START_HERE/samples.csv index 6455c9ae..47caddd8 100755 --- a/START_HERE/samples.csv +++ b/START_HERE/samples.csv @@ -1,4 +1,4 @@ -FASTQ/SAM Files,Sample/Group Name,Replicate number,Control,Normalization +Input Files,Sample/Group Name,Replicate number,Control,Normalization ./fastq_files/cond1_rep1.fastq.gz,condition1,1,TRUE, ./fastq_files/cond1_rep2.fastq.gz,condition1,2,, ./fastq_files/cond1_rep3.fastq.gz,condition1,3,, diff --git a/doc/Configuration.md b/doc/Configuration.md index 26faf28a..7d9c663e 100644 --- a/doc/Configuration.md +++ b/doc/Configuration.md @@ -119,7 +119,7 @@ The final output directory name has three components: The `run_directory` suffix in the Paths File supports subdirectories; if provided, the final output directory will be named as indicated above, but the subdirectory structure specified in `run_directory` will be retained. ## Samples Sheet Details -| _Column:_ | FASTQ/SAM Files | Sample/Group Name | Replicate Number | Control | Normalization | +| _Column:_ | Input Files | Sample/Group Name | Replicate Number | Control | Normalization | |-----------:|---------------------|-------------------|------------------|---------|---------------| | _Example:_ | cond1_rep1.fastq.gz | condition1 | 1 | True | RPM | diff --git a/tests/testdata/config_files/samples.csv b/tests/testdata/config_files/samples.csv index d96b524e..f4ed8522 100755 --- a/tests/testdata/config_files/samples.csv +++ b/tests/testdata/config_files/samples.csv @@ -1,4 +1,4 @@ -FASTQ/SAM Files,Sample/Group Name,Replicate Number,Control,Normalization +Input Files,Sample/Group Name,Replicate Number,Control,Normalization ../../../START_HERE/fastq_files/cond1_rep1.fastq.gz,condition1,1,TRUE, ../../../START_HERE/fastq_files/cond1_rep2.fastq.gz,condition1,2,, ../../../START_HERE/fastq_files/cond1_rep3.fastq.gz,condition1,3,, diff --git a/tiny/rna/configuration.py b/tiny/rna/configuration.py index fbd1e3db..b317fcb4 100644 --- a/tiny/rna/configuration.py +++ b/tiny/rna/configuration.py @@ -833,7 +833,7 @@ class CSVReader(csv.DictReader): "Mismatches": "Mismatch" }), "Samples Sheet": OrderedDict({ - "FASTQ/SAM Files": "File", + "Input Files": "File", "Sample/Group Name": "Group", "Replicate Number": "Replicate", "Control": "Control", @@ -960,6 +960,15 @@ def check_backward_compatibility(self, header_vals): "the new column to your Features Sheet to avoid this error." ])) + if self.doctype == "Samples Sheet": + if 'fastq/sam files' in header_vals_lc: + compat_errors.append('\n'.join([ + "It looks like you're using a Samples Sheet from an earlier version of", + 'tinyRNA. The "FASTQ/SAM files" column has been renamed to "Input Files"', + 'due to the addition of BAM file support. Please rename the column in', + "your Samples Sheet to avoid this error." + ])) + if compat_errors: raise ValueError('\n\n'.join(compat_errors)) diff --git a/tiny/templates/samples.csv b/tiny/templates/samples.csv index 32305007..9d6a8c00 100755 --- a/tiny/templates/samples.csv +++ b/tiny/templates/samples.csv @@ -1,2 +1,2 @@ -FASTQ/SAM Files,Sample/Group Name,Replicate Number,Control,Normalization +Input Files,Sample/Group Name,Replicate Number,Control,Normalization ,,,, \ No newline at end of file From 4764dd34835cbd6a86c9504b93f334134c2dd041 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 30 Apr 2023 21:27:52 -0700 Subject: [PATCH 12/18] More mass renaming of SAM -> alignment --- START_HERE/tiny-count_TUTORIAL.md | 4 +-- tests/unit_test_helpers.py | 2 +- tests/unit_tests_features.py | 52 +++++++++++++++---------------- tests/unit_tests_hts_parsing.py | 46 +++++++++++++-------------- tests/unit_tests_validation.py | 30 +++++++++--------- tiny/rna/counter/counter.py | 4 +-- tiny/rna/counter/features.py | 2 +- tiny/rna/counter/validation.py | 10 +++--- 8 files changed, 75 insertions(+), 75 deletions(-) diff --git a/START_HERE/tiny-count_TUTORIAL.md b/START_HERE/tiny-count_TUTORIAL.md index 1a8b6072..1fae5f30 100644 --- a/START_HERE/tiny-count_TUTORIAL.md +++ b/START_HERE/tiny-count_TUTORIAL.md @@ -11,7 +11,7 @@ Alternatively, if you have already installed tinyRNA, you can use the `tiny-coun ## Your Data Files Gather the following files for the analysis: -1. **SAM files** containing small RNA reads aligned to a reference genome, one file per sample +1. **SAM or BAM files** containing small RNA reads aligned to a reference genome, one file per sample 2. **GFF3 or GFF2/GTF file(s)** containing annotations for features that you want to assign reads to ## Configuration Files @@ -24,7 +24,7 @@ tiny-count --get-templates Next, fill out the configuration files that were copied: ### 1. The Samples Sheet (samples.csv) -Edit this file to add the paths to your SAM files, and to define the group name, replicate number, etc. for each sample. +Edit this file to add the paths to your SAM or BAM files, and to define the group name, replicate number, etc. for each sample. ### 2. The Paths File (paths.yml) Edit this file to add the paths to your GFF annotation(s) under the `gff_files` key. You can leave the `alias` key as-is for now. All other keys in this file are used in the tinyRNA workflow. diff --git a/tests/unit_test_helpers.py b/tests/unit_test_helpers.py index 0b6aaf45..b750d10e 100644 --- a/tests/unit_test_helpers.py +++ b/tests/unit_test_helpers.py @@ -131,7 +131,7 @@ def get_dir_checksum_tree(root_path: str) -> dict: return dir_tree -def make_parsed_sam_record(Name="0_count=1", Seq="CAAGACAGAGCTTCACCGTTC", Chrom='I', Start=15064570, Strand=True, NM=0): +def make_parsed_alignment(Name="0_count=1", Seq="CAAGACAGAGCTTCACCGTTC", Chrom='I', Start=15064570, Strand=True, NM=0): return { "Name": Name, "Length": len(Seq), diff --git a/tests/unit_tests_features.py b/tests/unit_tests_features.py index ce752dbd..ed7ef9e9 100644 --- a/tests/unit_tests_features.py +++ b/tests/unit_tests_features.py @@ -3,7 +3,7 @@ from unittest.mock import patch, call from tiny.rna.counter.features import * -from unit_test_helpers import read, make_parsed_sam_record, rules_template, strand_to_bool +from unit_test_helpers import read, make_parsed_alignment, rules_template, strand_to_bool resources = "./testdata/counter" @@ -115,7 +115,7 @@ def test_assign_features_no_match(self): htsgas[iv_none] += "Should not match" # Create mock SAM alignment with non-overlapping interval - sam_aln = make_parsed_sam_record(**{'Start': 2, 'Chrom': chrom, 'Strand': strand_to_bool(strand)}) + sam_aln = make_parsed_alignment(**{'Start': 2, 'Chrom': chrom, 'Strand': strand_to_bool(strand)}) """ iv_none: 0 |--| 2 @@ -143,7 +143,7 @@ def test_assign_features_single_base_overlap(self): htsgas[iv_none] += "Non-overlapping feature" # Create mock SAM alignment which overlaps iv_olap by one base - sam_aln = make_parsed_sam_record(**{'Chrom': chrom, 'Strand': strand_to_bool(strand), 'Start': 0, 'Seq': 'AT'}) + sam_aln = make_parsed_alignment(**{'Chrom': chrom, 'Strand': strand_to_bool(strand), 'Start': 0, 'Seq': 'AT'}) """ iv_none: 2 |-| 3 @@ -170,7 +170,7 @@ def test_count_reads_generic(self, mock): mock_bundle = ["mock_alignment"] mock_read_count = 1 - instance.sam_reader.bundle_multi_alignments.return_value = iter([(mock_bundle, mock_read_count)]) + instance.alignment_reader.bundle_multi_alignments.return_value = iter([(mock_bundle, mock_read_count)]) instance.assign_features.return_value = ({"mock_feat"}, 1) expected_calls_to_stats = [ @@ -195,11 +195,11 @@ def test_feature_selector_nested_interval(self): feat, fs = self.make_feature_for_interval_test(iv_rule, "Nested Overlap", chrom, strand, start, stop) aln_base = {'Seq': 'ATGC', 'Chrom': chrom, 'Strand': strand_to_bool(strand)} - aln_spill_lo = make_parsed_sam_record(**dict(aln_base, Start=start - 1, Name="spill")) - aln_spill_hi = make_parsed_sam_record(**dict(aln_base, Start=start + 2, Name="spill")) - aln_contained = make_parsed_sam_record(**dict(aln_base, Seq="N", Start=7, Name="contained")) - aln_contained_lo = make_parsed_sam_record(**dict(aln_base, Start=start, Name="contained")) - aln_contained_hi = make_parsed_sam_record(**dict(aln_base, Start=start + 1, Name="contained")) + aln_spill_lo = make_parsed_alignment(**dict(aln_base, Start=start - 1, Name="spill")) + aln_spill_hi = make_parsed_alignment(**dict(aln_base, Start=start + 2, Name="spill")) + aln_contained = make_parsed_alignment(**dict(aln_base, Seq="N", Start=7, Name="contained")) + aln_contained_lo = make_parsed_alignment(**dict(aln_base, Start=start, Name="contained")) + aln_contained_hi = make_parsed_alignment(**dict(aln_base, Start=start + 1, Name="contained")) """ aln: |ATGC| @@ -225,10 +225,10 @@ def test_feature_selector_partial_interval(self): feat, fs = self.make_feature_for_interval_test(iv_rule, "Partial Overlap", chrom, strand, start, stop) aln_base = {'Seq': 'ATGC', 'Chrom': chrom, 'Strand': strand_to_bool(strand)} - aln_spill_lo = make_parsed_sam_record(**dict(aln_base, Start=start - 1, Name="spill")) - aln_spill_hi = make_parsed_sam_record(**dict(aln_base, Start=start + 2, Name="spill")) - aln_contained_lo = make_parsed_sam_record(**dict(aln_base, Start=start, Name="contained")) - aln_contained_hi = make_parsed_sam_record(**dict(aln_base, Start=start + 1, Name="contained")) + aln_spill_lo = make_parsed_alignment(**dict(aln_base, Start=start - 1, Name="spill")) + aln_spill_hi = make_parsed_alignment(**dict(aln_base, Start=start + 2, Name="spill")) + aln_contained_lo = make_parsed_alignment(**dict(aln_base, Start=start, Name="contained")) + aln_contained_hi = make_parsed_alignment(**dict(aln_base, Start=start + 1, Name="contained")) """ aln: |ATGC| @@ -253,11 +253,11 @@ def test_feature_selector_exact_interval(self): aln_match = {'Seq': 'ATGC', 'Chrom': chrom, 'Strand': strand_to_bool(strand)} aln_short = {'Seq': 'NNN', 'Chrom': chrom, 'Strand': strand_to_bool(strand)} - aln_exact = make_parsed_sam_record(**dict(aln_match, Start=start, Name="exact")) - aln_short_lo = make_parsed_sam_record(**dict(aln_short, Start=start, Name="match lo")) - aln_short_hi = make_parsed_sam_record(**dict(aln_short, Start=start + 1, Name="match hi")) - aln_spill_lo = make_parsed_sam_record(**dict(aln_match, Start=start - 1, Name="spill lo")) - aln_spill_hi = make_parsed_sam_record(**dict(aln_match, Start=start + 1, Name="spill hi")) + aln_exact = make_parsed_alignment(**dict(aln_match, Start=start, Name="exact")) + aln_short_lo = make_parsed_alignment(**dict(aln_short, Start=start, Name="match lo")) + aln_short_hi = make_parsed_alignment(**dict(aln_short, Start=start + 1, Name="match hi")) + aln_spill_lo = make_parsed_alignment(**dict(aln_match, Start=start - 1, Name="spill lo")) + aln_spill_hi = make_parsed_alignment(**dict(aln_match, Start=start + 1, Name="spill hi")) """ aln_match: |ATGC| @@ -300,10 +300,10 @@ def test_feature_selector_5p_interval(self): feat_A, fs = self.make_feature_for_interval_test(iv_rule, "5' Anchored Overlap (+)", chrom, '+', start, end) aln_base = {'Start': start, 'Chrom': chrom, 'Strand': True} aln = { - 'aln_none': make_parsed_sam_record(**dict(aln_base, Start=start + 1, Seq="ATGC")), - 'aln_long': make_parsed_sam_record(**dict(aln_base, Seq="ATGCNN")), - 'aln_exact': make_parsed_sam_record(**dict(aln_base, Seq="ATGCN")), - 'aln_short': make_parsed_sam_record(**dict(aln_base, Seq="ATGC")), + 'aln_none': make_parsed_alignment(**dict(aln_base, Start=start + 1, Seq="ATGC")), + 'aln_long': make_parsed_alignment(**dict(aln_base, Seq="ATGCNN")), + 'aln_exact': make_parsed_alignment(**dict(aln_base, Seq="ATGCN")), + 'aln_short': make_parsed_alignment(**dict(aln_base, Seq="ATGC")), } self.assertEqual(fs.choose(feat_A, aln['aln_none']), {}) @@ -347,10 +347,10 @@ def test_feature_selector_3p_interval(self): feat_A, fs = self.make_feature_for_interval_test(iv_rule, "3' Anchored Overlap (+)", chrom, '+', start, end) aln_base = {'Start': start, 'Chrom': chrom, 'Strand': True} aln = { - 'aln_none': make_parsed_sam_record(**dict(aln_base, Seq="ATGC")), - 'aln_long': make_parsed_sam_record(**dict(aln_base, Start=start - 1, Seq="ATGCNN")), - 'aln_exact': make_parsed_sam_record(**dict(aln_base, Seq="ATGCN")), - 'aln_short': make_parsed_sam_record(**dict(aln_base, Start=start + 1, Seq="ATGC")), + 'aln_none': make_parsed_alignment(**dict(aln_base, Seq="ATGC")), + 'aln_long': make_parsed_alignment(**dict(aln_base, Start=start - 1, Seq="ATGCNN")), + 'aln_exact': make_parsed_alignment(**dict(aln_base, Seq="ATGCN")), + 'aln_short': make_parsed_alignment(**dict(aln_base, Start=start + 1, Seq="ATGC")), } self.assertEqual(fs.choose(feat_A, aln['aln_none']), {}) diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index b9b7eac3..b8898497 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -22,7 +22,7 @@ unittest.main() -class SamReaderTests(unittest.TestCase): +class AlignmentReaderTests(unittest.TestCase): @classmethod def setUpClass(self): self.sam_file = f"{resources}/sam/identity_choice_test.sam" @@ -35,10 +35,10 @@ def exhaust_iterator(it): # === TESTS === - """Did SAM_reader correctly skip header values and parse all pertinent info from a single record SAM file?""" + """Did AlignmentReader correctly skip header values and parse all pertinent info from a single record SAM file?""" - def test_sam_reader(self): - sam_bundle, read_count = next(SAM_reader().bundle_multi_alignments(self.short_sam_file)) + def test_AlignmentReader_single_sam(self): + sam_bundle, read_count = next(AlignmentReader().bundle_multi_alignments(self.short_sam_file)) sam_record = sam_bundle[0] self.assertEqual(sam_record['Chrom'], "I") @@ -79,21 +79,21 @@ def test_sam_parser_comparison(self): """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?""" - def test_SAM_reader_get_decollapsed_filename(self): - reader = SAM_reader() + def test_AlignmentReader_get_decollapsed_filename(self): + reader = AlignmentReader() reader.file = "~/path/to/input/sam_file.sam" sam_out = reader._get_decollapsed_filename() self.assertEqual(sam_out, "sam_file_decollapsed.sam") - """Does SAM_reader._new_bundle report the correct read count for different collapser types?""" + """Does AlignmentReader._new_bundle report the correct read count for different collapser types?""" - def test_SAM_reader_new_bundle(self): + def test_AlignmentReader_new_bundle(self): qnames = ["0_count=3", "seq0_x5", "non-collapsed"] counts = [3, 5, 1] - reader = SAM_reader() + reader = AlignmentReader() reader._header_dict = {'HD': {'SO': 'queryname'}} for qname, expected in zip(qnames, counts): @@ -101,10 +101,10 @@ def test_SAM_reader_new_bundle(self): _, read_count = reader._new_bundle({'Name': qname}) self.assertEqual(read_count, expected) - """Does SAM_reader._gather_metadata() correctly identify metadata and write the decollapsed file header?""" + """Does AlignmentReader._gather_metadata() correctly identify metadata and write the decollapsed file header?""" - def test_SAM_reader_gather_metadata(self): - reader = SAM_reader(decollapse=True) + def test_AlignmentReader_gather_metadata(self): + reader = AlignmentReader(decollapse=True) reader._decollapsed_filename = "mock_outfile_name.sam" sam_in = pysam.AlignmentFile(self.short_sam_file) @@ -124,14 +124,14 @@ def test_SAM_reader_gather_metadata(self): self.assertEqual(reader.references, ('I',)) self.assertTrue(reader.has_nm) - """Does SAM_reader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?""" + """Does AlignmentReader._write_decollapsed_sam() write the correct number of duplicates to the decollapsed file?""" - def test_SAM_reader_write_decollapsed_sam(self): + def test_AlignmentReader_write_decollapsed_sam(self): header = pysam.AlignmentHeader() alignment = pysam.AlignedSegment(header) alignment.query_name = "0_count=5" - reader = SAM_reader(decollapse=True) + reader = AlignmentReader(decollapse=True) reader.collapser_type = "tiny-collapse" reader._collapser_token = "=" reader._decollapsed_reads = [alignment] @@ -150,12 +150,12 @@ def test_SAM_reader_write_decollapsed_sam(self): outfile.assert_has_calls(expected_writelines) self.assertTrue(len(reader._decollapsed_reads) == 0) - """Does SAM_reader._parse_alignments() save lines and write them to the decollapsed file when appropriate?""" + """Does AlignmentReader._parse_alignments() save lines and write them to the decollapsed file when appropriate?""" - def test_SAM_reader_parse_alignments_decollapse(self): - with patch.object(SAM_reader, "_write_decollapsed_sam") as write_fn: - # Set up SAM_reader class - reader = SAM_reader(decollapse=True) + def test_AlignmentReader_parse_alignments_decollapse(self): + with patch.object(AlignmentReader, "_write_decollapsed_sam") as write_fn: + # Set up AlignmentReader class + reader = AlignmentReader(decollapse=True) reader.collapser_type = "tiny-collapse" reader._decollapsed_reads = buffer = [0] * 99999 # At 100,001, expect buffer to be written reader.file = self.short_sam_file # File with single alignment @@ -178,10 +178,10 @@ def test_SAM_reader_parse_alignments_decollapse(self): """Are decollapsed outputs skipped when non-collapsed SAM files are supplied?""" - def test_SAM_reader_no_decollapse_non_collapsed_SAM_files(self): + def test_AlignmentReader_no_decollapse_non_collapsed_SAM_files(self): stdout_capture = io.StringIO() - with patch.object(SAM_reader, "_write_decollapsed_sam") as write_sam, \ - patch.object(SAM_reader, "_write_header_for_decollapsed_sam") as write_header: + with patch.object(AlignmentReader, "_write_decollapsed_sam") as write_sam, \ + patch.object(AlignmentReader, "_write_header_for_decollapsed_sam") as write_header: with contextlib.redirect_stderr(stdout_capture): reader = AlignmentReader(decollapse=True) records = reader.bundle_multi_alignments(f"{resources}/sam/non-collapsed.sam") diff --git a/tests/unit_tests_validation.py b/tests/unit_tests_validation.py index a17dbeef..807f4c28 100644 --- a/tests/unit_tests_validation.py +++ b/tests/unit_tests_validation.py @@ -237,7 +237,7 @@ def test_chrom_heuristics_report_output(self): exp_warnings = '\n'.join([ "GFF files and sequence files might not contain the same chromosome identifiers.", "This is determined from a subset of each sequence file, so false positives may be reported.", - "\t" + f"{validator.targets['sam files']}: ", + "\t" + f"{validator.targets['alignment files']}: ", "\t\t" + "sam1: ", "\t\t\t" + f"Chromosomes sampled: {', '.join(sorted(suspect_files['sam1']))}", "\t\t" + "sam2: ", @@ -304,7 +304,7 @@ def test_chrom_heuristics_runtime(self): class SamSqValidatorTest(unittest.TestCase): @classmethod def setUpClass(self): - self.syntax_header = "Every SAM file must have complete @SQ headers with SN and LN\n" \ + self.syntax_header = "Every alignment file must have complete @SQ headers with SN and LN\n" \ "fields when performing sequence-based counting.\n" self.identifier_header = "Sequence identifiers must be unique and have consistent length definitions.\n" @@ -333,12 +333,12 @@ def test_read_sq_headers(self): ], } - validator = SamSqValidator([sam_file]) + validator = AlignmentSqValidator([sam_file]) validator.read_sq_headers() self.assertDictEqual(validator.sq_headers, expected) - """Does SamSqValidator correctly identify missing @SQ headers?""" + """Does AlignmentSqValidator correctly identify missing @SQ headers?""" def test_missing_sq_headers(self): mock_files = ['sam1', 'sam2'] @@ -349,18 +349,18 @@ def test_missing_sq_headers(self): expected = '\n'.join([ self.syntax_header, - "\t" + f"{SamSqValidator.targets['missing sq']}: ", + "\t" + f"{AlignmentSqValidator.targets['missing sq']}: ", "\t\t" + mock_files[1] ]) - validator = SamSqValidator(mock_files) + validator = AlignmentSqValidator(mock_files) validator.sq_headers = mock_sam_headers validator.validate_sq_headers() self.assertListEqual(validator.report.errors, [expected]) self.assertListEqual(validator.report.warnings, []) - """Does SamSqValidator correctly identify incomplete @SQ headers?""" + """Does AlignmentSqValidator correctly identify incomplete @SQ headers?""" def test_incomplete_sq_headers(self): mock_files = ['sam1', 'sam2', 'sam3'] @@ -372,19 +372,19 @@ def test_incomplete_sq_headers(self): expected = '\n'.join([ self.syntax_header, - "\t" + f"{SamSqValidator.targets['incomplete sq']}: ", + "\t" + f"{AlignmentSqValidator.targets['incomplete sq']}: ", "\t\t" + mock_files[1], "\t\t" + mock_files[2], ]) - validator = SamSqValidator(mock_files) + validator = AlignmentSqValidator(mock_files) validator.sq_headers = mock_sam_headers validator.validate_sq_headers() self.assertListEqual(validator.report.errors, [expected]) self.assertListEqual(validator.report.warnings, []) - """Does SamSqValidator correctly identify duplicate identifiers?""" + """Does AlignmentSqValidator correctly identify duplicate identifiers?""" def test_duplicate_identifiers(self): mock_files = ['sam1', 'sam2', 'sam3'] @@ -396,18 +396,18 @@ def test_duplicate_identifiers(self): expected = '\n'.join([ self.identifier_header, - "\t" + f"{SamSqValidator.targets['intra sq']}: ", + "\t" + f"{AlignmentSqValidator.targets['intra sq']}: ", "\t\t" + mock_files[2], ]) - validator = SamSqValidator(mock_files) + validator = AlignmentSqValidator(mock_files) validator.sq_headers = mock_sam_headers validator.validate_sq_headers() self.assertListEqual(validator.report.errors, [expected]) self.assertListEqual(validator.report.warnings, []) - """Does SamSqValidator correctly identify identifiers with inconsistent length definitions?""" + """Does AlignmentSqValidator correctly identify identifiers with inconsistent length definitions?""" def test_inconsistent_identifier_length(self): mock_files = ['sam1', 'sam2', 'sam3'] @@ -419,12 +419,12 @@ def test_inconsistent_identifier_length(self): expected = '\n'.join([ self.identifier_header, - "\t" + f"{SamSqValidator.targets['inter sq']}: ", + "\t" + f"{AlignmentSqValidator.targets['inter sq']}: ", "\t\t" + 'chr1', "\t\t" + 'chr2', ]) - validator = SamSqValidator(mock_files) + validator = AlignmentSqValidator(mock_files) validator.sq_headers = mock_sam_headers validator.validate_sq_headers() diff --git a/tiny/rna/counter/counter.py b/tiny/rna/counter/counter.py index de4d71bd..5e577cdc 100644 --- a/tiny/rna/counter/counter.py +++ b/tiny/rna/counter/counter.py @@ -154,7 +154,7 @@ def load_config(features_csv: str, in_pipeline: bool) -> List[dict]: def load_references(paths: PathsFile, libraries: List[dict], rules: List[dict], prefs) -> ReferenceBase: - """Determines the reference source (GFF or SAM @SQ headers) and constructs the appropriate object + """Determines the reference source (GFF or alignment @SQ headers) and constructs the appropriate object Args: paths: a PathsFile object that optionally contains a `gff_files` configuration @@ -191,7 +191,7 @@ def map_and_reduce(libraries, paths, prefs): # MergedStatsManager handles final output files, regardless of multiprocessing status summary = MergedStatsManager(Features, paths['features_csv'], prefs) - # Use a multiprocessing pool if multiple sam files were provided + # Use a multiprocessing pool if multiple alignment files were provided if len(libraries) > 1: mp.set_start_method("fork") with mp.Pool(len(libraries)) as pool: diff --git a/tiny/rna/counter/features.py b/tiny/rna/counter/features.py index 1ce7be63..8eca57dc 100644 --- a/tiny/rna/counter/features.py +++ b/tiny/rna/counter/features.py @@ -51,7 +51,7 @@ def count_reads(self, library: dict): read_seq = self.alignment_reader.bundle_multi_alignments(library["File"]) self.stats.assign_library(library) - # For each sequence in the sam file... + # For each sequence in the alignment file... for bundle, read_count in read_seq: bstat = self.stats.count_bundle(bundle, read_count) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index 005cc5e5..f53a4049 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -171,10 +171,10 @@ def chroms_shared_with_genomes(self, genome_fastas): def alignment_chroms_mismatch_heuristic(self, sam_files: List[str], subset_size=50000) -> Dict[str, set]: """Since alignment files can be very large, we only check that there's at least one shared chromosome identifier and only the first subset_size lines are read from each file. The - returned dictionary contains only the SAM files whose sampled chromosome set failed to + returned dictionary contains only the alignment files whose sampled chromosome set failed to intersect with the chromosomes parsed from GFF files. Returns: - a dictionary of {SAM filename: SAM chromosomes sampled}""" + a dictionary of {alignment filename: alignment chromosomes sampled}""" files_wo_overlap = {} @@ -296,7 +296,7 @@ def get_ambiguous_lengths(self) -> List[str]: return bad_seqs def get_duplicate_identifiers(self) -> Dict[str, List[str]]: - """Returns a dictionary of SAM files that contain duplicate sequence identifiers""" + """Returns a dictionary of alignment files that contain duplicate sequence identifiers""" bad_files = {} for file in self.alignment_files: @@ -308,13 +308,13 @@ def get_duplicate_identifiers(self) -> Dict[str, List[str]]: return bad_files def get_incomplete_sq_headers(self) -> List[str]: - """Returns a list of SAM files that have incomplete @SQ headers""" + """Returns a list of alignment files that have incomplete @SQ headers""" return [file for file, sqs in self.sq_headers.items() if not all("SN" in sq and "LN" in sq for sq in sqs)] def get_missing_headers(self) -> List[str]: - """Returns a list of SAM files that lack @SQ headers""" + """Returns a list of alignment files that lack @SQ headers""" return [file for file, sqs in self.sq_headers.items() if len(sqs) == 0] From 1c112f8aba45d383a3bf03a368272cdfe0780943 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 30 Apr 2023 21:34:30 -0700 Subject: [PATCH 13/18] Updating unit tests in accordance with changes to both the SamplesSheet class and the AlignmentReader class --- tests/unit_tests_configuration.py | 170 +++++++++++++++++++++++------- tests/unit_tests_counter.py | 28 ++--- tests/unit_tests_features.py | 1 + tests/unit_tests_hts_parsing.py | 129 ++++++++++++++++++++++- 4 files changed, 263 insertions(+), 65 deletions(-) diff --git a/tests/unit_tests_configuration.py b/tests/unit_tests_configuration.py index b9da1a9a..a10c6f38 100644 --- a/tests/unit_tests_configuration.py +++ b/tests/unit_tests_configuration.py @@ -10,6 +10,14 @@ from tiny.rna.util import r_reserved_keywords +def patch_isfile(): + return patch('tiny.rna.configuration.os.path.isfile', return_value=True) + + +def patch_open(read_data): + return patch('tiny.rna.configuration.open', mock_open(read_data=read_data)) + + class BowtieIndexesTest(unittest.TestCase): @classmethod def setUpClass(self): @@ -150,37 +158,60 @@ def test_verify_bowtie_build_outputs(self): class SamplesSheetTest(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.contexts = ["Pipeline Start", "Pipeline Step", "Standalone Run"] + """Does SamplesSheet catch multi-assignment of control condition?""" def test_validate_control_group(self): - sheet = csv_factory("samples.csv", [ - {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good - {'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good - {'File': '3.fastq', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''} # Bad - ]) # ^^^ - - exp_contains = r".*(multiple control conditions).*" - with self.assertRaisesRegex(AssertionError, exp_contains), \ - patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \ - patch('tiny.rna.configuration.os.path.isfile', return_value=True): - SamplesSheet('mock_filename') + rows = [ + {'File': '1', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good + {'File': '2', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good + {'File': '3', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''} # Bad + ] # ^^^^ ^^^^ + + start_sheet = csv_factory("samples.csv", [dict(r, File=f"{i}.fastq") for i, r in enumerate(rows)]) + step_sheet = csv_factory("samples.csv", [dict(r, File=f"{i}.sam") for i, r in enumerate(rows)]) + run_sheet = step_sheet + + with patch_isfile(): + # Control condition should not be evaluated in Pipeline Step or Standalone Run context + with patch_open(step_sheet): + SamplesSheet('mock_filename', context="Pipeline Step") + with patch_open(run_sheet): + SamplesSheet('mock_filename', context="Standalone Run") + + # Control condition should be evaluated in Pipeline Start context + exp_contains = r".*(multiple control conditions).*" + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(start_sheet): + SamplesSheet('mock_filename', context="Pipeline Start") """Does SamplesSheet catch duplicate entries for the same group and rep?""" + def test_validate_group_rep(self): - sheet = csv_factory("samples.csv", [ - {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good - {'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good - {'File': '3.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''} # Bad - ]) # ^^^ ^^^ + rows = [ + {'File': '1', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good + {'File': '2', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good + {'File': '3', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''} # Bad + ] # ^^^^ ^^^ + + start_sheet = csv_factory("samples.csv", [dict(r, File=f"{i}.fastq") for i, r in enumerate(rows)]) + step_sheet = csv_factory("samples.csv", [dict(r, File=f"{i}.sam") for i, r in enumerate(rows)]) + run_sheet = step_sheet + contexts = list(zip(self.contexts, [start_sheet, step_sheet, run_sheet])) exp_contains = r".*(same group and replicate).*" - with self.assertRaisesRegex(AssertionError, exp_contains), \ - patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \ - patch('tiny.rna.configuration.os.path.isfile', return_value=True): - SamplesSheet('mock_filename') - """Does SamplesSheet catch fastq files that don't exist, have a bad file extension, or are listed more than once?""" + # Validation should take place in all contexts + for context, sheet in contexts: + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(sheet), patch_isfile(): + SamplesSheet('mock_filename', context=context) + + """Does SamplesSheet catch bad fastq entries in Pipeline Start context?""" + def test_validate_fastq_filepath(self): + context = "Pipeline Start" csv_rows = [ {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good {'File': '1.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Bad @@ -189,28 +220,50 @@ def test_validate_fastq_filepath(self): sheet = csv_factory("samples.csv", csv_rows) # File doesn't exist - exp_contains = r".*(was not found).*" - with self.assertRaisesRegex(AssertionError, exp_contains), \ - patch('tiny.rna.configuration.open', mock_open(read_data=sheet)): - SamplesSheet('mock_filename') + exp_contains = r".*(fastq file on row 1 of mock_filename was not found).*" + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(sheet): + SamplesSheet('mock_filename', context=context) # Duplicate filename - exp_contains = r".*(listed more than once).*" - with self.assertRaisesRegex(AssertionError, exp_contains), \ - patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \ - patch('tiny.rna.configuration.os.path.isfile', return_value=True): - SamplesSheet('mock_filename') + exp_contains = r".*(listed more than once).*\(row 2\)" + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(sheet), patch_isfile(): + SamplesSheet('mock_filename', context=context) # Bad file extension - exp_contains = r".*(\.fastq\(\.gz\) extension).*" - csv_rows.pop(0) + sheet = csv_factory("samples.csv", csv_rows[1:]) # remove duplicate entry + exp_contains = r".*(\.fastq\(\.gz\) extension).*\(row 2\)" + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(sheet), patch_isfile(): + SamplesSheet('mock_filename', context=context) + + """Does SamplesSheet catch bad alignment file entries in Standalone Run context?""" + + def test_validate_alignments_filepath(self): + context = "Standalone Run" + csv_rows = [ + {'File': '1.sam', 'Group': 'G1', 'Replicate': '1', 'Control': True, 'Normalization': ''}, # Good + {'File': '1.sam', 'Group': 'G1', 'Replicate': '2', 'Control': True, 'Normalization': ''}, # Good + {'File': '1.bam', 'Group': 'G1', 'Replicate': '3', 'Control': True, 'Normalization': ''}, # Bad + {'File': '2.xyz', 'Group': 'G2', 'Replicate': '1', 'Control': True, 'Normalization': ''} # Bad + ] # ^^^^^^^ sheet = csv_factory("samples.csv", csv_rows) - with self.assertRaisesRegex(AssertionError, exp_contains), \ - patch('tiny.rna.configuration.open', mock_open(read_data=sheet)), \ - patch('tiny.rna.configuration.os.path.isfile', return_value=True): - SamplesSheet('mock_filename') - """Does validate_r_safe_sample_groups detect group names that will cause namespace collisions in R?""" + # File doesn't exist + exp_contains = r".*(file on row 1 of mock_filename was not found).*" + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(sheet): + SamplesSheet('mock_filename', context=context) + + # Duplicate filename + exp_contains = r".*(listed more than once).*\(row 2\)" + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(sheet), patch_isfile(): + SamplesSheet('mock_filename', context=context) + + # Bad file extension + sheet = csv_factory("samples.csv", csv_rows[1:]) # remove duplicate entry + exp_contains = r".*(\.sam or \.bam extension).*\(row 3\)" + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(sheet), patch_isfile(): + SamplesSheet('mock_filename', context=context) + + """Does validate_r_safe_sample_groups() detect group names that will cause namespace collisions in R?""" def test_validate_r_safe_sample_groups(self): non_alphanum_chars = [bad.join(('a', 'b')) for bad in "~!@#$%^&*()+-=`<>?/,:;\"'[]{}\| \t\n\r\f\v"] @@ -227,6 +280,45 @@ def test_validate_r_safe_sample_groups(self): with self.assertRaisesRegex(AssertionError, msg): SamplesSheet.validate_r_safe_sample_groups(dict.fromkeys(bad)) + """Does validate_normalization() do what it should?""" + + def test_validate_normalization(self): + good = [ + {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': False, 'Normalization': 'rpm'}, # Good + {'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': False, 'Normalization': 'RPM'}, # Good + {'File': '3.fastq', 'Group': 'G1', 'Replicate': '3', 'Control': False, 'Normalization': ' RPM '}, # Good + {'File': '4.fasta', 'Group': 'G2', 'Replicate': '1', 'Control': False, 'Normalization': ' 1'}, # Good + {'File': '5.fasta', 'Group': 'G2', 'Replicate': '2', 'Control': False, 'Normalization': '1.1'}, # Good + {'File': '6.fasta', 'Group': 'G2', 'Replicate': '3', 'Control': False, 'Normalization': ''}, # Good + ] # ^^^^ + + start_sheet = csv_factory("samples.csv", [dict(r, File=f"{i}.fastq") for i, r in enumerate(good)]) + step_sheet = csv_factory("samples.csv", [dict(r, File=f"{i}.sam") for i, r in enumerate(good)]) + run_sheet = step_sheet + + # These SHOULD NOT throw an error + for context, sheet in zip(self.contexts, [start_sheet, step_sheet, run_sheet]): + with patch_open(sheet), patch_isfile(): + SamplesSheet("mock_filename", context=context) + + bad = [ + {'File': '1.fastq', 'Group': 'G1', 'Replicate': '1', 'Control': False, 'Normalization': 'abc'}, # Bad + {'File': '2.fastq', 'Group': 'G1', 'Replicate': '2', 'Control': False, 'Normalization': '123.rpm'}, # Bad + {'File': '3.fastq', 'Group': 'G1', 'Replicate': '3', 'Control': False, 'Normalization': '.'}, # Bad + {'File': '1.fastq', 'Group': 'G2', 'Replicate': '1', 'Control': False, 'Normalization': '_'}, # Bad + ] + + start_sheet = csv_factory("samples.csv", [dict(r, File=f"{i}.fastq") for i, r in enumerate(bad)]) + step_sheet = csv_factory("samples.csv", [dict(r, File=f"{i}.sam") for i, r in enumerate(bad)]) + run_sheet = step_sheet + + exp_contains = r".*(Invalid normalization value).*" + + # These SHOULD throw an error + for context, sheet in zip(self.contexts, [start_sheet, step_sheet, run_sheet]): + with self.assertRaisesRegex(AssertionError, exp_contains), patch_open(sheet), patch_isfile(): + SamplesSheet("mock_filename", context=context) + class PathsFileTest(unittest.TestCase): @@ -344,11 +436,11 @@ def test_backward_incompatibility(self): with self.assertRaisesRegex(AssertionError, r".*(check the release notes).*"): config.check_backward_compatibility() - """Does PathsFile map all paths to the working directory when in_pipeline=True?""" + """Does PathsFile map all paths to the working directory when context=True?""" def test_pipeline_mapping(self): # There are entries in the template Paths File that will cause FileNotFound - # when in_pipeline=True (they're not in the template directory). Since + # when context=True (they're not in the template directory). Since # file existence isn't relevant for this test, we patch that out. with patch('tiny.rna.configuration.os.path.isfile', return_value=True): config = make_paths_file(in_pipeline=True) diff --git a/tests/unit_tests_counter.py b/tests/unit_tests_counter.py index ad83ee01..9702e2b0 100644 --- a/tests/unit_tests_counter.py +++ b/tests/unit_tests_counter.py @@ -7,7 +7,7 @@ import unit_test_helpers as helpers import tiny.rna.counter.counter as counter -from tiny.rna.util import from_here +from tiny.rna.configuration import ConfigBase resources = "./testdata/counter" @@ -96,13 +96,13 @@ def get_loaded_samples_row(self, row, exp_file): def test_load_samples_single_cmd(self): mock_samp_sheet_path = '/dev/null' inp_file = "test.fastq" - exp_file = from_here(mock_samp_sheet_path, "test_aligned_seqs.sam") + exp_file = ConfigBase.joinpath(mock_samp_sheet_path, "test_aligned_seqs.sam") row = dict(self.csv_samp_row_dict, **{'File': inp_file}) csv = self.csv("samples.csv", [row]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): - inputs_step = counter.load_samples(mock_samp_sheet_path, is_pipeline=False) + inputs_step = counter.load_samples(mock_samp_sheet_path, in_pipeline=False) expected_result = self.get_loaded_samples_row(row, exp_file) self.assertEqual(inputs_step, expected_result) @@ -118,7 +118,7 @@ def test_load_samples_single_pipeline(self): csv = self.csv("samples.csv", [row]) with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): - inputs_pipeline = counter.load_samples(mock_samp_sheet_path, is_pipeline=True) + inputs_pipeline = counter.load_samples(mock_samp_sheet_path, in_pipeline=True) expected_result = self.get_loaded_samples_row(row, exp_file) self.assertEqual(inputs_pipeline, expected_result) @@ -144,25 +144,11 @@ def test_load_samples_sam(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - inputs = counter.load_samples(dummy_file, is_pipeline=False) + inputs = counter.load_samples(dummy_file, in_pipeline=False) expected_result = self.get_loaded_samples_row(row, sam_filename) self.assertEqual(inputs, expected_result) - """Does load_samples throw ValueError if a non-absolute path to a SAM file is provided?""" - - def test_load_samples_nonabs_path(self): - bad = "./dne.sam" - row = dict(self.csv_samp_row_dict, **{'File': bad}) - csv = self.csv("samples.csv", [row]) - - expected_error = "The following file must be expressed as an absolute path:\n" + bad - - with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): - with self.assertRaisesRegex(ValueError, expected_error): - dummy_file = '/dev/null' - counter.load_samples(dummy_file, False) - """Does load_samples throw ValueError if sample filename does not have a .fastq or .sam extension?""" def test_load_samples_bad_extension(self): @@ -187,7 +173,7 @@ def test_load_config_single_cmd(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - ruleset = counter.load_config(dummy_file, is_pipeline=False) + ruleset = counter.load_config(dummy_file, in_pipeline=False) expected_ruleset = self.parsed_feat_rule self.assertEqual(ruleset, expected_ruleset) @@ -201,7 +187,7 @@ def test_load_config_single_pipeline(self): with patch('tiny.rna.configuration.open', mock_open(read_data=csv)): dummy_file = '/dev/null' - ruleset = counter.load_config(dummy_file, is_pipeline=True) + ruleset = counter.load_config(dummy_file, in_pipeline=True) expected_ruleset = self.parsed_feat_rule self.assertEqual(ruleset, expected_ruleset) diff --git a/tests/unit_tests_features.py b/tests/unit_tests_features.py index ed7ef9e9..9f2cd3e0 100644 --- a/tests/unit_tests_features.py +++ b/tests/unit_tests_features.py @@ -7,6 +7,7 @@ resources = "./testdata/counter" + class FeaturesTests(unittest.TestCase): @classmethod diff --git a/tests/unit_tests_hts_parsing.py b/tests/unit_tests_hts_parsing.py index b8898497..6f06c066 100644 --- a/tests/unit_tests_hts_parsing.py +++ b/tests/unit_tests_hts_parsing.py @@ -28,6 +28,7 @@ def setUpClass(self): self.sam_file = f"{resources}/sam/identity_choice_test.sam" self.short_sam_file = f"{resources}/sam/single.sam" self.short_sam = helpers.read(self.short_sam_file) + self.short_bam_file = f"{resources}/bam/single.bam" @staticmethod def exhaust_iterator(it): @@ -50,7 +51,22 @@ def test_AlignmentReader_single_sam(self): self.assertEqual(sam_record['Length'], 21) self.assertEqual(sam_record['nt5end'], 'G') - """Does our custom SAM parser produce the same pertinent info as HTSeq's BAM_reader? + """Did AlignmentReader correctly skip header values and parse all pertinent info from a single record BAM file?""" + + def test_AlignmentReader_single_bam(self): + bam_bundle, read_count = next(AlignmentReader().bundle_multi_alignments(self.short_bam_file)) + bam_record = bam_bundle[0] + + self.assertEqual(bam_record['Chrom'], "I") + self.assertEqual(bam_record['Start'], 15064569) + self.assertEqual(bam_record['End'], 15064590) + self.assertEqual(bam_record['Strand'], False) + self.assertEqual(bam_record['Name'], "0_count=5") + self.assertEqual(bam_record['Seq'], "CAAGACAGAGCTTCACCGTTC") + self.assertEqual(bam_record['Length'], 21) + self.assertEqual(bam_record['nt5end'], 'G') + + """Does our AlignmentReader produce the same pertinent info from a SAM file as HTSeq's BAM_reader? A note on SAM files: reads are always stored 5' to 3', so antisense reads are actually recorded in reverse complement. HTSeq automatically performs this conversion, but we @@ -77,7 +93,28 @@ def test_sam_parser_comparison(self): else: self.assertEqual(our['Seq'], their.read.seq.decode()) - """Does SAM_reader._get_decollapsed_filename() create an appropriate filename?""" + """Does our AlignmentReader produce the same pertinent info from a BAM file as HTSeq's BAM_reader?""" + + def test_bam_parser_comparison(self): + file = f"{resources}/bam/Lib304_test.bam" + ours = AlignmentReader().bundle_multi_alignments(file) + theirs = HTSeq.bundle_multiple_alignments(HTSeq.BAM_Reader(file)) + + for (our_bundle, _), their_bundle in zip(ours, theirs): + self.assertEqual(len(our_bundle), len(their_bundle)) + for our, their in zip(our_bundle, their_bundle): + self.assertEqual(our['Chrom'], their.iv.chrom) + self.assertEqual(our['Start'], their.iv.start) + self.assertEqual(our['End'], their.iv.end) + self.assertEqual(our['Name'], their.read.name) + self.assertEqual(our['nt5end'], chr(their.read.seq[0])) # See note above + self.assertEqual(our['Strand'], helpers.strand_to_bool(their.iv.strand)) + if our['Strand'] is False: # See note above + self.assertEqual(our['Seq'][::-1].translate(helpers.complement), their.read.seq.decode()) + else: + self.assertEqual(our['Seq'], their.read.seq.decode()) + + """Does AlignmentReader._get_decollapsed_filename() create an appropriate filename?""" def test_AlignmentReader_get_decollapsed_filename(self): reader = AlignmentReader() @@ -106,21 +143,28 @@ def test_AlignmentReader_new_bundle(self): def test_AlignmentReader_gather_metadata(self): reader = AlignmentReader(decollapse=True) reader._decollapsed_filename = "mock_outfile_name.sam" - + reader._assign_library(self.short_sam_file) sam_in = pysam.AlignmentFile(self.short_sam_file) + with patch('builtins.open', mock_open()) as sam_out: reader._gather_metadata(sam_in) expected_writelines = [ call('mock_outfile_name.sam', 'w'), call().__enter__(), - call().write("@SQ\tSN:I\tLN:21\n"), + call().write("@HD\tSO:unsorted\n@SQ\tSN:I\tLN:21\n@PG\tID:bowtie\n"), call().__exit__(None, None, None) ] + expected_header_dict = { + 'HD': {'SO': 'unsorted'}, + 'SQ': [{'LN': 21, 'SN': 'I'}], + 'PG': [{'ID': 'bowtie'}] + } + sam_out.assert_has_calls(expected_writelines) self.assertEqual(reader.collapser_type, 'tiny-collapse') - self.assertDictEqual(reader._header_dict, {'SQ': [{'SN': "I", 'LN': 21}]}) + self.assertDictEqual(reader._header_dict, expected_header_dict) self.assertEqual(reader.references, ('I',)) self.assertTrue(reader.has_nm) @@ -194,6 +238,81 @@ def test_AlignmentReader_no_decollapse_non_collapsed_SAM_files(self): "Alignments do not appear to be derived from a supported collapser input. " "Decollapsed SAM files will therefore not be produced.\n") + """Is incompatible alignment file ordering correctly identified from @HD header values?""" + + def test_AlignmentReader_incompatible_HD_header(self): + # Valid SO values: ["unknown", "unsorted", "queryname", "coordinate"] + # Valid GO values: ["none", "query", "reference"] + + strictly_compatible = [ + {'HD': {'SO': "queryname"}}, + {'HD': {'GO': "query"}}, + ] + + # Should not throw error + for header in strictly_compatible: + reader = AlignmentReader() + reader._header_dict = header + reader._assign_library("mock_infile.sam") + reader._check_for_incompatible_order() + + strictly_incompatible = [ + ({'HD': {'SO': "coordinate"}}, "by coordinate"), + ({'HD': {'GO': "reference"}}, "by reference"), + ({'HD': {}}, "sorting/grouping couldn't be determined"), + ({}, "sorting/grouping couldn't be determined"), + ] + + # Should throw error + for header, message in strictly_incompatible: + reader = AlignmentReader() + reader._header_dict = header + reader._assign_library("mock_infile.sam") + with self.assertRaisesRegex(ValueError, message): + reader._check_for_incompatible_order() + + """Is incompatible alignment file ordering correctly identified from @PG header values?""" + + def test_AlignmentReader_incompatible_PG_header(self): + # SO = ["unknown", "unsorted", "queryname", "coordinate"] + # GO = ["none", "query", "reference"] + + self.assertEqual(AlignmentReader.compatible_unordered, ("bowtie", "bowtie2", "star")) + SO_unordered = [{'SO': "unknown"}, {'SO': "unsorted"}] + GO_unordered = [{'GO': "none"}] + compatible = [ + {'PG': [{'ID': tool}], 'HD': un_so, 'GO': un_go} + for tool in AlignmentReader.compatible_unordered + for un_so in SO_unordered + for un_go in GO_unordered + ] + + # Only the last reported PG ID matters + multiple_tools = [{'ID': "INCOMPATIBLE"}, {'ID': "bowtie"}] + compatible += [{'PG': multiple_tools, 'HD': {'SO': "unsorted"}}] + + # Should not throw error + for header in compatible: + reader = AlignmentReader() + reader._header_dict = header + reader._assign_library("mock_infile.sam") + reader._check_for_incompatible_order() + + incompatible = [ + {'PG': [{'ID': "INCOMPATIBLE"}], 'HD': un_so, 'GO': un_go} + for un_so in SO_unordered + for un_go in GO_unordered + ] + + # Should throw error + expected_error = "adjacency couldn't be determined" + for header in incompatible: + reader = AlignmentReader() + reader._header_dict = header + reader._assign_library("mock_infile.sam") + with self.assertRaisesRegex(ValueError, expected_error): + reader._check_for_incompatible_order() + class ReferenceFeaturesTests(unittest.TestCase): From deb2425ee71f82c05bdd71109b6fd3176b0bf4c0 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 30 Apr 2023 21:40:41 -0700 Subject: [PATCH 14/18] Minor docstring update and lifting state for compatible alignment tools that produce "unordered" outputs --- tiny/rna/counter/hts_parsing.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index b1311a0e..81b1b789 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -28,7 +28,9 @@ class AlignmentReader: - """A minimal SAM/BAM reader that bundles multiple-alignments and only parses data relevant to the workflow.""" + """A SAM/BAM wrapper for Pysam that bundles multiple-alignments and only retrieves data relevant to the workflow.""" + + compatible_unordered = ("bowtie", "bowtie2", "star") # unordered files from these tools don't require sorting def __init__(self, **prefs): self.decollapse = prefs.get("decollapse", False) @@ -112,7 +114,8 @@ def _check_for_incompatible_order(self): Headers that report unsorted/unknown sorting order are likely still compatible, but since the SAM/BAM standard doesn't address alignment adjacency, we have to assume - that it is implementation dependent. Therefore, we check the last reported @PG + that it is implementation dependent. Therefore, if all other checks for strict + compatibility/incompatibility pass, we check the last reported @PG header against a short (incomplete) list of alignment tools that are known to follow the adjacency convention.""" @@ -141,7 +144,7 @@ def _check_for_incompatible_order(self): # to handle the alignments is one that produces adjacent alignments by default pg_header = self._header_dict.get('PG', [{}]) last_prog = pg_header[-1].get('ID', "") - if last_prog.lower() not in ("bowtie", "bowtie2", "star"): + if last_prog.lower() not in self.compatible_unordered: raise ValueError(error.format(reason="multi-mapping adjacency couldn't be determined")) def _determine_collapser_type(self, qname: str) -> None: From c309ffa376a64563b471f29382cebe05fa367aa5 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Sun, 30 Apr 2023 21:42:23 -0700 Subject: [PATCH 15/18] Adding relevant updates to tiny-count documentation and adding test files that were missed earlier --- doc/tiny-count.md | 11 ++++++++--- tests/testdata/counter/bam/Lib304_test.bam | Bin 0 -> 22386 bytes tests/testdata/counter/bam/single.bam | Bin 0 -> 217 bytes 3 files changed, 8 insertions(+), 3 deletions(-) create mode 100644 tests/testdata/counter/bam/Lib304_test.bam create mode 100644 tests/testdata/counter/bam/single.bam diff --git a/doc/tiny-count.md b/doc/tiny-count.md index 79f80f0e..3537c16c 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -7,7 +7,10 @@ For an explanation of tiny-count's parameters in the Run Config and by commandli tiny-count offers a variety of options for refining your analysis. You might find that repeat analyses are required while tuning these options to your goals. Using the command `tiny recount`, tinyRNA will run the workflow starting at the tiny-count step using inputs from a prior end-to-end run to save time. See the [pipeline resume documentation](Pipeline.md#resuming-a-prior-analysis) for details and prerequisites. ## Running as a Standalone Tool -If you would like to run tiny-count as a standalone tool, not as part of an end-to-end or resumed analysis, you can do so with the command `tiny-count`. The command has [one required argument](Parameters.md#full-tiny-count-help-string): your Paths File. Your Samples Sheet will need to list SAM files rather than FASTQ files in the `FASTQ/SAM Files` column. SAM files from third party sources are also supported, and if they have been produced from reads collapsed by tiny-collapse or fastx, tiny-count will honor the reported read counts. +If you would like to run tiny-count as a standalone tool, not as part of an end-to-end or resumed analysis, you can do so with the command `tiny-count`. The command has [one required argument](Parameters.md#full-tiny-count-help-string): your Paths File. Your Samples Sheet will need to list SAM or BAM alignment files rather than FASTQ files in the `Input Files` column. Alignment files from third party sources are also supported, and if they have been produced from reads collapsed by tiny-collapse or fastx, tiny-count will honor the reported read counts. + +#### Input File Requirements +The SAM/BAM files provided during standalone runs _must_ be ordered so that multi-mapping read alignments are listed adjacent to one another. This adjacency convention is required for proper normalization by genomic hits. For this reason, files with ambiguous sorting or grouping (`SO:unsorted`, `SO:unknown`, or `GO:none`) will be rejected unless they were produced by an alignment tool that we recognize for following the adjacency convention. At this time, this includes Bowtie, Bowtie2, and STAR (an admittedly incomplete list). #### Using Non-collapsed Sequence Alignments While third-party SAM files from non-collapsed reads are supported, there are some caveats. These files will result in substantially higher resource usage and runtimes; we strongly recommend collapsing prior to alignment. Additionally, the sequence-related stats produced by tiny-count will no longer represent _unique_ sequences. These stats will instead refer to all sequences with unique QNAMEs (that is, multi-alignment bundles still cary a sequence count of 1). @@ -139,8 +142,10 @@ Examples: ## Count Normalization Small RNA reads passing selection will receive a normalized count increment. By default, read counts are normalized twice before being assigned to a feature. Both normalization steps can be disabled in `run_config.yml` if desired. Counts for each small RNA sequence are divided: -1. By the number of loci it aligns to in the genome. -2. By the number of _selected_ features for each of its alignments. +1. By the number of loci it aligns to in the genome (genomic hits). +2. By the number of _selected_ features for each of its alignments (feature hits). + +>**Important**: For proper normalization by genomic hits, input files must be ordered such that multi-mapping read alignments are listed adjacent to one another. ## The Details You may encounter the following cases when you have more than one unique GFF file listed in your Paths File: diff --git a/tests/testdata/counter/bam/Lib304_test.bam b/tests/testdata/counter/bam/Lib304_test.bam new file mode 100644 index 0000000000000000000000000000000000000000..48e2b97e87e261342653c2b997cad233b3d66674 GIT binary patch literal 22386 zcmV)EK)}BriwFb&00000{{{d;LjnLb0aZ}TP69C$9o!9H(XcNa+7V<{h$Kpas3CDX zxzk=?;?Rz50r@Te$-i*nMK`#(?{iL0?)3JtgPkOqJj~M9WijYy>FTN2+u(de)19nd z($%tW;C-KbGvP^lnk~glnS!3))!&N}xRi5o>$$7DQEB zQT=i_H1)>vz&Jm8Xj}BGf>3LoF}`N}z^D?YVLVb}9p77^feKsHmCEumVulp4Wc1Iz4ZeoT3;U^qKLaz7^mORm(+#;L!~}WBE*ftOpo-< zfJ?0;&!yJ9!pd-M#zPq9<4nqtxOAai(?A>0oDEza{+T0Ry)OMZxuTdSpWk1}51R$M zCwl<^03VA81ONa4009360763o0EsNUoe8wvRJq4HFk_|3&81=NO$avdikx`>1d>J# z0!3viATxr>RG|ncKAI3Qt)LH!Kp91)i~#mjG&CcH6{^sw1F)MzKI%4-R>wJ99&{+I$1^@YY^oRVea^RuIo^Z^p z&s4|kh+o^D87QWyg|5l@nPE*Ev#Yajtuki%uG7ZM*mc^tLq`d--?|%w8N-Ayi6K-Y z!c6?y4Ln;4P1Ur`5tIKzG4%*@@F?bx%YQm%-H-2A2xgeYbRy0d2`BV5IdB6hoIMay zrI7Iya_Oww0kMY*fgCalxni%q0Qp((ldR7`YBA&>{N7Sz-?!yJ^(}KCr53FhrL3Vj zpzH(6#1y4oNl+Bi)@)BxEK3%p&;`3=%2}YCsSpagV2x^G!E8;ERYSI1e`Fw}ntGK_ zExQzuhZNqcG^@!4vt-3HWLNctUuBsi3Ccyi&oYsrwBiLDgWs!?>==O|Wjwsv zioQ90_|{Y7by~^8LqjukH&j>taJoKcu1=1|ecbiwG!yHinz9|ZuBmIOKR!X>Y^a&{ zUIEIRd`dg{6#`S2b<0p~Uy!mUrTnlI%Cu4_jZ!GTFNLyxp%i*d9z1FzP!`F&GuTZY z6WcKiOV*8yhsVbxdia_rZGeYgTg<~pc8PC}RsQh(=8(Vqx?=wFU+=Ub9)5$&6HUy+ z4afBjL$}2rzV-5jc=*KORr+wMdF;7hhcTb{`19P0v|8;ZHl-PwzF|S=1t$L|wJFC_ z%(`Q@1E$NpNm-1dDM|>CA}OL6dcsbha{(}mxmRhmR!QEahDHFEEUJ7&x7;b)ZVO7Y z_cH6Tlx8EMyo>*D%CZ&M{%roktIg=)2aP^_tv|hjhriJK+Dhu-@lDiIHx1mOp=0ue zM7Fvqzds#}fdA23NySDDeLu8h{8wn2gD|xeX355T0&@``(`J;8(6FI*NUAOR(Z_vt z6m$HXYk?WxTd|b#+>Sp7QJ;N9aV6JKLo*atpDF0vwbljb1evR)w_6F25_*!VNTK1# zvxPx);ZCi-1kmHd+=c5f>a8kUz5;#35M8)6DCYdlOToNQF9q|_lu|IaO)UlU*8Hyl z^9EmFc9TamupC8KLeo)2m4!$l-S4hFfVpspCopT(dO~1Ywj#-rp=zP1@j+ka_HR7| z%3SW6)oQiG%h*z2g{~%9iZ1#xRCI2?l}+1lr7(0wSBk#OJI{OqlvB8yt6pg_o6AyFSNBXq_+=<&?w2N# z%aVKa>y_jUqvK2}hAmmPoEP(tnY#mX4)-Ff*OT%RPG>K)RLj&=Q4IYw-?{k=V7|k3 zIoBEQre*rjmnHm5)FL}@l(P79dx5gT9X!NR`UFgcPz*Kj;iolr%tlnZDdpm{^Tq

V2YR$KH+Xm>LWj!x{7%mDFPGQ?%`sw;iVuU_9EeG8u(zA!n_SA@VC@cbTvWvmESyR!FXrd>k-Oz<8E=!^$1pBc zBh+wWax2PHewM8&)jN&kK1N7GR#beQysFYRt7of9EXB4I&kuY_S4An&2KXz{VH^A} z!$@V*R}J6QOw|xpmFUa-ZO{FHS(LG4Nxjd7i^1_jJx!Nkdioz~w?NUjA;3nT|EgzNSYZfLtmi4qW<;ldJY^j0Jq*T=xSk)DLHuK? zE=l|lO`1V9u_GQojVMSxam;FBjlPf;(Zt{xGab|SH9g3$iM57_0{bQIE!L z>Z)TIvXG%Ji66WBQZF#2sV2L1LQ zFUYfbqDrW5Gc9t%0tFv-<2%IwtZQ$1KY4B3R38&pK=e`$47BbWwVyFkX=6r z5Yx6bRaB0}r}DQHGjXLDM+L&rHBC6xAi81}-*G%JlN9dP?~}?azP=gwfvro{AVykT z?!Qg8gS5#Q7Wg2lYPw-y<_<1+7ta9ZYW|8flBT|H1&R(AnW_m(<$Xsnr@TkQG3l^j zvl*`#iM~PQ*KNlW$U2};Kk$hno=#85&I>+G0wer$*iPDBs_&W>%sX2WJR#AC_}<1# zLD`>sJvJG0Lh&UkQGau|dzw~;GcPE)=fx^#IV=0K_+NSFJdYIQZ9e3{Ypxnz9CoM)lg)Xl^Ke3b` zw|C;K^Rwe?EjBjo$f{%5#MdT@p=&9<&lUHtNNU-N^%@v_2;!CtFE@Vq}|=m~Cct=;V~*Ro6u2S+pVuG%j9)InLFYuB|Ik?GojX+b{@ zGK`b~cAbCS&cK|Q@o|{EH^?!PFQN$tb0W<52P9Z@Dsj3%?`??g=aJZiu*{@;VC&6J z0A>?~cXB$_T4E14x+SS5qPU)v|Kyyum?Ws3#uYc6YSMe;c+fO05;@A7$M^TXG`RIr zl=$P2(5LD+s_hvQ`48_4ebQIhYX`E8ALm|{oqAH1L9cLicn;+-uV4DLb(VwjU9QWh zQ%|P6sGeaFC#xdpGNPMn`P(GjvBNzoon}(NAOp#O^V3x=VOK82T>aGAz|55Sn51!m z%o^Db-N07dyqMw3myv)Z|EM&XuoUWep!R!)o;%auyO)O5IiJ6sl>H>v+wbLi3x599 zDCNU@%Aj0Q=)2GlGyg?;8hP3A9Y$2Ds8aFa0?VzDf9eO3+b6(A`pkVd6VT_aprUxICuae$G50#|R@(_tthl~}0w@d4w*1%e zl>@d|x6Y(g!;P;Xjl)uNX9&*v2`CqHEtqbdG5JZ7h_6Ui{+P7<7IEay=3^SksFdWZ zh)4!H5)uS4(N@0as7b`hE*FgX;@Jy{Qa)TTX7w#iVE)X1n^yAMz|X4anygs)BkXIL zF($Rq7;mL*!ujpP!W6zR>A9JE@9#mG#jUjL_8Btd+^CA|qG%8q5Ptpu=&g%leL1yi zo#b_3}nJh^NR@E7kb(_cR&93qVpdK~09f%!AAB z0L0CBnQGFxWUCM}Hsp+OwM+Ce+mGxC&PCj80dSIW7CS&r3QQPTEY1?u*nw-#+*fIR7)0!AV+|$mBv+5|a4D z&n7i8PXBNla2nj~x@x5v+kGq2>pUMCpv@JTRL*o$N9OM1E&yXu#%^vUce89;w&dB0 z&owf#Etx}u{|00lzy7_#-Z7GIxh4r?(_Y& zm}mLIx!PbHzwj+9frRL>u$@VF+LQPG9-LX+fDAavh)hUo9+7D2=L~}X&+_d6*=~rx zW}T$k43C`U!(> zNacY{6R#LNe3Y~I#aDszzg(-PTIt0K37O{mhHz*r!c2aI9?(KD_kW6>&q6SCJHNB|kH9R> zI5rJ7g*-52CA32?cWay_pWYOlJGtq-)mkH&-y3+Ei`zgEv~i;SyvCNX9+h6RCKGl< zn;%`$9K{q>*Jh4lZhw*JH!EC%o2D`Gn!P~Y#qm7HF(gIykPncd<1oy6+TgwDvyXuJ z;|wv{N`5C-(R_3ha5Q%?hoc|7_v&%rD17n8)<&7O7ohu(mctUEoY!Bu0-QJbw|2dn zO!#$N(?Nd()fdnDqk72f8E6ex9^$;qs-}>!(AcZp5wyO8_ z%OV2XL{#1JW!V=iU7{O&!LspS%-}{;tMyuq+2AU6H_8{Jyvpbn)jt8VoG%pWwWN{= zTLi+|vq>w4ft<8TWj;s`66d_u8I{U26a{@H3@3LYlJiEP7`kdV*PjIDvW!)$GsW?i zt;-rh5?1cG*H7-=37naHi=^IRiXu4G>gj7}79Hm#!23PzPnESb@($z*S>5y70C85ZKZxI>Zh;)%IdBcIp2DFd7ljXLWm_N~Cu zbtC@`aNZ)aV$=9HT7&VeS?E@P3^Ww43CC~g4Y2IsN5MIV-&Cg2Oj@Ec0{Ce6g6@2t zc&-Dz0k)j@bC4caxHmwf&+}F3-O-2<*gii+F#87Ba{eJ;Jjy@(4JL%EBQqH82*lU& zdZhFpod(R_d^MoaWz+!KG+e`%#FO%)&Gy7$Q^EOAE}C=xw~5R=W~gY+hnH*y&XfH2 zX;zYM9MyL$3nlCjYJ!~T`zTvrDEyLn=d5T|9Y!zRuIAH=B^v!;Y;5A{7Y=$r}+aHIG!a)5j2}ZblP3mV-(cV)!E9 zo;#cG!N11wmh`eQ{YBtxMIsa|`P{{m+Kf6d>t|qE$xFd@NF8`+=C3xt^!lBFIWc1s zFc_!>mV^ekzL6I*rB9VgW7^3@LmC{q4$H`d&Kp8}{qk5XED%%a)7e&vaeXA)A{)Zb z9jV;oJP(-N_!oAw%k;gqpiOC#ZAiHs*W zq9Oe2pjB-%YJm+OgsqxZZk_u4Js$+;biPjAsxsMEXqg0OEBa>U-UW_)IXJKIcU!B@ zbRJPdMMEnv@zyp~$~ov0<#4{fUOAlaN#$@(U8@|#ulH|@3r5w(jlH}WwYnH>gtRy*lNl9{arwQe7#;#uuXP=>RI8T+N zhkRVpzTUDVxpQqv-{*mnIA@l0Z-2EUxpUiwHb-_r1LX--Euhhtz8c1?UK&V zA19W>x%iXiaPF8?4(FQ5<#0|cNx%M2NqWeECD}i_lw|)*Drq0ARnooiaY=gbFH5?& zpD)R7e6A$B zafMbZS!Cg&-w_Ec1bW_1L9g$74lqCKZM8I}&6M*>DC9OUVMr6j&@Z#BwL36B$rvyc z722$5NMWHLoD&8pf5=_WyW5XR^3>)0pihg*Yp`@*vygmfJAz>nnPRq|e;_dT4i${~ zm9-`?hsyvDn-}w6o0Bx!J>`NiTTUlY zzxRs9>^`=Xx4HIWx&;cwO#R~1z}zrgFy^oGh)nyasJEdC$Gl&}2`y=bqn%VZ94Kg* z%w-2c3P;om_@QxeGRsOCHpykt&~aXjLDhiG&yCCZIBEb>Hi!WbVQ3?fMR?Kx+c|q!8VdPFjDy!j0>L(hJvO=Ko2S zrH8sDyLp@Kt$$Sun9ca=Mw`uMQZWx6jU9bI@AtX>bDt;r^AP|0^og_;Jqk^<^b66k zD9oRkbK5`T488RC=`vl!G>jteol#LsIufAAd{8Wh9?~niq^4piKL@>2NnQqS5IPci zNw{J;=uxy~5vg?gK0jHvQ%#n@Vqlsdpf7?a-r$RFwwu< z@AM6cAx%rIAVhm1ZG`{yzNV#gvD#YPJ|MkVOu-8r#FC4WYg2seFukcy-C!CqKDh%y zG2J++NkJb}(M=Zhq85L(2Hn&PpCKhlGx)ysF7x@3bxeHT=qxSV8aJvJJ99u*aJR2- zeXp?`#`a-85d0l>e%<4|d2th~^cyk#_)v|c5J$f5ioQ{l8@}(GaqF)lIiEZ#?)_B+ zC#oBrI}e<5`FcT@=`N1y5Lnct!+74h(fM1ETCT^rJ>RSS9r;$w*nxs0L&p(pSwPq9 zRrGtFkkRiM5aygZ==v4HldtBUcC4*NsvYj*8dkMRB^fSI6ih$!0yIp_TZu9v6UX;g z`SWxu%m7z1y)#fPBY%PV!^cd{>|z8=a@8C({zWm85{9`8)c^NJQUm?`F#mO`NiUIV zJ0U_W=m43w=g%GY6U+LUjMrf+$7R@Vre|rsXesPkQ84FSQVM(CaM7F}97J4&|4@qN zeDnT|!TAk;wl>QNF<-`0B*zyPI1wk@o4!x6oGJIweNZgtIEDCF7Y-NAS?iVTVxd~4 zku(%B$Tu_;4^=$5kLd|>t_5d2f4+95uNC+T;pUR+MD&BpTb=0al-+}+R%tP#W;}G} zS4nq=+)mKxhaU^h8X3o@&9nmybs2M0dTr+gIrM_N^|iP!hLYlRn5ib{2kU#PCpxB( zzRW>QqD^0vvDdmx!I%c6Ahhw1aG5{l%$bv2u?|jhPswx4Q!+jd zGo{1A1Snq$i7g_^p&#eMzIHO{kJC*yOu~LPFlx&81;#V%b(Bgrd+G`b!^>n#ZW$3(sbcbqV#pH z+nV%&`!2UdMy=XTj)XBaJRWlxE!7n_;wb4~2V^(ns8u`3!6RtW2pbNg$80sfxp>CQ z*$t*@RmK3e$#h&y_?LurkjSxn`(zTbeyecK;!Uy*`dW?cEP>o5jO>x5{L?ruT|IUn z52rcofGA%_zeYzf1XbRs$orPLFOohc{41Nq=;)Io+kqqHE?HN$C5du3@g+~Kn$!%` z&_&N<6_JFzvt>8iInKvhDF(B_q!7YD6E%Cr(^(Ytt&ag_Z13AhX-vFmq({8vDl%%k z?U5`1r$_wS1M=`{wMOz1bxZ_aeILqH?jG^CEIl5a6~pvWQmxiXcJ6f{1~3EEat1=u zQf=2~HUi{knJa5*tz-~Mk`zM3DGxU9fxkS zWBb};q_$a|ZJ1^IC?qG1g7ccAza3B1>9Kr0x6V3R9rS++(e_%;t>-SifwY->kiXmN z%sh6N*alG7&@o4pLznHI(}`2!2yTK5I;k?PG(8O+k3rG%ng=^9Buc?c{F|W8xWAwo zLSsiukudkX!<)vC*$1EHCb8D)on#zdc641rM*&qR-y>SGN2Ux=4C|ewVyJ8AJ}Vox z8{}0C$H+$l^Y@Gm-A#J(v;eard`AoYykBOY&#n&40et^Toz>Yr8LuP3oD%*r)TQ6v zl4P%Vy)VPM^lObi?-z0DJH7;uzPcm-=^U5-GXq@uwFVP*g>s8q#`j>ii@pVwo2T?O zfSZ)&B$eAhb6jNK=%JNEZqjSu?5RhA^Z|Fe21rR8M>Q}e%0`PpcMv3PfPVL`XNZ%C zFU=ZEe{feL=7(z785uP9NF{^=rH$; z972*oHoLjIN`3R%>|CD|$Co8XmeGPoIH_zrJ)qs?B<1T53jcsIDPQPlsJb2o&dXB% z2-Oq2Y)FzZ`GT{_&Iy-w-_Sjbb6N=}y>eGhCEbg5!_{z~UddfE@ zl4uLJ3w5o(jTt$kX#0gdjg+jM9we7 zi-wdkOg~g)Re17gbRT^Etn7ejt=UT2>}Zi>;No#iR~Ao}qw?U@%|8p${(N4PG76Xckv3~C1Vgt$^Nl6Ie1)$? zx0qNMVw9l<*S96!Nt<&1eB|Wpm|KhK*p2p#9&++b@t%1R=i3j(UgCvA! zPfQQQr7%KHTe;~$McLb1WDi!r)02W3Gq#a_W9#Cbx+!M+b+Z#bfk|4OrX2bX26;DG&|Z0d4PXxA8#w*R|1*2uIv-&Igz~~1PH6h&&6DEnj`WxB>&jT^xgU`y>tRNv zSjs%z#V0>QCmirjOP48S2tDMNqw3TVuF<8d_Nl&R{i#)JSCc^kWo2#jIja`wxAq%F zljz#ry!wwpI*Y${?J6@o-Zv}>&Y@6v@Y?E>v(5ozjO4{bMRUA$#*le+`~-p&2Tj{0 z$W8I?oDt_+rxHhpKU@@t3WR4Tk_3-i_~}IL8ap}2bOPcaF!I(?&-f_2O>eDTXQ~>A zTh)hiRTjliY4F5nNb!WmEd~Ron~(;mI6=(NFmpV7^o|=JJn7S@`wC1!JZ>LPi!ImT_vBG)z|tUC%NxASCY$&Z#F)0p?Zy zPGhS;T*nAJQbZ|gM(Gr@^IqoxbCpssX030Mq|zsfew*huil_S(h*{@u8<;(Y3dYEn zUrGW+MSq*09#5m6g<{_QF)2R0qiD>ycgSdse-*t2cKAk|>scU1@kyG+J4IvG|2t6# z&n_C%UHWy>=b-3wv-y0I=Xqn%J8ivhQwcp$0E5Ct&6QHRCn1wgfi=a?U^9rh`6^fa99Mz!< z#dJ@jDo3H1Pw!0)k3ulisrJ-?zXqnpcdB*R1z(f>}>awjV;c{&car-H_vRx(Q^&>clLk>2SK zf>ctF5BDQAjpy()B0HT%0)ndMQ1vh$Mr?j$c-%lf5 z9e>{Aq+EnurpFErX`twG$Q@uk%II?uQpP)dy`0jFS~bCd3sUw`(x{7tqv(1mb4l!b zZ@wFnE!M`oWkgq1!?bf($=p+a44nD=5~glFIT00c90em6LpNuQ?%xj|kQLpnGlC{W z2@?ufTq}3#{m_8%<+Uy|zFfgxM!~*q2$!)@DR83^=MhdyV_FF*fLUa44@zN>XCNwF zxNVk@cB{+ynK0dEGBkzIw~JMi&@gB)C%p@Pxtw;5;t0=fi%Gq+R9plOQppBE(z{@{ zZ;l0II$zXuTgjIRRLxb9)S@vOfY(rCZ(Xn(<3#X%>j`N0AH6zlU+HnIkcD?EqW+0u3 zj7T#kyrHO_ibS@Umua+EYQDaGjvN6qZrXK3t9PNrmv%Ni;?f4rP@LEMgOvxAfj z69L~g&*6off%i&yeV>z6Emvl}STvZNd-ml8xy!`8OIJ2%P zhx6RRaya|_vK-E|Ye-4!c&=d$PBM2DIqZRID=Pji$cYrU*Ej>oH6$BB z@&x^oM~p(|ow+F)%Fz2bY8nzR+U#EOha-~`AQ+=rWhN1-kvcxN4j9N@uQ6RhEVxnBeMh&dJou3fj6xXF&uT^7j!HV|Z z=%(SI14W?lv#@vu7Twm%ev<9d11Tw~RU+BKFw5CK(3F3hd>@A|c4}2dylJ6r1deZ* zxm&XzyXQB^bcWtBl~bHPX-(=fRMN2nHSS?1p|383=Hx$n=6677_wKg!(u8;mkevL$ zmJP{DlJA+q zGk~Hx&8-In4OS)Rr6aS02CI_uoB4YYXZ}zIC%Ka~6+tjen9w}oolIr)Id7BT#7J-5 zQZl;M*CVJWqb+oX2_;kYtPCZJC!^^regELNe^2_AT1;m@+xC%}F9rFF!0y@jPEruc z_fPdV2|}c69oGIqi4gyWu#|=%Pj-}C*E?s zm6c&Ee|OjsgJs#ea4N(P}Tq_~J|?4?XW}gxt~bDt8x! z)gPs0IUMm^ue6wp86g%O4PpEoQ5H%%uNjYMU z*BY}E$pbEEW%f9siKlN`(9^ftyU0EBszC~kwn@Xl2!4iSI;CBe)XYZp8lmdr0DKZxtOQlIhRU;BA$Nk7l?entDwJq!>1fy0_n3gr^$?J^iweihTNi zKKMPJK5G>}oobZtoH4*jR&O$`6HUy(@KNUN<@TA+UVk@mzL%jPx09_nY_x*JbQeF& zZy!H@&Y<^U#y`8R2^W?M4`c59okPy1<2bpTq$X~Aj*NCmo+O-KDMh~dX!2{ZoQt9< zrp0o;@G>2kTP){-Xq0WSoF!w+`96~`iRT#?$+>3w$>4mtaL(29iC^yA!uQ*<^Jolh z;>tOw`9|Y8+0{KeFK3ODNqgu=hKuH0dk}36SS;t+v*>vHVmU9KL`St3%dw&X_r-G7 zjfS-s!=b*@@7_X!PSg0l(>gOl*+k;CC8NtxzS*|htwEYhT*NKNt~V;l^Z;}QRgmsr z$hq6jjhV9rDO)NE6KNm=s(gT9Bo3xpqffY^<)624;U=-iIn80(&XK0QXcJ1jwxS%m zY^OXrAb+$`Pd22-kX}>M9YvAywzxX(A8~J^G=@o21;juY6+1fbCfM`oLNV0!JYgw` zq+P*J?`<@at%4CnLl<`gE%c21>Al~wcLeDR8K0-Yj3q#WI|JGGa_;or1)s|9v0QJk zBkT}PM;@`{y?6;E0ux?)+!W3 z(?gD5XNW}AdRr}(9@6ZKs*UvGcdm)f)V$W@R3C`bLumfic4yErmYn>pW}isw)Zr#~ z9|)pylST;?@XUi^WrwJ-D2hH26p^YePT)`nv=&SVnxaLqV9_Fo6CxI+&NvlNaj4XJ zcv^>ATfsU(`$V+mt$k!CC&|vaNk8|w+CTcW-}!wjds_4Qaoe|lC-!-=ZI`VlWVUN- zZ$t3kw{H6*|68G)&7FAiS?5nV?|@v}@a@{#WUShTD7dn#G5*Hl5p9PX$K|taqYfX@ zHumrl+n=%x^7(|90dj2}(mM(%dyt6$5s)At5eXU=HLV#Z?{R**jq}jlJAv~xf?sLt z6XE2t9_MWMe_+v&R9kZeTf0^`i2O=;X457ry_sS@~M9T(n~FnY0Sg|ka~CUC(WGmI*^vMK&Dw!$ql;&@h5G* zbs3PpYyqio^t(WMss&$f%z{IJG^qtUV(R=8fwZ;-qz@l3fOLEdNZS4H0BH@!bvk() z>GFXZkk;34qfC@k@kmDkNm3(@tVDz(<^MTK1ijpeYuBxW~ zY_spb9WWmwwm6kSKpaChOxbhbMT(rmkdztE1bXfEz>=Pb*I0q$aw4C6^C~ zqpG%SAXmdeVsR+vh|w1UXCK7os31@3!&5%>@vP!zpBghZG5}W@1MW z6LaY8_Va@u2hPFl+pSV4gl;z(OA1D!i4)&$-bQfV+#X2(!@k`r#cW_B7_RC%IvgG= z;YR3AF*iN;?gVU=QVd305fG?Q#>yIs?olKkbF zu#NQA)_y>Gu)aKEl;m&hP9VXpqapy=v4mVTvZW{XTLpx}+3!K6k_&8Uj4iBbt|7Q; zb4j8{r~A7f0_VLLPBnByZ9#!+S~gWDE$5<9tATS=45wBMzHg4LIv|W>nQ1xG-n$q$ zMfO!t&1S>$$U1^YR7`xe(H$`J4HYSdP@mIf7fGUE%5G{? z=-gi5P~Uo}$llr2prs1duwo!E@UWDS(}Qv*O@}h}VrseY;=8hG8lvr_zG?Q^X)$n~ zXS?`ng^-2nNE&=M6^U|M&gE~N4xE3o-|AZE4K*D}R%P3@!AwoY@!slLmrn-HR_>hH z-Kh@91`;G!l?Btb z6W&0cyKnQyzD~svbLSbpD#{VFuQ7^fIvp{X;1lG45^t;ERZ^4sL^sq&1v@E#lIiZv^AXd zEs|&SzOCV`YSB5peNMC2trEPcB^%kcD=Thdj;F?dwVZO=yXGKAFdPA6EwKZ{;~co~ z8Dg4dV&tiXzMBG1j;lMCn~>9!ayI?&QQ#cFoils&bl{kA-z~@yELj&2a*f1}Fppzb zcLk0m^5*<*h6$WAI(Tyq{oWkl{3>Q&S3??zZAl~|3|dt}j;9}(S5bMGoS;VK$`bq| zh<1#b73K7zoQ-cB37jjrb8g>E%-}0Cyg3VAdmlJ2aOW84J>uTUu=iVtgEVZqjs*V* zX*nxyAp!fNxpR(qklt^6Ifu{40B4cNn{)M1JxMMSHx9KIhxZ|Qsy|`obQW_3|2?he zBB|i65(OliLUMbN8o(R+JP4$LOh-~NSM=>gVgQ4ePjYRf7+O-lCB@k5X$*^n2_c6Q|(}Xw6K#;6*Ne&vt%KyPx;s;*+_s1`9>|ID6+b`q>(?p87^P{_0#E%zBdIPLB^OMTGkGBYu({L#U93VA>**El1H(hZsM+ z&(E^s$vO7Ver_TU4&B@P^!qJv?qgpF`A{+}MwTrISQd=57s76nh{Jaw`=sVe!6QVJ zHyJX#MJeeKqWAR~ANpUdPZU!Q+}8+$Cth|T1I|sKJG$hA{-jnyWPi6(@P;!qxBmF3 zIPHC%pA;XOE9NV~XMxx~5M+X^^`ti(4b6@0`zOH6WEMXb3pwA5NJ4X1wjt1@n2OR2 zl8r!a{yQ<|-eyK+0a6M;ARuER3!L#zG6?-(JC528AfwppQYZvBnr5ja>jaC5A2{lE z9ryJ#z!-H|B5v1QHSBgJFDY`g;qi)sfZpc>p_ItgT&g6*KRN3+*$ zr0QpF%p+P0A5F=`uC5|0Z906vZ+u5}{O?aGjBPt~AK1QC9vsyAR5U>oWgq+?;eFb8(DVL+DnWPC z2IS{%B1LlZnbkzaLW7P|56Fu$QgvA~k&{?q_p*0C*pB2oU&kyfi;#5Dl1)R=UBgOF zqT%I~WV7-D5x;p%x&3PfD|qWI0I)^lMF= z;%hgNuT?DM178b@79B{SG!-{Rl^5M;y-R-t(nV}7w-_p0GiB4lh6uYkt#jpty)bdE zu#aW29P}81uCCjnp$ks>9ER0?>2HKn2&Fnhf+OV0*u<7f%V~eM$+HI5L4czo8GLN2 zYzV14$!r+@0dQVpA5d@*gf@Z(x%HxG*-HA1nK$<$Ss6=XB+3V4mn2_L#+slej`evL z=LOvs0q4p33bjW!Ts3fUnuef4=9wPz^9>g#wQ{ex)=y$V(VzD>!0>twIwTm`B-g!} z3{flBSvrLzy0iVO#ZobNXrRy&lj4fRw==bJpWllt{;L_Lm0N76c*xcl-?klW+01%} zcq{j^6TVM2n8+3-Tq_UPzk+Q=*HXKR7m{evNEw6G$=UGU$~{;kF(2o!ky%*ji|*6l@9rCl=x_82*#G zLigVT<2z~qC+G!{ZAo{Kf)qJ*FNI~x{GJWbW7Tj&)O}Q-hbSP7BBnjP6F1HR%z11t zbz^h3y0=xrSiz!er1rLMzm7&MSa(rFx=B531MEj*mMT8y+e$)GprGPZb^1Hq+$3G1R1NNC z2}~VF#1a-`PH6n8*;yjtgVF3$Q_h7e2_e~86jf78eQE}+JQz49vrkRA7~B9@z>PLQ{FR}qDp$a8O)O86^*?H(i$V@--!3bh0;Dr5uj9xh@nb+Rz13WBx_I7M10- zh8(bnEY)mC7RY8pw$AR|6p^JTWhf2#Y`hpZHROJE%@7S$geoX49pm|iyf0piV!zE^WCK^rL6Z=X3k8ZBV#*#_ z8~B86oX0gkNlu7xDn6&3gxIj5iSWrTjRWK=-or%!1N)OQ(HrW5bO{iUZCJV{4^IG~ z_wTd6Yhpgt@$+HkR+!1X4qm^~34S;_B2l><^ahx=AuEt$tQsi2M907X5(y8m!z$%+D2(T7 zhA7L9p{4GgwI)X#kn`9ETe%uC*q}rQ)G=8VQa||{nx!gLvOzbOEt<%dWg)%&xnqSa z(7c#OEFXHrus-Px4K~%FiXP!fPDk8RK z(cm2D{IVV_Puw});s+5*9;*uubnOe9!0}^9R5sFQc_k*L)Xrp=_env~# zY{;oj)pN~B99`*2NsYz^U(y{7YIK(*;__~yNH-@fY%o0dlAdWWKlqU7fh}&KooS=$ z0g!_4sQ^txL>Cf`Q-@nGzrk<%6cwnUDwzmmU-u+#a{TrAyp6cj;4;M#8;)vKUN&}j`s-g{zdqx;|MfmU zq1`aP%U`FqdiNKH!LJXiA9@t|^@ffI!|QLSt=c9u#9=OUysa*sMVdtHnqe<R5J$HVXwfF*McZjPr;n!c@a1e=MfP<&cg|H8^Zjo0!JKkBX}&dvIn1P#%LWU; zFr-;&rep|;oIDSPl0F^Z-zzAHk{Vpx_538_Rsri!HSOfC73yA_zm90RW9uX#O7gXK z;$9>DiXdiZB=FyP*2m=w{{VoInHcLM7;85L#}JVMISL7tnSNdp#1F}!nNJXB%o9UN ze*BZXrTIlSs?~DETz~b>fcZIZ%(xNX1Iz`yF`una0P`|$Om)OlM4jQi1%^CFQ=++I z=H5#BMNi_5S$-Kgo}ckPo=dZ&gnBgZwfXms?ST0u?~nHWf!T$k~Ge%voBQeu?W0v+L0pdM7IAb=>rpJ>j=0{FTFn!P3oir5ZjrsWx z_8|Uf-k1}wAky5!8}r6TwBJ5gOl~~6(`NCO<_GuP4Vdrol7^ZD!)yLvlBh}G75?SF z^_)PNEY=}U8K=(h{%Vi-GJW6#Lm%LA5)8qF?D)hS>g6bWNp7_>>n~2!%h3?ZBVG~{ z#Pk`l05B_=c>__5EI~t243#};G5yCr5145g&X^Ueh{fT=tW7zXL~dY3mJCBR^n^j8 zUKBI)`(z)V#vAj?5fW+Y$Qv_hC7tre6*J*C#Ns$P!x?i^Px8^0^8RSMuckY#y=#oB zD7pgSNK@%diMi*)*GRYD3}?(sEB!%_95D2q**uwcGGMs4a-kthx@AIh4jEF%QU_OW z8W7(Orj!fi4oNT-5vDW1D6G_ZkA*k<-W(kfPD5vgY&~UQTa({JIh@@oIb=8!4ixw%FIA$u*!(x`49D8Vp#&S~4JgP$L^E`VG|HNsEM3$q4^Riimxur) zlvH6GP*fqgrkp-P`^)&DETw$t9#c(7b+?d>(+_1?sWrLB^4W0yw&BVcyMmosp7Gz1 z;V*}=BeeN^P#!3awImZdTZxHNx#%X?;WB^jM`RP^3&C+2DmE2CgceYCGf1>2&yodx zCQ202kfK^|`~zVW$Wb-aWDqa=`S=6;BouCtJ@)gnLb`^iiU9sONlOViWTu}G!VRK5 z;b+o(=McJ!bLYjkuPNm#A?-qNZKzR(P0g`!zSOTLA>8P*#Bpt>#3+A+d;SYwU4E=>t-(+7u z5VgQATOAD=!*AFi*ya6MCR^P%#YXq$#Ga62awL8dae~ zVw0nr4o<(V#fC9cQBFQmT_;=6Y0QZ$A&y_3!>5v_$FtZbdLbXov4!#!9h*8-y(P^} zr#kvW*OM%y5gDeAUTEmp`fW`b%=m!rd^nko?$uZg^0z?L4hS(!S%9txwr-|m-04Sz zql`+Zm7N8BB*1m5CbDNvpo|q-Uts)>D`Wlrw2=@$#xPv~#%!)~e7q$KjGa4pGQOGO zj|}F;KpTi}=}xYUyZ`PtdE>=6|54w!%!{$>Ml!cyB3C))Os0`IevBTU4gp3R*S%X9 z+X;+kxytd8;x{Se#n2umcl%ndjGY(wsUEx-OMgKOFPkf)v_GA-$&c~H1tjx&Ay-D5 zPJAKTbG=L6{EVbtP30;_`3kZgzv0UGWHx`sh?n`lQ47u_sR=8&GNwI42E(^=y?32s z$k6lET;(|TUefyK`&`e*ng{vst68OcfRW)U#}`NJMj9P+m19{q{(AeNOZcz(zn1c6 z-1{l{R(Es#RvW$`O=h!P&-ppy{6IP{#y&&#B)K|V8Asj9U-y}J0g>Y|u8jR2AouPX zu8fO+&!18Kiug+mu4l00^?iWxSFUoL@Fl5kIEw28~HnG*7=K8H>{SW{3So1oWGBud%w;EQWskr~Kn#yMOWdmAb+{+H{w`e@{Hq>PNK977}$|G0>&9P59!H!$Ae z%IMRRb~56}=)8C)FeV_Lj5b8p{Cuu!zO`*2DIMkdtsWXZl=%8OcrwJBNd?ChT;f1_yLcc_^j z?QSHc@60u`Nju_Fv7yaKJs%so^$Up1VojZmcglQHGQWsj^-~N@VnwEmpi3-FPrkG% z#!vU6S-HI$(&Zyrxs7v5NLH?_LoZ;wDK!Pzv>n#D16qwAqw z-RRFRS5307G^r?n)cC-P!yr+~Mx>G`^}vCvZe9$WKe01?OJzR~x884F5p@tl zQDX^yrmr``Yv!G_uSk2>1QiB_DLQk8Xp}&|)isCFR+Fr66-w@eJY^Hw^UFr!oLG-@ z)tj{04<8Oa9=mtap52Tb|Hcl~^{#0a^v5t@P!w(ki5`zduQu7g8SLno zmiXiG$QN9Go?cX^HJRmHDu)K`YS8}Og$9gv;u+W!bF@kQmhCJ|A%JmZOOv61z{F|q z!$F5oR{&$dD-Deb>s1Qakx2QFAvJ^e3r?#Cj`+b`l841^zf~!PhW5c*2z6bgoDgZ{TDgi$F3(GULF)$MLh8{8Vs4MqCh_;Q{VH^78n z-2$Yyn8TY&jRQi+4S-DvX5FY_dmMmq16(-t08;GT!Mp(~4F)H<0R$LvqsWn>xabW) zmwwb~{o_kKE4A>_TVx+Qa(buI*iL`xBWJT3I-4pKUV<}DGxfCk=HgSyAgb-zYI?O8 zRMTBql?6k=u9I4QK5ggI$?$u&=~)dA98}@JKw~s1=4i%?Q_|z-ft0GDsiTq&!>&Xd zYRePP&!FPGeEAHoH9}OJTCe~YA>=3+$;e7-;!<(GSVSV~ud`p}TBtALy{J>%CV7cr{yeb7@@+Gv#4Q!3Qx|%OUN*)=VJC- zxE|k?5xCP}m9&H$Ps7=A9-Rllmoxq=5^138!K!})*qeYWB2 zt(UA? zXDh*h_?E6ZFnb-wcr}AW=V{zl_La}x5lBccfaXFFXo5#UO6<7tIzav{espuGnh*ML zp~a96O@ktV2@@u8d$~=w_y?mXuh zSE~ipTNAm^{R^wIn)brHcBS9>k8xoJBP*r^1NC7zxQo*I1wOjRZ(_(1bKG0LubTsg z-ZW?b(VsTbCxU6%XlK`z4I8@hfMJV;^dX!_GiLzlUmfiCtQML{iXd2|69Xiz&34^% z+B>72_0)t$&p3jvKI0bC zZGNsTryO{YrR4M$UG%DKFM z7i7PkfD6-n9V1x^jknXTKO%lgL%FeKYF$Bt4+bqjK@UuO-yFQo&(w)xLXBo^OI5Ir zz&V^0Lr>^$8r+?c6Pn8gbpZh;hC`YF64m6ofOi*+`-kuGZmPC(>z?y((C7gz1`nt%84mn9R-M#l`=R5;k^;1t3ny$w zNf67IpyimDxZfB(p?|!P1of|HCN`GyxlnUk43176ChaCFdh#0RO3Q8q#=Be@&MwsE z`0g2WAOGiW(h7chhWUEt^TBRX#;ZF z*Yk!Zlj_U)awr-B!;hgyp9`&8n~M`w37={34MoMNgtQymA|@dclX^V89{QOER~JXj z8x0<)E-}RM08OVQl++{OVIFF5Kyk$UuEGDr5i@@4LclzMV%8?)^tQ2Jo6x9PP3rXa z8eA_M1ei^6(ggJyN5r}WeY6E3tuuQ1Kd&ZXJfL?mF<2B(0NbRwmv4(?<6 zx(!cGw|z+|@<2AHkPkW`4HrC_|CdP?W!hbj`xr10rV~C*JBZB2FVN(mJ79@C8fJyvbzQhqy8@ftQS8+ z`;W6Ufm(PDIzpljvCrh10F~&d2dSmYdfl?Ykg+L~89I_8+D2k6&70)?#JkkkNxF(0$m%u?2`4Af%XCbGt{c5fLxD=!mUPn;Oxs9n zS@kWi1Wb-?z~xK97YK|uqPQW^O?ziwF)%(MzMKyYC6H{DIP$S9Bo6FRXWkmq_kW=% z$Ny^eui+OXgn1T-dqb zM+1iuTstuQD!s$_usPF6|6ZmC6ga{D`Ve}THN~_OcBS`(IW-eMp1xcNgLF5vXc#BhRYHHJRx2FQ~<$;W%O=WHScW8=AT=;CbJ@k&yZAG0{cU^#&+ zSLRm>Xk)_Ks!K5u~&Q1_OWMmrI~1 zCb$K#2&<1Ud(cXgEl|;j^YeVR&$yTmTD@2jNJDbO?!eOQz3Km+pLZF>l!9SHQGh_J z36ZeG)dlpEtr|r_t=F?dt;JIC9k67q8mgcO$%fN-kQqIQfp9|%q?`*tBv^k-lwO-FLm33io4EK zj8k7Fum9YbZ`F`-SC@m7S*o&VB$C71z#Go&A3O4(T*?K{31pChuIXY`OM6xR7WrZ6 z$Q4!!)n9-zYeK(!0m@Ugrkv4X}hhUOJT28+q zt>K(Azlk%l91gL$t_g$5aNIzOCU)+@hJTy&iE?}mpl+!lOMsgO#*?DNUfgb!bma)% zB>L_CQjH(CTLw>V;M?ioR?$oYxmxPj{{Ic3ZH*X5s2qy5Nh*ww0r$7&B)xg)C0d?z zE?`7eK~K&3&+4B6M?!3Q!ugjPWOgV>wbaCukg02J+Ma%Gam2M&X{cbR zhuG?0Ol3Kv=*mRvy-yqDCn;fT7{@VQ_Y61 zXImuBTUAIs(RT zQx&xI!Mv?ckU0b0*?LDYln~{ZFgZv9k5JOtp57$fzaI21(F%EE_I}l0AC4HM=b>a) zc?W0Ax))j^P3JF3TdKQ6&X^~^raP^@YYbFmgSG)hQc8PD%oV4QAlpWfGv>_;$aL40 zyriKQ4t|uhKI_lEa7v+!4&pl@ohID$g&3Wk@gq@z6I28=3sSsoXk#L$Hm9B|5?9SU z_TyG6g)V9lioTpwo+tWD_r?C$s>m~4YVf3yXWFQ{1Z7Q+CHT*@Lu_1Yyl}8 zo5XRYGY`^P_q+&W?|+B1b>{e$Zkc}~X)el9h65*&mV?VV%JBLq($8fm+dv5ubmUkt za@tnX5N6LMC+=&Ggs1l=-j-uTeuQU^B&yZ#GyDic$B+gWLp%5pF8J$GAbieo9oCNW zbtyiC&$r{f(jynq+nX2Rm@kMb_y~u+M75?JfBZ5DaIxcpCC|Lgf2%b?F(Jf-u^=b* zp#48)_wTgY{r~_UiwFb&00000{{{d;LjnMr083_ofHgOG7jW(G0~!UsA86CL|PmD0s;8ZM)W4?Q{BCp1u!@IQX9MeB1o# z$(N6tjf|cYaTGlNQnJzOl)nB+pQlMISqV%@{%nD4Y-|jZ_wm>{1C5kNGdqF7JE5Ur ze))W(g1_|&r%oswP~dvel~z{A&PSQuEX65Ug74Dt#X006e6Ofvug literal 0 HcmV?d00001 From 1c1d06204d34f8cbfacbc009bfbc020c2ad1f934 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 1 May 2023 10:01:49 -0700 Subject: [PATCH 16/18] Bugfix for decollapsed outputs: proper linebreaks between buffered writes --- tiny/rna/counter/hts_parsing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tiny/rna/counter/hts_parsing.py b/tiny/rna/counter/hts_parsing.py index 81b1b789..ec27df54 100644 --- a/tiny/rna/counter/hts_parsing.py +++ b/tiny/rna/counter/hts_parsing.py @@ -193,11 +193,11 @@ def _write_decollapsed_sam(self) -> None: if name != prev_name: seq_count = int(name.rsplit(self._collapser_token, 1)[1]) - aln_out.extend([aln.to_string()] * seq_count) + aln_out.extend([aln.to_string() + '\n'] * seq_count) prev_name = name with open(self._get_decollapsed_filename(), 'a') as sam_o: - sam_o.write('\n'.join(aln_out)) + sam_o.writelines(aln_out) self._decollapsed_reads.clear() From 7f226216d9603998de2d40106c38f6642efa40c7 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 1 May 2023 10:38:52 -0700 Subject: [PATCH 17/18] Upgrading GFFValidator.alignment_chroms_mismatch_heuristic() to use Pysam so that it can parse BAM files --- tiny/rna/counter/validation.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tiny/rna/counter/validation.py b/tiny/rna/counter/validation.py index f53a4049..a068be6e 100644 --- a/tiny/rna/counter/validation.py +++ b/tiny/rna/counter/validation.py @@ -180,11 +180,10 @@ def alignment_chroms_mismatch_heuristic(self, sam_files: List[str], subset_size= for file in sam_files: file_chroms = set() - with open(file, 'rb') as f: - for i, line in zip(range(subset_size), f): - if line[0] == ord('@'): continue - file_chroms.add(line.split(b'\t')[2].strip().decode()) - if i % 10000 == 0 and len(file_chroms & self.chrom_set): break + reader = pysam.AlignmentFile(file) + for i, aln in enumerate(reader.head(subset_size)): + file_chroms.add(aln.reference_name) + if i % 10000 == 0 and len(file_chroms & self.chrom_set): break if not len(file_chroms & self.chrom_set): files_wo_overlap[file] = file_chroms From eb0287e40d21a53b14f4f37e13fa961046364e41 Mon Sep 17 00:00:00 2001 From: Alex Tate <0xalextate@gmail.com> Date: Mon, 1 May 2023 11:30:14 -0700 Subject: [PATCH 18/18] Minor streamlining of info about BAM file requirements --- doc/tiny-count.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/tiny-count.md b/doc/tiny-count.md index 3537c16c..8f367aa9 100644 --- a/doc/tiny-count.md +++ b/doc/tiny-count.md @@ -7,10 +7,16 @@ For an explanation of tiny-count's parameters in the Run Config and by commandli tiny-count offers a variety of options for refining your analysis. You might find that repeat analyses are required while tuning these options to your goals. Using the command `tiny recount`, tinyRNA will run the workflow starting at the tiny-count step using inputs from a prior end-to-end run to save time. See the [pipeline resume documentation](Pipeline.md#resuming-a-prior-analysis) for details and prerequisites. ## Running as a Standalone Tool +Skip to [Feature Selection](#feature-selection) if you are using the tinyRNA workflow. + If you would like to run tiny-count as a standalone tool, not as part of an end-to-end or resumed analysis, you can do so with the command `tiny-count`. The command has [one required argument](Parameters.md#full-tiny-count-help-string): your Paths File. Your Samples Sheet will need to list SAM or BAM alignment files rather than FASTQ files in the `Input Files` column. Alignment files from third party sources are also supported, and if they have been produced from reads collapsed by tiny-collapse or fastx, tiny-count will honor the reported read counts. #### Input File Requirements -The SAM/BAM files provided during standalone runs _must_ be ordered so that multi-mapping read alignments are listed adjacent to one another. This adjacency convention is required for proper normalization by genomic hits. For this reason, files with ambiguous sorting or grouping (`SO:unsorted`, `SO:unknown`, or `GO:none`) will be rejected unless they were produced by an alignment tool that we recognize for following the adjacency convention. At this time, this includes Bowtie, Bowtie2, and STAR (an admittedly incomplete list). +The SAM/BAM files provided during standalone runs _must_ be ordered so that multi-mapping read alignments are listed adjacent to one another. This adjacency convention is required for proper normalization by genomic hits. For this reason, files with ambiguous order will be rejected unless they were produced by an alignment tool that we recognize for following the adjacency convention. At this time, this includes Bowtie, Bowtie2, and STAR (an admittedly incomplete list). + +#### BAM File Tips +- Use the `--no-PG` option with `samtools view` when converting alignments +- Pysam will issue two warnings about missing index files; they can be ignored #### Using Non-collapsed Sequence Alignments While third-party SAM files from non-collapsed reads are supported, there are some caveats. These files will result in substantially higher resource usage and runtimes; we strongly recommend collapsing prior to alignment. Additionally, the sequence-related stats produced by tiny-count will no longer represent _unique_ sequences. These stats will instead refer to all sequences with unique QNAMEs (that is, multi-alignment bundles still cary a sequence count of 1).