From 499b6381a3c9cc0df5deba642306126ab43e3732 Mon Sep 17 00:00:00 2001 From: Chris Tomkins-Tinch Date: Mon, 11 Mar 2024 16:55:54 -0400 Subject: [PATCH] add parameter to remove duplicate bit as exclusion criterion in filter_bam_to_proper_primary_mapped_reads (#94) add a parameter to remove the read duplicate flag bit from the read exclusion criteria in filter_bam_to_proper_primary_mapped_reads; CLI commands making use of this function now now accept an optional --keepDuplicates flag --- read_utils.py | 30 ++++++++++++++++++++++++------ tools/samtools.py | 4 ++-- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/read_utils.py b/read_utils.py index 5c4db47d..44e357d9 100755 --- a/read_utils.py +++ b/read_utils.py @@ -1319,7 +1319,7 @@ def parser_align_and_fix(parser=argparse.ArgumentParser()): # ========================= -def filter_bam_to_proper_primary_mapped_reads(inBam, outBam, doNotRequirePairsToBeProper=False, keepSingletons=False): +def filter_bam_to_proper_primary_mapped_reads(inBam, outBam, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False): ''' Take a BAM file and filter to only reads that are properly paired and mapped. Optionally reject singletons, and optionally require reads to be properly paired and mapped. @@ -1338,7 +1338,8 @@ def filter_bam_to_proper_primary_mapped_reads(inBam, outBam, doNotRequirePairsTo samtools.filter_to_proper_primary_mapped_reads(inBam, outBam, require_pairs_to_be_proper=not doNotRequirePairsToBeProper, - reject_singletons=not keepSingletons) + reject_singletons=not keepSingletons, + reject_duplicates=not keepDuplicates) return 0 def parser_filter_bam_to_proper_primary_mapped_reads(parser=argparse.ArgumentParser()): @@ -1354,6 +1355,11 @@ def parser_filter_bam_to_proper_primary_mapped_reads(parser=argparse.ArgumentPar help='Keep singleton reads when filtering (default: %(default)s)', action='store_true' ) + parser.add_argument( + '--keepDuplicates', + help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)', + action='store_true' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, filter_bam_to_proper_primary_mapped_reads, split_args=True) @@ -1363,7 +1369,7 @@ def parser_filter_bam_to_proper_primary_mapped_reads(parser=argparse.ArgumentPar def minimap2_idxstats(inBam, refFasta, outBam=None, outStats=None, - filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False): + filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False): ''' Take reads, align to reference with minimap2 and perform samtools idxstats. Optionally filter reads after alignment, prior to reporting idxstats, to include only those flagged as properly paired. ''' @@ -1388,7 +1394,8 @@ def minimap2_idxstats(inBam, refFasta, outBam=None, outStats=None, samtools.filter_to_proper_primary_mapped_reads(bam_aligned, bam_filtered, require_pairs_to_be_proper=not doNotRequirePairsToBeProper, - reject_singletons=not keepSingletons) + reject_singletons=not keepSingletons, + reject_duplicates=not keepDuplicates) os.unlink(bam_aligned) else: shutil.move(bam_aligned, bam_filtered) @@ -1420,6 +1427,11 @@ def parser_minimap2_idxstats(parser=argparse.ArgumentParser()): help='Keep singleton reads when filtering (default: %(default)s)', action='store_true' ) + parser.add_argument( + '--keepDuplicates', + help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)', + action='store_true' + ) parser.add_argument('--outBam', help='Output aligned, indexed BAM file', default=None) parser.add_argument('--outStats', help='Output idxstats file', default=None) @@ -1432,7 +1444,7 @@ def parser_minimap2_idxstats(parser=argparse.ArgumentParser()): def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None, min_score_to_filter=None, aligner_options=None, - filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False): + filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False): ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats. Optionally filter reads after alignment, prior to reporting idxstats, to include only those flagged as properly paired. ''' @@ -1460,7 +1472,8 @@ def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None, samtools.filter_to_proper_primary_mapped_reads(bam_aligned, bam_filtered, require_pairs_to_be_proper=not doNotRequirePairsToBeProper, - reject_singletons=not keepSingletons) + reject_singletons=not keepSingletons, + reject_duplicates=not keepDuplicates) os.unlink(bam_aligned) else: shutil.move(bam_aligned, bam_filtered) @@ -1513,6 +1526,11 @@ def parser_bwamem_idxstats(parser=argparse.ArgumentParser()): help='Keep singleton reads when filtering (default: %(default)s)', action='store_true' ) + parser.add_argument( + '--keepDuplicates', + help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)', + action='store_true' + ) util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None))) util.cmd.attach_main(parser, bwamem_idxstats, split_args=True) diff --git a/tools/samtools.py b/tools/samtools.py index 5f2e6943..efc53fb0 100644 --- a/tools/samtools.py +++ b/tools/samtools.py @@ -182,7 +182,7 @@ def removeDoublyMappedReads(self, inBam, outBam): opts = ['-b', '-F' '1028', '-f', '2', '-@', '3'] self.view(opts, inBam, outBam) - def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_be_proper=True, reject_singletons=True): + def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_be_proper=True, reject_singletons=True, reject_duplicates=True): ''' This function writes a bam file filtered to include only reads that are: - not flagged as duplicates @@ -210,7 +210,7 @@ def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_ is_single_end=not read.is_paired # if a PCR/optical duplicate, do not write - if read.is_duplicate: + if reject_duplicates and read.is_duplicate: continue # if a read is a secondary or supplementary mapping (split/chimeric), do not write