Skip to content

Commit

Permalink
add parameter to remove duplicate bit as exclusion criterion in filte…
Browse files Browse the repository at this point in the history
…r_bam_to_proper_primary_mapped_reads (#94)

add a parameter to remove the read duplicate flag bit from the read exclusion criteria in filter_bam_to_proper_primary_mapped_reads; CLI commands making use of this function now now accept an optional --keepDuplicates flag
  • Loading branch information
tomkinsc committed Mar 11, 2024
1 parent 52877a9 commit 499b638
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 8 deletions.
30 changes: 24 additions & 6 deletions read_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1319,7 +1319,7 @@ def parser_align_and_fix(parser=argparse.ArgumentParser()):

# =========================

def filter_bam_to_proper_primary_mapped_reads(inBam, outBam, doNotRequirePairsToBeProper=False, keepSingletons=False):
def filter_bam_to_proper_primary_mapped_reads(inBam, outBam, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False):
''' Take a BAM file and filter to only reads that are properly
paired and mapped. Optionally reject singletons, and
optionally require reads to be properly paired and mapped.
Expand All @@ -1338,7 +1338,8 @@ def filter_bam_to_proper_primary_mapped_reads(inBam, outBam, doNotRequirePairsTo
samtools.filter_to_proper_primary_mapped_reads(inBam,
outBam,
require_pairs_to_be_proper=not doNotRequirePairsToBeProper,
reject_singletons=not keepSingletons)
reject_singletons=not keepSingletons,
reject_duplicates=not keepDuplicates)
return 0

def parser_filter_bam_to_proper_primary_mapped_reads(parser=argparse.ArgumentParser()):
Expand All @@ -1354,6 +1355,11 @@ def parser_filter_bam_to_proper_primary_mapped_reads(parser=argparse.ArgumentPar
help='Keep singleton reads when filtering (default: %(default)s)',
action='store_true'
)
parser.add_argument(
'--keepDuplicates',
help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)',
action='store_true'
)

util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, filter_bam_to_proper_primary_mapped_reads, split_args=True)
Expand All @@ -1363,7 +1369,7 @@ def parser_filter_bam_to_proper_primary_mapped_reads(parser=argparse.ArgumentPar


def minimap2_idxstats(inBam, refFasta, outBam=None, outStats=None,
filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False):
filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False):
''' Take reads, align to reference with minimap2 and perform samtools idxstats.
Optionally filter reads after alignment, prior to reporting idxstats, to include only those flagged as properly paired.
'''
Expand All @@ -1388,7 +1394,8 @@ def minimap2_idxstats(inBam, refFasta, outBam=None, outStats=None,
samtools.filter_to_proper_primary_mapped_reads(bam_aligned,
bam_filtered,
require_pairs_to_be_proper=not doNotRequirePairsToBeProper,
reject_singletons=not keepSingletons)
reject_singletons=not keepSingletons,
reject_duplicates=not keepDuplicates)
os.unlink(bam_aligned)
else:
shutil.move(bam_aligned, bam_filtered)
Expand Down Expand Up @@ -1420,6 +1427,11 @@ def parser_minimap2_idxstats(parser=argparse.ArgumentParser()):
help='Keep singleton reads when filtering (default: %(default)s)',
action='store_true'
)
parser.add_argument(
'--keepDuplicates',
help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)',
action='store_true'
)
parser.add_argument('--outBam', help='Output aligned, indexed BAM file', default=None)
parser.add_argument('--outStats', help='Output idxstats file', default=None)

Expand All @@ -1432,7 +1444,7 @@ def parser_minimap2_idxstats(parser=argparse.ArgumentParser()):

def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None,
min_score_to_filter=None, aligner_options=None,
filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False):
filterReadsAfterAlignment=False, doNotRequirePairsToBeProper=False, keepSingletons=False, keepDuplicates=False):
''' Take reads, align to reference with BWA-MEM and perform samtools idxstats.
Optionally filter reads after alignment, prior to reporting idxstats, to include only those flagged as properly paired.
'''
Expand Down Expand Up @@ -1460,7 +1472,8 @@ def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None,
samtools.filter_to_proper_primary_mapped_reads(bam_aligned,
bam_filtered,
require_pairs_to_be_proper=not doNotRequirePairsToBeProper,
reject_singletons=not keepSingletons)
reject_singletons=not keepSingletons,
reject_duplicates=not keepDuplicates)
os.unlink(bam_aligned)
else:
shutil.move(bam_aligned, bam_filtered)
Expand Down Expand Up @@ -1513,6 +1526,11 @@ def parser_bwamem_idxstats(parser=argparse.ArgumentParser()):
help='Keep singleton reads when filtering (default: %(default)s)',
action='store_true'
)
parser.add_argument(
'--keepDuplicates',
help='When filtering, do not exclude reads due to being flagged as duplicates (default: %(default)s)',
action='store_true'
)

util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
util.cmd.attach_main(parser, bwamem_idxstats, split_args=True)
Expand Down
4 changes: 2 additions & 2 deletions tools/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ def removeDoublyMappedReads(self, inBam, outBam):
opts = ['-b', '-F' '1028', '-f', '2', '-@', '3']
self.view(opts, inBam, outBam)

def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_be_proper=True, reject_singletons=True):
def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_be_proper=True, reject_singletons=True, reject_duplicates=True):
'''
This function writes a bam file filtered to include only reads that are:
- not flagged as duplicates
Expand Down Expand Up @@ -210,7 +210,7 @@ def filter_to_proper_primary_mapped_reads(self, inBam, outBam, require_pairs_to_
is_single_end=not read.is_paired

# if a PCR/optical duplicate, do not write
if read.is_duplicate:
if reject_duplicates and read.is_duplicate:
continue

# if a read is a secondary or supplementary mapping (split/chimeric), do not write
Expand Down

0 comments on commit 499b638

Please sign in to comment.