From 3e1b9f0dc519d6e80d7935f3e564b6da0013e1cc Mon Sep 17 00:00:00 2001 From: Patrick Sorn Date: Fri, 24 Nov 2023 19:06:20 +0100 Subject: [PATCH] Added mismatches to read info; Fixed bug in read plotting; Updated README --- README.md | 1 + pyproject.toml | 2 +- src/bp_quant/command_line.py | 1 + src/bp_quant/plot_reads.py | 5 ++++- src/bp_quant/requantify.py | 33 ++++++++++++++++++++++++++++----- src/bp_quant/version.py | 2 +- 6 files changed, 36 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index b300ca8..5745fbf 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ breakpoint (spanning pairs). - Count reads using `bp_quant count` - Output: - Table with read counts per input sequence + - Table with info on each read (`read_info.tsv`) ## Dependencies diff --git a/pyproject.toml b/pyproject.toml index 5164bae..be8b8c6 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "bp_quant" -version = "0.5.0" +version = "0.5.1" authors = [ {name = "TRON - Translational Oncology at the University Medical Center of the Johannes Gutenberg University Mainz", email = "patrick.sorn@tron-mainz.de"}, ] diff --git a/src/bp_quant/command_line.py b/src/bp_quant/command_line.py index 6d26ef7..f9ee8b5 100644 --- a/src/bp_quant/command_line.py +++ b/src/bp_quant/command_line.py @@ -67,4 +67,5 @@ def bp_quant_cli(): try: args.func(args) except AttributeError as e: + print(e) parser.parse_args(["--help"]) diff --git a/src/bp_quant/plot_reads.py b/src/bp_quant/plot_reads.py index de02e16..50112ed 100755 --- a/src/bp_quant/plot_reads.py +++ b/src/bp_quant/plot_reads.py @@ -53,6 +53,7 @@ def plot(seq_to_pos, read_info, output_path): read_dict = {} with open(read_info) as inf: + next(inf) for line in inf: elements = line.rstrip().split("\t") read_name = elements[0] @@ -60,7 +61,8 @@ def plot(seq_to_pos, read_info, output_path): seq_name = elements[2] start = int(elements[3]) stop = int(elements[4]) - read_type = elements[5] + mismatches = int(elements[5]) + read_type = elements[6] if seq_name not in read_dict: read_dict[seq_name] = {} @@ -137,6 +139,7 @@ def add_plot_reads_args(parser): help="Output file", required=True ) + parser.set_defaults(func=plot_reads_command) def plot_reads_command(args): diff --git a/src/bp_quant/requantify.py b/src/bp_quant/requantify.py index 646e645..e8669d4 100755 --- a/src/bp_quant/requantify.py +++ b/src/bp_quant/requantify.py @@ -68,12 +68,12 @@ def classify_read(aln_start, aln_stop, aln_pairs, intervals, allow_mismatches, b # define match bases for get_aligned_pairs() MATCH_BASES = ['A', 'C', 'G', 'T'] - read_info = {"junc": False, "within": False, "interval": "", "anchor": 0} + read_info = {"junc": False, "within": False, "interval": "", "anchor": 0, "nm_in_bp_area": 0} for (interval_name, ref_start, ref_stop) in intervals: # Check if read spans ref start if aln_start <= ref_stop - bp_dist and aln_stop >= ref_stop + bp_dist: - + num_mismatches = 0 # test that read maps exact (or no del/ins) in pos +/- bp_dist reg_start = ref_stop - bp_dist @@ -90,12 +90,16 @@ def classify_read(aln_start, aln_stop, aln_pairs, intervals, allow_mismatches, b # test if reference sequence are all capital (means match) according # to https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.get_aligned_pairs aln_seq = [s for (q, r, s) in aln_pairs if r is not None and reg_start <= r and r < reg_end] - no_snp = all(s in MATCH_BASES for s in aln_seq) + # Get number of mismatches for this read + match_list = [s in MATCH_BASES for s in aln_seq] + no_snp = all(match_list) + num_mismatches = match_list.count(False) if (no_ins_or_del and no_snp) or allow_mismatches: anchor = min(aln_stop - ref_stop, ref_stop - aln_start) read_info["junc"] = True read_info["anchor"] = anchor read_info["interval"] = interval_name + read_info["nm_in_bp_area"] = num_mismatches # read maps completely within the interval @@ -117,6 +121,15 @@ def __init__(self, seq_table_file, bam_file, output_path, bp_dist, allow_mismatc self.quant_file = os.path.join(output_path, "quantification.tsv") self.reads_file = os.path.join(output_path, "read_info.tsv.gz") self.reads_out = gzip.open(self.reads_file, "wb") + self.reads_out.write(("\t".join([ + "name", + "mate", + "reference", + "start", + "end", + "num_mismatches_in_bp_area", + "classification" + ]) + "\n").encode()) self.bp_dist = bp_dist self.allow_mismatches = allow_mismatches self.interval_mode = interval_mode @@ -382,8 +395,18 @@ def quantify(self, r1, r2): r1_type = "softjunc" if not r2_type: r2_type = "softjunc" - self.reads_out.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(read_name, "R1", seq_name, r1_start, r1_stop, r1_type).encode()) - self.reads_out.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(read_name, "R2", seq_name, r2_start, r2_stop, r2_type).encode()) + r1_mismatches = r1_info["nm_in_bp_area"] + r2_mismatches = r2_info["nm_in_bp_area"] + self.reads_out.write( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + read_name, "R1", seq_name, r1_start, r1_stop, r1_mismatches, r1_type + ).encode() + ) + self.reads_out.write( + "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + read_name, "R2", seq_name, r2_start, r2_stop, r2_mismatches, r2_type + ).encode() + ) def write_results(self): diff --git a/src/bp_quant/version.py b/src/bp_quant/version.py index bba4b60..e5a4c08 100644 --- a/src/bp_quant/version.py +++ b/src/bp_quant/version.py @@ -1,2 +1,2 @@ -version_info = (0, 5, 0) +version_info = (0, 5, 1) version = '.'.join(str(c) for c in version_info)