From 3e1b9f0dc519d6e80d7935f3e564b6da0013e1cc Mon Sep 17 00:00:00 2001
From: Patrick Sorn <patrick.sorn@tron-mainz.de>
Date: Fri, 24 Nov 2023 19:06:20 +0100
Subject: [PATCH] Added mismatches to read info; Fixed bug in read plotting;
 Updated README

---
 README.md                    |  1 +
 pyproject.toml               |  2 +-
 src/bp_quant/command_line.py |  1 +
 src/bp_quant/plot_reads.py   |  5 ++++-
 src/bp_quant/requantify.py   | 33 ++++++++++++++++++++++++++++-----
 src/bp_quant/version.py      |  2 +-
 6 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index b300ca8..5745fbf 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ breakpoint (spanning pairs).
 - Count reads using `bp_quant count`
 - Output: 
     - Table with read counts per input sequence
+    - Table with info on each read (`read_info.tsv`)
 
 ## Dependencies
 
diff --git a/pyproject.toml b/pyproject.toml
index 5164bae..be8b8c6 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "bp_quant"
-version = "0.5.0"
+version = "0.5.1"
 authors = [
   {name = "TRON - Translational Oncology at the University Medical Center of the Johannes Gutenberg University Mainz", email = "patrick.sorn@tron-mainz.de"},
 ]
diff --git a/src/bp_quant/command_line.py b/src/bp_quant/command_line.py
index 6d26ef7..f9ee8b5 100644
--- a/src/bp_quant/command_line.py
+++ b/src/bp_quant/command_line.py
@@ -67,4 +67,5 @@ def bp_quant_cli():
     try:
         args.func(args)
     except AttributeError as e:
+        print(e)
         parser.parse_args(["--help"])
diff --git a/src/bp_quant/plot_reads.py b/src/bp_quant/plot_reads.py
index de02e16..50112ed 100755
--- a/src/bp_quant/plot_reads.py
+++ b/src/bp_quant/plot_reads.py
@@ -53,6 +53,7 @@ def plot(seq_to_pos, read_info, output_path):
     read_dict = {}
 
     with open(read_info) as inf:
+        next(inf)
         for line in inf:
             elements = line.rstrip().split("\t")
             read_name = elements[0]
@@ -60,7 +61,8 @@ def plot(seq_to_pos, read_info, output_path):
             seq_name = elements[2]
             start = int(elements[3])
             stop = int(elements[4])
-            read_type = elements[5]
+            mismatches = int(elements[5])
+            read_type = elements[6]
             
             if seq_name not in read_dict:
                 read_dict[seq_name] = {}
@@ -137,6 +139,7 @@ def add_plot_reads_args(parser):
         help="Output file",
         required=True
     )
+    parser.set_defaults(func=plot_reads_command)
 
     
 def plot_reads_command(args):
diff --git a/src/bp_quant/requantify.py b/src/bp_quant/requantify.py
index 646e645..e8669d4 100755
--- a/src/bp_quant/requantify.py
+++ b/src/bp_quant/requantify.py
@@ -68,12 +68,12 @@ def classify_read(aln_start, aln_stop, aln_pairs, intervals, allow_mismatches, b
     # define match bases for get_aligned_pairs()
     MATCH_BASES = ['A', 'C', 'G', 'T']
 
-    read_info = {"junc": False, "within": False, "interval": "", "anchor": 0}
+    read_info = {"junc": False, "within": False, "interval": "", "anchor": 0, "nm_in_bp_area": 0}
 
     for (interval_name, ref_start, ref_stop) in intervals:
         # Check if read spans ref start
         if aln_start <= ref_stop - bp_dist and aln_stop >= ref_stop + bp_dist:
-        
+            num_mismatches = 0
             # test that read maps exact (or no del/ins) in pos +/- bp_dist
 
             reg_start = ref_stop - bp_dist
@@ -90,12 +90,16 @@ def classify_read(aln_start, aln_stop, aln_pairs, intervals, allow_mismatches, b
             # test if reference sequence are all capital (means match) according
             # to https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.get_aligned_pairs
             aln_seq = [s for (q, r, s) in aln_pairs if r is not None and reg_start <= r and r < reg_end]
-            no_snp = all(s in MATCH_BASES for s in aln_seq)
+            # Get number of mismatches for this read
+            match_list = [s in MATCH_BASES for s in aln_seq]
+            no_snp = all(match_list)
+            num_mismatches = match_list.count(False)
             if (no_ins_or_del and no_snp) or allow_mismatches:
                 anchor = min(aln_stop - ref_stop, ref_stop - aln_start)
                 read_info["junc"] = True
                 read_info["anchor"] = anchor
                 read_info["interval"] = interval_name
+                read_info["nm_in_bp_area"] = num_mismatches
 
         
         # read maps completely within the interval
@@ -117,6 +121,15 @@ def __init__(self, seq_table_file, bam_file, output_path, bp_dist, allow_mismatc
         self.quant_file = os.path.join(output_path, "quantification.tsv")
         self.reads_file = os.path.join(output_path, "read_info.tsv.gz")
         self.reads_out = gzip.open(self.reads_file, "wb")
+        self.reads_out.write(("\t".join([
+            "name",
+            "mate",
+            "reference",
+            "start",
+            "end",
+            "num_mismatches_in_bp_area",
+            "classification"
+        ]) + "\n").encode())
         self.bp_dist = bp_dist
         self.allow_mismatches = allow_mismatches
         self.interval_mode = interval_mode
@@ -382,8 +395,18 @@ def quantify(self, r1, r2):
             r1_type = "softjunc"
         if not r2_type:
             r2_type = "softjunc"
-        self.reads_out.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(read_name, "R1", seq_name, r1_start, r1_stop, r1_type).encode())
-        self.reads_out.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(read_name, "R2", seq_name, r2_start, r2_stop, r2_type).encode())
+        r1_mismatches = r1_info["nm_in_bp_area"]
+        r2_mismatches = r2_info["nm_in_bp_area"]
+        self.reads_out.write(
+            "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+                read_name, "R1", seq_name, r1_start, r1_stop, r1_mismatches, r1_type
+            ).encode()
+        )
+        self.reads_out.write(
+            "{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+                read_name, "R2", seq_name, r2_start, r2_stop, r2_mismatches, r2_type
+            ).encode()
+        )
 
 
     def write_results(self):
diff --git a/src/bp_quant/version.py b/src/bp_quant/version.py
index bba4b60..e5a4c08 100644
--- a/src/bp_quant/version.py
+++ b/src/bp_quant/version.py
@@ -1,2 +1,2 @@
-version_info = (0, 5, 0)
+version_info = (0, 5, 1)
 version = '.'.join(str(c) for c in version_info)