Merge pull request #6 from TRON-Bioinformatics/dev

Dev into master
TRON-Bioinformatics · Jun 7, 2023 · 451a3e6 · 451a3e6
2 parents 9c80beb + 3b104b7
commit 451a3e6
Show file tree

Hide file tree

Showing 7 changed files with 151 additions and 29 deletions.
diff --git a/README.md b/README.md
@@ -26,23 +26,22 @@ breakpoint (spanning pairs).
    - pysam (>= 0.16.0.1)
  - STAR (>= 2.7.8a)
  - samtools (>= 1.9)
+ - bowtie2 (>= 2.4.4)
+ - bwa (>= 0.7.17)
 
 ## Installation
 
 ```
 git clone https://github.com/TRON-Bioinformatics/easyquant.git
-```
 
-```
 mv config.ini.sample config.ini
-```
 
-Update `config.ini` with installation paths
+# If you have conda installed you can simply install the environment like this
+conda env create -f environment.yml
 
+# Otherwise you have to adjust the path to the different tools in your config.ini
 ```
-samtools_cmd=/path/to/samtools/1.9/samtools
-star_cmd=/path/to/STAR/2.7.8a/bin/Linux_x86_64_static/STAR
-```
+
 
 ## Usage
 

diff --git a/config.ini.sample b/config.ini.sample
@@ -2,8 +2,7 @@
 version=0.4.1
 
 [commands]
-samtools=/path/to/samtools
-star=/path/to/STAR
-bwa=/path/to/bwa
-bowtie2=/path/to/bowtie2
-quantification=/path/to/easyquant/requantify.py
+samtools=samtools
+star=STAR
+bwa=bwa
+bowtie2=bowtie2
diff --git a/easy_quant.py b/easy_quant.py
@@ -35,6 +35,7 @@ class Easyquant(object):
     def __init__(self, fq1, fq2, bam, seq_tab, bp_distance, working_dir, allow_mismatches, interval_mode):
 
         self.working_dir = os.path.abspath(working_dir)
+        self.module_dir = os.path.dirname(os.path.realpath(__file__))
         IOMethods.create_folder(self.working_dir)
 
         logfile = os.path.join(self.working_dir, 'run.log')
@@ -188,7 +189,8 @@ def run(self, method, num_threads, star_cmd_params):
 
 
             if self.fq1 and self.fq2:
-
+                # TODO: Test quantification performance / runtime
+                # of outputting unaligned reads / singletons
                 align_cmd = "{0} --outFileNamePrefix {1} \
                 --limitOutSAMoneReadBytes 1000000 \
                 --genomeDir {2} \
@@ -198,7 +200,9 @@ def run(self, method, num_threads, star_cmd_params):
                 --alignEndsType EndToEnd \
                 --outFilterMultimapNmax -1 \
                 --outSAMattributes NH HI AS nM NM MD \
-                --outSAMunmapped None \
+                --outSAMunmapped Within KeepPairs \
+                --outFilterScoreMinOverLread 0.3 \
+                --outFilterMatchNminOverLread 0.3 \
                 {5} \
                 --runThreadN {6}".format(
                     self.cfg.get('commands','star'), 
@@ -249,7 +253,7 @@ def run(self, method, num_threads, star_cmd_params):
             interval_mode_str = " --interval_mode"
 
         quant_cmd = "{0} -i {1} -t {2} -d {3} -o {4}{5}{6}".format(
-            self.cfg.get('commands', 'quantification'),
+            os.path.join(self.module_dir, "requantify.py"),
             sam_file,
             self.seq_tab,
             self.bp_distance,

diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,11 @@
+name: Easyquant
+channels:
+  - bioconda
+  - conda-forge
+  - defaults
+dependencies:
+  - bowtie2=2.4.4
+  - bwa=0.7.17
+  - pysam=0.21.0
+  - samtools=1.9
+  - star=2.7.8a
diff --git a/requantify.py b/requantify.py
@@ -74,12 +74,15 @@ def classify_read(aln_start, aln_stop, aln_pairs, intervals, allow_mismatches, b
     Classifies read and returns dict with information on mapping position
     """
 
+    # TODO: Can we extract start and stop position of alignment from aln_pairs to remove redundancy?
+    # Check performance
+
     # define match bases for get_aligned_pairs()
     MATCH_BASES = ['A', 'C', 'G', 'T']
 
     read_info = {"junc": False, "within": False, "interval": "", "anchor": 0}
 
-    for interval_name, ref_start, ref_stop in intervals:
+    for (interval_name, ref_start, ref_stop) in intervals:
         # Check if read spans ref start
         if aln_start <= ref_stop - bp_dist and aln_stop >= ref_stop + bp_dist:
 
@@ -191,12 +194,17 @@ def parse_alignment(self):
 
 
             if r1 and r2:
-                # Ignore unmapped reads and read pairs 
+                # Ignore unmapped read pairs and read pairs 
                 # with unmatching reference or read name
-                if not (r1.is_unmapped or r2.is_unmapped or
-                    r1.query_name != r2.query_name or
-                    r1.reference_name != r2.reference_name):
-                    self.quantify(r1, r2)
+                # TODO: Check if unmapped reference is '*'
+                # and therefore needs more relaxed clause
+                if not (r1.is_unmapped and r2.is_unmapped or
+                    r1.query_name != r2.query_name):
+
+                    if (r1.reference_name == r2.reference_name or
+                        r1.is_unmapped and not r2.is_unmapped or
+                        not r1.is_unmapped and r2.is_unmapped):
+                        self.quantify(r1, r2)
                 r1 = None
                 r2 = None
         logger.info("Quantification done.")
@@ -230,6 +238,11 @@ def quantify(self, r1, r2):
         """
 
 
+#        if r1.is_unmapped or r2.is_unmapped:
+#            print(r1)
+#            print(r2)
+#            print(r1.reference_start, r1.reference_end, r1.reference_name, r2.reference_start, r2.reference_end, r2.reference_name)
+
         read_name = r1.query_name
         seq_name = r1.reference_name
 
@@ -241,17 +254,38 @@ def quantify(self, r1, r2):
             right_interval = self.seq_to_pos[seq_name][1][0]
 
 
-        r1_start = r1.reference_start
-        r1_stop = r1.reference_end
-        r1_pairs = r1.get_aligned_pairs(with_seq=True)
-        r2_start = r2.reference_start
-        r2_stop = r2.reference_end
-        r2_pairs = r2.get_aligned_pairs(with_seq=True)
+        r1_start = -1
+        r1_stop = -1
+        r1_pairs = None
+        if not r1.is_unmapped:
+            r1_start = r1.reference_start
+            r1_stop = r1.reference_end
+            r1_pairs = None
+            try:
+                r1_pairs = r1.get_aligned_pairs(with_seq=True)
+            except:
+                logger.error("Not possible to get read information. Skipping...")
+                return
+
+        r2_start = -1
+        r2_stop = -1
+        r2_pairs = None
+        if not r2.is_unmapped:
+            r2_start = r2.reference_start
+            r2_stop = r2.reference_end
+            r2_pairs = None
+            try:
+                r2_pairs = r2.get_aligned_pairs(with_seq=True)
+            except:
+                logger.error("Not possible to get read information. Skipping...")
+                return
 
         # Get read information [junc, within, interval]
         r1_info = classify_read(r1_start, r1_stop, r1_pairs, self.seq_to_pos[seq_name], self.allow_mismatches, self.bp_dist)
         r2_info = classify_read(r2_start, r2_stop, r2_pairs, self.seq_to_pos[seq_name], self.allow_mismatches, self.bp_dist)
 
+#        print(r1_info, r2_info)
+
         if not self.interval_mode:
             self.counts[seq_name][2] = max([r1_info["anchor"], r2_info["anchor"], self.counts[seq_name][2]])
 

diff --git a/run_test.sh b/run_test.sh
@@ -103,4 +103,20 @@ python easy_quant.py \
   -s example_data/CLDN18_Context_seq.csv \
   -d 10 \
   -o example_out_csv_bam \
-  -t 12 \
+  -t 12
+
+
+#================================================================
+# Test run using uBAM file as input
+#================================================================
+
+# Remove existing output folder
+#rm -rf example_out_csv_ubam
+
+# Run pipeline
+#python easy_quant.py \
+#  -b example_data/example_rna-seq.ubam \
+#  -s example_data/ubam_Context_seq.csv \
+#  -d 10 \
+#  -o example_out_csv_ubam \
+#  -t 12
diff --git a/tests.py b/tests.py
@@ -1,8 +1,17 @@
 #!/usr/bin/env python
 
+import os
 import unittest
 
-from requantify import perc_true, mean, median
+from requantify import perc_true
+from requantify import mean
+from requantify import median
+from requantify import get_seq_to_pos
+from requantify import classify_read
+
+
+SEQ_TABLE_FILE = os.path.join(os.path.dirname(__file__), "example_data", "CLDN18_Context_seq.csv")
+
 
 class TestRequantify(unittest.TestCase):
 
@@ -18,5 +27,55 @@ def test_median(self):
         data = [4, 3, 0, 5, 9, 0, 1, 0, 0, 0]
         self.assertEqual(median(data), 0.5)
 
+    def test_get_seq_to_pos(self):
+        result = {
+            'CLDN18_1': [
+                ('0_400', 0, 400),
+                ('400_786', 400, 786)
+            ],
+            'CLDN18_2': [
+                ('0_361', 0, 361),
+                ('361_747', 361, 747)
+            ],
+            'CLDN18_total': [
+                ('0_400', 0, 400),
+                ('400_786', 400, 786)
+            ],
+            'CLDN18_1_fake': [
+                ('0_400', 0, 400),
+                ('400_786', 400, 786)
+            ],
+            'CLDN18_2_fake': [
+                ('0_361', 0, 361),
+                ('361_747', 361, 747)
+            ],
+            'HPRT1':[
+                ('0_400', 0, 400),
+                ('400_793', 400, 793)
+            ]
+        }
+        self.assertEqual(get_seq_to_pos(SEQ_TABLE_FILE), result)
+
+    def test_classify_read(self):
+        aln_pairs = [(0, 365, 'C'), (1, 366, 'C'), (2, 367, 'C'), (3, 368, 'T'), (4, 369, 'A'), (5, 370, 'T'), (6, 371, 'T'), (7, 372, 'T'), (8, 373, 'C'), (9, 374, 'A'), (10, 375, 'C'), (11, 376, 'C'), (12, 377, 'A'), (13, 378, 'T'), (14, 379, 'C'), (15, 380, 'C'), (16, 381, 'T'), (17, 382, 'G'), (18, 383, 'G'), (19, 384, 'G'), (20, 385, 'A'), (21, 386, 'C'), (22, 387, 'T'), (23, 388, 'T'), (24, 389, 'C'), (25, 390, 'C'), (26, 391, 'A'), (27, 392, 'G'), (28, 393, 'C'), (29, 394, 'C'), (30, 395, 'A'), (31, 396, 'T'), (32, 397, 'G'), (33, 398, 'C'), (34, 399, 'T'), (35, 400, 'G'), (36, 401, 'C'), (37, 402, 'A'), (38, 403, 'G'), (39, 404, 'G'), (40, 405, 'C'), (41, 406, 'A'), (42, 407, 'G'), (43, 408, 'T'), (44, 409, 'G'), (45, 410, 'C'), (46, 411, 'G'), (47, 412, 'A'), (48, 413, 'G'), (49, 414, 'C'), (50, 415, 'C')]
+        interval = [
+            ('0_400', 0, 400),
+            ('400_786', 400, 786)
+        ]
+        result = {"junc": True, "within": False, "interval": "0_400", "anchor": 15}
+        self.assertEqual(classify_read(365, 415, aln_pairs, interval, True, 10), result)
+
+
+        aln_pairs = [(0, 609, 'C'), (1, 610, 'T'), (2, 611, 'C'), (3, 612, 'A'), (4, 613, 'C'), (5, 614, 'C'), (6, 615, 'C'), (7, 616, 'A'), (8, 617, 'A'), (9, 618, 'A'), (10, 619, 'A'), (11, 620, 'A'), (12, 621, 'A'), (13, None, None), (14, 622, 'C'), (15, 623, 'A'), (16, 624, 'A'), (17, 625, 'G'), (18, 626, 'G'), (19, 627, 'A'), (20, 628, 'G'), (21, 629, 'A'), (22, 630, 'T'), (23, 631, 'C'), (24, 632, 'C'), (25, 633, 'C'), (26, 634, 'A'), (27, 635, 'T'), (28, 636, 'C'), (29, 637, 'T'), (30, 638, 'A'), (31, 639, 'G'), (32, 640, 'A'), (33, 641, 'T'), (34, 642, 'T'), (35, 643, 'T'), (36, 644, 'C'), (37, 645, 'T'), (38, 646, 'T'), (39, 647, 'C'), (40, 648, 'T'), (41, 649, 'T'), (42, 650, 'G'), (43, 651, 'C'), (44, 652, 'T'), (45, 653, 'T'), (46, 654, 'T'), (47, 655, 'T'), (48, 656, 'G'), (49, 657, 'A'), (50, 658, 'C')]
+
+        interval = [
+            ('0_400', 0, 400),
+            ('400_786', 400, 786)
+        ]
+        result = {"junc": False, "within": True, "interval": "400_786", "anchor": 0}
+        self.assertEqual(classify_read(609, 658, aln_pairs, interval, True, 10), result)
+
+
+
 if __name__ == "__main__":
     unittest.main()