diff --git a/README.md b/README.md index 510f88f..8122ea6 100644 --- a/README.md +++ b/README.md @@ -26,23 +26,22 @@ breakpoint (spanning pairs). - pysam (>= 0.16.0.1) - STAR (>= 2.7.8a) - samtools (>= 1.9) + - bowtie2 (>= 2.4.4) + - bwa (>= 0.7.17) ## Installation ``` git clone https://github.com/TRON-Bioinformatics/easyquant.git -``` -``` mv config.ini.sample config.ini -``` -Update `config.ini` with installation paths +# If you have conda installed you can simply install the environment like this +conda env create -f environment.yml +# Otherwise you have to adjust the path to the different tools in your config.ini ``` -samtools_cmd=/path/to/samtools/1.9/samtools -star_cmd=/path/to/STAR/2.7.8a/bin/Linux_x86_64_static/STAR -``` + ## Usage diff --git a/config.ini.sample b/config.ini.sample index d81dc06..be20811 100755 --- a/config.ini.sample +++ b/config.ini.sample @@ -2,8 +2,7 @@ version=0.4.1 [commands] -samtools=/path/to/samtools -star=/path/to/STAR -bwa=/path/to/bwa -bowtie2=/path/to/bowtie2 -quantification=/path/to/easyquant/requantify.py +samtools=samtools +star=STAR +bwa=bwa +bowtie2=bowtie2 diff --git a/easy_quant.py b/easy_quant.py index 8a87dc8..e574570 100755 --- a/easy_quant.py +++ b/easy_quant.py @@ -35,6 +35,7 @@ class Easyquant(object): def __init__(self, fq1, fq2, bam, seq_tab, bp_distance, working_dir, allow_mismatches, interval_mode): self.working_dir = os.path.abspath(working_dir) + self.module_dir = os.path.dirname(os.path.realpath(__file__)) IOMethods.create_folder(self.working_dir) logfile = os.path.join(self.working_dir, 'run.log') @@ -188,7 +189,8 @@ def run(self, method, num_threads, star_cmd_params): if self.fq1 and self.fq2: - + # TODO: Test quantification performance / runtime + # of outputting unaligned reads / singletons align_cmd = "{0} --outFileNamePrefix {1} \ --limitOutSAMoneReadBytes 1000000 \ --genomeDir {2} \ @@ -198,7 +200,9 @@ def run(self, method, num_threads, star_cmd_params): --alignEndsType EndToEnd \ --outFilterMultimapNmax -1 \ --outSAMattributes NH HI AS nM NM MD \ - --outSAMunmapped None \ + --outSAMunmapped Within KeepPairs \ + --outFilterScoreMinOverLread 0.3 \ + --outFilterMatchNminOverLread 0.3 \ {5} \ --runThreadN {6}".format( self.cfg.get('commands','star'), @@ -249,7 +253,7 @@ def run(self, method, num_threads, star_cmd_params): interval_mode_str = " --interval_mode" quant_cmd = "{0} -i {1} -t {2} -d {3} -o {4}{5}{6}".format( - self.cfg.get('commands', 'quantification'), + os.path.join(self.module_dir, "requantify.py"), sam_file, self.seq_tab, self.bp_distance, diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..0c18174 --- /dev/null +++ b/environment.yml @@ -0,0 +1,11 @@ +name: Easyquant +channels: + - bioconda + - conda-forge + - defaults +dependencies: + - bowtie2=2.4.4 + - bwa=0.7.17 + - pysam=0.21.0 + - samtools=1.9 + - star=2.7.8a diff --git a/requantify.py b/requantify.py index 8333b71..225ed58 100755 --- a/requantify.py +++ b/requantify.py @@ -74,12 +74,15 @@ def classify_read(aln_start, aln_stop, aln_pairs, intervals, allow_mismatches, b Classifies read and returns dict with information on mapping position """ + # TODO: Can we extract start and stop position of alignment from aln_pairs to remove redundancy? + # Check performance + # define match bases for get_aligned_pairs() MATCH_BASES = ['A', 'C', 'G', 'T'] read_info = {"junc": False, "within": False, "interval": "", "anchor": 0} - for interval_name, ref_start, ref_stop in intervals: + for (interval_name, ref_start, ref_stop) in intervals: # Check if read spans ref start if aln_start <= ref_stop - bp_dist and aln_stop >= ref_stop + bp_dist: @@ -191,12 +194,17 @@ def parse_alignment(self): if r1 and r2: - # Ignore unmapped reads and read pairs + # Ignore unmapped read pairs and read pairs # with unmatching reference or read name - if not (r1.is_unmapped or r2.is_unmapped or - r1.query_name != r2.query_name or - r1.reference_name != r2.reference_name): - self.quantify(r1, r2) + # TODO: Check if unmapped reference is '*' + # and therefore needs more relaxed clause + if not (r1.is_unmapped and r2.is_unmapped or + r1.query_name != r2.query_name): + + if (r1.reference_name == r2.reference_name or + r1.is_unmapped and not r2.is_unmapped or + not r1.is_unmapped and r2.is_unmapped): + self.quantify(r1, r2) r1 = None r2 = None logger.info("Quantification done.") @@ -230,6 +238,11 @@ def quantify(self, r1, r2): """ +# if r1.is_unmapped or r2.is_unmapped: +# print(r1) +# print(r2) +# print(r1.reference_start, r1.reference_end, r1.reference_name, r2.reference_start, r2.reference_end, r2.reference_name) + read_name = r1.query_name seq_name = r1.reference_name @@ -241,17 +254,38 @@ def quantify(self, r1, r2): right_interval = self.seq_to_pos[seq_name][1][0] - r1_start = r1.reference_start - r1_stop = r1.reference_end - r1_pairs = r1.get_aligned_pairs(with_seq=True) - r2_start = r2.reference_start - r2_stop = r2.reference_end - r2_pairs = r2.get_aligned_pairs(with_seq=True) + r1_start = -1 + r1_stop = -1 + r1_pairs = None + if not r1.is_unmapped: + r1_start = r1.reference_start + r1_stop = r1.reference_end + r1_pairs = None + try: + r1_pairs = r1.get_aligned_pairs(with_seq=True) + except: + logger.error("Not possible to get read information. Skipping...") + return + + r2_start = -1 + r2_stop = -1 + r2_pairs = None + if not r2.is_unmapped: + r2_start = r2.reference_start + r2_stop = r2.reference_end + r2_pairs = None + try: + r2_pairs = r2.get_aligned_pairs(with_seq=True) + except: + logger.error("Not possible to get read information. Skipping...") + return # Get read information [junc, within, interval] r1_info = classify_read(r1_start, r1_stop, r1_pairs, self.seq_to_pos[seq_name], self.allow_mismatches, self.bp_dist) r2_info = classify_read(r2_start, r2_stop, r2_pairs, self.seq_to_pos[seq_name], self.allow_mismatches, self.bp_dist) +# print(r1_info, r2_info) + if not self.interval_mode: self.counts[seq_name][2] = max([r1_info["anchor"], r2_info["anchor"], self.counts[seq_name][2]]) diff --git a/run_test.sh b/run_test.sh index ff22110..c1512c1 100644 --- a/run_test.sh +++ b/run_test.sh @@ -103,4 +103,20 @@ python easy_quant.py \ -s example_data/CLDN18_Context_seq.csv \ -d 10 \ -o example_out_csv_bam \ - -t 12 \ + -t 12 + + +#================================================================ +# Test run using uBAM file as input +#================================================================ + +# Remove existing output folder +#rm -rf example_out_csv_ubam + +# Run pipeline +#python easy_quant.py \ +# -b example_data/example_rna-seq.ubam \ +# -s example_data/ubam_Context_seq.csv \ +# -d 10 \ +# -o example_out_csv_ubam \ +# -t 12 diff --git a/tests.py b/tests.py index 70ca2b1..5942d2d 100644 --- a/tests.py +++ b/tests.py @@ -1,8 +1,17 @@ #!/usr/bin/env python +import os import unittest -from requantify import perc_true, mean, median +from requantify import perc_true +from requantify import mean +from requantify import median +from requantify import get_seq_to_pos +from requantify import classify_read + + +SEQ_TABLE_FILE = os.path.join(os.path.dirname(__file__), "example_data", "CLDN18_Context_seq.csv") + class TestRequantify(unittest.TestCase): @@ -18,5 +27,55 @@ def test_median(self): data = [4, 3, 0, 5, 9, 0, 1, 0, 0, 0] self.assertEqual(median(data), 0.5) + def test_get_seq_to_pos(self): + result = { + 'CLDN18_1': [ + ('0_400', 0, 400), + ('400_786', 400, 786) + ], + 'CLDN18_2': [ + ('0_361', 0, 361), + ('361_747', 361, 747) + ], + 'CLDN18_total': [ + ('0_400', 0, 400), + ('400_786', 400, 786) + ], + 'CLDN18_1_fake': [ + ('0_400', 0, 400), + ('400_786', 400, 786) + ], + 'CLDN18_2_fake': [ + ('0_361', 0, 361), + ('361_747', 361, 747) + ], + 'HPRT1':[ + ('0_400', 0, 400), + ('400_793', 400, 793) + ] + } + self.assertEqual(get_seq_to_pos(SEQ_TABLE_FILE), result) + + def test_classify_read(self): + aln_pairs = [(0, 365, 'C'), (1, 366, 'C'), (2, 367, 'C'), (3, 368, 'T'), (4, 369, 'A'), (5, 370, 'T'), (6, 371, 'T'), (7, 372, 'T'), (8, 373, 'C'), (9, 374, 'A'), (10, 375, 'C'), (11, 376, 'C'), (12, 377, 'A'), (13, 378, 'T'), (14, 379, 'C'), (15, 380, 'C'), (16, 381, 'T'), (17, 382, 'G'), (18, 383, 'G'), (19, 384, 'G'), (20, 385, 'A'), (21, 386, 'C'), (22, 387, 'T'), (23, 388, 'T'), (24, 389, 'C'), (25, 390, 'C'), (26, 391, 'A'), (27, 392, 'G'), (28, 393, 'C'), (29, 394, 'C'), (30, 395, 'A'), (31, 396, 'T'), (32, 397, 'G'), (33, 398, 'C'), (34, 399, 'T'), (35, 400, 'G'), (36, 401, 'C'), (37, 402, 'A'), (38, 403, 'G'), (39, 404, 'G'), (40, 405, 'C'), (41, 406, 'A'), (42, 407, 'G'), (43, 408, 'T'), (44, 409, 'G'), (45, 410, 'C'), (46, 411, 'G'), (47, 412, 'A'), (48, 413, 'G'), (49, 414, 'C'), (50, 415, 'C')] + interval = [ + ('0_400', 0, 400), + ('400_786', 400, 786) + ] + result = {"junc": True, "within": False, "interval": "0_400", "anchor": 15} + self.assertEqual(classify_read(365, 415, aln_pairs, interval, True, 10), result) + + + aln_pairs = [(0, 609, 'C'), (1, 610, 'T'), (2, 611, 'C'), (3, 612, 'A'), (4, 613, 'C'), (5, 614, 'C'), (6, 615, 'C'), (7, 616, 'A'), (8, 617, 'A'), (9, 618, 'A'), (10, 619, 'A'), (11, 620, 'A'), (12, 621, 'A'), (13, None, None), (14, 622, 'C'), (15, 623, 'A'), (16, 624, 'A'), (17, 625, 'G'), (18, 626, 'G'), (19, 627, 'A'), (20, 628, 'G'), (21, 629, 'A'), (22, 630, 'T'), (23, 631, 'C'), (24, 632, 'C'), (25, 633, 'C'), (26, 634, 'A'), (27, 635, 'T'), (28, 636, 'C'), (29, 637, 'T'), (30, 638, 'A'), (31, 639, 'G'), (32, 640, 'A'), (33, 641, 'T'), (34, 642, 'T'), (35, 643, 'T'), (36, 644, 'C'), (37, 645, 'T'), (38, 646, 'T'), (39, 647, 'C'), (40, 648, 'T'), (41, 649, 'T'), (42, 650, 'G'), (43, 651, 'C'), (44, 652, 'T'), (45, 653, 'T'), (46, 654, 'T'), (47, 655, 'T'), (48, 656, 'G'), (49, 657, 'A'), (50, 658, 'C')] + + interval = [ + ('0_400', 0, 400), + ('400_786', 400, 786) + ] + result = {"junc": False, "within": True, "interval": "400_786", "anchor": 0} + self.assertEqual(classify_read(609, 658, aln_pairs, interval, True, 10), result) + + + if __name__ == "__main__": unittest.main()