Skip to content

Commit

Permalink
Merge pull request #6 from TRON-Bioinformatics/dev
Browse files Browse the repository at this point in the history
Dev into master
  • Loading branch information
patricksorn authored Jun 7, 2023
2 parents 9c80beb + 3b104b7 commit 451a3e6
Show file tree
Hide file tree
Showing 7 changed files with 151 additions and 29 deletions.
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,22 @@ breakpoint (spanning pairs).
- pysam (>= 0.16.0.1)
- STAR (>= 2.7.8a)
- samtools (>= 1.9)
- bowtie2 (>= 2.4.4)
- bwa (>= 0.7.17)

## Installation

```
git clone https://github.com/TRON-Bioinformatics/easyquant.git
```
```
mv config.ini.sample config.ini
```
Update `config.ini` with installation paths
# If you have conda installed you can simply install the environment like this
conda env create -f environment.yml
# Otherwise you have to adjust the path to the different tools in your config.ini
```
samtools_cmd=/path/to/samtools/1.9/samtools
star_cmd=/path/to/STAR/2.7.8a/bin/Linux_x86_64_static/STAR
```


## Usage

Expand Down
9 changes: 4 additions & 5 deletions config.ini.sample
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
version=0.4.1

[commands]
samtools=/path/to/samtools
star=/path/to/STAR
bwa=/path/to/bwa
bowtie2=/path/to/bowtie2
quantification=/path/to/easyquant/requantify.py
samtools=samtools
star=STAR
bwa=bwa
bowtie2=bowtie2
10 changes: 7 additions & 3 deletions easy_quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class Easyquant(object):
def __init__(self, fq1, fq2, bam, seq_tab, bp_distance, working_dir, allow_mismatches, interval_mode):

self.working_dir = os.path.abspath(working_dir)
self.module_dir = os.path.dirname(os.path.realpath(__file__))
IOMethods.create_folder(self.working_dir)

logfile = os.path.join(self.working_dir, 'run.log')
Expand Down Expand Up @@ -188,7 +189,8 @@ def run(self, method, num_threads, star_cmd_params):


if self.fq1 and self.fq2:

# TODO: Test quantification performance / runtime
# of outputting unaligned reads / singletons
align_cmd = "{0} --outFileNamePrefix {1} \
--limitOutSAMoneReadBytes 1000000 \
--genomeDir {2} \
Expand All @@ -198,7 +200,9 @@ def run(self, method, num_threads, star_cmd_params):
--alignEndsType EndToEnd \
--outFilterMultimapNmax -1 \
--outSAMattributes NH HI AS nM NM MD \
--outSAMunmapped None \
--outSAMunmapped Within KeepPairs \
--outFilterScoreMinOverLread 0.3 \
--outFilterMatchNminOverLread 0.3 \
{5} \
--runThreadN {6}".format(
self.cfg.get('commands','star'),
Expand Down Expand Up @@ -249,7 +253,7 @@ def run(self, method, num_threads, star_cmd_params):
interval_mode_str = " --interval_mode"

quant_cmd = "{0} -i {1} -t {2} -d {3} -o {4}{5}{6}".format(
self.cfg.get('commands', 'quantification'),
os.path.join(self.module_dir, "requantify.py"),
sam_file,
self.seq_tab,
self.bp_distance,
Expand Down
11 changes: 11 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
name: Easyquant
channels:
- bioconda
- conda-forge
- defaults
dependencies:
- bowtie2=2.4.4
- bwa=0.7.17
- pysam=0.21.0
- samtools=1.9
- star=2.7.8a
58 changes: 46 additions & 12 deletions requantify.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,15 @@ def classify_read(aln_start, aln_stop, aln_pairs, intervals, allow_mismatches, b
Classifies read and returns dict with information on mapping position
"""

# TODO: Can we extract start and stop position of alignment from aln_pairs to remove redundancy?
# Check performance

# define match bases for get_aligned_pairs()
MATCH_BASES = ['A', 'C', 'G', 'T']

read_info = {"junc": False, "within": False, "interval": "", "anchor": 0}

for interval_name, ref_start, ref_stop in intervals:
for (interval_name, ref_start, ref_stop) in intervals:
# Check if read spans ref start
if aln_start <= ref_stop - bp_dist and aln_stop >= ref_stop + bp_dist:

Expand Down Expand Up @@ -191,12 +194,17 @@ def parse_alignment(self):


if r1 and r2:
# Ignore unmapped reads and read pairs
# Ignore unmapped read pairs and read pairs
# with unmatching reference or read name
if not (r1.is_unmapped or r2.is_unmapped or
r1.query_name != r2.query_name or
r1.reference_name != r2.reference_name):
self.quantify(r1, r2)
# TODO: Check if unmapped reference is '*'
# and therefore needs more relaxed clause
if not (r1.is_unmapped and r2.is_unmapped or
r1.query_name != r2.query_name):

if (r1.reference_name == r2.reference_name or
r1.is_unmapped and not r2.is_unmapped or
not r1.is_unmapped and r2.is_unmapped):
self.quantify(r1, r2)
r1 = None
r2 = None
logger.info("Quantification done.")
Expand Down Expand Up @@ -230,6 +238,11 @@ def quantify(self, r1, r2):
"""


# if r1.is_unmapped or r2.is_unmapped:
# print(r1)
# print(r2)
# print(r1.reference_start, r1.reference_end, r1.reference_name, r2.reference_start, r2.reference_end, r2.reference_name)

read_name = r1.query_name
seq_name = r1.reference_name

Expand All @@ -241,17 +254,38 @@ def quantify(self, r1, r2):
right_interval = self.seq_to_pos[seq_name][1][0]


r1_start = r1.reference_start
r1_stop = r1.reference_end
r1_pairs = r1.get_aligned_pairs(with_seq=True)
r2_start = r2.reference_start
r2_stop = r2.reference_end
r2_pairs = r2.get_aligned_pairs(with_seq=True)
r1_start = -1
r1_stop = -1
r1_pairs = None
if not r1.is_unmapped:
r1_start = r1.reference_start
r1_stop = r1.reference_end
r1_pairs = None
try:
r1_pairs = r1.get_aligned_pairs(with_seq=True)
except:
logger.error("Not possible to get read information. Skipping...")
return

r2_start = -1
r2_stop = -1
r2_pairs = None
if not r2.is_unmapped:
r2_start = r2.reference_start
r2_stop = r2.reference_end
r2_pairs = None
try:
r2_pairs = r2.get_aligned_pairs(with_seq=True)
except:
logger.error("Not possible to get read information. Skipping...")
return

# Get read information [junc, within, interval]
r1_info = classify_read(r1_start, r1_stop, r1_pairs, self.seq_to_pos[seq_name], self.allow_mismatches, self.bp_dist)
r2_info = classify_read(r2_start, r2_stop, r2_pairs, self.seq_to_pos[seq_name], self.allow_mismatches, self.bp_dist)

# print(r1_info, r2_info)

if not self.interval_mode:
self.counts[seq_name][2] = max([r1_info["anchor"], r2_info["anchor"], self.counts[seq_name][2]])

Expand Down
18 changes: 17 additions & 1 deletion run_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,20 @@ python easy_quant.py \
-s example_data/CLDN18_Context_seq.csv \
-d 10 \
-o example_out_csv_bam \
-t 12 \
-t 12


#================================================================
# Test run using uBAM file as input
#================================================================

# Remove existing output folder
#rm -rf example_out_csv_ubam

# Run pipeline
#python easy_quant.py \
# -b example_data/example_rna-seq.ubam \
# -s example_data/ubam_Context_seq.csv \
# -d 10 \
# -o example_out_csv_ubam \
# -t 12
61 changes: 60 additions & 1 deletion tests.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
#!/usr/bin/env python

import os
import unittest

from requantify import perc_true, mean, median
from requantify import perc_true
from requantify import mean
from requantify import median
from requantify import get_seq_to_pos
from requantify import classify_read


SEQ_TABLE_FILE = os.path.join(os.path.dirname(__file__), "example_data", "CLDN18_Context_seq.csv")


class TestRequantify(unittest.TestCase):

Expand All @@ -18,5 +27,55 @@ def test_median(self):
data = [4, 3, 0, 5, 9, 0, 1, 0, 0, 0]
self.assertEqual(median(data), 0.5)

def test_get_seq_to_pos(self):
result = {
'CLDN18_1': [
('0_400', 0, 400),
('400_786', 400, 786)
],
'CLDN18_2': [
('0_361', 0, 361),
('361_747', 361, 747)
],
'CLDN18_total': [
('0_400', 0, 400),
('400_786', 400, 786)
],
'CLDN18_1_fake': [
('0_400', 0, 400),
('400_786', 400, 786)
],
'CLDN18_2_fake': [
('0_361', 0, 361),
('361_747', 361, 747)
],
'HPRT1':[
('0_400', 0, 400),
('400_793', 400, 793)
]
}
self.assertEqual(get_seq_to_pos(SEQ_TABLE_FILE), result)

def test_classify_read(self):
aln_pairs = [(0, 365, 'C'), (1, 366, 'C'), (2, 367, 'C'), (3, 368, 'T'), (4, 369, 'A'), (5, 370, 'T'), (6, 371, 'T'), (7, 372, 'T'), (8, 373, 'C'), (9, 374, 'A'), (10, 375, 'C'), (11, 376, 'C'), (12, 377, 'A'), (13, 378, 'T'), (14, 379, 'C'), (15, 380, 'C'), (16, 381, 'T'), (17, 382, 'G'), (18, 383, 'G'), (19, 384, 'G'), (20, 385, 'A'), (21, 386, 'C'), (22, 387, 'T'), (23, 388, 'T'), (24, 389, 'C'), (25, 390, 'C'), (26, 391, 'A'), (27, 392, 'G'), (28, 393, 'C'), (29, 394, 'C'), (30, 395, 'A'), (31, 396, 'T'), (32, 397, 'G'), (33, 398, 'C'), (34, 399, 'T'), (35, 400, 'G'), (36, 401, 'C'), (37, 402, 'A'), (38, 403, 'G'), (39, 404, 'G'), (40, 405, 'C'), (41, 406, 'A'), (42, 407, 'G'), (43, 408, 'T'), (44, 409, 'G'), (45, 410, 'C'), (46, 411, 'G'), (47, 412, 'A'), (48, 413, 'G'), (49, 414, 'C'), (50, 415, 'C')]
interval = [
('0_400', 0, 400),
('400_786', 400, 786)
]
result = {"junc": True, "within": False, "interval": "0_400", "anchor": 15}
self.assertEqual(classify_read(365, 415, aln_pairs, interval, True, 10), result)


aln_pairs = [(0, 609, 'C'), (1, 610, 'T'), (2, 611, 'C'), (3, 612, 'A'), (4, 613, 'C'), (5, 614, 'C'), (6, 615, 'C'), (7, 616, 'A'), (8, 617, 'A'), (9, 618, 'A'), (10, 619, 'A'), (11, 620, 'A'), (12, 621, 'A'), (13, None, None), (14, 622, 'C'), (15, 623, 'A'), (16, 624, 'A'), (17, 625, 'G'), (18, 626, 'G'), (19, 627, 'A'), (20, 628, 'G'), (21, 629, 'A'), (22, 630, 'T'), (23, 631, 'C'), (24, 632, 'C'), (25, 633, 'C'), (26, 634, 'A'), (27, 635, 'T'), (28, 636, 'C'), (29, 637, 'T'), (30, 638, 'A'), (31, 639, 'G'), (32, 640, 'A'), (33, 641, 'T'), (34, 642, 'T'), (35, 643, 'T'), (36, 644, 'C'), (37, 645, 'T'), (38, 646, 'T'), (39, 647, 'C'), (40, 648, 'T'), (41, 649, 'T'), (42, 650, 'G'), (43, 651, 'C'), (44, 652, 'T'), (45, 653, 'T'), (46, 654, 'T'), (47, 655, 'T'), (48, 656, 'G'), (49, 657, 'A'), (50, 658, 'C')]

interval = [
('0_400', 0, 400),
('400_786', 400, 786)
]
result = {"junc": False, "within": True, "interval": "400_786", "anchor": 0}
self.assertEqual(classify_read(609, 658, aln_pairs, interval, True, 10), result)



if __name__ == "__main__":
unittest.main()

0 comments on commit 451a3e6

Please sign in to comment.