From dc5566147dff3af83b71db59a8c178ace309b268 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Tue, 4 Feb 2020 17:26:59 +0100 Subject: [PATCH 01/16] improved Zall estimate for model=X --- ChangeLog | 11 ++++++++++ .../PredictorMfe2dHeuristicSeedExtension.cpp | 21 ++++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7784a54d..7b2de279 100644 --- a/ChangeLog +++ b/ChangeLog @@ -10,9 +10,20 @@ # changes in development version since last release ################################################################################ +# IntaRNA +- improved Zall estimate (and depending values) for `--model=X` (default) + ################################################################################ ################################################################################ +################################################################################ + +200131 Martin Raden + * IntaRNA/PredictorMfe2dHeuristicSeedExtension : + * fillHybridE_left() : + + extended Zall update (additional save cases considered) + + ################################################################################ ### version 3.1.5 ################################################################################ diff --git a/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp b/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp index 4e124e0f..0dc4a7c4 100644 --- a/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp +++ b/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp @@ -267,8 +267,16 @@ fillHybridE_left( const size_t si1, const size_t si2 ) } // update mfe if needed if ( E_isNotINF( curMinE ) ) { + // safe cases for Zall update: + // - i==si && j==sj : seed only + // - i!=si && !seedBound(i) && + // ? j==sj : left seed extension + // ? or no seed between i and si : si == left-most seed + bool isSeedOrLeftExt = (i1==si1 && i2==si2) + || (j1opt == sj1 && !seedHandler.isSeedBound(i1,i2)); + // leftE (incl. E_init) + seedE - updateOptima( i1,sj1,i2,sj2, curMinE + seedE, true, i1==si1 && i2==si2 ); + updateOptima( i1,sj1,i2,sj2, curMinE + seedE, true, isSeedOrLeftExt ); // check if right opt != right seed boundary // check for max interaction length @@ -276,9 +284,16 @@ fillHybridE_left( const size_t si1, const size_t si2 ) && (j1opt+1-i1)<=energy.getAccessibility1().getMaxLength() && (j2opt+1-i2)<=energy.getAccessibility2().getMaxLength() ) { + // check if no seed between i and si + // or no Z update for left extensions to avoid duplicated handling (underestimates Zall) + bool noSeedLeftOfsi = (i1 < si1 && !seedHandler.isSeedBound(i1,i2)); + if (noSeedLeftOfsi) { + size_t x1=si1, x2=si2; + // check of no seed start between i and si + noSeedLeftOfsi = !(seedHandler.updateToNextSeed(x1,x2, i1+1,si1-1, i2+1, si2-1)); + } // leftE (incl. E_init) + seedE + rightOptE - // no Z update for left extensions to avoid duplicated handling (underestimates Zall) - updateOptima( i1,j1opt,i2,j2opt, curMinE + seedE + rightOptE, true, false ); + updateOptima( i1,j1opt,i2,j2opt, curMinE + seedE + rightOptE, true, noSeedLeftOfsi ); } } } From 7da60ebbaebc9a23a485c84d3ac64205d1a96c91 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Thu, 6 Feb 2020 13:18:12 +0100 Subject: [PATCH 02/16] fix Zall fix --- src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp b/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp index 0dc4a7c4..86abd0c6 100644 --- a/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp +++ b/src/IntaRNA/PredictorMfe2dHeuristicSeedExtension.cpp @@ -269,14 +269,10 @@ fillHybridE_left( const size_t si1, const size_t si2 ) if ( E_isNotINF( curMinE ) ) { // safe cases for Zall update: // - i==si && j==sj : seed only - // - i!=si && !seedBound(i) && - // ? j==sj : left seed extension - // ? or no seed between i and si : si == left-most seed - bool isSeedOrLeftExt = (i1==si1 && i2==si2) - || (j1opt == sj1 && !seedHandler.isSeedBound(i1,i2)); + // - i!=si && and no seed between i (incl.) and si (excl) : si == left-most seed // leftE (incl. E_init) + seedE - updateOptima( i1,sj1,i2,sj2, curMinE + seedE, true, isSeedOrLeftExt ); + updateOptima( i1,sj1,i2,sj2, curMinE + seedE, true, (i1==si1 && i2==si2) ); // check if right opt != right seed boundary // check for max interaction length From 8062aa406dd7502867eecdfcce471818033d973c Mon Sep 17 00:00:00 2001 From: martin-mann Date: Thu, 6 Feb 2020 18:14:08 +0100 Subject: [PATCH 03/16] copomus added --- configure.ac | 21 +- python/CopomuS.py | 239 ++++++++++++++++++ python/Makefile.am | 21 +- python/README.md | 86 +------ python/bin/Makefile.am | 10 + python/copomus/CopomuS-logo.jpg | Bin 0 -> 8353 bytes python/copomus/IntaRNA.py | 127 ++++++++++ python/copomus/Makefile.am | 30 +++ python/copomus/README.md | 98 ++++++++ python/copomus/__init__.py | 1 + python/copomus/candidate_filters.py | 154 ++++++++++++ python/copomus/candidate_selectors.py | 103 ++++++++ python/copomus/indexing.py | 50 ++++ python/copomus/measures.py | 334 ++++++++++++++++++++++++++ python/copomus/mutation.py | 35 +++ python/copomus/mutation_generators.py | 92 +++++++ python/intarnapvalue/README.md | 86 +++++++ 17 files changed, 1404 insertions(+), 83 deletions(-) create mode 100644 python/CopomuS.py create mode 100644 python/copomus/CopomuS-logo.jpg create mode 100644 python/copomus/IntaRNA.py create mode 100644 python/copomus/Makefile.am create mode 100644 python/copomus/README.md create mode 100644 python/copomus/__init__.py create mode 100644 python/copomus/candidate_filters.py create mode 100644 python/copomus/candidate_selectors.py create mode 100644 python/copomus/indexing.py create mode 100644 python/copomus/measures.py create mode 100644 python/copomus/mutation.py create mode 100644 python/copomus/mutation_generators.py create mode 100644 python/intarnapvalue/README.md diff --git a/configure.ac b/configure.ac index 1b48235f..44c1f7db 100644 --- a/configure.ac +++ b/configure.ac @@ -8,7 +8,7 @@ BOOST_REQUIRED_VERSION=1.50.0 # minimal required version of the Vienna RNA library VRNA_REQUIRED_VERSION=2.4.14 # minimal required version of python interpreter -PYTHON_REQUIRED_VERSION=3.6 +PYTHON_REQUIRED_VERSION=3.7 AC_CANONICAL_HOST @@ -283,6 +283,23 @@ AC_MSG_RESULT([$intarnapvalue]) AM_CONDITIONAL([INSTALL_INTARNAPVALUE],[test x"$intarnapvalue" = x"yes"]) +############################# +# COPOMUS SCRIPT +############################# + +AC_MSG_CHECKING([whether to compile and install 'CopomuS' python module]) +copomus=yes +AC_ARG_ENABLE([CopomuS], + [AS_HELP_STRING([--enable-CopomuS], + [build and install the CopomuS python3 module])], + [AM_COND_IF([HAVE_PYTHON], [copomus="$enableval"], [copomus="no (python not available)"])], + [AM_COND_IF([HAVE_PYTHON], [copomus=yes], [copomus="no (python not available)"])] + ) +AC_MSG_RESULT([$copomus]) + +AM_CONDITIONAL([INSTALL_COPOMUS],[test x"$copomus" = x"yes"]) + + ############################################################################### @@ -436,6 +453,7 @@ AC_CONFIG_FILES([R/Makefile]) AC_CONFIG_FILES([python/Makefile]) AC_CONFIG_FILES([python/bin/Makefile]) AC_CONFIG_FILES([python/intarnapvalue/Makefile]) +AC_CONFIG_FILES([python/copomus/Makefile]) AC_CONFIG_FILES([tests/Makefile]) AC_CONFIG_FILES([tests/data/Makefile]) AC_CONFIG_FILES([doc/Makefile]) @@ -461,6 +479,7 @@ Features * log coloring : $logColoring * OpenMP multi-threading : $multithreadingEnabled * intarnapvalue : $intarnapvalue + * CopomuS : $copomus Install Directories for given 'prefix' -------------------------------------- diff --git a/python/CopomuS.py b/python/CopomuS.py new file mode 100644 index 00000000..e21efc2f --- /dev/null +++ b/python/CopomuS.py @@ -0,0 +1,239 @@ +#!/usr/bin/env python3 + +# Copyright 2019 +# Author: Fabio Gutmann + +import os +import sys +import csv +from tempfile import gettempdir +from argparse import ArgumentParser, FileType, SUPPRESS +from typing import List +from platform import python_version +from copomus.mutation import Mutation +from copomus.IntaRNA import IntaRNA +from copomus.candidate_selectors import get_selector +from copomus.candidate_filters import get_filter +from copomus.mutation_generators import get_generator + +__version__ = 'v1.0' +__intarna_min__ = '3.1.2' +__python_min__ = '3.7.0' + + +class CopomuS: + def __init__(self): + self.query = '' + self.target = '' + self.qidxpos0 = 1 + self.tidxpos0 = 1 + self.param_file = '' + self.measures = [] + self.candidate_selection = '' + self.candidate_filters = [] + self.output = '' + self.delimiter = '' + self.mutation_generator = '' + self.mutation_encoding = '' + self.alpha, self.beta = 0.0, 0.0 + self.threads = 0 + + def main(self): + # PARSE COMMAND LINE ARGS + self.parse_cmd_args() + + # CHECK INTARNA BIN AND VERSION + IntaRNA.check_binary(__intarna_min__) + + # SELECT CANDIDATES + bp_func = get_selector(self.candidate_selection) if not self.mutation_encoding else get_selector('mutEnc') + bp = bp_func(self.query, self.target, self.qidxpos0, self.tidxpos0, self.param_file, self.threads, + enc=self.mutation_encoding) + + # FILTER OUT CANDIDATES + if not self.mutation_encoding: + for f in self.candidate_filters: + func = get_filter(f) + bp = func(bp, self.query, self.target, self.qidxpos0, self.tidxpos0) + + if not bp: + sys.stderr.write('There are no base pair candidates left after filtering!\n') + sys.exit(1) + + # CREATE MUTATIONS + if not self.mutation_encoding: + gen_func = get_generator(self.mutation_generator) + mutations = gen_func(self.query, self.target, self.qidxpos0, self.tidxpos0, bp, self.measures, + self.alpha, self.beta) + else: + gen_func = get_generator('specific') + mutations = gen_func(self.query, self.target, self.qidxpos0, self.tidxpos0, self.measures, + self.alpha, self.beta, self.mutation_encoding) + + # RUN MEASUREMENTS + self.run_tests(mutations) + + # SORT MUTATIONS + mutations.sort() + + # CALCULATE RANKS + self.calculate_ranks(mutations) + + # OUTPUT + self.output_csv(mutations) + + def parse_cmd_args(self): + defaults = { + 'measures': ['mfeCover', 'E', 'minDeltaE'], + 'selector': 'mfe', + 'filters': [], + 'generator': 'flip', + 'alpha': 1.0, + 'beta': 1.0, + 'threads': 0 + } + + parser = ArgumentParser(description='Checks different measures for rating mutations') + parser.add_argument('-q', '--query', dest='query', required=True, help='The query sequence.') + parser.add_argument('-t', '--target', dest='target', required=True, help='The target sequence.') + parser.add_argument('--qIdxPos0', dest='qidxpos0', default=1, + type=int, help='The starting index for the query. (Default: 1)') + parser.add_argument('--tIdxPos0', dest='tidxpos0', default=1, type=int, + help='The starting index for the target. (Default: 1)') + parser.add_argument('-m', '--measure', dest='measures', action='append', default=[], + help='Which measure to add to the output, can be used multiple times. ' + 'Output will be sorted in order of measures specified. ' + f"(Default: {defaults['measures']})", + choices=['E', 'minDeltaE', 'mfeCover', 'EDqi', 'cEDqi', 'Eqi', 'Eti']) + parser.add_argument('-s', '--candidateSelection', dest='candidate_selection', default=defaults['selector'], + help='Defines the method used to select candidate base pairs. ' + f"(Default: {defaults['selector']})", + choices=['mfe', 'mfeSO']) + parser.add_argument('-f', '--candidateFilter', dest='candidate_filters', action='append', + default=[], + help='Filters candidate base pairs, can be used multiple times. ' + f"(Default: {defaults['filters']})", + choices=['GU', 'AU', 'CG', 'lp', 'lpMfe', 'he', 'heMfe']) + parser.add_argument('-g', '--generator', dest='mutation_generator', default=defaults['generator'], + choices=['flip', 'any'], help='Defines the method used for generating mutated sequences. ' + f"(Default: {defaults['generator']})") + parser.add_argument('--mutationEncoding', dest='mutation_encoding', + help='Allows direct candidate selection by specifying a mutation encoding. ' + 'Overwrites options -s, -f, and -g') + parser.add_argument('-o', '--output', dest='output', nargs='?', type=FileType('w'), default=sys.stdout, + help='Which file the output should be written to. (Default: STDOUT)') + parser.add_argument('-d', '--delimiter', dest='delimiter', default='\t', + help='Defines the delimiter used to separate columns in the output, default tab. ' + '(Default: \\t)') + parser.add_argument('-p', '--parameterFile', dest='param_file', default='', + help='Optional parameter file for IntaRNA to provide further parameters and ' + 'prediction constraints.') + parser.add_argument('--threads', dest='threads', default=defaults['threads'], type=int, + help='Threads used for IntaRNA call') + parser.add_argument('-v', '--version', action='version', version=f'CopomuS {__version__}') + parser.add_argument('--alpha', dest='alpha', type=float, default=defaults['alpha'], help=SUPPRESS) + parser.add_argument('--beta', dest='beta', type=float, default=defaults['beta'], help=SUPPRESS) + + args = parser.parse_args() + + mm = [] # filter out any duplicate entries for measures + args.measures = defaults['measures'] if not args.measures else args.measures + for m in args.measures: + if m not in mm: + mm.append(m) + args.measures = mm + + ff = [] # filter out any duplicate entries for filters + args.candidate_filters = defaults['filters'] if not args.candidate_filters else args.candidate_filters + for f in args.candidate_filters: + if f not in ff: + ff.append(f) + args.candidate_filters = ff + + args.query = args.query.upper().replace('T', 'U') # capitalize sequences and replace T -> U + args.target = args.target.upper().replace('T', 'U') + + if args.param_file and not os.path.exists(args.param_file): + sys.stderr.write(f'Cannot find provided parameter file: {args.param_file}\n') + sys.exit(1) + + for key, value in args.__dict__.items(): + self.__setattr__(key, value) + + def run_tests(self, mutations: List[Mutation]): + for mut in mutations: + outcsvcols = list(set([j for i in [mm.outcsvcols for mm in mut.measures] for j in i])) # combine lists + outcsvcols = ','.join(outcsvcols) # join columns + outcsvcols = 'id1' if not outcsvcols else outcsvcols # if no col -> select id1 + + out_pipes = [j for i in [m.out_pipes for m in mut.measures] for j in i] # combine lists + + # create args for IntaRNA call + out_param = [f'--out={x}:{os.path.join(gettempdir(), f"CopomuS_{x}")}_{os.getpid()}.temp' for x in out_pipes] + + for mode in ['ww', 'wm', 'mw', 'mm']: # go through each mutation mode + q = mut.qw if mode[0] == 'w' else mut.qm # get actual sequences + t = mut.tw if mode[1] == 'w' else mut.tm + + i = IntaRNA() # call IntaRNA, we have 4 calls per base pair, one for each mutation mode + data = i.run(q, t, mut.qidxpos0, mut.tidxpos0, outcsvcols, self.threads, 1, self.param_file, out_param) + + for m in mut.measures: # add results to each measure + m.calculate_score(data, mode, mut, self.param_file) + + for m in mut.measures: + m.post_compute() + + def calculate_ranks(self, mutations: List[Mutation]): + # Calculate total ranking order + rank = 1 + mutations[0].ranks['total'] = rank + for i in range(1, len(mutations)): + if mutations[i - 1] < mutations[i]: + rank += 1 + mutations[i].ranks['total'] = rank + + # Calculate ranking for each measurement + for i in range(len(self.measures)): + rank = 1 + m_id = self.measures[i] + + measures = [(x.measures[i], j) for j, x in enumerate(mutations)] + measures.sort(key=lambda x: x[0]) + + mutations[measures[0][1]].ranks[m_id] = rank + for j in range(1, len(measures)): + measure, mut_id = measures[j] + if measures[j - 1][0] < measures[j][0]: + rank += 1 + mutations[mut_id].ranks[m_id] = rank + + def output_csv(self, mutations: List[Mutation]): + # CREATE CSV WRITER + writer = csv.writer(self.output, delimiter=self.delimiter) + + # WRITE CSV + headers = [j for i in + [[f'{m.id}_rank'] + [f'{m.id}_{k}' for k in m.score.keys()] for m in mutations[0].measures] + for j in i] + writer.writerow(['mutation', 'rank', 'qIndex', 'tIndex', 'bpWildtype', 'bpMutated'] + headers) + + for m in mutations: + values = [] + for x in m.measures: + values.append(m.ranks[x.id]) + scores = list(x.score.values()) + for i in range(len(scores)): # replace None with 'NA' + if scores[i] is None: + scores[i] = 'NA' + values += scores + # values = [j for i in [list(x.score.values()) for x in r.measures] for j in i] + writer.writerow([str(m), m.ranks['total'], m.query_n, m.target_n, m.bp_ww, m.bp_mm] + values) # write csv + + +if __name__ == '__main__': + if python_version() < __python_min__: + sys.stderr.write(f'This script requires python >= {__python_min__}\n') + sys.exit(1) + c = CopomuS() + c.main() diff --git a/python/Makefile.am b/python/Makefile.am index a9ce82fa..2664f026 100644 --- a/python/Makefile.am +++ b/python/Makefile.am @@ -1,12 +1,13 @@ # sub directories to check for Makefiles -SUBDIRS = intarnapvalue bin - -noinst_PYTHON = setup.py +SUBDIRS = copomus intarnapvalue bin # install python modules if HAVE_PYTHON + +noinst_PYTHON = setup.py + if INSTALL_INTARNAPVALUE # ensure pip is installed @@ -18,4 +19,18 @@ install-exec-hook: $(PYTHON) setup.py install --prefix=${prefix} endif + +# copomus installation +if INSTALL_INTARNAPVALUE + +python_PYTHON = \ + CopomuS.py + +endif + +else + +# ensure files are part of distribution if python not available +EXTRA_DIST = *.py + endif diff --git a/python/README.md b/python/README.md index 7c67eb11..d74a0b2f 100644 --- a/python/README.md +++ b/python/README.md @@ -1,86 +1,14 @@ -# IntaRNApvalue -A tool for calculating p-values of IntaRNA energy scores. - -### How it works: -This tool shuffles one or more of the given sequences (depending on shuffle mode) while preserving mono- and dinucleotide frequences. -Thus the resulting sequences are very similar with similar properties to the original one. -It then calculates the IntaRNA interaction scores for these newly generated sequences and uses them to approximate the p-value of the original target/query combination. -This p-value is a score for the likelihood that a sequence with a better interaction score than the original ones can randomly occur. -It can thus be used as a measurement for how good an interaction between two sequences is. - -## Dependencies: -- IntaRNA -##### And if you want to run it with python or compile it yourself: -- Python 3 (>= 3.6) -- numpy -- scipy -## Installation: -You can either run this tool directly with python, install it with setuptools or run it as a compiled binary. -##### Run with python directly: -You don't have to install it, if you just plan on running it from a certain directory. -Simply copy the "intarnapvalue" folder where you want to run it from. -You have to get the dependencies yourself in this case. +# auxiliary tools -To use the tool from anywhere however, you need to install it as a python module. -All dependencies (except IntaRNA) will be installed automatically. -This way it can also be installed in a virtual environment. -```console -python setup.py install -``` +## [IntaRNApvalue](intarnapvalue) -##### Run as binary: -Go into the bin directory. You can find pre-compiled builds for linux and windows for x64 directly. -You don't need to install any dependencies for running these, on Windows however you will need Microsoft Visual C++ Redistributable. -If you want to compile it yourself, you need to get the dependencies and PyInstaller. -I had problems with numpy 1.17.0, so I had to use 1.16.4. -Simply run the build.py with the python binary that has the dependencies and PyInstaller installed. -You will find your binary in the build folder. - -## Usage: -Run it as a module without installing (you need to be in the parent dir of intarnapvalue): -```console -python3 -m intarnapvalue -``` - -Run it as a module from anywhere with python (need to run setup.py first, see [Installation](#installation)): -```console -python3 -m intarnapvalue -``` -Or use as a compiled binary: -```console -./IntaRNApvalue -``` +A tool for calculating p-values of IntaRNA energy scores. -Or import it and use it from within other python code: -```python -from intarnapvalue.intarna_pvalue import IntaRNApvalue -IntaRNApvalue(['--flag1', 'arg1']) -``` +## [CopomuS](copomus) -## Arguments: +The Compensatory Point Mutation Selector helps to identify the most promising +base pairs from an IntaRNA mfe prediction to verify the interaction via +compensatory mutation experiments. -| Flag | Value | Default | Description | -| ------------------ |:---------------------- | :------ | -------------------- | -| -h, --help | | | Gives detailed command help. | -| -q, --query | Sequence or FASTA file | | Query as a raw sequence or a file in FASTA format. Takes the first sequence from the file. | -| -t, --target | Sequence or FASTA file | | Target as a raw sequence or a file in FASTA format. Takes the first sequence from the file. | -| -c, --cardinality | Integer | | How many sequence pairs are randomly permuted and considered for p-value calculation. | -| -m, --shuffle-mode | {q, t, b} | | Which sequence will be shuffled: Query, Target or both. | -| -d, --distribution | {gev, gumbel, gauss} | gev | (optional) The distribution used for p-value calculation: Generalized Extreme Value Distribution, Gumbel Distribution or Gauss. | -| -o, --output | {pvalue, scores} | pvalue | (optional) If set to p-value, will only output p-value. If set to scores, will output every score from randomly generated sequences, but no p-value. | -| --threads | 0 - {max threads} | 0 | (optional) How many threads IntaRNA uses for score calculation. If set to 0 it will use all available threads. | -| --randSeed | Integer | None | (optional) The seed used for generating random sequences. | -| -p, --parameterFile | file name | None | (optional) parameter file to be used in IntaRNA calls to further guide predictions. | -## Example -An example use-case could look like this with sequences in raw format: -```console -$ python3 -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0 -0.2421277140114205 -``` -Or you could specify any of the sequences as files: -```console -$ python3 -m intarnapvalue --query query.fasta --target target.fasta --scores 10000 --shuffle-mode b --threads 0 -0.2421277140114205 -``` \ No newline at end of file diff --git a/python/bin/Makefile.am b/python/bin/Makefile.am index e2a46584..afab0a4c 100644 --- a/python/bin/Makefile.am +++ b/python/bin/Makefile.am @@ -1,2 +1,12 @@ + +# install python modules +if HAVE_PYTHON + noinst_PYTHON = build.py + +else + +EXTRA_DIST = build.py + +endif diff --git a/python/copomus/CopomuS-logo.jpg b/python/copomus/CopomuS-logo.jpg new file mode 100644 index 0000000000000000000000000000000000000000..383ab203becbe379d4be74e3e35d2ae65ec1c3f4 GIT binary patch literal 8353 zcmb7ocU)7?()US7AoSjQiG&ga3yg^ES}Hm^S}GbE z24*-b10yFB4GkMV8|Mvf9$p@LR-_;jAqYqCAg)0`czAdOFak;f0!jn}4Flr;yIgex zETA zWB>q~hFK?*0Tfa(0XWO%vf=>9QXfnFDGPwL5QVqM&xZ)v*yAIGq|%@kH5tK68&It_ zIw3c_WVdyjZ?yv<_cK9Cwl{(v^~I}E$6H$s%ox+7G4wv?8aGv`z7UDdlJryFXNJ^E zaYvW9pBLVuRJm0L@;6EvT*0W?=Nsn5K<%1bZ&117s{68My5}+gLHfj8p?Sv-GQ=>f z@~fNT`HWo0uS-M9C2vP+Olt8}vySCPR8*X}x6+|sry9{&_8ewK3uN2khJd?JTaQ_X z4kX1vKiE9BA^mXA`&m>YrNlO8fLM|t2NqFziVEr5T!o`h`tw$ zPhT$|cv-vwgz&l_1P~UkjQ?jrSNMpR;3RnffJ~E5yrxpGxHFNKbkdF45P7Gcv>E@5 zP=K0l7!L8`V^N&sQeL~r$;cnrg^<6KnZ=~9-b zSl!E(-;cpaB){w9Ql#7ORU~r|U`VgHAJyL%ve8BnXK{XV=V@6a)a~ zv>(DI$=NoPBwl<6kJMr5{}h}7(C(qn{ooA%yTN$L01yrY3Ym z(eKp0Y}|3pYhteqa%k30{o9IiG-t5u%6U7RW&7p2eN8M@vcCetzUt!4ujM=-YB|d+ zNGn~FN{CZcvw5+*v79L1dQCV`7hJ>By)}P_$vDj+-l$$THxhlAW^DZPBU7UycNdp; z?bs~$wj%T7%|gS*Vk|&)Z0${Xwa1zgP3Vu(iJ$s^2FVN%>7JN$O%7HTjk}?Ih=CKK; z>ZT^s{)Q_6v+d|}%9*kUuNk>xpZM6|yR+!6qR4zPdKM|~71LtV?Tb%b)#x9M|QFA?&pL?2iF}=>48_Zi!uxxr=qef zla0H7>Q{J(;yZ4i{RBnq2$PhtWkK#GM!bPJc%wrzdf%$PvDEi<9ZIrY0TL)lcVV=pM0ZNRbD0ahNMEduTh9PXb6WE%4migk7vznS} zz3-~Ito9x!#rxB8T1JV0aHZCVNZqB~C-s@&?%mjjj6y5)AnE6;yw&W4%QsUu0Yvvr z1Zi-a#B9PhaoJT6&2}~D@=p0<`k-ghTmJ0OB=Zm7Bz)t-nLTHr7Yc6=S0yC%KCiua zm|UK$>!DnQ)+L7a zVd<4B62l23fp=j#hsHeqpum4|26-`SeC(F#l{`$ul@uC`_U=vT*N@Z@o1{Za75+cV zD?1`tnBjtK@8MEa#<+32J4~3^0343fgMd|o1O3Ln`G?Mha*R#fm})0FmnpWb0w^Zp z9}p~(YQHfHcH61`ks)_0+~?4dYwM#Iwpk_~h}ITX&voId`)Yb$Ysq*`t~BHq3XP5o z6VAZbm6z1djoFSnid%yi+PuLRd9zS%#T1n)w@g|~Ja5gcDQ`;86!n4-vDZ7ww1o8F z(20FsQlT;-S8q?`zB}Ekv;vJi%+Rk1B`z>qUC!g2hcYt^&SE*AoWt~lX4V1It z`g>GG6bwCJicA%NSCJ$s%(;V?#_#tdPgk2xJrIh>NgxzI=nD97R(3JFEPr<4ABer{ZjfW|IvfZHg7fdI5bHp3AW(7& zO6(m+CM4xXrDgv2p2NCaNzf_VZnLsuqu<+8&6K0HO`%TJXFig|J?s_=pGgfF=ZU!o zce=Eiv#$Uid5x1>w^8$=3|~N@1mBf-Y$v~3R(c;VkR`2hl^4CcBWG>i#o~QRw;aXf zp}P{DWjsp9x$;2W{u{-e!i7;GmwesFhUMu-OUo!xMaKKRLf4&y5Jpe)urJ!VE7m_BU5|(oN*9^edGdO2gLB0(0pZv`g zwPL+qUGdFsv}9IwDq!Z3WK?-2xh=iMGe=?;q=8lYOi>Y`-Jbr!=KQ^&jhbm@Tt$Bk z8Pl1AO(+3U%p zd)pfJ!kvYvX!C={2<>ujrX5Lb_eCMoMc<;E=I_NI*?q|&BsOCU9_m(Tg|U+T4(y~P zu3Zoq1cO0wuwLl;)m*<8a*AXKCA-iqlm~`GSX#?HIDeVUA|$Dnic{ON3rfuJ9_a}HqKRZ)p%NT<_`(VQ@Hl`B^<s&7Ir>J@>B)<`K5_GC)o+aZ z<(|KY>7zLy5yY~?u}DlKaRDOpj=MvmIxK-c+tEWkcer zOmIqwx(@>iId`7+<5GFh*zqhTa@2-NNg`3aVG~d0OYWe{hqbJgq9ue#e%0NOfAyc| z5@t?R4oQOF)l>$D|D2U7{gHEdm*&T%(xX*RzOr(}3M)}Txb*j9jXfCpR;5j$uJZ+) zW^MnW;y^`iZ$%zya%k9VM)9=dtt~#+P2OV@&&S-yP}0mH(Nz9q2eC@vxAMZNa*G2; z`Ih!BHQ}P&Obg|a&WBTFC-+hPHqD0D?H#<=%Ck?tI;tR=E|ar`eFd6L!&7{(0Gm{I z(g>~Yyn}OYd02TF&1lQG{|=~dN?LmtA+9{+pQ?Zx!ho8da9W5n6a`IJHU3Gx0szdx z12iL}W1J!yOQ&fw7}xO!1)z5rhV7c1d$afZ<|njW4DV264euB1^4)=zSK%4WH8VJt z*a4XY3>76^{-itZ3Oo-rGF^Mq9r_hWr`oTqS?hMI4be z@>p7c#;B-ugnKkZ8Y463dq(ZjSnPi#{cuB;$jfnfGGf1TlCf0Q&ykrzZ{tasQ>EAH z?jGK4G<;2nZf&J3vRVbEjK;jInVY=%I4zjJ&4*y6oh{Zg$HwP5JZlbgD0)Vu zT?!u#358y&Ybl>_AT6q5O1isoau%Cv4vY_2^@V(sb+$ybCw{ogBRLk*X z4&ZUkJ@Y-bbV_K=d`fj|XqaWMsDrMa`r<~+k^dDi`%H!TS8Hau6ea~>lT2P1xFz1l zd!|;fUVH@zSvZp_R5q{PtJ`#m+S=6DINiQv!B5^nQIF?~X%FOYfuR~kx z8uPB6$|4stN1~P30b0Y#rhIaTMA?jLe1Uae6eRzR_`=aDFbXmK$oR8zJnK}%qP_9d zI&tgb(Dk5eEzMd9b1ns}{~j_Vn^iZZ_V`%Ct+eqruI`&r7h~W1UCwJ~$o5>D=9C`= z$fU8B!-|qY>Fx(>!Zc}_4tfzGk}ovhJu%sbJ) zN%(DB;NfC-ljea84?UeT{&*Ibvpsxz5=&5}XpCz<#+qY&4i`%j38OUOvmq79NklPB z$#|*83r{RI$djT@hy{G7Kr@B6?a#E<)G3KC%sPAzTN2|LE1O#NuTPJ4!n^N|7^82!8H^$* zO>T*~p_bW-#^Gqtg|aC)->S9B4T{CIt`L6sMVDs+ZA|L1)izT8a#TbeNp-u_XFA`p zl}&rHW2g5Odu#lp!!t@1R+D_p3Q!+*nJu{j!m<7jK?YpMUmz$p3Gk0!2a&Vy3Q22OOze@l2Pa`_ zyO!Cc{vCixvcDft4T}{OFL1Kb);3U7x+^_NlIRMOTGIWNsv4i_6cksO?Z=Ih z%KXdZMx|uU-!4t7T=SwVWM)%!%f?rINaIv(6oVJW!0a!>jL;e0a3ICMU3>p%gG>4; zqRjdpi=x$l&Wn|B`O1@6^rKS}yGgg++BJiH7FIp>fVMsRv>IlWwI9C3KIiRWcH-KX z(H2R_0@!GpczjRNbKVv~{mur%sNPcTP@M7fq(g6}*wuD|=dS6`D?hwBrIk=>*1_HW z;k5H9_(ocO_|I^+k7jF9TMU)wX==*Q^WqnTa!$WK^_zJ_l>anD-S5~uZi%o1Al)G3{~f{nTjq&yl824}WtCymZ6NwEMU7%~a@x(;HYCbR>~liopk7K6pi+ zd8^37t4e9nF;Z(^s>qc4r%3}--)Rr%cA#-mnq1-cpZ(@Czsd~!Xw;zhi)2TJwS+iD zTSALGRJmlH*wjl?)I2z2wr9CuO5IO!`z>Q97S`~r^zB^Q0)f6opNZDwo4fiP8fXG~ z+DfyC>|(AmIcdawed%4`TNKg3<^U$svmmwZE(0MyP%-=yPQ51*NFh*Av2;CH|IPs?yCq<+* z`YpL|XNt;hsE%w;RCR98wF;82LB+xZ6(v@>7+qVGV&wxK#<%`mEEeSJyx<3^RSqj# zda?}E3BN zJ?1whgJu#kGC^CW=_I4~xRSY~_@$lXW;2aXlF?kbl&d$bKT>eR%08qxr@jasY#Oti zT+Fqw&C->Th&ie&GB{;%L>KCJrW@*$?||<%&5?g>`Vf~v?KSfhpFz;Fw0J+wJ4&zriydR z1}X3w<<9GlY>l0mgKgudN^J?M=&pC19}iUdaS!LbG)xgE%ta& z9hTY62%%TRXA-!yB0B91L)v}CB|1zH)Vxej*Jb)%$|qsW6iNHz)oOe@f<#S6M&iJv zunIN$+4%Oy%WqJZz-SuMY|d>wfQN0wA--KacCnTLR`!`Zq3*Ym=JSvcdq!rWWxg1; zf(H+)Eo8#|IEyLAbZsU3qq1P=Xn~uX?QQPVYur2?N}Pf`yl0HV&3DpWTm?Qan!IUm z^Yryz3s#ZK3FxEBb0O{V^Hr24YR7**E5%bi~ti-j|za?kGVyWerLh{JY{=>4$TW!`=TU@z;x=~>`9 zp222lA&K|^SPDWW#RkTQfbqG4@VP>;Rf!F(#C4sKM016p*(d<65D*syjEe$btA?Qf z#kFK5fD8uVmBIzomXf6*K~j)R00lrG6jEG4FaQhyL4;gbaO~K))I17&fC7zeL+~}Z zLNwMW&|Clvk|@oF1q{OCgSe7GTp=(Z5rB{>{cU3f(2#5G*HDlkGz5Fv{#GDFi3{tP z;N;jq41nVN&v=>qI?mLRvPdG+benL;)cz-2COc*af0|2Wf7Xg)a1=E}b9DC}7-1is zu{Er;V<~4T&T>guqO9C-_Y(Z|^%9$8>uBkW_k4$DKEc6Qlor(2#`boLlCIcT!9=Nv zi~Xd-NP8xCv@_28JAasjgYuSwwVNbbqP34CTKZi$5RdC@Wix;6nPF|@!rBOeU{jd? zjp@k&c4;j(At`sPi5QEdf0=k450f3US-y}*%UUFqzJ_wTe|?SWmap4BQG=6kSFfgo z&9NDMG7}Nh7ur&P^uT4LQ|V3tM$b0)4Uzg+y)hMzb@x9?11)&vN-tiqqlhTqm-J|a z^*d7vU$#j@NU2TR!|8c!f3>Q9G^CnRo7V0N!o+coF5l+8R5}c)Mx}y7{hG=e$}_X4 zhb7-%CnFGK*u}-BBe2QVf4n{jAirK)3-?_zOcz^l(!^fv@_)TP*_}{ZoEMLvcU#+W zo+?<#hzVpo3OWA(J$XfVAWpk}+nsr9ax*q63hb!YCC=)*)N!lpnWaMWqm#56f_b`b zl?a2qZd2}mkD!04Nm9K|DW#oCY@D6Hn>(^=ma)_!{5zg>4(KZL5y`WKhvfIm%q`^# zjeAW8TJLIPEJ+GoiV|^4qunAlPm9Xl;2k2TR0Ue&^cAY?{$Ql%f`vFWi_&eDcj$aw z*fWdosn`zI6iwNSlKmpxmNMHY)l@-Qwi58+BNK-uElE?x@e4NcM1nq=w+lkN-IPSc zE?P#1XUOwz?ubgF@MQP9%#7+y>;x-~RHb9zofM1o$=WSSnVUMkRN`GcBZXyV`t*{d z-ZR$M^)nPOaO_z34Pc27@1TE#qL#-<2fj*j6Z(>@`-#lx*Y57Sw>U-<<1y_I*%=n- z2^&NrBJ7^p*Bhnck?AKSl-%^V0${#w5jgOuqxAR{BftAvmKNu-XjaZK%e=YIQxh5P=T zV;_mP+te*;HEYV2j8!r5A&(l&pW$|vldRjetGD}GB!QWg75VGmD&*(UYTCXUX>zNh zV#t)M|0SMF7r)@pLRD-ds6=-5@kOWT&kp;l+S4x(r=Mo_JBtw^S+|GSv>svl#0_p& zida#-OY}NIoACvh1GHZ>h%QEktGt{%P4cH0`TcY0<`Sfmx@NiFk_czdq$tGOtw>0F z;RegIuu(i=AA}~L>+a$k?L6$eWo}NKO0D4gb}MyFG$_5wMm9eZ{Wz2h;GwOiI-8v+)fl92Z-!7EQ2a zSG(ty&SYpNtzF&4R}5H=d9uryDyw#+8*RfJZrHp1S+}*7Dd1ZYo&kQbtkvM9%b zvd@qxoQXVOSDpP1ZUxZ|ISBu>OcZ=Q5&9!JJy4K1%B>w)|M9Kk4q<^PL8_a4BOJ~| zkxkA*pd~GSg15*qTTm!9GlSdoMx{0C?vOj;FDzVOTR2BKa2I<~94n^w?)NwX&u{33DSr5+PbRcdvmipr)ubWwK zTXs6kF;f>GH-zP>z=;j5`*t*BYG$1+`hyHCZl>~_BwbjG>uoqPbo>Yo^x%)|PB5G> zlKnh)Ltd{uD6=NEwMX~eER@(rYJ&(*`Q;DD*KI|!i~`9g)V@kL?Xmw5U%C=Iz616f zDZ~Vm8*gpq9~r{zUAX^XVfi)17&@Dl-eB6ETivWi6;ctHu2tc3s&1j)k`^b0I<#jv? zpFGmq=JUhJ4|n(z{t}@>U`iQ{&X9kDXW@0nF&b+LZMW&;W#@6H#NIn9$-194%RH)f z4@kVbzkrD98cs@XJXFi!{ra(I{YS2a7=H%4wBZ?BSR8!LT0x%@&f0gQoMje{tQvQ> zjmDmnFjXU6IWKiv``P#)f}o0mOHL8;DDbDkF$vz^Ok#fNj$2jFX>4@bdJdnr_QVM` zqI9K55U)5xCsvyt)3!29@ec<&!JEuk{^*g|xFXtV4k7}%VIB+$DgI1Nua7@~wZuZV<>`myRv7ymWbcorrBXg zJvHs_N0Ade3rd38E_|OOQ#S+)z3#z+vS?rT;}f$vzp6c?AD?erUgp2K(6g^attnHZ zS6>s0L@$^PpzBr3s2GtLmWE{Z>H3{E2%_dXg-`YK6C|CfV19PWuDV} zq|_7Y>vv4pq;ZkGAgaoHZ&T0Q9Jl%mOhqx`ni%x*V?_8Fz+_K|ldc|`#{cMRnRX|n zN}yqE?>k73q#dQ1OdC)G-LnbN{6KU0=p06JN7&AmVOF)Yvy7nN`K|3NO>oqniCeNH z#q$s{XOb(xs7Uz*uWs&YGrZj1JBWlBpFl8lOXszAku*z?mo~AyP;0}3S5rwGd@!5> zT3O76mA<2DZE(VK)^?YGU#G*~PnxKnnBoTnajd+^REbV~9V|8p72qr5ezM`H|NRP( aJS~pjEMNE|LaA8j)pt2^xY%(u_x}J7YscdN literal 0 HcmV?d00001 diff --git a/python/copomus/IntaRNA.py b/python/copomus/IntaRNA.py new file mode 100644 index 00000000..67efd434 --- /dev/null +++ b/python/copomus/IntaRNA.py @@ -0,0 +1,127 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +import sys +from typing import Dict, Union, List +from subprocess import Popen, PIPE, run + + +class IntaRNA: + """IntaRNA python3 wrapper""" + def __init__(self): + pass + + @staticmethod + def check_binary(required_version: str): + """ + Checks weather or not the IntaRNA binary can be found in PATH and exits with returncode 1 if not found. + :param required_version: The minimum required version for IntaRNA + """ + p = run('IntaRNA --version', shell=True, stdout=PIPE, stderr=PIPE, universal_newlines=True) + if p.returncode: + sys.stderr.write('Please add the IntaRNA binary to your PATH.\n') + sys.exit(1) + if p.stdout.split('\n')[0].split()[1] < required_version: + sys.stderr.write(f'This tool requires IntaRNA version {required_version}, please upgrade\n') + sys.exit(1) + + def run(self, query: str, target: str, qidxpos0: int, tidxpos0: int, outcsvcols: str, threads: int, n: int = 1, + param_file: str = '', extra_params: list = '', raw_stdout: bool = False) -> \ + Union[Dict[str, any], List[Dict[str, any]], str]: + """ + Runs IntaRNA on a pair of sequences and returns a dictionary with the output + :param query: The query sequence + :param target: The target sequence + :param qidxpos0: The starting index for the query sequence + :param tidxpos0: The starting index for the target sequence + :param outcsvcols: The csvcols returned by IntaRNA + :param threads: Thread count used by IntaRNA + :param n: The number of suboptimal interactions returned + :param param_file: Optional path to a file with additional parameters + :param extra_params: Optional list of additional parameters + :param raw_stdout: Weather or not stdout should be returned in raw format, not as a dict + :return: Dictionary with column name as key and payload as value + """ + if not extra_params: + extra_params = [] + param_file = f'--parameterFile={param_file}' if param_file else '' + p = Popen(['IntaRNA', '-q', query, '-t', target, '--outMode=C', f'--outcsvcols={outcsvcols}', + f'--qIdxPos0={qidxpos0}', f'--tIdxPos0={tidxpos0}', f'--outNumber={n}', f'--threads={threads}', param_file] + + extra_params, stdout=PIPE, stderr=PIPE, universal_newlines=True) + + stdout, stderr = p.communicate() + if p.returncode: + sys.stderr.write(f'IntaRNA Error: {stdout} {stderr}\n') + sys.exit(p.returncode) + if not raw_stdout: + if n == 1: + d = self.output_to_dict(stdout) + if type(d) != bool: + return d[0] + else: + return d + return self.output_to_dict(stdout) + else: + return stdout + + @staticmethod + def output_to_dict(data: str) -> Union[List[Dict[str, any]], bool]: + """ + Transforms IntaRNA output into a dictionary + :param data: Raw IntaRNA output + :return: dictionary with colname as key and payload as value (casted into fitting type) + >>> d = IntaRNA.output_to_dict('start1;end1;id1;id2;E\\n34;42;target;query;-4.7\\n') + >>> d + [{'start1': 34, 'end1': 42, 'id1': 'target', 'id2': 'query', 'E': -4.7}] + >>> [type(x) for x in d[0].values()] + [, , , , ] + """ + subopts = [] + + lines = data.rstrip().split('\n') + if len(lines) == 2: + headers, entries = lines + entries = [entries] + elif len(lines) > 2: + headers = lines[0] + entries = lines[1:] + else: + headers = lines[0] + entries = [''] + headers = headers.split(';') + + for values in entries: + d = {} + values = values.split(';') + if len(headers) != len(values): # no interaction between query and target + return False + + for i, h in enumerate(headers): + try: + value = int(values[i]) # try to cast to int + except ValueError: + try: + value = float(values[i]) # try to cast to float + except ValueError: + value = values[i] # leave as string + d[h] = value + subopts.append(d) + + for subopt in subopts: + for k, v in subopt.items(): + if not v: + subopt[k] = None + + return subopts + + @staticmethod + def to_fasta(sequences: Dict[str, str]) -> str: + """ + Transforms sequences into a FASTA string + :param sequences: A dictionary with sequence name as key and raw sequence as tuple + :return: A FASTA string containing all sequences + """ + fasta_str = '' + for name, seq in sequences.items(): + fasta_str += f'>{name}\n{seq}\n' + return fasta_str diff --git a/python/copomus/Makefile.am b/python/copomus/Makefile.am new file mode 100644 index 00000000..c315226e --- /dev/null +++ b/python/copomus/Makefile.am @@ -0,0 +1,30 @@ + +sourcefiles = \ + __init__.py \ + candidate_filters.py \ + candidate_selectors.py \ + indexing.py \ + measures.py \ + mutation.py \ + mutation_generators.py + +# install python modules +if HAVE_PYTHON + +# copomus installation +if INSTALL_INTARNAPVALUE + +copomus_PYTHON = $(sourcefiles) + +copomusdir = $(pythondir)/copomus + +else +noinst_PYTHON = $(sourcefiles) +endif + +else + +# ensure files are part of distribution if python not available +EXTRA_DIST = $(sourcefiles) + +endif diff --git a/python/copomus/README.md b/python/copomus/README.md new file mode 100644 index 00000000..820f8a04 --- /dev/null +++ b/python/copomus/README.md @@ -0,0 +1,98 @@ +# CopomuS - Compensatory point mutation selector for sRNA-RNA interaction verification + +![CopomuS logo](CopomuS-logo.jpg) + +## How it works + +CopomuS is a tool for selecting mutation candidate points for sRNA-RNA interactions. +For this it takes different measures for evaluating and ranking each position. +CopomuS is highly configurable for specific purposes, see below for an explanation of all arguments. + +## Overview +- [Dependencies](#dependencies) +- [Usage](#usage) +- [Arguments](#arguments) + - [Measures](#measures) + - [Candidate Selection](#candidate-selection) + - [Candidate Filters](#candidate-filters) + - [Mutation Generators](#mutation-generators) + +## Dependencies +- [IntaRNA](https://github.com/BackofenLab/IntaRNA) >= 3.1.2 (needs to be in PATH) +- [python](https://www.python.org/) >= 3.7.0 + +## Usage +```console +./CopomoS.py -q -t + +mutation rank qIndex tIndex bpWildtype bpMutated mfeCover_rank mfeCover_ww mfeCover_wm mfeCover_mw mfeCover_mm mfeCover_is_valid E_rank E_ww E_wm E_mw E_mm E_is_valid minDeltaE_rank minDeltaE_wm/ww minDeltaE_mw/ww minDeltaE_wm/mm minDeltaE_mw/mm minDeltaE_ww/mm minDeltaE_min +U4A&A11U 1 4 11 UA AU 1 True False False True 1 1 -9.32 -5.43 -7.25 -8.46 1 3 3.89 2.07 3.03 1.21 -0.86 1.21 +C16G&G1C 2 16 1 CG GC 1 True False True True 1 2 -9.32 -8.11 -5.31 -5.99 0 5 1.21 4.01 -2.12 0.68 -3.33 -2.12 +G15C&C2G 2 15 2 GC CG 1 True True False True 1 2 -9.32 -12.37 -8.12 -12.2 0 5 -3.05 1.2 -0.17 4.08 2.88 -3.05 +... +``` + +## Arguments + +| Flag | Value | Default | Description | +| :------------------ |:---------------------- | :------ | :-------------------- | +| -h, --help | | | Gives detailed command help. | +| -q, --query | sequence | | Query as a raw sequence. | +| -t, --target | sequence | | Target as a raw sequence. | +| -m, --measure | {E, ed1, ed2, maxED, mfeCover, mfeOverlap, spotProb, qAccessProfile, tAccessProfile, qEnergyProfile, tEnergyProfile} | E | The measure used to calculate the rank of a base pair. Can be used multiple times. In that case, base pairs are ranked in order of measurements specified. | +| --qIdxPos0 | integer | 1 | (Optional) starting index for the query. | +| --tIdxPos0 | integer | 1 | (Optional) starting index for the target. | +| -c, --candidateSelection | {mfe, mfeSO} | mfe | (Optional) method by which base pair candidates are selected from the sequences. | +| -f, --candidateFilters | {GU, AU, CG, lp, lpMfe, he, heMfe} | None | (Optional) filter by which base pair candidates are filtered. Can be used multiple times to combine filters. | +| -g, --generator | {flip, any} | flip | (Optional) method used to generate mutated sequences by which base pairs are ranked. | +| --mutationEncoding | string (ex. G1C&U7G) | | (Optional) allows specific selection of candidates by specifying a mutation encoding. If this option is specified, -c, -f and -g will be ignored. | +| -o, --output | file name | STDOUT | (Optional) file to which the output should be written to. | +| -d, --delimiter | character | "\t" (tab) | (Optional) delimiter used to separate csv output. | +| -p, --parameterFile | file name | None | (optional) parameter file to be used in IntaRNA calls to further guide predictions. | +| -t, --threads | integer | 0 (all) | (Optional) thread count used by IntaRNA. | +| -v, --version | | | Prints current version and exits. | + +#### Measures + +| Name | Description | +| :---- | :-------------------------------------------| +| E | (class) Interaction Energy between query and target. | +| minDeltaE | (sort) Energy difference between different combinations of mutation types (ww, wm, mw, mm). | +| mfeCover | (class) Mutation position has to be in MFE. | +| Eqi | (class) Energy profile on query at mutation position. | +| Eti | (class) Energy profile on target at mutation position. | +| EDqi | (sort) Accessibility profile on query at mutation position. | +| cEDqi | (class) Accessibility profile on query at mutation position needs to be constant. | + +The default measure combination is `-m mfeCover -m E -m minDeltaE`. +It is advised to always use a sorting measure at the end, to improve the resolution of the result, +since class measures only split mutations in 3 ranks (valid, invalid, no more interaction). + +#### Candidate Selection +The choice specified by this argument defines the method, which is used to select candidate base pairs from the given sequences. + +| Name | Description | +| :---- | :-------------------------------------------| +| mfe | Selects all base pairs from the MFE region. | +| mfeSO | Selects all base pairs from all suboptimal interactions that are in the MFE region. | + +#### Candidate Filters +The filters filter out base pairs from all candidates that do not comply with the constraints. + +| Name | Description | +| :---- | :-------------------------------------------| +| GU | Filters out any GU or UG base pairs. | +| AU | Filters out any AU or UA base pairs. | +| GC | Filters out any GC or CG base pairs. | +| lp | Filters lonely base pairs that can not stack (both neighbors not in MFE and both neighbors cant pair). | +| lpMfe | Filters lonely base pairs in MFE interaction (both neighbors not in MFE). | +| he | Filters helix ends (One neighbor can pair, the other one can not). | +| heMfe | Filters helix ends (One neighbor in MFE, the other one is not). | + +#### Mutation Generators +The mutation generator chooses the method by which mutated sequences are generated. + +| Name | Description | +| :---- | :-------------------------------------------| +| flip | Flips a base pair to generate the mutated sequences (for example GC --> CG) | +| any | Generates all logical base pair combinations from a given base pair (for example GC --> CG, UG, UA) | \ No newline at end of file diff --git a/python/copomus/__init__.py b/python/copomus/__init__.py new file mode 100644 index 00000000..6f56e119 --- /dev/null +++ b/python/copomus/__init__.py @@ -0,0 +1 @@ +__all__ = ['mutation', 'indexing', 'IntaRNA', 'measures.py'] diff --git a/python/copomus/candidate_filters.py b/python/copomus/candidate_filters.py new file mode 100644 index 00000000..70baabcc --- /dev/null +++ b/python/copomus/candidate_filters.py @@ -0,0 +1,154 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +from typing import Tuple, List +from utils.indexing import idx_to_array_index + + +def _filter_bp_type(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int, bp_type='') -> \ + List[Tuple[int, int]]: + """ + Filters out base pairs of a specified type + :param bps: List of tuples in form (q_index, t_index) representing the base pairs + :param q: The query sequence + :param t: The target sequence + :param qidxpos0: Starting index for the query + :param tidxpos0: Starting index for the target + :param bp_type: The type of base pairs to filter out, for example GU, CG, AU, GC, ... + :return: List of tuples in form (q_index, t_index) representing the base pairs + >>> q, t = 'GCUACGAUC', 'UUUGCGAGCAGCUAGG' + >>> bps = [(1,1),(2,2),(6,13),(8,15),(9,16)] + >>> _filter_bp_type(bps, q, t, 1, 1, 'GU') + [(2, 2), (9, 16)] + >>> _filter_bp_type(bps, q, t, 1, 1, 'GC') + [(1, 1), (2, 2), (6, 13), (8, 15)] + """ + new_bps = [] + + for q_index, t_index in bps: + q_array_index = idx_to_array_index(q_index, qidxpos0) + t_array_index = idx_to_array_index(t_index, tidxpos0) + if f'{q[q_array_index]}{t[t_array_index]}' in [bp_type, bp_type[::-1]]: + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def _neighbors_in_mfe(q_index: int, t_index: int, bps: List[Tuple[int, int]]) -> int: + """ + Counts how many neighboring base pairs are in the MFE region + :param q_index: Base pair index on the query + :param t_index: Base pair index on the target + :param bps: List of tuples in form (q_index, t_index) representing the base pairs + :return: Count of neighboring base pairs that are in the MFE region + >>> _neighbors_in_mfe(3, 5, [(1, 7), (6, 9), (2, 6), (4, 4), (7, 4)]) + 2 + >>> _neighbors_in_mfe(3, 5, [(1, 7), (6, 9), (4, 4), (7, 4)]) + 1 + """ + count = 0 + for q_o, t_o in [(+1, -1), (-1, +1)]: # get neighbors by offset + if (q_index+q_o, t_index+t_o) in bps: + count += 1 + return count + + +def _neighbors_can_pair(q_index: int, t_index: int, q: str, t: str, qidxpos0: int, tidxpos0: int) -> int: + """ + Counts how many neighboring base pairs can pair + :param q_index: Base pair index on the query + :param t_index: Base pair index on the target + :param q: The query sequence + :param t: The target sequence + :param qidxpos0: Starting index for the query + :param tidxpos0: Starting index for the target + :return: Count of neighboring base pairs that can pair + >>> _neighbors_can_pair(1, 5, 'GCAUCGAUC', 'CGUACGAUCGAUCC', 1, 1) + 0 + >>> _neighbors_can_pair(3, 3, 'GCAUCGAUC', 'CGUACGAUCGAUCC', 1, 1) + 1 + >>> _neighbors_can_pair(6, 9, 'GCAUCGAUC', 'CGUACGAUCGAUCC', 1, 1) + 2 + """ + count = 0 + q_array_index = idx_to_array_index(q_index, qidxpos0) + t_array_index = idx_to_array_index(t_index, tidxpos0) + pairable_bps = ['GC', 'CG', 'AU', 'UA', 'GU', 'UG'] + for q_o, t_o in [(+1, -1), (-1, +1)]: # get neighbors by offset + if q_array_index+q_o in range(len(q)) and t_array_index+t_o in range(len(t)): + if f'{q[q_array_index+q_o]}{t[t_array_index+t_o]}' in pairable_bps: + count += 1 + return count + + +def filter_gu(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters any GU or UG base pair""" + return _filter_bp_type(bps, q, t, qidxpos0, tidxpos0, bp_type='GU') + + +def filter_au(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters any AU or UA base pair""" + return _filter_bp_type(bps, q, t, qidxpos0, tidxpos0, bp_type='AU') + + +def filter_gc(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters ay GC or CG base pair""" + return _filter_bp_type(bps, q, t, qidxpos0, tidxpos0, bp_type='GC') + + +def filter_lp(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters lonely base pairs that can not stack (both neighbors not in MFE and both neighbors cant pair)""" + new_bps = [] + + for q_index, t_index in bps: + if not _neighbors_in_mfe(q_index, t_index, bps) and \ + not _neighbors_can_pair(q_index, t_index, q, t, qidxpos0, tidxpos0): + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def filter_lp_mfe(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters lonely base pairs in MFE interaction (both neighbors not in MFE)""" + new_bps = [] + + for q_index, t_index in bps: + if not _neighbors_in_mfe(q_index, t_index, bps): + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def filter_he(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters helix ends (One neighbor can pair, the other one can not)""" + new_bps = [] + + for q_index, t_index in bps: + if _neighbors_can_pair(q_index, t_index, q, t, qidxpos0, tidxpos0) == 1: + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def filter_he_mfe(bps: List[Tuple[int, int]], q: str, t: str, qidxpos0: int, tidxpos0: int) -> List[Tuple[int, int]]: + """Filters helix ends (One neighbor in MFE, the other one is not)""" + new_bps = [] + + for q_index, t_index in bps: + if _neighbors_in_mfe(q_index, t_index, bps) == 1: + continue + new_bps.append((q_index, t_index)) + return new_bps + + +def get_filter(string: str) -> any: + candidate_filters = { + 'GU': filter_gu, + 'AU': filter_au, + 'GC': filter_gc, + 'lp': filter_lp, + 'lpMfe': filter_lp_mfe, + 'he': filter_he, + 'heMfe': filter_he_mfe + } + return candidate_filters.get(string) diff --git a/python/copomus/candidate_selectors.py b/python/copomus/candidate_selectors.py new file mode 100644 index 00000000..898a77fc --- /dev/null +++ b/python/copomus/candidate_selectors.py @@ -0,0 +1,103 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +import sys +from typing import List, Tuple +from utils.IntaRNA import IntaRNA +from utils.indexing import idx_to_array_index as idx_aidx + + +def get_mfe_bps(query: str, target: str, qidxpos0: int, tidxpos0: int, param_file: str, threads: int, **kwargs) -> \ + List[Tuple[int, int]]: + """ + Gets a list of all base pairs for a given interaction + :param query: The query sequence + :param target: The target sequence + :param qidxpos0: The counting start index for the query + :param tidxpos0: The counting start index for the target + :param param_file: Parameter file used for IntaRNA call + :param threads: Thread count used by IntaRNA + :return: List of tuples of indexes in form (query_index, target_index) with each tuple representing a base pair + """ + candidates = [] + i = IntaRNA() + data = i.run(query, target, qidxpos0, tidxpos0, 'bpList', threads, param_file=param_file) + if not data or not data['bpList']: + print('No favorable interaction between the specified sequences!') + sys.exit(1) + for t in data['bpList'].strip().split(':'): + t_index, q_index = [int(x) for x in t.strip('(').strip(')').split(',')] + candidates.append((q_index, t_index)) # note we do query first, then target + return candidates + + +def get_mfe_bps_so(query: str, target: str, qidxpos0: int, tidxpos0: int, param_file: str, threads: int, **kwargs) -> \ + List[Tuple[int, int]]: + """ + Gets a list of all suboptimal base pairs for a given interaction + :param query: The query sequence + :param target: The target sequence + :param qidxpos0: The counting start index for the query + :param tidxpos0: The counting start index for the target + :param param_file: Parameter file used for IntaRNA call + :param threads: Thread count used by IntaRNA + :return: List of tuples of indexes in form (query_index, target_index) with each tuple representing a base pair + """ + candidates = [] + i = IntaRNA() + data = i.run(query, target, qidxpos0, tidxpos0, 'start1,end1,start2,end2', threads, param_file=param_file) + if not data: + print('No favorable interaction between the specified sequences!') + sys.exit(1) + data = i.run(query, target, qidxpos0, tidxpos0, 'bpList', threads, 1000, param_file, + ['--qRegion', f"{data['start2']}-{data['end2']}", '--tRegion', f"{data['start1']}-{data['end1']}"]) + for subopt in data: + for t in subopt['bpList'].strip().split(':'): + t_index, q_index = [int(x) for x in t.strip('(').strip(')').split(',')] + if (q_index, t_index) not in candidates: + candidates.append((q_index, t_index)) # note we do query first, then target + return candidates + + +def get_mutation_enc(query: str, target: str, qidxpos0: int, tidxpos0: int, param_file: str, **kwargs) \ + -> List[Tuple[int, int]]: + """ + Only selects the base pairs specified by the mutation encoding + :param query: The query sequence + :param target: The target sequence + :param qidxpos0: The counting start index for the query + :param tidxpos0: The counting start index for the target + :param param_file: Parameter file used for IntaRNA call + :return: List of tuples of indexes in form (query_index, target_index) with each tuple representing a base pair + """ + try: + candidates = [] + enc = kwargs['enc'] + for mut in enc.split(','): # split block mutations into single mutations + q, t = mut.split('&') + q_index = int(q[1:len(q) - 1]) # mutation indices + t_index = int(t[1:len(t) - 1]) + + if query[idx_aidx(q_index, qidxpos0)] != q[0] or target[idx_aidx(t_index, tidxpos0)] != t[0]: + print('The encoding you specified does not match with the sequences you specified!') + sys.exit(1) + candidates.append((q_index, t_index)) + return candidates + + except (IndexError, ValueError, KeyError): + print('Error parsing the encoding you specified!') + sys.exit(1) + + +def get_selector(string: str) -> any: + """ + Gets a selector by its string name + :param string: The name of the selector + :return: A reference to a selector function + """ + selectors = { + 'mfe': get_mfe_bps, + 'mfeSO': get_mfe_bps_so, + 'mutEnc': get_mutation_enc + } + return selectors.get(string) diff --git a/python/copomus/indexing.py b/python/copomus/indexing.py new file mode 100644 index 00000000..6b08be81 --- /dev/null +++ b/python/copomus/indexing.py @@ -0,0 +1,50 @@ +# Copyright 2019 +# Author: Fabio Gutmann + + +def idx_to_array_index(idx: int, idxpos0: int): + """ + Converts a nucleotide index to an 0-included array index + :param idx: The index to convert + :param idxpos0: The start index + :return: The index of the element in the array + >>> idx_to_array_index(10, 5) + 5 + >>> idx_to_array_index(10, -200) + 209 + >>> idx_to_array_index(1, -1) + 1 + """ + return idx - idxpos0 - (1 if (idxpos0 < 0 < idx or idx < 0 < idxpos0) else 0) + + +def array_index_to_idx(i: int, idxpos0: int): + """ + Converts a nucleotide index to an 0-included array index + :param i: The array index + :param idxpos0: The start index + :return: The index of the element in the array + >>> array_index_to_idx(5, 5) + 10 + >>> array_index_to_idx(209, -200) + 10 + >>> array_index_to_idx(1, -1) + 1 + """ + return idxpos0 + i + (1 if (idxpos0 < 0 < i or i < 0 < idxpos0) else 0) + + +def add_sub_idx(i1: int, change: int): + """ + Adds or subtracts an index from another one and skips 0 if encountered. + :param i1: The start index + :param change: The change (+ or -) + :return: The new index + >>> add_sub_idx(-10, 10) + 1 + >>> add_sub_idx(10, -10) + -1 + >>> add_sub_idx(-5, 10) + 6 + """ + return i1 + change + (1 if i1 < 0 < i1+change+1 else -1 if i1+change-1 < 0 < i1 else 0) diff --git a/python/copomus/measures.py b/python/copomus/measures.py new file mode 100644 index 00000000..602a1902 --- /dev/null +++ b/python/copomus/measures.py @@ -0,0 +1,334 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +import os +from tempfile import gettempdir +from utils.mutation import Mutation +from utils.indexing import idx_to_array_index + + +class Measure: + def __init__(self): + self.outcsvcols = [] # list with all needed outcsvcols + self.out_pipes = [] + self.score = {'ww': None, 'wm': None, 'mw': None, 'mm': None} # measurement scores + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + if data: + self.score[mut_type] = data[self.outcsvcols[0]] + + def post_compute(self): + pass + + def read_temp_file(self): + data = [] + for pipe in self.out_pipes: + file_name = f'CopomuS_{pipe}_{os.getpid()}.temp' + with open(os.path.join(gettempdir(), file_name), 'r') as f: + data.append(f.read()) + os.remove(os.path.join(gettempdir(), file_name)) + return data + + # noinspection PyTypeChecker + def is_valid(self, alpha: float, beta: float = None) -> bool: + beta = alpha if not beta else beta + + res = (self.score['ww'] < 0 and self.score['mm'] < 0 and + (self.score['mw'] >= 0 or + ((self.score['ww'] + alpha < self.score['mw']) and (self.score['mm'] + beta < self.score['mw']))) + and + (self.score['wm'] >= 0 or + ((self.score['ww'] + alpha < self.score['wm']) and (self.score['mm'] + beta < self.score['wm']))) + ) + return res + + def non_i(self) -> bool: + """Checks if the Measure as values from non-interactions""" + return None in self.score.values() + + def __lt__(self, other): + return False # overwrite for custom comparisons + + def __eq__(self, other): + return self.score == other.score + + +class Energy(Measure): + def __init__(self, alpha, beta): + super().__init__() + self.outcsvcols = ['E'] + self.id = 'E' + self.alpha = alpha + self.beta = beta + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(self.alpha, self.beta)) + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) and not other.is_valid(self.alpha, self.beta) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) == other.is_valid(self.alpha, self.beta) + + +class MinDeltaEnergy(Measure): + def __init__(self): + super().__init__() + self.outcsvcols = ['E'] + self.id = 'minDeltaE' + + def post_compute(self): + score = { + 'wm/ww': round(self.score['wm'] - self.score['ww'], 2) if self.score['wm'] and self.score['ww'] else '', + 'mw/ww': round(self.score['mw'] - self.score['ww'], 2) if self.score['mw'] and self.score['ww'] else '', + 'wm/mm': round(self.score['wm'] - self.score['mm'], 2) if self.score['wm'] and self.score['mm'] else '', + 'mw/mm': round(self.score['mw'] - self.score['mm'], 2) if self.score['mw'] and self.score['mm'] else '', + 'ww/mm': round(self.score['ww'] - self.score['mm'], 2) if self.score['ww'] and self.score['mm'] else '' + } + self.score = score + entries = [v for k, v in self.score.items() if v != '' and k != 'ww/mm'] + self.score['min'] = min(entries) if entries else None + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.score['min'] > 0 > other.score['min'] or 0 < other.score['min'] < self.score['min'] + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.score['min'] == other.score['min'] + + +class MFECover(Measure): + def __init__(self): + super().__init__() + self.outcsvcols = ['start1', 'end1', 'start2', 'end2'] + self.id = 'mfeCover' + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.in_mfe()) + + def in_mfe(self): + return self.score['ww'] and self.score['mm'] + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + if not data: + return + mfe_cover = (m.target_n in range(data['start1'], data['end1']+1)) and \ + (m.query_n in range(data['start2'], data['end2']+1)) + # noinspection PyTypeChecker + self.score[mut_type] = mfe_cover + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.in_mfe() and not other.in_mfe() + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.in_mfe() == other.in_mfe() + + +class EnergyProfileQuery(Measure): + def __init__(self, alpha, beta): + super().__init__() + self.id = 'Eqi' + self.out_pipes = ['qMinE'] + self.alpha = alpha + self.beta = beta + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(self.alpha, self.beta)) + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + data = self.read_temp_file()[0] + data = data.strip().split('\n') + value = data[idx_to_array_index(m.query_n, m.qidxpos0) + 1].split(';')[2] + if value == 'NA': + self.score[mut_type] = None + else: + # noinspection PyTypeChecker + self.score[mut_type] = float(value) + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) and not other.is_valid(self.alpha, self.beta) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) == other.is_valid(self.alpha, self.beta) + + +class EnergyProfileTarget(Measure): + def __init__(self, alpha, beta): + super().__init__() + self.id = 'Eti' + self.out_pipes = ['tMinE'] + self.alpha = alpha + self.beta = beta + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(self.alpha, self.beta)) + + # noinspection PyTypeChecker + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + data = self.read_temp_file()[0] + data = data.strip().split('\n') + value = data[idx_to_array_index(m.target_n, m.tidxpos0) + 1].split(';')[2] + if value == 'NA': + self.score[mut_type] = None + else: + # noinspection PyTypeChecker + self.score[mut_type] = float(value) + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) and not other.is_valid(self.alpha, self.beta) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(self.alpha, self.beta) == other.is_valid(self.alpha, self.beta) + + +class AccessProfileQuery(Measure): + def __init__(self): + super().__init__() + self.id = 'EDqi' + self.out_pipes = ['qAcc'] + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(0)) + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + data = self.read_temp_file()[0] + data = data.strip().split('\n') + # noinspection PyTypeChecker + self.score[mut_type] = float(data[idx_to_array_index(m.query_n, m.qidxpos0) + 2].split('\t')[1]) + + # noinspection PyTypeChecker + def is_valid(self, alpha: float, beta: float = None, gamma: float = 0) -> bool: + return self.score['ww'] < self.score['mm'] + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return (self.is_valid(0) and not other.is_valid(0)) or ( + self.is_valid(0) and other.is_valid(0) and + self.score['mm'] - self.score['ww'] < other.score['mm'] - other.score['ww']) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(0) == other.is_valid(0) and \ + self.score['mm'] - self.score['ww'] == other.score['mm'] - other.score['ww'] + + +class ConstantAccessProfileQuery(Measure): + def __init__(self): + super().__init__() + self.id = 'cEDqi' + self.out_pipes = ['qAcc'] + + def post_compute(self): + if self.non_i(): + # noinspection PyTypeChecker + self.score['is_valid'] = 0 + return + # noinspection PyTypeChecker + self.score['is_valid'] = int(self.is_valid(1)) + + def calculate_score(self, data: dict, mut_type: str, m: Mutation = None, param_file: str = ''): + data = self.read_temp_file()[0] + data = data.strip().split('\n') + # noinspection PyTypeChecker + self.score[mut_type] = float(data[idx_to_array_index(m.query_n, m.qidxpos0) + 2].split('\t')[1]) + + # noinspection PyTypeChecker + def is_valid(self, alpha: float, beta: float = None, gamma: float = 0) -> bool: + return abs(self.score['ww'] - self.score['mm']) < alpha + + def __lt__(self, other): + if self.non_i(): + return False + elif other.non_i(): + return True + return self.is_valid(1) and not other.is_valid(1) + + def __eq__(self, other): + if self.non_i() and not other.non_i() or not self.non_i() and other.non_i(): + return False + if self.non_i() and other.non_i(): + return True + return self.is_valid(1) == other.is_valid(1) + + +def get_measure(string: str, alpha: float, beta: float) -> Measure: + mm = { + 'E': Energy(alpha, beta), + 'minDeltaE': MinDeltaEnergy(), + 'mfeCover': MFECover(), + 'EDqi': AccessProfileQuery(), + 'cEDqi': ConstantAccessProfileQuery(), + 'Eqi': EnergyProfileQuery(alpha, beta), + 'Eti': EnergyProfileTarget(alpha, beta) + } + return mm.get(string) diff --git a/python/copomus/mutation.py b/python/copomus/mutation.py new file mode 100644 index 00000000..af163340 --- /dev/null +++ b/python/copomus/mutation.py @@ -0,0 +1,35 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +from dataclasses import dataclass, field +from typing import List +from utils.indexing import idx_to_array_index + + +@dataclass() +class Mutation: + measures: List[any] + qw: str # wildtype sequences + tw: str + qidxpos0: int # indexing start of query + tidxpos0: int + query_n: int # index of mutation on query + target_n: int + bp_mm: str # mm base pair + bp_ww: str = '' # ww base pair + qm: str = '' # mutated sequences + tm: str = '' + ranks: dict = field(default_factory=dict) + + def __post_init__(self): + mut_pos_q = idx_to_array_index(self.query_n, self.qidxpos0) + mut_pos_t = idx_to_array_index(self.target_n, self.tidxpos0) + self.bp_ww = f'{self.qw[mut_pos_q]}{self.tw[mut_pos_t]}' + self.qm = self.qw[0:mut_pos_q] + self.bp_mm[0] + self.qw[mut_pos_q+1:] + self.tm = self.tw[0:mut_pos_t] + self.bp_mm[1] + self.tw[mut_pos_t+1:] + + def __lt__(self, other): + return self.measures < other.measures + + def __str__(self): + return f'{self.bp_ww[0]}{self.query_n}{self.bp_mm[0]}&{self.bp_ww[1]}{self.target_n}{self.bp_mm[1]}' diff --git a/python/copomus/mutation_generators.py b/python/copomus/mutation_generators.py new file mode 100644 index 00000000..0825de43 --- /dev/null +++ b/python/copomus/mutation_generators.py @@ -0,0 +1,92 @@ +# Copyright 2019 +# Author: Fabio Gutmann + +from copy import deepcopy +from utils.indexing import idx_to_array_index +from utils.measures import get_measure +from utils.mutation import Mutation +from typing import List, Tuple + + +def gen_flip(query: str, target: str, qidxpos0: int, tidxpos0: int, bp_list: List[Tuple[int, int]], mms: List[str], + alpha: float, beta: float) -> List[Mutation]: + mutations = [] + + for q_index, t_index in bp_list: + mm = [] + for m in mms: + mm.append(get_measure(m, alpha, beta)) + + q_index_a = idx_to_array_index(q_index, qidxpos0) + t_index_a = idx_to_array_index(t_index, tidxpos0) + + bp_mm = f'{target[t_index_a]}{query[q_index_a]}' + + m = Mutation(deepcopy(mm), query, target, qidxpos0, tidxpos0, q_index, t_index, bp_mm) + mutations.append(m) + return mutations + + +def gen_any(query: str, target: str, qidxpos0: int, tidxpos0: int, bp_list: List[Tuple[int, int]], mms: List[str], + alpha: float, beta: float) -> List[Mutation]: + mutations = [] + + possible_mutations = { + 'AU': ['UA', 'UG', 'CG'], + 'UA': ['AU', 'GU', 'GC'], + 'GC': ['UA', 'UG', 'CG'], + 'CG': ['AU', 'GU', 'GC'], + 'GU': ['UA', 'UG', 'CG'], + 'UG': ['AU', 'GU', 'GC'] + } + + for q_index, t_index in bp_list: + mm = [] + for m in mms: + mm.append(get_measure(m, alpha, beta)) + + q_index_a = idx_to_array_index(q_index, qidxpos0) + t_index_a = idx_to_array_index(t_index, tidxpos0) + + pos = possible_mutations[f'{query[q_index_a]}{target[t_index_a]}'] + + for p in pos: + bp_mm = f'{p[0]}{p[1]}' + + m = Mutation(deepcopy(mm), query, target, qidxpos0, tidxpos0, q_index, t_index, bp_mm) + mutations.append(m) + return mutations + + +def gen_specific(query: str, target: str, qidxpos0: int, tidxpos0: int, mms: List[str], alpha: float, beta: float, + encoding: str) -> List[Mutation]: + mutations = [] + + mm = [] + for m in mms: + mm.append(get_measure(m, alpha, beta)) + + for mut in encoding.split(','): # split block mutations into single mutations + q, t = mut.split('&') + q_index = int(q[1:len(q)-1]) # mutation indices + t_index = int(t[1:len(t)-1]) + + bp_mm = f'{q[len(q)-1]}{t[len(t)-1]}' + + m = Mutation(deepcopy(mm), query, target, qidxpos0, tidxpos0, q_index, t_index, bp_mm) + mutations.append(m) + return mutations + + +def get_generator(string: str) -> any: + """ + Gets a mutation generator by its string name + :param string: The string name + :return: A reference to a generator function + """ + generators = { + 'flip': gen_flip, + 'any': gen_any, + 'specific': gen_specific + } + return generators.get(string) diff --git a/python/intarnapvalue/README.md b/python/intarnapvalue/README.md new file mode 100644 index 00000000..7c67eb11 --- /dev/null +++ b/python/intarnapvalue/README.md @@ -0,0 +1,86 @@ +# IntaRNApvalue +A tool for calculating p-values of IntaRNA energy scores. + +### How it works: +This tool shuffles one or more of the given sequences (depending on shuffle mode) while preserving mono- and dinucleotide frequences. +Thus the resulting sequences are very similar with similar properties to the original one. +It then calculates the IntaRNA interaction scores for these newly generated sequences and uses them to approximate the p-value of the original target/query combination. +This p-value is a score for the likelihood that a sequence with a better interaction score than the original ones can randomly occur. +It can thus be used as a measurement for how good an interaction between two sequences is. + +## Dependencies: +- IntaRNA +##### And if you want to run it with python or compile it yourself: +- Python 3 (>= 3.6) +- numpy +- scipy + +## Installation: +You can either run this tool directly with python, install it with setuptools or run it as a compiled binary. +##### Run with python directly: +You don't have to install it, if you just plan on running it from a certain directory. +Simply copy the "intarnapvalue" folder where you want to run it from. +You have to get the dependencies yourself in this case. + +To use the tool from anywhere however, you need to install it as a python module. +All dependencies (except IntaRNA) will be installed automatically. +This way it can also be installed in a virtual environment. +```console +python setup.py install +``` + +##### Run as binary: +Go into the bin directory. You can find pre-compiled builds for linux and windows for x64 directly. +You don't need to install any dependencies for running these, on Windows however you will need Microsoft Visual C++ Redistributable. +If you want to compile it yourself, you need to get the dependencies and PyInstaller. +I had problems with numpy 1.17.0, so I had to use 1.16.4. +Simply run the build.py with the python binary that has the dependencies and PyInstaller installed. +You will find your binary in the build folder. + +## Usage: +Run it as a module without installing (you need to be in the parent dir of intarnapvalue): +```console +python3 -m intarnapvalue +``` + +Run it as a module from anywhere with python (need to run setup.py first, see [Installation](#installation)): +```console +python3 -m intarnapvalue +``` +Or use as a compiled binary: +```console +./IntaRNApvalue +``` + +Or import it and use it from within other python code: +```python +from intarnapvalue.intarna_pvalue import IntaRNApvalue +IntaRNApvalue(['--flag1', 'arg1']) +``` + +## Arguments: + +| Flag | Value | Default | Description | +| ------------------ |:---------------------- | :------ | -------------------- | +| -h, --help | | | Gives detailed command help. | +| -q, --query | Sequence or FASTA file | | Query as a raw sequence or a file in FASTA format. Takes the first sequence from the file. | +| -t, --target | Sequence or FASTA file | | Target as a raw sequence or a file in FASTA format. Takes the first sequence from the file. | +| -c, --cardinality | Integer | | How many sequence pairs are randomly permuted and considered for p-value calculation. | +| -m, --shuffle-mode | {q, t, b} | | Which sequence will be shuffled: Query, Target or both. | +| -d, --distribution | {gev, gumbel, gauss} | gev | (optional) The distribution used for p-value calculation: Generalized Extreme Value Distribution, Gumbel Distribution or Gauss. | +| -o, --output | {pvalue, scores} | pvalue | (optional) If set to p-value, will only output p-value. If set to scores, will output every score from randomly generated sequences, but no p-value. | +| --threads | 0 - {max threads} | 0 | (optional) How many threads IntaRNA uses for score calculation. If set to 0 it will use all available threads. | +| --randSeed | Integer | None | (optional) The seed used for generating random sequences. | +| -p, --parameterFile | file name | None | (optional) parameter file to be used in IntaRNA calls to further guide predictions. | + +## Example +An example use-case could look like this with sequences in raw format: +```console +$ python3 -m intarnapvalue --query GCUGAAAAACAUAACCCAUAAAAUGCUAGCUGUACCAGGAACCA --target GGUUUCUUCGCCUCUGCGUUCACCAAAGUGUUCACCC --scores 10000 --shuffle-mode b --threads 0 +0.2421277140114205 +``` +Or you could specify any of the sequences as files: +```console +$ python3 -m intarnapvalue --query query.fasta --target target.fasta --scores 10000 --shuffle-mode b --threads 0 +0.2421277140114205 +``` \ No newline at end of file From fbafa6fac37bdc9f19d6856176e07b9bf2eb080a Mon Sep 17 00:00:00 2001 From: martin-mann Date: Thu, 6 Feb 2020 18:17:28 +0100 Subject: [PATCH 04/16] copomus disable flag --- configure.ac | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/configure.ac b/configure.ac index 44c1f7db..ae287e77 100644 --- a/configure.ac +++ b/configure.ac @@ -290,10 +290,9 @@ AM_CONDITIONAL([INSTALL_INTARNAPVALUE],[test x"$intarnapvalue" = x"yes"]) AC_MSG_CHECKING([whether to compile and install 'CopomuS' python module]) copomus=yes AC_ARG_ENABLE([CopomuS], - [AS_HELP_STRING([--enable-CopomuS], - [build and install the CopomuS python3 module])], - [AM_COND_IF([HAVE_PYTHON], [copomus="$enableval"], [copomus="no (python not available)"])], - [AM_COND_IF([HAVE_PYTHON], [copomus=yes], [copomus="no (python not available)"])] + [AS_HELP_STRING([--disable-CopomuS], + [build and install the CopomuS python3 module (def=enabled)])], + [AM_COND_IF([HAVE_PYTHON], [copomus="$enableval"], [copomus="no (python not available)"])] ) AC_MSG_RESULT([$copomus]) From dcdc8c9e4a00af35af1fb6726cc161a38b020046 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Fri, 7 Feb 2020 10:45:13 +0100 Subject: [PATCH 05/16] bump version --- configure.ac | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/configure.ac b/configure.ac index ae287e77..4e2ad09f 100644 --- a/configure.ac +++ b/configure.ac @@ -1,14 +1,14 @@ AC_PREREQ([2.65]) # 5 argument version only available with aclocal >= 2.64 -AC_INIT([IntaRNA], [3.1.5], [], [intaRNA], [http://www.bioinf.uni-freiburg.de] ) +AC_INIT([IntaRNA], [3.2.0], [], [intaRNA], [http://www.bioinf.uni-freiburg.de] ) # minimal required version of the boost library BOOST_REQUIRED_VERSION=1.50.0 # minimal required version of the Vienna RNA library VRNA_REQUIRED_VERSION=2.4.14 -# minimal required version of python interpreter -PYTHON_REQUIRED_VERSION=3.7 +# minimal required version of python interpreter (to install intarnapvalue) +PYTHON_REQUIRED_VERSION=3.6 AC_CANONICAL_HOST @@ -283,22 +283,6 @@ AC_MSG_RESULT([$intarnapvalue]) AM_CONDITIONAL([INSTALL_INTARNAPVALUE],[test x"$intarnapvalue" = x"yes"]) -############################# -# COPOMUS SCRIPT -############################# - -AC_MSG_CHECKING([whether to compile and install 'CopomuS' python module]) -copomus=yes -AC_ARG_ENABLE([CopomuS], - [AS_HELP_STRING([--disable-CopomuS], - [build and install the CopomuS python3 module (def=enabled)])], - [AM_COND_IF([HAVE_PYTHON], [copomus="$enableval"], [copomus="no (python not available)"])] - ) -AC_MSG_RESULT([$copomus]) - -AM_CONDITIONAL([INSTALL_COPOMUS],[test x"$copomus" = x"yes"]) - - ############################################################################### @@ -478,7 +462,6 @@ Features * log coloring : $logColoring * OpenMP multi-threading : $multithreadingEnabled * intarnapvalue : $intarnapvalue - * CopomuS : $copomus Install Directories for given 'prefix' -------------------------------------- From 8b33d6ed32934b856b6ab3dd770229cc1375b4ba Mon Sep 17 00:00:00 2001 From: martin-mann Date: Fri, 7 Feb 2020 10:45:43 +0100 Subject: [PATCH 06/16] final copomus install setup --- ChangeLog | 6 ++++++ python/CopomuS.py | 18 ++++++++++-------- python/Makefile.am | 23 ++++++++--------------- python/bin/Makefile.am | 10 ---------- python/copomus/Makefile.am | 22 ++-------------------- python/copomus/README.md | 1 - 6 files changed, 26 insertions(+), 54 deletions(-) diff --git a/ChangeLog b/ChangeLog index 7b2de279..42a64f02 100644 --- a/ChangeLog +++ b/ChangeLog @@ -18,6 +18,12 @@ ################################################################################ +200207 Martin Raden + + python/CopomuS.py : Compensatory mutation selector to support interaction + validation experiments + + python/copomus/* : utility functions of CopomuS + * python/README.md : links to dedicated README.md of subfolders + 200131 Martin Raden * IntaRNA/PredictorMfe2dHeuristicSeedExtension : * fillHybridE_left() : diff --git a/python/CopomuS.py b/python/CopomuS.py index e21efc2f..26c3af4b 100644 --- a/python/CopomuS.py +++ b/python/CopomuS.py @@ -3,22 +3,28 @@ # Copyright 2019 # Author: Fabio Gutmann + +__intarna_min__ = '3.1.2' +__python_min__ = '3.7.0' + import os import sys +from platform import python_version + +if python_version() < __python_min__: + sys.stderr.write(f'CopomuS requires python >= {__python_min__}\n') + sys.exit(1) + import csv from tempfile import gettempdir from argparse import ArgumentParser, FileType, SUPPRESS from typing import List -from platform import python_version from copomus.mutation import Mutation from copomus.IntaRNA import IntaRNA from copomus.candidate_selectors import get_selector from copomus.candidate_filters import get_filter from copomus.mutation_generators import get_generator -__version__ = 'v1.0' -__intarna_min__ = '3.1.2' -__python_min__ = '3.7.0' class CopomuS: @@ -130,7 +136,6 @@ def parse_cmd_args(self): 'prediction constraints.') parser.add_argument('--threads', dest='threads', default=defaults['threads'], type=int, help='Threads used for IntaRNA call') - parser.add_argument('-v', '--version', action='version', version=f'CopomuS {__version__}') parser.add_argument('--alpha', dest='alpha', type=float, default=defaults['alpha'], help=SUPPRESS) parser.add_argument('--beta', dest='beta', type=float, default=defaults['beta'], help=SUPPRESS) @@ -232,8 +237,5 @@ def output_csv(self, mutations: List[Mutation]): if __name__ == '__main__': - if python_version() < __python_min__: - sys.stderr.write(f'This script requires python >= {__python_min__}\n') - sys.exit(1) c = CopomuS() c.main() diff --git a/python/Makefile.am b/python/Makefile.am index 2664f026..f5681a3d 100644 --- a/python/Makefile.am +++ b/python/Makefile.am @@ -3,11 +3,17 @@ SUBDIRS = copomus intarnapvalue bin -# install python modules -if HAVE_PYTHON +# scripts to be installed in $(bindir) directly +dist_bin_SCRIPTS = \ + CopomuS.py + +# auxiliary scripts to be part of distribution noinst_PYTHON = setup.py +# install python modules +if HAVE_PYTHON + if INSTALL_INTARNAPVALUE # ensure pip is installed @@ -20,17 +26,4 @@ install-exec-hook: endif -# copomus installation -if INSTALL_INTARNAPVALUE - -python_PYTHON = \ - CopomuS.py - -endif - -else - -# ensure files are part of distribution if python not available -EXTRA_DIST = *.py - endif diff --git a/python/bin/Makefile.am b/python/bin/Makefile.am index afab0a4c..e2a46584 100644 --- a/python/bin/Makefile.am +++ b/python/bin/Makefile.am @@ -1,12 +1,2 @@ - -# install python modules -if HAVE_PYTHON - noinst_PYTHON = build.py - -else - -EXTRA_DIST = build.py - -endif diff --git a/python/copomus/Makefile.am b/python/copomus/Makefile.am index c315226e..3ab43800 100644 --- a/python/copomus/Makefile.am +++ b/python/copomus/Makefile.am @@ -1,5 +1,5 @@ -sourcefiles = \ +copomus_DATA = \ __init__.py \ candidate_filters.py \ candidate_selectors.py \ @@ -8,23 +8,5 @@ sourcefiles = \ mutation.py \ mutation_generators.py -# install python modules -if HAVE_PYTHON +copomusdir = $(bindir)/copomus -# copomus installation -if INSTALL_INTARNAPVALUE - -copomus_PYTHON = $(sourcefiles) - -copomusdir = $(pythondir)/copomus - -else -noinst_PYTHON = $(sourcefiles) -endif - -else - -# ensure files are part of distribution if python not available -EXTRA_DIST = $(sourcefiles) - -endif diff --git a/python/copomus/README.md b/python/copomus/README.md index 820f8a04..bc237923 100644 --- a/python/copomus/README.md +++ b/python/copomus/README.md @@ -50,7 +50,6 @@ G15C&C2G 2 15 2 GC CG 1 True True False True 1 2 -9.32 -12.37 -8.12 -12.2 0 5 -3 | -d, --delimiter | character | "\t" (tab) | (Optional) delimiter used to separate csv output. | | -p, --parameterFile | file name | None | (optional) parameter file to be used in IntaRNA calls to further guide predictions. | | -t, --threads | integer | 0 (all) | (Optional) thread count used by IntaRNA. | -| -v, --version | | | Prints current version and exits. | #### Measures From e206aaf1901e4bd270486d848c8d414dce9a8c6d Mon Sep 17 00:00:00 2001 From: Martin Raden Date: Fri, 7 Feb 2020 10:47:26 +0100 Subject: [PATCH 07/16] Fabio added --- ChangeLog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index 42a64f02..f6777b35 100644 --- a/ChangeLog +++ b/ChangeLog @@ -18,7 +18,7 @@ ################################################################################ -200207 Martin Raden +200207 Martin Raden + Fabio Gutmann + python/CopomuS.py : Compensatory mutation selector to support interaction validation experiments + python/copomus/* : utility functions of CopomuS From c482679e523fbb877bf7a09be4cc89cfb9e607a4 Mon Sep 17 00:00:00 2001 From: Martin Raden Date: Fri, 14 Feb 2020 10:04:18 +0100 Subject: [PATCH 08/16] link to latest release --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index bbdcbeb1..f4643cc8 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,7 @@ not needed to be installed separately): The data provided within the github repository (or within the `Source code` archives provided at the -[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases)) +[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases/latest)) is no complete distribution and lacks all system specifically generated files. Thus, in order to get started with a fresh clone of the IntaRNA source code repository you have to run the GNU autotools @@ -214,7 +214,7 @@ Afterwards, you can continue as if you would have downloaded an ## IntaRNA package distribution (e.g. `intaRNA-2.0.0.tar.gz`) When downloading an IntaRNA package distribution (e.g. `intaRNA-2.0.0.tar.gz`) from the -[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases), you should +[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases/latest), you should first ensure, that you have all [dependencies](#deps) installed. If so, you can simply run the following (assuming `bash` shell). ```bash @@ -277,10 +277,10 @@ and follow either [install from github](#instgithub) or ### ... using pre-compiled binaries For some releases, we also provide precompiled binary packages for Microsoft Windows at the -[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases) +[IntaRNA release page](https://github.com/BackofenLab/IntaRNA/releases/latest) that enable 'out-of-the-box' usage. If you want to use them: -- [download](https://github.com/BackofenLab/IntaRNA/releases) the according ZIP archive and extract +- [download](https://github.com/BackofenLab/IntaRNA/releases/latest) the according ZIP archive and extract - open a [Windows command prompt](https://www.lifewire.com/how-to-open-command-prompt-2618089) - [run IntaRNA](#usage) @@ -331,7 +331,7 @@ brew install viennarna brew install doxygen ``` -Download and extract the IntaRNA source code package (e.g. `intaRNA-2.0.2.tar.gz`) from the [release page](releases/). +Download and extract the IntaRNA source code package (e.g. `intaRNA-2.0.2.tar.gz`) from the [release page](https://github.com/BackofenLab/IntaRNA/releases/latest). ```[bash] ./configure CC=gcc-6 CXX=g++-6 From b6565e1091515a8fb8e2d29d57737e10ece2de46 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Mon, 17 Feb 2020 14:26:17 +0100 Subject: [PATCH 09/16] - new arguments: - `qId|tId` : optional id (FASTA prefix) setup for sequence naming - `acc|accW|accL` : meta accessibility setup for both query and target - `intLenMax|intLoopMax` : meta interaction and interior loop length setup --- ChangeLog | 40 ++++ src/bin/CommandLineParsing.cpp | 374 ++++++++++++++++++++++----------- src/bin/CommandLineParsing.h | 347 ++++++++---------------------- 3 files changed, 383 insertions(+), 378 deletions(-) diff --git a/ChangeLog b/ChangeLog index f6777b35..290cbcac 100644 --- a/ChangeLog +++ b/ChangeLog @@ -12,12 +12,52 @@ # IntaRNA - improved Zall estimate (and depending values) for `--model=X` (default) +- new arguments: + - `qId|tId` : optional id (FASTA prefix) setup for sequence naming + - `acc|accW|accL` : meta accessibility setup for both query and target + - `intLenMax|intLoopMax` : meta interaction and interior loop length setup + +# CopomuS +- Compensatory mutation selector to support interaction validation experiments + ################################################################################ ################################################################################ ################################################################################ +200217 Martin Raden + * bin/CommandLineParsing : + * NumericParameter : + * CharParameter : + + isSet() : checks if value and default differ + + qId|tId : id (prefix) for sequence naming + + acc|accW|accL : meta for q|t* + + intLenMax|intLoopMax : meta for q|t* + * resetParamDefault() : + + overwrite value (since set to default in constructor) + + validate_region|shape|shapeMethod|shapeConversion : generic checks for q|t + + validate_seedRange : generic checks for q|t + + validate_numberArgumentExcludeRange() : generic check excluding a range + + validate_id() : checks for line breaks in q|tId + * parseSequences() : + * parseSequencesFasta() : + + idPrefix handling + - obsolete functions + - validate_query|target + - validate_qAccW|L|Constr + - validate_q|tRegion|Shape|ShapeMethod|ShapeConversion + - validate_seedQ|TRange + * constructor() : + + extended default reset for new arguments + * q|tAcc* arguments now hidden + * validation calls refactored + + model|acc|intLenMax|intLoopMax now general arguments + * parse() : + + additional checks for meta arguments with setup of q|t variables + + usage of .isSet() where appropriate + + check that outSep is not within id prefix in outMode=C + 200207 Martin Raden + Fabio Gutmann + python/CopomuS.py : Compensatory mutation selector to support interaction validation experiments diff --git a/src/bin/CommandLineParsing.cpp b/src/bin/CommandLineParsing.cpp index 9d56491f..60c103a7 100644 --- a/src/bin/CommandLineParsing.cpp +++ b/src/bin/CommandLineParsing.cpp @@ -142,6 +142,13 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) tShapeMethod("Zb0.89"), tShapeConversion("Os1.6i-2.29"), + // meta constraints + acc("acc","NC", 'C'), + accW("accW", 0, 99999, 150), + accL("accL", 0, 99999, 100), + intLenMax("intLenMax", 0, 99999, 0), + intLoopMax("intLoopMax", 0, 30, 10), + // Helix Constraints helixMinBP("helixMinBP",2,4,2), helixMaxBP("helixMaxBP",2,20,10), @@ -228,6 +235,10 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) resetParamDefault<>(tAccW, 0); resetParamDefault<>(tAccL, 0); resetParamDefault<>(tIntLoopMax, 16); + resetParamDefault<>(accW, 0); + resetParamDefault<>(accL, 0); + resetParamDefault<>(intLenMax, 0); + resetParamDefault<>(intLoopMax, 16); #if INTARNA_MULITHREADING resetParamDefault<>(threads, 1); #endif @@ -259,6 +270,10 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) resetParamDefault<>(tAccW, 150); resetParamDefault<>(tAccL, 100); resetParamDefault<>(tIntLoopMax, 16); + resetParamDefault<>(accW, 150); + resetParamDefault<>(accL, 100); + resetParamDefault<>(intLenMax, 0); + resetParamDefault<>(intLoopMax, 16); resetParamDefault<>(energyFile, std::string(VrnaHandler::Turner04), "energyVRNA"); #if INTARNA_MULITHREADING resetParamDefault<>(threads, 1); @@ -302,6 +317,9 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) resetParamDefault<>(tAccW, 0); resetParamDefault<>(tAccL, 0); resetParamDefault<>(tIntLenMax, 60); + resetParamDefault<>(accW, 0); + resetParamDefault<>(accL, 0); + resetParamDefault<>(intLenMax, 60); break; case IntaRNAseed : // seed-only prediction @@ -316,6 +334,8 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) resetParamDefault<>(tIntLoopMax, 8); resetParamDefault<>(qIntLenMax, 60); resetParamDefault<>(qIntLoopMax, 8); + resetParamDefault<>(intLenMax, 60); + resetParamDefault<>(intLoopMax, 8); resetParamDefault<>(outMinPu, 0.001); // new features added with 3.1.0 resetParamDefault<>(outNoLP, true, "outNoLP"); @@ -346,11 +366,29 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ("query,q" , value(&queryArg) ->required() - ->notifier(boost::bind(&CommandLineParsing::validate_query,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_sequenceArgument,this,"query",_1)) , "either an RNA sequence or the stream/file name from where to read" " the query sequences (should be the shorter sequences to increase efficiency);" " use 'STDIN' to read from standard input stream; sequences have to use" " IUPAC nucleotide encoding; output alias is [seq2]") + ; + opts_cmdline_short.add(opts_query); + opts_query.add_options() + ("qId" + , value(&qId) + ->default_value("") + ->notifier(boost::bind(&CommandLineParsing::validate_id,this,"qId",_1)) + , std::string("id (FASTA-prefix) to be used for query sequence naming.").c_str()) + (qIdxPos0.name.c_str() + , value(&(qIdxPos0.val)) + ->default_value(qIdxPos0.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,qIdxPos0,_1)) + , std::string("index of first (5') sequence position of all query sequences" + " (arg in range ["+toString(qIdxPos0.min)+","+toString(qIdxPos0.max)+"];").c_str()) + ("qSet" + , value(&(qSetString)) + ->notifier(boost::bind(&CommandLineParsing::validate_qSet,this,_1)) + , std::string("query subset : List of sequence indices to consider for prediction in the format 'from1-to1,from2-to2,..' assuming indexing starts with 1").c_str()) (qAcc.name.c_str() , value(&(qAcc.val)) ->default_value(qAcc.def) @@ -364,7 +402,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) (qAccW.name.c_str() , value(&(qAccW.val)) ->default_value(qAccW.def) - ->notifier(boost::bind(&CommandLineParsing::validate_qAccW,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,qAccW,_1,1,2)) , std::string("accessibility computation : sliding window size for query accessibility computation" " (arg in range ["+toString(qAccW.min)+","+toString(qAccW.max)+"];" " 0 will use to the full sequence length)." @@ -373,25 +411,12 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) (qAccL.name.c_str() , value(&(qAccL.val)) ->default_value(qAccL.def) - ->notifier(boost::bind(&CommandLineParsing::validate_qAccL,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,qAccL,_1,1,2)) , std::string("accessibility computation : maximal loop length (base pair span) for query accessibility computation" " (arg in range ["+toString(qAccL.min)+","+toString(qAccL.max)+"]; 0 will use to sliding window size 'qAccW')").c_str()) - ; - opts_cmdline_short.add(opts_query); - opts_query.add_options() - (qIdxPos0.name.c_str() - , value(&(qIdxPos0.val)) - ->default_value(qIdxPos0.def) - ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,qIdxPos0,_1)) - , std::string("index of first (5') sequence position of all query sequences" - " (arg in range ["+toString(qIdxPos0.min)+","+toString(qIdxPos0.max)+"];").c_str()) - ("qSet" - , value(&(qSetString)) - ->notifier(boost::bind(&CommandLineParsing::validate_qSet,this,_1)) - , std::string("query subset : List of sequence indices to consider for prediction in the format 'from1-to1,from2-to2,..' assuming indexing starts with 1").c_str()) ("qAccConstr" , value(&(qAccConstr)) - ->notifier(boost::bind(&CommandLineParsing::validate_qAccConstr,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_structureConstraintArgument,this,"qAccConstr",_1)) , std::string("accessibility computation : structure constraint :" "\n EITHER a string of query sequence length encoding for each position:" " '.' no constraint," @@ -424,7 +449,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) " (arg in range ["+toString(qIntLoopMax.min)+","+toString(qIntLoopMax.max)+"]; 0 enforces stackings only)").c_str()) ("qRegion" , value(&(qRegionString)) - ->notifier(boost::bind(&CommandLineParsing::validate_qRegion,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validateRegion,this,"qRegion",_1)) , std::string("interaction site : query regions to be considered for" " interaction prediction. Format =" " 'from1-to1,from2-to2,..' where indexing starts with 'qIdxPos0'." @@ -450,8 +475,26 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ("target,t" , value(&targetArg) ->required() - ->notifier(boost::bind(&CommandLineParsing::validate_target,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_sequenceArgument,this,"target",_1)) , "either an RNA sequence or the stream/file name from where to read the target sequences (should be the longer sequences to increase efficiency); use 'STDIN' to read from standard input stream; sequences have to use IUPAC nucleotide encoding; output alias is [seq1]") + ; + opts_cmdline_short.add(opts_target); + opts_target.add_options() + ("tId" + , value(&tId) + ->default_value("") + ->notifier(boost::bind(&CommandLineParsing::validate_id,this,"tId",_1)) + , std::string("id (FASTA-prefix) to be used for target sequence naming.").c_str()) + (tIdxPos0.name.c_str() + , value(&(tIdxPos0.val)) + ->default_value(tIdxPos0.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,tIdxPos0,_1)) + , std::string("index of first (5') sequence position of all target sequences" + " (arg in range ["+toString(tIdxPos0.min)+","+toString(tIdxPos0.max)+"];").c_str()) + ("tSet" + , value(&(tSetString)) + ->notifier(boost::bind(&CommandLineParsing::validate_tSet,this,_1)) + , std::string("target subset : List of sequence indices to consider for prediction in the format 'from1-to1,from2-to2,..' assuming indexing starts with 1").c_str()) (tAcc.name.c_str() , value(&(tAcc.val)) ->default_value(tAcc.def) @@ -465,8 +508,8 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) (tAccW.name.c_str() , value(&(tAccW.val)) ->default_value(tAccW.def) - ->notifier(boost::bind(&CommandLineParsing::validate_tAccW,this,_1)) - , std::string("accessibility computation : sliding window size for query accessibility computation" + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,tAccW,_1,1,2)) + , std::string("accessibility computation : sliding window size for target accessibility computation" " (arg in range ["+toString(tAccW.min)+","+toString(tAccW.max)+"];" " 0 will use the full sequence length)" " Note, this also restricts the maximal interaction length (see --tIntLenMax)." @@ -474,25 +517,12 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) (tAccL.name.c_str() , value(&(tAccL.val)) ->default_value(tAccL.def) - ->notifier(boost::bind(&CommandLineParsing::validate_tAccL,this,_1)) - , std::string("accessibility computation : maximal loop size (base pair span) for query accessibility computation" + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,tAccL,_1,1,2)) + , std::string("accessibility computation : maximal loop size (base pair span) for target accessibility computation" " (arg in range ["+toString(tAccL.min)+","+toString(tAccL.max)+"]; 0 will use the sliding window size 'tAccW')").c_str()) - ; - opts_cmdline_short.add(opts_target); - opts_target.add_options() - (tIdxPos0.name.c_str() - , value(&(tIdxPos0.val)) - ->default_value(tIdxPos0.def) - ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,tIdxPos0,_1)) - , std::string("index of first (5') sequence position of all target sequences" - " (arg in range ["+toString(tIdxPos0.min)+","+toString(tIdxPos0.max)+"];").c_str()) - ("tSet" - , value(&(tSetString)) - ->notifier(boost::bind(&CommandLineParsing::validate_tSet,this,_1)) - , std::string("target subset : List of sequence indices to consider for prediction in the format 'from1-to1,from2-to2,..' assuming indexing starts with 1").c_str()) ("tAccConstr" , value(&(tAccConstr)) - ->notifier(boost::bind(&CommandLineParsing::validate_tAccConstr,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_structureConstraintArgument,this,"tAccConstr",_1)) , std::string("accessibility computation : structure constraint :" "\n EITHER a string of target sequence length encoding for each position:" " '.' no constraint," @@ -525,7 +555,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) " (arg in range ["+toString(tIntLoopMax.min)+","+toString(tIntLoopMax.max)+"]; 0 enforces stackings only)").c_str()) ("tRegion" , value(&(tRegionString)) - ->notifier(boost::bind(&CommandLineParsing::validate_tRegion,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validateRegion,this,"tRegion",_1)) , std::string("interaction site : target regions to be considered for" " interaction prediction. Format =" " 'from1-to1,from2-to2,..' where indexing starts with 'tIdxPos0'." @@ -653,11 +683,11 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) " (arg in range ["+toString(seedMinPu.min)+","+toString(seedMinPu.max)+"]).").c_str()) ("seedQRange" , value(&(seedQRange)) - ->notifier(boost::bind(&CommandLineParsing::validate_seedQRange,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_seedRange,this,"seedQRange",_1)) , std::string("interval(s) in the query to search for seeds in format 'from1-to1,from2-to2,...' (Note, only for single query)").c_str()) ("seedTRange" , value(&(seedTRange)) - ->notifier(boost::bind(&CommandLineParsing::validate_seedTRange,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_seedRange,this,"seedTRange",_1)) , std::string("interval(s) in the target to search for seeds in format 'from1-to1,from2-to2,...' (Note, only for single target)").c_str()) ("seedNoGU", value(&seedNoGU) ->default_value(seedNoGU) @@ -674,15 +704,15 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) opts_shape.add_options() ("qShape" , value(&qShape) - ->notifier(boost::bind(&CommandLineParsing::validate_qShape,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shape,this,"qShape",_1)) , "file name from where to read the query sequence's SHAPE reactivity data to guide its accessibility computation") ("tShape" , value(&tShape) - ->notifier(boost::bind(&CommandLineParsing::validate_tShape,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shape,this,"tShape",_1)) , "SHAPE: file name from where to read the target sequence's SHAPE reactivity data to guide its accessibility computation") ("qShapeMethod" , value(&qShapeMethod) - ->notifier(boost::bind(&CommandLineParsing::validate_qShapeMethod,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shapeMethod,this,"qShapeMethod",_1)) , std::string("SHAPE: method how to integrate SHAPE reactivity data into query accessibility computation via pseudo energies:" "\n" " 'D': Convert by using a linear equation according to Deigan et al. (2009). The calculated pseudo energies will " @@ -697,13 +727,13 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ).c_str()) ("tShapeMethod" , value(&tShapeMethod) - ->notifier(boost::bind(&CommandLineParsing::validate_tShapeMethod,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shapeMethod,this,"tShapeMethod",_1)) , std::string("SHAPE: method how to integrate SHAPE reactivity data into target accessibility computation via pseudo energies.\n" "[for encodings see --qShapeMethod]" ).c_str()) ("qShapeConversion" , value(&qShapeConversion) - ->notifier(boost::bind(&CommandLineParsing::validate_qShapeConversion,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shapeConversion,this,"qShapeConversion",_1)) , std::string("SHAPE: method how to convert SHAPE reactivities to pairing probabilities for query accessibility computation. " "This parameter is useful when dealing with the SHAPE incorporation according to Zarringhalam et al. (2012). " "The following methods can be used to convert SHAPE reactivities into the probability for a certain nucleotide to be unpaired:" @@ -720,7 +750,7 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) ).c_str()) ("tShapeConversion" , value(&tShapeConversion) - ->notifier(boost::bind(&CommandLineParsing::validate_tShapeConversion,this,_1)) + ->notifier(boost::bind(&CommandLineParsing::validate_shapeConversion,this,"tShapeConversion",_1)) , std::string("SHAPE: method how to convert SHAPE reactivities to pairing probabilities for target accessibility computation.\n" "[for encodings see --qShapeConversion]" ).c_str()) @@ -738,9 +768,6 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) "\n 'M' = exact (slow), " "\n 'S' = seed-only" ).c_str()) - ; - opts_cmdline_short.add(opts_inter); - opts_inter.add_options() (model.name.c_str() , value(&(model.val)) ->default_value(model.def) @@ -751,6 +778,48 @@ CommandLineParsing::CommandLineParsing( const Personality personality ) "\n 'B' = single-site, helix-block-based, minimum-free-energy interaction (blocks of stable helices and interior loops only), " "\n 'P' = single-site interaction with minimal free ensemble energy per site (interior loops only)" ).c_str()) + (acc.name.c_str() + , value(&(acc.val)) + ->default_value(acc.def) + ->notifier(boost::bind(&CommandLineParsing::validate_charArgument,this,acc,_1)) + , std::string("accessibility computation :" + "\n 'N' no accessibility contributions" + "\n 'C' computation of accessibilities (see --accW and --accL)" + ).c_str()) + (intLenMax.name.c_str() + , value(&(intLenMax.val)) + ->default_value(intLenMax.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,intLenMax,_1)) + , std::string("interaction site : maximal window size to be considered for" + " interaction" + " (arg in range ["+toString(intLenMax.min)+","+toString(intLenMax.max)+"];" + " 0 refers to the full sequence length)." + " If --accW is provided, the smaller window size of both is used." + ).c_str()) + (intLoopMax.name.c_str() + , value(&(intLoopMax.val)) + ->default_value(intLoopMax.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgument,this,intLoopMax,_1)) + , std::string("interaction site : maximal number of unpaired bases between neighbored interacting bases to be considered in interactions" + " (arg in range ["+toString(intLoopMax.min)+","+toString(intLoopMax.max)+"]; 0 enforces stackings only)").c_str()) + ; + opts_cmdline_short.add(opts_inter); + opts_inter.add_options() + (accW.name.c_str() + , value(&(accW.val)) + ->default_value(accW.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,accW,_1,1,2)) + , std::string("accessibility computation : sliding window size for accessibility computation" + " (arg in range ["+toString(accW.min)+","+toString(accW.max)+"];" + " 0 will use the full sequence length)" + " Note, this also restricts the maximal interaction length (see --intLenMax)." + ).c_str()) + (accL.name.c_str() + , value(&(accL.val)) + ->default_value(accL.def) + ->notifier(boost::bind(&CommandLineParsing::validate_numberArgumentExcludeRange,this,accL,_1,1,2)) + , std::string("accessibility computation : maximal loop size (base pair span) for accessibility computation" + " (arg in range ["+toString(tAccL.min)+","+toString(accL.max)+"]; 0 will use the sliding window size 'accW')").c_str()) ((energy.name+",e").c_str() , value(&(energy.val)) ->default_value(energy.def) @@ -1104,7 +1173,6 @@ parse(int argc, char** argv) // parsing escape literals outSep = unescaped_string::getUnescaped( outSep ); - // open output stream { // open according stream @@ -1118,13 +1186,33 @@ parse(int argc, char** argv) } // parse the sequences - parseSequences("query",queryArg,query,qSet,qIdxPos0.val); - parseSequences("target",targetArg,target,tSet,tIdxPos0.val); + parseSequences("query",qId,queryArg,query,qSet,qIdxPos0.val); + parseSequences("target",tId,targetArg,target,tSet,tIdxPos0.val); // validate accessibility input from file (requires parsed sequences) validate_qAccFile( qAccFile ); validate_tAccFile( tAccFile ); + //////////////// INTERACTION CONSTRAINTS /////////////////// + + // interaction length: meta vs. special + if (intLenMax.isSet()) { + if (qIntLenMax.isSet() && intLenMax.val != qIntLenMax.val) { throw error("--intLenMax and --qIntLenMax are set to different values"); } + if (tIntLenMax.isSet() && intLenMax.val != tIntLenMax.val) { throw error("--intLenMax and --tIntLenMax are set to different values"); } + qIntLenMax.val = intLenMax.val; + tIntLenMax.val = intLenMax.val; + } + + // interior loop length: meta vs. special + if (intLoopMax.isSet()) { + if (qIntLoopMax.isSet() && intLoopMax.val != qIntLoopMax.val) { throw error("--intLoopMax and --qIntLoopMax are set to different values"); } + if (tIntLoopMax.isSet() && intLoopMax.val != tIntLoopMax.val) { throw error("--intLoopMax and --tIntLoopMax are set to different values"); } + qIntLoopMax.val = intLoopMax.val; + tIntLoopMax.val = intLoopMax.val; + } + + //////////////// SEED //////////////////////// + // check seed setup if (noSeedRequired) { // reset model if needed @@ -1135,13 +1223,13 @@ parse(int argc, char** argv) if (mode.val == 'S') throw error("mode=S not applicable for non-seed predictions!"); // input sanity check : maybe seed constraints defined -> warn if (!seedTQ.empty()) LOG(INFO) <<"no seed constraint wanted, but explicit seedTQ provided (will be ignored)"; - if (seedBP.val != seedBP.def) LOG(INFO) <<"no seed constraint wanted, but seedBP provided (will be ignored)"; - if (seedMaxUP.val != seedMaxUP.def) LOG(INFO) <<"no seed constraint wanted, but seedMaxUP provided (will be ignored)"; - if (seedQMaxUP.val != seedQMaxUP.def) LOG(INFO) <<"no seed constraint wanted, but seedQMaxUP provided (will be ignored)"; - if (seedTMaxUP.val != seedTMaxUP.def) LOG(INFO) <<"no seed constraint wanted, but seedTMaxUP provided (will be ignored)"; - if (seedMaxE.val != seedMaxE.def) LOG(INFO) <<"no seed constraint wanted, but seedMaxE provided (will be ignored)"; - if (seedMaxEhybrid.val != seedMaxEhybrid.def) LOG(INFO) <<"no seed constraint wanted, but seedMaxEhybrid provided (will be ignored)"; - if (seedMinPu.val != seedMinPu.def) LOG(INFO) <<"no seed constraint wanted, but seedMinPu provided (will be ignored)"; + if (seedBP.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedBP provided (will be ignored)"; + if (seedMaxUP.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedMaxUP provided (will be ignored)"; + if (seedQMaxUP.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedQMaxUP provided (will be ignored)"; + if (seedTMaxUP.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedTMaxUP provided (will be ignored)"; + if (seedMaxE.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedMaxE provided (will be ignored)"; + if (seedMaxEhybrid.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedMaxEhybrid provided (will be ignored)"; + if (seedMinPu.isSet()) LOG(INFO) <<"no seed constraint wanted, but seedMinPu provided (will be ignored)"; if (!seedQRange.empty()) LOG(INFO) <<"no seed constraint wanted, but seedQRange provided (will be ignored)"; if (!seedTRange.empty()) LOG(INFO) <<"no seed constraint wanted, but seedTRange provided (will be ignored)"; if (seedNoGU) LOG(INFO) <<"no seed constraint wanted, but seedNoGU provided (will be ignored)"; @@ -1183,13 +1271,13 @@ parse(int argc, char** argv) throw error("explicit seed definition only for single query/target available"); } // input sanity check : maybe seed constraints defined -> warn - if (seedBP.val != seedBP.def) LOG(INFO) <<"explicit seeds defined, but seedBP provided (will be ignored)"; - if (seedMaxUP.val != seedMaxUP.def) LOG(INFO) <<"explicit seeds defined, but seedMaxUP provided (will be ignored)"; - if (seedQMaxUP.val != seedQMaxUP.def) LOG(INFO) <<"explicit seeds defined, but seedQMaxUP provided (will be ignored)"; - if (seedTMaxUP.val != seedTMaxUP.def) LOG(INFO) <<"explicit seeds defined, but seedTMaxUP provided (will be ignored)"; - if (seedMaxE.val != seedMaxE.def) LOG(INFO) <<"explicit seeds defined, but seedMaxE provided (will be ignored)"; - if (seedMaxEhybrid.val != seedMaxEhybrid.def) LOG(INFO) <<"explicit seeds defined, but seedMaxEhybrid provided (will be ignored)"; - if (seedMinPu.val != seedMinPu.def) LOG(INFO) <<"explicit seeds defined, but seedMinPu provided (will be ignored)"; + if (seedBP.isSet()) LOG(INFO) <<"explicit seeds defined, but seedBP provided (will be ignored)"; + if (seedMaxUP.isSet()) LOG(INFO) <<"explicit seeds defined, but seedMaxUP provided (will be ignored)"; + if (seedQMaxUP.isSet()) LOG(INFO) <<"explicit seeds defined, but seedQMaxUP provided (will be ignored)"; + if (seedTMaxUP.isSet()) LOG(INFO) <<"explicit seeds defined, but seedTMaxUP provided (will be ignored)"; + if (seedMaxE.isSet()) LOG(INFO) <<"explicit seeds defined, but seedMaxE provided (will be ignored)"; + if (seedMaxEhybrid.isSet()) LOG(INFO) <<"explicit seeds defined, but seedMaxEhybrid provided (will be ignored)"; + if (seedMinPu.isSet()) LOG(INFO) <<"explicit seeds defined, but seedMinPu provided (will be ignored)"; if (!seedQRange.empty()) LOG(INFO) <<"explicit seeds defined, but seedQRange provided (will be ignored)"; if (!seedTRange.empty()) LOG(INFO) <<"explicit seeds defined, but seedTRange provided (will be ignored)"; if (seedNoGU) LOG(INFO) <<"explicit seeds defined, but seedNoGU provided (will be ignored)"; @@ -1216,55 +1304,27 @@ parse(int argc, char** argv) parseRegion( "qRegion", qRegionString, query, qRegion ); parseRegion( "tRegion", tRegionString, target, tRegion ); + //////////////// ACCESSIBILITY CONSTRAINTS /////////////////// - //////////////// WINDOW-BASED COMPUTATION /////////////////// - - // check if window-based computation enabled - if (windowWidth.val > 0) { - // minimal window width - if (windowWidth.val < 10) { - throw error("window-based computation: --windowWidth should be at least 10 but is "+toString(windowWidth.val)); - } - // ensure interaction length is restricted - size_t maxIntLength = 0; - if (qAcc.val != 'N') { // check if accessibility computation enabled - if (qIntLenMax.val == 0 && qAccW.val == 0) { - throw error("window-based computation: maximal query interaction length has to be restricted either via --qAccW or --qIntLenMax"); - } else { // update maximum - maxIntLength = std::max(qIntLenMax.val, qAccW.val); - } - } else { // max interaction length of query - if (qIntLenMax.val == 0) { - throw error("window-based computation: maximal query interaction length has to be restricted via --qIntLenMax"); - } else { - maxIntLength = qIntLenMax.val; - } - } - if (tAcc.val != 'N') { - if (tIntLenMax.val == 0 && tAccW.val == 0) { - throw error("window-based computation: maximal target interaction length has to be restricted either via --tAccW or --tIntLenMax"); - } else { // update maximum - maxIntLength = maxIntLength < std::max(tAccW.val,tIntLenMax.val) ? std::max(tAccW.val,tIntLenMax.val) : maxIntLength; - } - } else { - if (tIntLenMax.val == 0) { - throw error("window-based computation: maximal target interaction length has to be restricted via --tIntLenMax"); - } else { // update maximum - maxIntLength = maxIntLength < tIntLenMax.val ? tIntLenMax.val : maxIntLength; - } - } - if (windowOverlap.val < maxIntLength) { - throw error("window-based computation: --windowOverlap ("+toString(windowOverlap.val)+") has to be at least as large as the maximal interaction length, i.e. max of --q|tAccW and --q|tIntLenMax ("+toString(maxIntLength)+")"); - } - if (windowWidth.val <= windowOverlap.val) { - throw error("window-based computation: --windowWidth ("+toString(windowWidth.val)+") has to exceed --windowOverlap ("+toString(windowOverlap.val)+")"); - } - if (outNumber.val > 1 && outOverlap.val != 'B') { - throw error("window-based computation: non-overlapping subopt output (-n > 1) only supported for --outOverlap=B"); - } + // accessibility setup: meta vs. special + if (acc.isSet()) { + if (qAcc.isSet() && acc.val != qAcc.val) { throw error("--acc and --qAcc are set to different values"); } + if (tAcc.isSet() && acc.val != tAcc.val) { throw error("--acc and --tAcc are set to different values"); } + qAcc.val = acc.val; + tAcc.val = acc.val; + } + if (accW.isSet()) { + if (qAccW.isSet() && accW.val != qAccW.val) { throw error("--accW and --qAccW are set to different values"); } + if (tAccW.isSet() && accW.val != tAccW.val) { throw error("--accW and --tAccW are set to different values"); } + qAccW.val = accW.val; + tAccW.val = accW.val; + } + if (accL.isSet()) { + if (qAccL.isSet() && accL.val != qAccL.val) { throw error("--accL and --qAccL are set to different values"); } + if (tAccL.isSet() && accL.val != tAccL.val) { throw error("--accL and --tAccL are set to different values"); } + qAccL.val = accL.val; + tAccL.val = accL.val; } - - //////////////// ACCESSIBILITY CONSTRAINTS /////////////////// // check qAccConstr - query sequence compatibility if (vm.count("qAccConstr") > 0) { @@ -1306,8 +1366,8 @@ parse(int argc, char** argv) break; } // drop to next handling case 'N' : { - if (qAccL.val != qAccL.def) LOG(INFO) <<"qAcc = "< 1 || query.size() > 1) { throw error("outmode=E allows only single sequence input for query and target"); } + // ensure separator is not used within sequence name + if (qId.find(outSep) != std::string::npos) { throw error("--qId contains --outSep separator..."); } + if (tId.find(outSep) != std::string::npos) { throw error("--tId contains --outSep separator..."); } + break; + case 'C' : + // ensure separator is not used within sequence name + if (qId.find(outSep) != std::string::npos) { throw error("--qId contains --outSep separator..."); } + if (tId.find(outSep) != std::string::npos) { throw error("--tId contains --outSep separator..."); } + break; + default: + break; } // check CSV stuff @@ -1394,6 +1469,57 @@ parse(int argc, char** argv) } } + + //////////////// WINDOW-BASED COMPUTATION /////////////////// + + // check if window-based computation enabled + if (windowWidth.val > 0) { + // minimal window width + if (windowWidth.val < 10) { + throw error("window-based computation: --windowWidth should be at least 10 but is "+toString(windowWidth.val)); + } + // ensure interaction length is restricted + size_t maxIntLength = 0; + if (qAcc.val != 'N') { // check if accessibility computation enabled + if (qIntLenMax.val == 0 && qAccW.val == 0) { + throw error("window-based computation: maximal query interaction length has to be restricted either via --qAccW or --qIntLenMax"); + } else { // update maximum + maxIntLength = std::max(qIntLenMax.val, qAccW.val); + } + } else { // max interaction length of query + if (qIntLenMax.val == 0) { + throw error("window-based computation: maximal query interaction length has to be restricted via --qIntLenMax"); + } else { + maxIntLength = qIntLenMax.val; + } + } + if (tAcc.val != 'N') { + if (tIntLenMax.val == 0 && tAccW.val == 0) { + throw error("window-based computation: maximal target interaction length has to be restricted either via --tAccW or --tIntLenMax"); + } else { // update maximum + maxIntLength = maxIntLength < std::max(tAccW.val,tIntLenMax.val) ? std::max(tAccW.val,tIntLenMax.val) : maxIntLength; + } + } else { + if (tIntLenMax.val == 0) { + throw error("window-based computation: maximal target interaction length has to be restricted via --tIntLenMax"); + } else { // update maximum + maxIntLength = maxIntLength < tIntLenMax.val ? tIntLenMax.val : maxIntLength; + } + } + if (windowOverlap.val < maxIntLength) { + throw error("window-based computation: --windowOverlap ("+toString(windowOverlap.val)+") has to be at least as large as the maximal interaction length, i.e. max of --q|tAccW and --q|tIntLenMax ("+toString(maxIntLength)+")"); + } + if (windowWidth.val <= windowOverlap.val) { + throw error("window-based computation: --windowWidth ("+toString(windowWidth.val)+") has to exceed --windowOverlap ("+toString(windowOverlap.val)+")"); + } + if (outNumber.val > 1 && outOverlap.val != 'B') { + throw error("window-based computation: non-overlapping subopt output (-n > 1) only supported for --outOverlap=B"); + } + } + + + //////////////// MODEL //////////////////////// + // final checks if parameter set compatible with model switch (model.val) { // ensemble based predictions @@ -1908,6 +2034,7 @@ getOutputConstraint() const void CommandLineParsing:: parseSequences(const std::string & paramName, + const std::string & idPrefix, const std::string& paramArg, RnaSequenceVec& sequences, const IndexRangeList & seqSubset, @@ -1919,13 +2046,13 @@ parseSequences(const std::string & paramName, // read FASTA from STDIN stream if (boost::iequals(paramArg,"STDIN")) { - parseSequencesFasta(paramName, std::cin, sequences, seqSubset,idxPos0); + parseSequencesFasta(paramName,idPrefix, std::cin, sequences, seqSubset,idxPos0); } else if (RnaSequence::isValidSequenceIUPAC(paramArg)) { // check if sequence is to be stored if (seqSubset.empty() || seqSubset.covers(1)) { // direct sequence input - sequences.push_back(RnaSequence(paramName,paramArg,idxPos0)); + sequences.push_back(RnaSequence(idPrefix.empty()?paramName:idPrefix,paramArg,idxPos0)); } else { // would result in no sequence -> error LOG(ERROR) <<"Parsing of "< qIdxPos0; //! subset of query sequence indices to be processed @@ -541,6 +553,8 @@ class CommandLineParsing { std::string targetArg; //! the container holding all target sequences RnaSequenceVec target; + //! the id (prefix) to be used for target naming + std::string tId; //! in/output index of pos 0 (of all targets) NumberParameter tIdxPos0; //! subset of target sequence indices to be processed @@ -578,6 +592,22 @@ class CommandLineParsing { //! probabilities for according accessibility prediction std::string tShapeConversion; + // META PARAMETER applied to both query and target + //! accessibility computation mode + CharParameter acc; + //! window length for accessibility computation (plFold) + NumberParameter accW; + //! maximal base pair span for accessibility computation (plFold) + NumberParameter accL; + //! whether or not lonely base pairs are allowed in accessibility computation + bool accNoLP; + //! whether or not GU base pairs are allowed to close loops in accessibility computation + bool accNoGUend; + //! window length to be considered accessible/interacting + NumberParameter intLenMax; + //! maximal internal loop length to be considered accessible/interacting + NumberParameter intLoopMax; + //! the minimal number of base pairs allowed in the helix (>2) NumberParameter helixMinBP; @@ -650,11 +680,6 @@ class CommandLineParsing { bool energyNoDangles; - //! whether or not lonely base pairs are allowed in accessibility computation - bool accNoLP; - //! whether or not GU base pairs are allowed to close loops in accessibility computation - bool accNoGUend; - //! where to write the output to and for each in what format // std::string out; std::vector< std::string > out; @@ -731,6 +756,7 @@ class CommandLineParsing { resetParamDefault( Param & param, Value value ) { if (param.def != value) { param.def = value; + param.val = value; VLOG(1) <<" "< void validate_numberArgument(const NumberParameter & param, const T& value) { - // alphabet check + // range check if ( ! param.isInRange(value) ) { LOG(ERROR) < + void validate_numberArgumentExcludeRange(const NumberParameter & param, const T& value, const T& minExcl, const T& maxExcl) + { + // standard check + validate_numberArgument( param, value ); + + // check excluded range + if (param.val >= minExcl && param.val <= maxExcl) { + LOG(ERROR) <<"\n "< "<(qAccW,value); - // check lower bound - if (qAccW.val > 0 && qAccW.val < 3) { - LOG(ERROR) <<"\n qAccW = " < 3"; - updateParsingCode(ReturnCode::STOP_PARSING_ERROR); - } -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_qAccL(const int & value) -{ - // standard check - validate_numberArgument(qAccL,value); - // check lower bound - if (qAccL.val > 0 && qAccL.val < 3) { - LOG(ERROR) <<"qAccL = " < 3"; - updateParsingCode(ReturnCode::STOP_PARSING_ERROR); - } -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_qAccConstr(const std::string & value) -{ - // forward check to general method - validate_structureConstraintArgument("qAccConstr", value); -} - -//////////////////////////////////////////////////////////////////////////// - inline void CommandLineParsing::validate_qAccFile(const std::string & value) { @@ -1234,7 +1179,7 @@ void CommandLineParsing::validate_qAccFile(const std::string & value) // if not STDIN if ( boost::iequals(value,"STDIN") ) { if (getTargetSequences().size()>1) { - LOG(ERROR) <<"reading quary accessibilities for multiple sequences from '"<(tAccW,value); - // check lower bound - if (tAccW.val > 0 && tAccW.val < 3) { - LOG(ERROR) <<"tAccW = " < 3"; - updateParsingCode(ReturnCode::STOP_PARSING_ERROR); - } -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_tAccL(const int & value) -{ - // standard check - validate_numberArgument(tAccL,value); - // check lower bound - if (tAccL.val > 0 && tAccL.val < 3) { - LOG(ERROR) <<"tAccL = " < 3"; - updateParsingCode(ReturnCode::STOP_PARSING_ERROR); - } -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_tAccConstr(const std::string & value) -{ - // forward check to general method - validate_structureConstraintArgument("tAccConstr", value); -} - -//////////////////////////////////////////////////////////////////////////// - inline void CommandLineParsing::validate_tAccFile(const std::string & value) { @@ -1387,48 +1279,6 @@ void CommandLineParsing::validate_tAccFile(const std::string & value) //////////////////////////////////////////////////////////////////////////// -inline -void CommandLineParsing::validate_tRegion(const std::string & value) { - // check and store region information - validateRegion( "tRegion", value ); -} - -//////////////////////////////////////////////////////////////////////////// - -inline -void CommandLineParsing::validate_tShape( const std::string & value ) -{ - if (!validateFile( value )) { - LOG(ERROR) <<"Can not access/read target's SHAPE reactivity file '" < Date: Mon, 17 Feb 2020 14:31:44 +0100 Subject: [PATCH 10/16] ensure intLenMax >= seedBP --- ChangeLog | 1 + src/bin/CommandLineParsing.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/ChangeLog b/ChangeLog index 290cbcac..a7a68e74 100644 --- a/ChangeLog +++ b/ChangeLog @@ -57,6 +57,7 @@ + additional checks for meta arguments with setup of q|t variables + usage of .isSet() where appropriate + check that outSep is not within id prefix in outMode=C + + ensure intLenMax >= seedBP 200207 Martin Raden + Fabio Gutmann + python/CopomuS.py : Compensatory mutation selector to support interaction diff --git a/src/bin/CommandLineParsing.cpp b/src/bin/CommandLineParsing.cpp index 60c103a7..697e13b5 100644 --- a/src/bin/CommandLineParsing.cpp +++ b/src/bin/CommandLineParsing.cpp @@ -1283,6 +1283,9 @@ parse(int argc, char** argv) if (seedNoGU) LOG(INFO) <<"explicit seeds defined, but seedNoGU provided (will be ignored)"; if (seedNoGUend) LOG(INFO) <<"explicit seeds defined, but seedNoGUend provided (will be ignored)"; } + // compare maximal interaction length with minimal seed length + if (qIntLenMax.val > 0 && qIntLenMax.val < seedBP.val) { throw error("maximal query interaction length < seedBP"); } + if (tIntLenMax.val > 0 && tIntLenMax.val < seedBP.val) { throw error("maximal target interaction length < seedBP"); } } /////////////// PARSE AND PREPARE PREDICTION RANGES ////////////// From ff6f3929d4014149a3488861086b5a4ac30ba00c Mon Sep 17 00:00:00 2001 From: martin-mann Date: Mon, 17 Feb 2020 14:41:36 +0100 Subject: [PATCH 11/16] docu meta arguments --- README.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index f4643cc8..27ab8054 100644 --- a/README.md +++ b/README.md @@ -497,10 +497,8 @@ mode = M # no seed constraint noSeed = true # full global accessibility computation -qAccW = 0 -qAccL = 0 -tAccW = 0 -tAccL = 0 +accW = 0 +accL = 0 ``` where you use the long parameter names without the leading `--`. @@ -750,7 +748,7 @@ by running IntaRNA when disabling both the seed constraint as well as the accessibility integration using ```bash # prediction results similar to TargetScan/RNAhybrid -IntaRNA [..] --noSeed --qAcc=N --tAcc=N +IntaRNA [..] --noSeed --acc=N ``` We *add seed-constraint support to TargetScan/RNAhybrid-like computations* by removing the `--noSeed` flag from the above call. @@ -767,7 +765,7 @@ Finally, RNAup only predicts interactions for subsequences of length 25. All this can be setup using ```bash # prediction results similar to 'RNAup -b' (incorporating accessibility of both RNAs) -IntaRNA --mode=M --noSeed --qAccW=0 --qAccL=0 --qIntLenMax=25 --tAccW=0 --tAccL=0 --tIntLenMax=25 +IntaRNA --mode=M --noSeed --accW=0 --accL=0 --intLenMax=25 ``` We *add seed-constraint support to RNAup-like computations* by removing the `--noSeed` flag from the above call. @@ -1055,8 +1053,8 @@ which results in two shorter ranges. If a range is shorter than `--seedBP`, it is completely removed. Finally, it is possible to restrict the overall length an interaction is allowed -to have. This can be done independently for the query and target sequence using -`--qIntLenMax` and `--tIntLenMax`, respectively. By setting both to 0 (default), +to have via `--intLenMax`. This can be done independently for the query and target sequence using +`--qIntLenMax` and `--tIntLenMax`, respectively. By setting to 0 (default), the smaller of the full sequence length and the maximal accessibility-window size (`--tAccW`, `--qAccW`) is used. @@ -1610,8 +1608,8 @@ If no value for `--energyVRNA` is provided, the default model of the underlying Vienna RNA package is used (see respective documentation). To increase prediction quality and to reduce the computational complexity, the -number of unpaired bases between intermolecular base pairs is restricted -(similar to internal loop length restrictions in single RNA folding algorithm). The +number of unpaired bases between intermolecular base pairs is restricted via +`--intLoopMax` (similar to internal loop length restrictions in single RNA folding algorithm). The upper bound can be set independently for the query and target sequence via `--qIntLoopMax` and `--tIntLoopMax`, respectively, and defaults to 16. @@ -1856,10 +1854,11 @@ penalty for a subsequence partaking in an interaction is given by *ED=-RT log(Pu where *Pu* denotes the unpaired probability of the subsequence. Within the IntaRNA energy model, *ED* values for both interacting subsequences are considered. -Accessibility incorporation can be disabled for query or target sequences using +To globally turn off accessibility consideration, set `--acc=N`. +Accessibility incorporation can be disabled separately for query or target sequences using `--qAcc=N` or `--tAcc=N`, respectively. -A setup of `--qAcc=C` or `--tAcc=C` (default) enables accessibility computation +A setup of `--acc=C` (default, as for `--qAcc=C` and `--tAcc=C`) enables accessibility computation using the selected [energy model](#energy) for query or target sequences, respectively. Using `--accNoLP` and `--accNoGUend`, the consideration of lonely base pairs @@ -1885,7 +1884,8 @@ unpaired probabilities within the window (while only intramolecular base pairs within the window are considered). IntaRNA enables both global as well as local unpaired probability computation. -To this end, the sliding window length has to be specified in order to enable/disable +To this end, the sliding window length `--accW` and the maximal base pair span +`--accL` (<= `--accW`) have to be specified in order to enable/disable local folding. ##### Use case examples global/local unpaired probability computation @@ -1899,7 +1899,7 @@ i.e. the maximal number of positions enclosed by a base pair independently while respecting `AccL <= AccW`. ```bash # using global accessibilities for target and query -IntaRNA [..] --qAccW=0 --qAccL=0 --tAccW=0 --qAccL=0 +IntaRNA [..] --accW=0 --accL=0 # using global accessibilities for query and local ones for target IntaRNA [..] --qAccW=0 --qAccL=0 --tAccW=150 --qAccL=100 ``` From 0ab91bed15ebaaae0673ed3a130b71e53f078029 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Mon, 2 Mar 2020 09:34:34 +0100 Subject: [PATCH 12/16] + missing parsing of negative indices --- src/IntaRNA/IndexRange.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/IntaRNA/IndexRange.h b/src/IntaRNA/IndexRange.h index c5258b4e..5b4285db 100644 --- a/src/IntaRNA/IndexRange.h +++ b/src/IntaRNA/IndexRange.h @@ -176,8 +176,12 @@ class IndexRange { if( ! boost::regex_match(stringEncoding, regex, boost::match_perl) ) { throw std::runtime_error("IndexRange::fromString("+stringEncoding+") uses no valid index range string encoding"); } - // find split position - const size_t splitPos = stringEncoding.find('-'); + // find split position assuming second index to be negative + size_t splitPos = stringEncoding.find("--"); + if (splitPos == std::string::npos) { + // last index should not be negative, thus, split at last '-' + splitPos = stringEncoding.find_last_of('-'); + } // parse interval boundaries if (seq == NULL) { from = boost::lexical_cast(stringEncoding.substr(0,splitPos)); From 72ef3820cff59af73721ed4f168ed55bbd490121 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Mon, 2 Mar 2020 09:37:19 +0100 Subject: [PATCH 13/16] areComplementary() positions accessible --- ChangeLog | 10 ++++++++++ src/IntaRNA/InteractionEnergy.h | 7 +++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/ChangeLog b/ChangeLog index a7a68e74..465d3d23 100644 --- a/ChangeLog +++ b/ChangeLog @@ -11,6 +11,7 @@ ################################################################################ # IntaRNA +- BUGFIX: accessibility blocking constraints were only applied to seed location - improved Zall estimate (and depending values) for `--model=X` (default) - new arguments: - `qId|tId` : optional id (FASTA prefix) setup for sequence naming @@ -26,6 +27,15 @@ ################################################################################ +200302 Martin Raden + * IntaRNA/IndexRange : + * fromString() : + * bugfix: + missing parsing of negative indices + * IntaRNA/InteractionEnergy : + * areComplementary() : + + check if both positions are accessible (to avoid additional checks in + predictor recursions) + 200217 Martin Raden * bin/CommandLineParsing : * NumericParameter : diff --git a/src/IntaRNA/InteractionEnergy.h b/src/IntaRNA/InteractionEnergy.h index 53bbfd17..0241bde5 100644 --- a/src/IntaRNA/InteractionEnergy.h +++ b/src/IntaRNA/InteractionEnergy.h @@ -148,7 +148,8 @@ class InteractionEnergy { * Checks whether or not two positions can form a base pair * @param i1 index in first sequence * @param i2 index in second sequence - * @return true if seq1(i1) can form a base pair with seq2(i2) + * @return true if seq1(i1) can form a base pair with seq2(i2) and both + * positions are accessible */ virtual bool @@ -685,7 +686,9 @@ bool InteractionEnergy:: areComplementary( const size_t i1, const size_t i2 ) const { - return RnaSequence::areComplementary( accS1.getSequence(), accS2.getSequence(), i1, i2); + return RnaSequence::areComplementary( accS1.getSequence(), accS2.getSequence(), i1, i2) + && isAccessible1(i1) + && isAccessible2(i2); } //////////////////////////////////////////////////////////////////////////// From 59a12d615ee3566caf06ea166bb8cb85b859ece6 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Mon, 2 Mar 2020 09:42:00 +0100 Subject: [PATCH 14/16] style --- ChangeLog | 54 +++++++++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/ChangeLog b/ChangeLog index 465d3d23..37e6e16c 100644 --- a/ChangeLog +++ b/ChangeLog @@ -41,33 +41,33 @@ * NumericParameter : * CharParameter : + isSet() : checks if value and default differ - + qId|tId : id (prefix) for sequence naming - + acc|accW|accL : meta for q|t* - + intLenMax|intLoopMax : meta for q|t* - * resetParamDefault() : - + overwrite value (since set to default in constructor) - + validate_region|shape|shapeMethod|shapeConversion : generic checks for q|t - + validate_seedRange : generic checks for q|t - + validate_numberArgumentExcludeRange() : generic check excluding a range - + validate_id() : checks for line breaks in q|tId - * parseSequences() : - * parseSequencesFasta() : - + idPrefix handling - - obsolete functions - - validate_query|target - - validate_qAccW|L|Constr - - validate_q|tRegion|Shape|ShapeMethod|ShapeConversion - - validate_seedQ|TRange - * constructor() : - + extended default reset for new arguments - * q|tAcc* arguments now hidden - * validation calls refactored - + model|acc|intLenMax|intLoopMax now general arguments - * parse() : - + additional checks for meta arguments with setup of q|t variables - + usage of .isSet() where appropriate - + check that outSep is not within id prefix in outMode=C - + ensure intLenMax >= seedBP + + qId|tId : id (prefix) for sequence naming + + acc|accW|accL : meta for q|t* + + intLenMax|intLoopMax : meta for q|t* + * resetParamDefault() : + + overwrite value (since set to default in constructor) + + validate_region|shape|shapeMethod|shapeConversion : generic checks for q|t + + validate_seedRange : generic checks for q|t + + validate_numberArgumentExcludeRange() : generic check excluding a range + + validate_id() : checks for line breaks in q|tId + * parseSequences() : + * parseSequencesFasta() : + + idPrefix handling + - obsolete functions + - validate_query|target + - validate_qAccW|L|Constr + - validate_q|tRegion|Shape|ShapeMethod|ShapeConversion + - validate_seedQ|TRange + * constructor() : + + extended default reset for new arguments + * q|tAcc* arguments now hidden + * validation calls refactored + + model|acc|intLenMax|intLoopMax now general arguments + * parse() : + + additional checks for meta arguments with setup of q|t variables + + usage of .isSet() where appropriate + + check that outSep is not within id prefix in outMode=C + + ensure intLenMax >= seedBP 200207 Martin Raden + Fabio Gutmann + python/CopomuS.py : Compensatory mutation selector to support interaction From 306d98c54ec444a9c979f5d9bd0b76ac5c6c0634 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Mon, 2 Mar 2020 09:52:07 +0100 Subject: [PATCH 15/16] * ambiguous nt warning now in verbose --- ChangeLog | 2 ++ README.md | 11 +++++++---- src/bin/IntaRNA.cpp | 6 +++--- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/ChangeLog b/ChangeLog index 37e6e16c..5d685260 100644 --- a/ChangeLog +++ b/ChangeLog @@ -35,6 +35,8 @@ * areComplementary() : + check if both positions are accessible (to avoid additional checks in predictor recursions) + * bin/IntaRNA : + * ambiguous nt warning now in verbose log (was info log) 200217 Martin Raden * bin/CommandLineParsing : diff --git a/README.md b/README.md index 27ab8054..9cb92b71 100644 --- a/README.md +++ b/README.md @@ -412,9 +412,12 @@ interaction energy = -10.85 kcal/mol ``` -or provide (multiple) sequence(s) in [FASTA-format](#https://en.wikipedia.org/wiki/FASTA_format). -It is possible to provide either file input or to read the FASTA input from the -STDIN stream. +In case you need specific RNA names in your output, you can provide ID strings +for each RNA using e.g. `--tId="mRNA with GC"` or `--qId="sRNA-example"`. + +Multiple sequences can be provided in [FASTA-format](#https://en.wikipedia.org/wiki/FASTA_format). +It is possible to use either file input or to read the FASTA input from the +`STDIN` stream. ```bash # running IntaRNA with FASTA files @@ -1056,7 +1059,7 @@ Finally, it is possible to restrict the overall length an interaction is allowed to have via `--intLenMax`. This can be done independently for the query and target sequence using `--qIntLenMax` and `--tIntLenMax`, respectively. By setting to 0 (default), the smaller of the full sequence length and the maximal accessibility-window -size (`--tAccW`, `--qAccW`) is used. +size is used (see `--accW`, `--tAccW`, or `--qAccW`). diff --git a/src/bin/IntaRNA.cpp b/src/bin/IntaRNA.cpp index 970c5aa0..92f250d4 100644 --- a/src/bin/IntaRNA.cpp +++ b/src/bin/IntaRNA.cpp @@ -113,7 +113,7 @@ int main(int argc, char **argv){ #if INTARNA_MULITHREADING #pragma omp critical(intarna_omp_logOutput) #endif - LOG(INFO) <<"Sequence '"<getSequence().getId() + VLOG(1) <<"Sequence '"<getSequence().getId() <<"' contains ambiguous nucleotide encodings. These positions are ignored for interaction computation."; } #if INTARNA_MULITHREADING @@ -182,8 +182,8 @@ int main(int argc, char **argv){ #if INTARNA_MULITHREADING #pragma omp critical(intarna_omp_logOutput) #endif - { LOG(INFO) <<"Sequence '"<getSequence().getId() - <<"' contains ambiguous IUPAC nucleotide encodings. These positions are ignored for interaction computation and replaced by 'N'.";} + { VLOG(1) <<"Sequence '"<getSequence().getId() + <<"' contains ambiguous IUPAC nucleotide encodings. These positions are ignored for interaction computation and are replaced by 'N'.";} } // second: iterate over all query sequences From 8f616376e4362f29b2282bb3b8adfd7bb21e4461 Mon Sep 17 00:00:00 2001 From: martin-mann Date: Mon, 2 Mar 2020 09:53:26 +0100 Subject: [PATCH 16/16] v3.2.0 --- ChangeLog | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/ChangeLog b/ChangeLog index 5d685260..0b2b9d38 100644 --- a/ChangeLog +++ b/ChangeLog @@ -10,6 +10,15 @@ # changes in development version since last release ################################################################################ + + +################################################################################ +################################################################################ + +################################################################################ +### version 3.2.0 +################################################################################ + # IntaRNA - BUGFIX: accessibility blocking constraints were only applied to seed location - improved Zall estimate (and depending values) for `--model=X` (default) @@ -21,10 +30,6 @@ # CopomuS - Compensatory mutation selector to support interaction validation experiments - -################################################################################ -################################################################################ - ################################################################################ 200302 Martin Raden