star: Add optional arguments to control memory usage, related to #134

With arguments genomeSAsparseD and genomeSAindexNbases one can control STAR's memory requirements and usage.
tomazc · Sep 20, 2017 · ef20687 · ef20687
1 parent 915c976
commit ef20687
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 8 deletions.
diff --git a/iCount/cli.py b/iCount/cli.py
@@ -117,7 +117,7 @@ def _extract_parameter_data(function):
 
     Every parameter in returned object can have the following entries:
 
-        * name - the name of parameter, preceeded by '--' if it is optional
+        * name - the name of parameter, preceded by '--' if it is optional
         * default - the default value (only for optional parameters). Extracted
           from function signature.
         * type - type of parameter, extracted from function docstring. If not
@@ -391,7 +391,7 @@ def verbose_help(mode):
 
     # all_args command:
     def all_args():
-        """Print all posssible parameter names and CLI commands where they are used."""
+        """Print all possible parameter names and CLI commands where they are used."""
         for param_name, commands in sorted(PARAMETERS.items(), key=lambda x: x[0].lstrip('-')):
             if param_name in SHORT_OPTARG_NAMES:
                 short_name = ' ({})'.format(SHORT_OPTARG_NAMES[param_name])

diff --git a/iCount/examples/tutorial.sh b/iCount/examples/tutorial.sh
@@ -4,16 +4,16 @@ set -vx
 mkdir tutorial_example
 cd tutorial_example
 
-iCount releases
+iCount releases --source ensembl
 
-iCount species -r 88
+iCount species --source ensembl -r 88
 
-iCount genome homo_sapiens -r 88 --chromosomes 21 MT
+iCount genome --source ensembl homo_sapiens 88 --chromosomes 21 MT
 
-iCount annotation homo_sapiens -r 88
+iCount annotation --source ensembl homo_sapiens 88
 
 mkdir hs88
-iCount indexstar homo_sapiens.88.chr21_MT.fa.gz hs88 --annotation homo_sapiens.88.gtf.gz
+iCount indexstar homo_sapiens.88.chr21_MT.fa.gz hs88 --annotation homo_sapiens.88.gtf.gz --genomeSAsparseD 2 --genomeSAindexNbases 15
 
 # the whole data set [880 MB] is available here:
 #wget http://icount.fri.uni-lj.si/data/20101116_LUjh03/\

diff --git a/iCount/externals/star.py b/iCount/externals/star.py
@@ -58,7 +58,8 @@ def get_version():
         return None
 
 
-def build_index(genome, genome_index, annotation='', overhang=100, overhang_min=8, threads=1):
+def build_index(genome, genome_index, annotation='', overhang=100, overhang_min=8, threads=1,
+                genomeSAsparseD=1, genomeSAindexNbases=14):
     """
     Call STAR to generate genome index, which is used for mapping.
 
@@ -77,6 +78,13 @@ def build_index(genome, genome_index, annotation='', overhang=100, overhang_min=
         TODO
     threads : int
         Number of threads that STAR can use for generating index.
+    genomeSAsparseD : int
+        Suffix array sparsity. Bigger numbers decrease RAM requirements
+        at the cost of mapping speed reduction. Suggested values
+        are 1 (30 GB RAM) or 2 (16 GB RAM).
+    genomeSAindexNbases : int
+        SA pre-indexing string length, typically between 10 and 15.
+        Longer strings require more memory, but result in faster searches.
 
     Returns
     -------
@@ -95,6 +103,8 @@ def build_index(genome, genome_index, annotation='', overhang=100, overhang_min=
     args = [
         'STAR',
         '--runThreadN', '{:d}'.format(threads),
+        '--genomeSAsparseD', '{:d}'.format(genomeSAsparseD),
+        '--genomeSAindexNbases', '{:d}'.format(genomeSAindexNbases),
         '--runMode', 'genomeGenerate',
         '--genomeDir', '{:s}'.format(genome_index),
         '--genomeFastaFiles', '{:s}'.format(genome_fname2),