In cmd.py argparse common_args(), set default number of threads to al…

…l available if --threads is unspecified (#104) * In cmd.py argparse common_args(), set default number of threads to all available if --threads is unspecified In cmd.py argparse common_args(), set default number of threads to all available if `--threads` is unspecified; previously if the threads arg were None, it would be up to the consuming function to set the thread count to all available. With this change, the new default is to use all available cores. Additionally, this sanitizes the user-requested thread count via util.misc.sanitize_thread_count(), if a value is specified. This was already the behavior in most multi-threaded functions, by separate calls to util.misc.sanitize_thread_count() where a threads arg is consumed; the latter could potentially be refactored out if we are relying solely on the argparse interface, though it should be preserved for python import usage of the same functions (including some test cases). Changing the default will cause no changes where existing separate sanitize_thread_count() calls are used. This also corrects a call to count_and_sort_barcodes() where the threads arg was not being passed. * add pandas to python dependencies
broadinstitute · Jun 7, 2024 · 853bea1 · 853bea1
1 parent f91e419
commit 853bea1
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 11 deletions.
diff --git a/illumina.py b/illumina.py
@@ -453,7 +453,7 @@ def parser_common_barcodes(parser=argparse.ArgumentParser()):
     parser.add_argument('--JVMmemory',
                         help='JVM virtual memory size (default: %(default)s)',
                         default=tools.picard.ExtractIlluminaBarcodesTool.jvmMemDefault)
-    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
+    util.cmd.common_args(parser, (('threads',None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
     util.cmd.attach_main(parser, main_common_barcodes)
     return parser
 
@@ -506,7 +506,7 @@ def main_common_barcodes(args):
     except IndexError:
         barcode2_len = 0
 
-    count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, barcode1_len, barcode2_len, args.truncateToLength, args.includeNoise, args.omitHeader)
+    count_and_sort_barcodes(barcodes_tmpdir, args.outSummary, barcode1_len, barcode2_len, args.truncateToLength, args.includeNoise, args.omitHeader, args.threads)
 
     # clean up
     os.unlink(barcode_file)

diff --git a/read_utils.py b/read_utils.py
@@ -919,7 +919,7 @@ def _merge_fastqs_and_mvicuna(lb, files):
 
     return readList
 
-def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
+def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None, threads=None):
     ''' Remove duplicate reads from BAM file using M-Vicuna. The
         primary advantage to this approach over Picard's MarkDuplicates tool
         is that Picard requires that input reads are aligned to a reference,
@@ -943,7 +943,7 @@ def rmdup_mvicuna_bam(inBam, outBam, JVMmemory=None):
     # For each library, merge FASTQs and run rmdup for entire library
     readListAll = mkstempfname('.keep_reads_all.txt')
     per_lb_read_lists = []
-    with concurrent.futures.ProcessPoolExecutor(max_workers=util.misc.available_cpu_count()) as executor:
+    with concurrent.futures.ProcessPoolExecutor(max_workers=threads or util.misc.available_cpu_count()) as executor:
         futures = [executor.submit(_merge_fastqs_and_mvicuna, lb, files) for lb, files in lb_to_files.items()]
         for future in concurrent.futures.as_completed(futures):
             log.info("mvicuna finished processing library")
@@ -972,7 +972,7 @@ def parser_rmdup_mvicuna_bam(parser=argparse.ArgumentParser()):
         default=tools.picard.FilterSamReadsTool.jvmMemDefault,
         help='JVM virtual memory size (default: %(default)s)'
     )
-    util.cmd.common_args(parser, (('loglevel', None), ('version', None), ('tmp_dir', None)))
+    util.cmd.common_args(parser, (('threads',None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
     util.cmd.attach_main(parser, rmdup_mvicuna_bam, split_args=True)
     return parser
 

diff --git a/util/cmd.py b/util/cmd.py
@@ -17,6 +17,7 @@
 
 import util.version
 import util.file
+import util.misc
 
 __author__ = "[email protected]"
 __version__ = util.version.get_version()
@@ -76,15 +77,13 @@ def common_args(parser, arglist=(('tmp_dir', None), ('loglevel', None))):
                     the end, even if there's a failure.""",
                                 default=False)
         elif k == 'threads':
-            if v is None:
-                text_default = "all available cores"
-            else:
-                text_default = v
+            # if v is None, sanitize_thread_count() sets count to all available cores
+            thread_count = util.misc.sanitize_thread_count(v)
             parser.add_argument('--threads',
                                 dest="threads",
                                 type=int,
-                                help="Number of threads (default: {})".format(text_default),
-                                default=v)
+                                help="Number of threads; by default all cores are used",
+                                default=thread_count)
         elif k == 'version':
             if not v:
                 v = __version__