Merge pull request #26 from broadinstitute/ct-downsample-jvm-parsefix

JVMmemory parse fix for downsample_bams
broadinstitute · Jun 9, 2020 · 4eb88a8 · 4eb88a8
2 parents 00e55c9 + a77edd5
commit 4eb88a8
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 4 deletions.
diff --git a/read_utils.py b/read_utils.py
@@ -442,9 +442,10 @@ def downsample_bams(data_pairs, downsample_target, threads=None, JVMmemory=None)
         downsamplesam = tools.picard.DownsampleSamTool()
         workers = util.misc.sanitize_thread_count(threads)
         JVMmemory = JVMmemory if JVMmemory else tools.picard.DownsampleSamTool.jvmMemDefault
-        jvm_worker_memory = str(max(1,int(JVMmemory.rstrip("g"))/workers))+'g'
+
+        jvm_worker_memory_mb = str(int(util.misc.convert_size_str(JVMmemory,"m")[:-1])//workers)+"m"
         with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
-            future_to_file = {executor.submit(downsamplesam.downsample_to_approx_count, *(list(fp)+[downsample_target]), JVMmemory=jvm_worker_memory): fp[0] for fp in data_pairs}
+            future_to_file = {executor.submit(downsamplesam.downsample_to_approx_count, *(list(fp)+[downsample_target]), JVMmemory=jvm_worker_memory_mb): fp[0] for fp in data_pairs}
             for future in concurrent.futures.as_completed(future_to_file):
                 f = future_to_file[future]
                 try:
@@ -455,9 +456,9 @@ def downsample_bams(data_pairs, downsample_target, threads=None, JVMmemory=None)
     def dedup_bams(data_pairs, threads=None, JVMmemory=None):
         workers = util.misc.sanitize_thread_count(threads)
         JVMmemory = JVMmemory if JVMmemory else tools.picard.DownsampleSamTool.jvmMemDefault
-        jvm_worker_memory = str(max(1,int(JVMmemory.rstrip("g"))/workers))+'g'
+        jvm_worker_memory_mb = str(int(util.misc.convert_size_str(JVMmemory,"m")[:-1])//workers)+"m"
         with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
-            future_to_file = {executor.submit(rmdup_mvicuna_bam, *fp, JVMmemory=jvm_worker_memory): fp[0] for fp in data_pairs}
+            future_to_file = {executor.submit(rmdup_mvicuna_bam, *fp, JVMmemory=jvm_worker_memory_mb): fp[0] for fp in data_pairs}
             for future in concurrent.futures.as_completed(future_to_file):
                 f = future_to_file[future]
                 try:

diff --git a/util/misc.py b/util/misc.py
@@ -562,3 +562,28 @@ def wraps(f):
 def unwrap(f):
     """Find the original function under layers of wrappers"""
     return f if not hasattr(f, '__wrapped__') else unwrap(f.__wrapped__)
+
+def convert_size_str(input_size_str, output_unit="m", round_number=True):
+    """ intended to convert a jvm-style size spec to an int value of the desired unit """
+    unit2factor = {'k': 1024, 'm': 1024**2, 'g': 1024**3, 't': 1024**4}
+    output_unit = output_unit.lower()
+
+    size_spec_pattern = re.compile(r"(.*)([kmgt])")
+
+    m = re.search(size_spec_pattern, input_size_str)
+    if m:
+        if m.group(1) and m.group(2):
+            input_size=float(m.group(1))
+            input_unit=m.group(2).lower()
+
+            if input_unit not in unit2factor.keys():
+                raise TypeError("Error parsing size from string: %s" % input_size_str)
+
+            # convert to bytes
+            size_in_bytes = input_size * unit2factor[input_unit]
+
+            # convert to desired size
+            if round_number:
+                return str(round(max(1,float(size_in_bytes)/unit2factor[output_unit])))+output_unit
+            else:
+                return str(float(size_in_bytes)/unit2factor[output_unit])+output_unit