From a77edd5dbe5863a4a331b5dad17b8b96377d15ad Mon Sep 17 00:00:00 2001 From: Christopher Tomkins-Tinch Date: Tue, 9 Jun 2020 15:00:34 -0400 Subject: [PATCH] JVMmemory parse fix for downsample_bams fix to parsing of passed JVMmemory value so it is interpreted correctly before dividing among worker threads; add util.misc.convert_size_str() to read and convert JVM-like mem spec strings --- read_utils.py | 9 +++++---- util/misc.py | 25 +++++++++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/read_utils.py b/read_utils.py index ad4cce4a..923deaab 100755 --- a/read_utils.py +++ b/read_utils.py @@ -442,9 +442,10 @@ def downsample_bams(data_pairs, downsample_target, threads=None, JVMmemory=None) downsamplesam = tools.picard.DownsampleSamTool() workers = util.misc.sanitize_thread_count(threads) JVMmemory = JVMmemory if JVMmemory else tools.picard.DownsampleSamTool.jvmMemDefault - jvm_worker_memory = str(max(1,int(JVMmemory.rstrip("g"))/workers))+'g' + + jvm_worker_memory_mb = str(int(util.misc.convert_size_str(JVMmemory,"m")[:-1])//workers)+"m" with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - future_to_file = {executor.submit(downsamplesam.downsample_to_approx_count, *(list(fp)+[downsample_target]), JVMmemory=jvm_worker_memory): fp[0] for fp in data_pairs} + future_to_file = {executor.submit(downsamplesam.downsample_to_approx_count, *(list(fp)+[downsample_target]), JVMmemory=jvm_worker_memory_mb): fp[0] for fp in data_pairs} for future in concurrent.futures.as_completed(future_to_file): f = future_to_file[future] try: @@ -455,9 +456,9 @@ def downsample_bams(data_pairs, downsample_target, threads=None, JVMmemory=None) def dedup_bams(data_pairs, threads=None, JVMmemory=None): workers = util.misc.sanitize_thread_count(threads) JVMmemory = JVMmemory if JVMmemory else tools.picard.DownsampleSamTool.jvmMemDefault - jvm_worker_memory = str(max(1,int(JVMmemory.rstrip("g"))/workers))+'g' + jvm_worker_memory_mb = str(int(util.misc.convert_size_str(JVMmemory,"m")[:-1])//workers)+"m" with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor: - future_to_file = {executor.submit(rmdup_mvicuna_bam, *fp, JVMmemory=jvm_worker_memory): fp[0] for fp in data_pairs} + future_to_file = {executor.submit(rmdup_mvicuna_bam, *fp, JVMmemory=jvm_worker_memory_mb): fp[0] for fp in data_pairs} for future in concurrent.futures.as_completed(future_to_file): f = future_to_file[future] try: diff --git a/util/misc.py b/util/misc.py index 8fc59099..a60f6cb7 100644 --- a/util/misc.py +++ b/util/misc.py @@ -562,3 +562,28 @@ def wraps(f): def unwrap(f): """Find the original function under layers of wrappers""" return f if not hasattr(f, '__wrapped__') else unwrap(f.__wrapped__) + +def convert_size_str(input_size_str, output_unit="m", round_number=True): + """ intended to convert a jvm-style size spec to an int value of the desired unit """ + unit2factor = {'k': 1024, 'm': 1024**2, 'g': 1024**3, 't': 1024**4} + output_unit = output_unit.lower() + + size_spec_pattern = re.compile(r"(.*)([kmgt])") + + m = re.search(size_spec_pattern, input_size_str) + if m: + if m.group(1) and m.group(2): + input_size=float(m.group(1)) + input_unit=m.group(2).lower() + + if input_unit not in unit2factor.keys(): + raise TypeError("Error parsing size from string: %s" % input_size_str) + + # convert to bytes + size_in_bytes = input_size * unit2factor[input_unit] + + # convert to desired size + if round_number: + return str(round(max(1,float(size_in_bytes)/unit2factor[output_unit])))+output_unit + else: + return str(float(size_in_bytes)/unit2factor[output_unit])+output_unit