diff --git a/metaquast.py b/metaquast.py index 462ada6f8e..09e6d84ea8 100755 --- a/metaquast.py +++ b/metaquast.py @@ -26,7 +26,7 @@ from quast_libs.options_parser import parse_options, remove_from_quast_py_args, prepare_regular_quast_args from quast_libs import contigs_analyzer, search_references_meta, plotter_data, qutils, run_busco -from quast_libs.qutils import cleanup, check_dirpath, is_python2, run_parallel +from quast_libs.qutils import cleanup, check_dirpath, run_parallel from quast_libs.log import get_logger logger = get_logger(qconfig.LOGGER_META_NAME) diff --git a/quast_libs/contigs_analyzer.py b/quast_libs/contigs_analyzer.py index 1a8f8ed296..0dc4b1b92d 100644 --- a/quast_libs/contigs_analyzer.py +++ b/quast_libs/contigs_analyzer.py @@ -36,7 +36,7 @@ from quast_libs.fastaparser import get_genome_stats from quast_libs.log import get_logger -from quast_libs.qutils import is_python2, run_parallel +from quast_libs.qutils import run_parallel logger = get_logger(qconfig.LOGGER_DEFAULT_NAME) diff --git a/quast_libs/qutils.py b/quast_libs/qutils.py index aa8b6715b9..8ea268a296 100644 --- a/quast_libs/qutils.py +++ b/quast_libs/qutils.py @@ -206,22 +206,15 @@ def correct_contigs(contigs_fpaths, corrected_dirpath, labels, reporting): return corrected_contigs_fpaths, old_contigs_fpaths -def convert_to_unicode(value): - if is_python2(): - return unicode(value) - else: - return str(value) - - def slugify(value): """ Prepare string to use in file names: normalizes string, removes non-alpha characters, and converts spaces to hyphens. """ import unicodedata - value = unicodedata.normalize('NFKD', convert_to_unicode(value)).encode('ascii', 'ignore').decode('utf-8') - value = convert_to_unicode(re.sub('[^\w\s-]', '-', value).strip()) - value = convert_to_unicode(re.sub('[-\s]+', '-', value)) + value = unicodedata.normalize('NFKD', str(value)).encode('ascii', 'ignore').decode('utf-8') + value = str(re.sub('[^\w\s-]', '-', value).strip()) + value = str(re.sub('[-\s]+', '-', value)) return str(value) @@ -831,10 +824,6 @@ def safe_create(fpath, logger, is_required=False): logger.notice(msg) -def is_python2(): - return sys.version_info[0] < 3 - - def fix_configure_timestamps(dirpath): try: os.utime(join(dirpath, 'aclocal.m4'), None) @@ -890,8 +879,18 @@ def compile_tool(name, dirpath, requirements, just_notice=False, logger=logger, def check_dirpath(path, message="", exit_code=3): - if not is_ascii_string(path): - logger.error('QUAST does not support non-ASCII characters in path.\n' + message, to_stderr=True, exit_with_code=exit_code) + """ + This function checks if string path is in ascii format and don't contain spaces. + + :param path: string check to + :param message: message to log if path isn't ok + :param exit_code: exit code in logger error + """ + try: + path.encode('ascii') + except UnicodeEncodeError: + logger.error('QUAST does not support non-ASCII characters in path.\n' + message, to_stderr=True, + exit_with_code=exit_code) if ' ' in path: logger.error('QUAST does not support spaces in paths.\n' + message, to_stderr=True, exit_with_code=exit_code) return True @@ -1063,10 +1062,7 @@ def run_parallel(_fn, fn_args, n_jobs=None, filter_results=False): except TypeError: pass except ImportError: - if is_python2(): - from joblib2 import Parallel, delayed - else: - from joblib3 import Parallel, delayed + from joblib3 import Parallel, delayed results_tuples = Parallel(**parallel_args)(delayed(_fn)(*args) for args in fn_args) results = [] if results_tuples: @@ -1081,18 +1077,6 @@ def run_parallel(_fn, fn_args, n_jobs=None, filter_results=False): return results -# based on http://stackoverflow.com/questions/196345/how-to-check-if-a-string-in-python-is-in-ascii -def is_ascii_string(line): - try: - line.encode('ascii') - except UnicodeDecodeError: # python2 - return False - except UnicodeEncodeError: # python3 - return False - else: - return True - - def md5(fpath): hash_md5 = hashlib.md5() with open(fpath, 'rb') as f: @@ -1120,18 +1104,3 @@ def verify_md5(fpath, md5_fpath=None): logger.warning('Failed to check md5 for %s! Either this file or its md5 file (%s) is missing or empty.' % (fpath, md5_fpath)) return False - - -def percentile(values, percent): - import math - percentile_idx = int(math.ceil((len(values) * percent) / 100)) - 1 - return values[max(0, percentile_idx)] - - -def calc_median(values): - if len(values) % 2 == 1: # odd number of values - median = values[(len(values) - 1) // 2] - else: # even number of values - take the avg of central - median = (values[len(values) // 2] + values[len(values) // 2 - 1]) // 2 - return median - diff --git a/quast_libs/qutils_test.py b/quast_libs/qutils_test.py new file mode 100644 index 0000000000..77592f7267 --- /dev/null +++ b/quast_libs/qutils_test.py @@ -0,0 +1,27 @@ +import sys +import os +import unittest + +sys.path.append(os.path.normpath(os.path.join(os.path.dirname(__file__), os.path.pardir))) + +import quast_libs.qutils as qq + + +class TestCheckDirPath(unittest.TestCase): + def test_check_wrong_format(self): + s = "♥O◘♦♥O◘♦" + with self.assertRaises(SystemExit): + qq.check_dirpath(s) + + def test_check_spaces(self): + s = " misha@misha:~$" + with self.assertRaises(SystemExit): + qq.check_dirpath(s) + + def test_check_right_format(self): + s = "misha@misha:~$" + self.assertTrue(qq.check_dirpath(s)) + + +if __name__ == '__main__': + unittest.main() diff --git a/quast_libs/reads_analyzer.py b/quast_libs/reads_analyzer.py index c4a4d539de..eb864738a5 100644 --- a/quast_libs/reads_analyzer.py +++ b/quast_libs/reads_analyzer.py @@ -11,6 +11,7 @@ import re import shutil import shlex +import numpy as np from collections import defaultdict from math import sqrt from os.path import isfile, join, basename, abspath, isdir, dirname, exists @@ -20,7 +21,7 @@ from quast_libs.fastaparser import create_fai_file from quast_libs.ra_utils.misc import * from quast_libs.qutils import is_non_empty_file, add_suffix, get_chr_len_fpath, run_parallel, \ - get_path_to_program, check_java_version, percentile, calc_median + get_path_to_program, check_java_version from quast_libs.log import get_logger from quast_libs.reporting import save_reads @@ -849,8 +850,8 @@ def proceed_cov_file(raw_cov_fpath, cov_fpath, correct_chr_names): def get_max_min_is(insert_sizes): - decile_1 = percentile(insert_sizes, 10) - decile_9 = percentile(insert_sizes, 90) + decile_1 = np.percentile(insert_sizes, 10, interpolation='lower') + decile_9 = np.percentile(insert_sizes, 90, interpolation='lower') return decile_1, decile_9 @@ -883,7 +884,7 @@ def calculate_insert_size(sam_fpath, output_dir, ref_name, reads_suffix=''): if insert_sizes: insert_sizes.sort() - median_is = calc_median(insert_sizes) + median_is = np.percentile(insert_sizes, 50, interpolation='lower') if median_is <= 0: return None, None, None min_insert_size, max_insert_size = get_max_min_is(insert_sizes)