From 6f9e646bcbb6b1d3e4f167325e9355d572f7e5c3 Mon Sep 17 00:00:00 2001 From: Jonathan Golob Date: Thu, 23 May 2024 17:39:28 -0400 Subject: [PATCH] Modifications to support iqtree. (#159) * Modifications to support iqtree. * Bumped up dockerfile to use python-3.12 * break system packages for making docker * Reverting to python 3.11 for docker --------- Co-authored-by: Jonathan Golob --- Dockerfile | 2 +- setup.py | 1 + taxtastic/refpkg.py | 11 ++-- taxtastic/subcommands/update.py | 2 +- taxtastic/utils.py | 93 +++++++++++++++++++++++---------- 5 files changed, 75 insertions(+), 34 deletions(-) diff --git a/Dockerfile b/Dockerfile index f103f570..9b66dce6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ RUN /opt/build/install_pplacer.sh /usr/local COPY setup.py MANIFEST.in README.rst requirements.txt /opt/build COPY taxtastic /opt/build/taxtastic -RUN python3 -m pip install --constraint requirements.txt . +RUN python3 -m pip install --constraint requirements.txt . WORKDIR /opt/run RUN mkdir -p /app /fh /mnt /run/shm diff --git a/setup.py b/setup.py index db76b9ad..5a41ab0c 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: 3.10', 'Programming Language :: Python :: 3.11', + 'Programming Language :: Python :: 3.12', 'Topic :: Scientific/Engineering :: Bio-Informatics'], 'download_url': 'https://github.com/fhcrc/taxtastic', # 'package_data': { diff --git a/taxtastic/refpkg.py b/taxtastic/refpkg.py index 04614d4c..e7decad9 100644 --- a/taxtastic/refpkg.py +++ b/taxtastic/refpkg.py @@ -510,12 +510,12 @@ def update_phylo_model(self, stats_type, stats_file, frequency_type=None): This function takes a log generated by RAxML or FastTree, parses it, and inserts an appropriate JSON file into the refpkg. The first - parameter must be 'RAxML', 'PhyML' or 'FastTree', depending on which + parameter must be 'RAxML', 'PhyML', 'IQTREE' or 'FastTree', depending on which program generated the log. It may also be None to attempt to guess which program generated the log. - :param stats_type: Statistics file type. One of 'RAxML', 'FastTree', 'PhyML' - :param stats_file: path to statistics/log file + :param stats_type: Statistics file type. One of 'RAxML', 'FastTree', 'PhyML', 'IQTREE' + :param stats_file: path to statistics/log/iqtree file :param frequency_type: For ``stats_type == 'PhyML'``, amino acid alignments only: was the alignment inferred with ``model`` or ``empirical`` frequencies? @@ -545,6 +545,9 @@ def update_phylo_model(self, stats_type, stats_file, frequency_type=None): elif 'PhyML' in line: stats_type = 'PhyML' break + elif line.startswith("IQ-TREE"): + stats_type = 'IQTREE' + break else: raise ValueError( "couldn't guess log type for %r" % (stats_file,)) @@ -555,6 +558,8 @@ def update_phylo_model(self, stats_type, stats_file, frequency_type=None): parser = utils.parse_raxmlng elif stats_type == 'FastTree': parser = utils.parse_fasttree + elif stats_type == 'IQTREE': + parser = utils.parse_iqtree elif stats_type == 'PhyML': parser = functools.partial(utils.parse_phyml, frequency_type=frequency_type) diff --git a/taxtastic/subcommands/update.py b/taxtastic/subcommands/update.py index f4c7dac0..e5bce5be 100644 --- a/taxtastic/subcommands/update.py +++ b/taxtastic/subcommands/update.py @@ -71,7 +71,7 @@ def build_parser(parser): stats_group = parser.add_argument_group('Tree inference log file parsing ' '(for updating `tree_stats`)') - stats_group.add_argument("--stats-type", choices=('PhyML', 'FastTree', 'RAxML'), + stats_group.add_argument("--stats-type", choices=('PhyML', 'FastTree', 'RAxML', 'IQTREE'), help="""stats file type [default: attempt to guess from file contents]""") stats_group.add_argument("--frequency-type", choices=('empirical', 'model'), diff --git a/taxtastic/utils.py b/taxtastic/utils.py index 4483452e..74c12049 100644 --- a/taxtastic/utils.py +++ b/taxtastic/utils.py @@ -109,35 +109,6 @@ def try_set_fields(d, regex, text, hook=lambda x: x): class InvalidLogError(ValueError): pass -def parse_raxmlng(handle): - """Parse RAxMLng's summary output. - - *handle* should be an open file handle containing the RAxMLng - log. It is parsed and a dictionary returned. - """ - s = handle.read() - result = {} - try_set_fields(result, r'(?PRAxML-NG v. [\d\.]+)', s) - # Less ideal, but for now force DNA and GTR for now - result['datatype'] = 'DNA' - result["subs_model"] = "GTR" - try_set_fields(result, r'\\nModel: (?P[\w\+]+)\\n', s) - result['empirical_frequencies'] = re.search(r'Base frequencies \(ML\)', s) is not None - rates = {} - try_set_fields(rates, r'Substitution rates \(ML\): (?P\d+\.\d+) (?P\d+\.\d+) (?P\d+\.\d+) (?P\d+\.\d+) (?P\d+\.\d+) (?P\d+\.\d+)', s, hook=float) - if len(rates) > 0: - result['subs_rates'] = rates - gamma = {} - try_set_fields(gamma, r'Rate heterogeneity: GAMMA \((?P\d+) cats, mean\), alpha: (?P\d+\.\d+)', s, hook=float) - try: - gamma['n_cats'] = int(gamma['n_cats']) - except: - pass - result['gamma'] = gamma - result['ras_model'] = 'gamma' - - return result - # https://github.com/amkozlov/raxml-ng/wiki/Input-data#single-mode DNA = ['JC', 'K80', 'F81', 'HKY', 'TN93ef', 'TN93', 'K81', 'K81uf', 'TPM2', 'TPM2uf', 'TPM3', 'TPM3uf', 'TIM1', 'TIM1uf', 'TIM2', 'TIM2uf', @@ -179,6 +150,70 @@ def parse_raxmlng(handle): return result +def parse_iqtree(handle): + """Parse iqtrees's summary output. + *handle* should be an open file handle containing the ".iqtree" output file + It is parsed and a dictionary returned. + """ + s = handle.read() + result = {} + try_set_fields(result, r'(?PIQ-TREE [\d\.]+)', s) + try_set_fields(result, r'Model of substitution: (?P[\w\+]+)\n', s) + if 'model' in result: + models = set(result['model'].split('+')) + # Assume here that the substitution models are the same as RAxML-ng. + # Maybe need to be corrected in the future. + subs_model = list(set(DNA).intersection(models)) + if len(subs_model) > 0: + result['subs_model'] = subs_model[0] + result['datatype'] = 'DNA' + else: + subs_model = list(set(PROT).intersection(models)) + if len(subs_model) > 0: + result['subs_model'] = subs_model[0] + result['datatype'] = 'AA' + # Implicit else here both if no model, and/or no model overlapping with substitutions + + # Were these empiric frequencies? + result['empirical_frequencies'] = re.search( + r'State frequencies: \(empirical counts from alignment\)\n', s) is not None + + # Grab the rates (if available) + re_rate = re.compile(r'(?P[A-Z])\-(?P[A-Z]): (?P\d+\.\d+)\n') + rates = { + f'{f.lower()}{s.lower()}': float(r) + for f, s, r in + re_rate.findall(s) + } + if len(rates) > 0: + result['subs_rates'] = rates + + frequencies = { + b: float(f) + for (b, f) in + re.findall( + r'pi\((?P[ACTG])\) = (\d+\.\d+)\n', + s + ) + } + if len(frequencies) > 0: + result['frequencies'] = frequencies + # Gamma (if done) + gamma = {} + try_set_fields( + gamma, + r'Model of rate heterogeneity: Gamma with (?P\d+) categories\nGamma shape alpha: (?P\d+\.\d+)\n', + s, + hook=float + ) + if gamma: + gamma['n_cats'] = int(gamma['n_cats']) + result['gamma'] = gamma + result['ras_model'] = 'gamma' + + return result + + def parse_raxml(handle): """Parse RAxML's summary output.