Skip to content

Commit

Permalink
Modifications to support iqtree. (#159)
Browse files Browse the repository at this point in the history
* Modifications to support iqtree.

* Bumped up dockerfile to use python-3.12

* break system packages for making docker

* Reverting to python 3.11 for docker

---------

Co-authored-by: Jonathan Golob <[email protected]>
  • Loading branch information
jgolob and Jonathan Golob authored May 23, 2024
1 parent 32c3d7f commit 6f9e646
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 34 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ RUN /opt/build/install_pplacer.sh /usr/local

COPY setup.py MANIFEST.in README.rst requirements.txt /opt/build
COPY taxtastic /opt/build/taxtastic
RUN python3 -m pip install --constraint requirements.txt .
RUN python3 -m pip install --constraint requirements.txt .

WORKDIR /opt/run
RUN mkdir -p /app /fh /mnt /run/shm
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@
'Programming Language :: Python :: 3.9',
'Programming Language :: Python :: 3.10',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
'Topic :: Scientific/Engineering :: Bio-Informatics'],
'download_url': 'https://github.com/fhcrc/taxtastic',
# 'package_data': {
Expand Down
11 changes: 8 additions & 3 deletions taxtastic/refpkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -510,12 +510,12 @@ def update_phylo_model(self, stats_type, stats_file, frequency_type=None):
This function takes a log generated by RAxML or FastTree, parses it,
and inserts an appropriate JSON file into the refpkg. The first
parameter must be 'RAxML', 'PhyML' or 'FastTree', depending on which
parameter must be 'RAxML', 'PhyML', 'IQTREE' or 'FastTree', depending on which
program generated the log. It may also be None to attempt to guess
which program generated the log.
:param stats_type: Statistics file type. One of 'RAxML', 'FastTree', 'PhyML'
:param stats_file: path to statistics/log file
:param stats_type: Statistics file type. One of 'RAxML', 'FastTree', 'PhyML', 'IQTREE'
:param stats_file: path to statistics/log/iqtree file
:param frequency_type: For ``stats_type == 'PhyML'``, amino acid
alignments only: was the alignment inferred with ``model`` or
``empirical`` frequencies?
Expand Down Expand Up @@ -545,6 +545,9 @@ def update_phylo_model(self, stats_type, stats_file, frequency_type=None):
elif 'PhyML' in line:
stats_type = 'PhyML'
break
elif line.startswith("IQ-TREE"):
stats_type = 'IQTREE'
break
else:
raise ValueError(
"couldn't guess log type for %r" % (stats_file,))
Expand All @@ -555,6 +558,8 @@ def update_phylo_model(self, stats_type, stats_file, frequency_type=None):
parser = utils.parse_raxmlng
elif stats_type == 'FastTree':
parser = utils.parse_fasttree
elif stats_type == 'IQTREE':
parser = utils.parse_iqtree
elif stats_type == 'PhyML':
parser = functools.partial(utils.parse_phyml,
frequency_type=frequency_type)
Expand Down
2 changes: 1 addition & 1 deletion taxtastic/subcommands/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def build_parser(parser):

stats_group = parser.add_argument_group('Tree inference log file parsing '
'(for updating `tree_stats`)')
stats_group.add_argument("--stats-type", choices=('PhyML', 'FastTree', 'RAxML'),
stats_group.add_argument("--stats-type", choices=('PhyML', 'FastTree', 'RAxML', 'IQTREE'),
help="""stats file type [default: attempt to guess from
file contents]""")
stats_group.add_argument("--frequency-type", choices=('empirical', 'model'),
Expand Down
93 changes: 64 additions & 29 deletions taxtastic/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,35 +109,6 @@ def try_set_fields(d, regex, text, hook=lambda x: x):
class InvalidLogError(ValueError):
pass

def parse_raxmlng(handle):
"""Parse RAxMLng's summary output.
*handle* should be an open file handle containing the RAxMLng
log. It is parsed and a dictionary returned.
"""
s = handle.read()
result = {}
try_set_fields(result, r'(?P<program>RAxML-NG v. [\d\.]+)', s)
# Less ideal, but for now force DNA and GTR for now
result['datatype'] = 'DNA'
result["subs_model"] = "GTR"
try_set_fields(result, r'\\nModel: (?P<subs_model>[\w\+]+)\\n', s)
result['empirical_frequencies'] = re.search(r'Base frequencies \(ML\)', s) is not None
rates = {}
try_set_fields(rates, r'Substitution rates \(ML\): (?P<ac>\d+\.\d+) (?P<ag>\d+\.\d+) (?P<at>\d+\.\d+) (?P<cg>\d+\.\d+) (?P<ct>\d+\.\d+) (?P<gt>\d+\.\d+)', s, hook=float)
if len(rates) > 0:
result['subs_rates'] = rates
gamma = {}
try_set_fields(gamma, r'Rate heterogeneity: GAMMA \((?P<n_cats>\d+) cats, mean\), alpha: (?P<alpha>\d+\.\d+)', s, hook=float)
try:
gamma['n_cats'] = int(gamma['n_cats'])
except:
pass
result['gamma'] = gamma
result['ras_model'] = 'gamma'

return result

# https://github.com/amkozlov/raxml-ng/wiki/Input-data#single-mode
DNA = ['JC', 'K80', 'F81', 'HKY', 'TN93ef', 'TN93', 'K81', 'K81uf', 'TPM2',
'TPM2uf', 'TPM3', 'TPM3uf', 'TIM1', 'TIM1uf', 'TIM2', 'TIM2uf',
Expand Down Expand Up @@ -179,6 +150,70 @@ def parse_raxmlng(handle):
return result


def parse_iqtree(handle):
"""Parse iqtrees's summary output.
*handle* should be an open file handle containing the ".iqtree" output file
It is parsed and a dictionary returned.
"""
s = handle.read()
result = {}
try_set_fields(result, r'(?P<program>IQ-TREE [\d\.]+)', s)
try_set_fields(result, r'Model of substitution: (?P<model>[\w\+]+)\n', s)
if 'model' in result:
models = set(result['model'].split('+'))
# Assume here that the substitution models are the same as RAxML-ng.
# Maybe need to be corrected in the future.
subs_model = list(set(DNA).intersection(models))
if len(subs_model) > 0:
result['subs_model'] = subs_model[0]
result['datatype'] = 'DNA'
else:
subs_model = list(set(PROT).intersection(models))
if len(subs_model) > 0:
result['subs_model'] = subs_model[0]
result['datatype'] = 'AA'
# Implicit else here both if no model, and/or no model overlapping with substitutions

# Were these empiric frequencies?
result['empirical_frequencies'] = re.search(
r'State frequencies: \(empirical counts from alignment\)\n', s) is not None

# Grab the rates (if available)
re_rate = re.compile(r'(?P<first>[A-Z])\-(?P<second>[A-Z]): (?P<rate>\d+\.\d+)\n')
rates = {
f'{f.lower()}{s.lower()}': float(r)
for f, s, r in
re_rate.findall(s)
}
if len(rates) > 0:
result['subs_rates'] = rates

frequencies = {
b: float(f)
for (b, f) in
re.findall(
r'pi\((?P<base>[ACTG])\) = (\d+\.\d+)\n',
s
)
}
if len(frequencies) > 0:
result['frequencies'] = frequencies
# Gamma (if done)
gamma = {}
try_set_fields(
gamma,
r'Model of rate heterogeneity: Gamma with (?P<n_cats>\d+) categories\nGamma shape alpha: (?P<alpha>\d+\.\d+)\n',
s,
hook=float
)
if gamma:
gamma['n_cats'] = int(gamma['n_cats'])
result['gamma'] = gamma
result['ras_model'] = 'gamma'

return result


def parse_raxml(handle):
"""Parse RAxML's summary output.
Expand Down

0 comments on commit 6f9e646

Please sign in to comment.