Skip to content

Commit

Permalink
Added option with unknown taxids to continue building taxtable with o…
Browse files Browse the repository at this point in the history
…nly known tax_ids
  • Loading branch information
crosenth committed Dec 13, 2023
1 parent 8092b43 commit be4753b
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Upcoming
* Updated NCBI taxdmp.zip download protocol from ftp to https [GH-156]
* New subcommand `taxit named` that filters unclassified, unnamed taxonomy ids
* Subcommand update_taxids can now process a tab delimited headerless file
* Subcommand taxtable has option to continue if unknown taxids

0.10.1
======
Expand Down
15 changes: 14 additions & 1 deletion taxtastic/subcommands/taxtable.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,13 @@ def build_parser(parser):
help=('Read tax_ids from sequence info file, minimally '
'containing a column named "tax_id"'))

parser.add_argument(
'-a', '--unknown-action',
choices=['error', 'warn'],
default='error',
help='action to perform for tax_ids '
'not present in database [%(default)s]')

output_group = parser.add_argument_group(
"Output options").add_mutually_exclusive_group()

Expand All @@ -121,13 +128,19 @@ def action(args):
elif args.tax_id_file:
tax_ids = set(args.tax_id_file.read().split())
elif args.seq_info:
tax_ids = {row['tax_id'] for row in csv.DictReader(args.seq_info)}
tax_ids = set(row['tax_id'] for row in csv.DictReader(args.seq_info))
else:
sys.exit('Error: no tax_ids were specified')

engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3)
tax = Taxonomy(engine, schema=args.schema)

if args.unknown_action == 'warn':
unknowns = tax.unknowns(tax_ids)
log.warn('Unknown tax_ids not '
'represented in output: ' + str(sorted(unknowns)))
tax_ids = set(tax_ids) - set(unknowns)

rows = tax._get_lineage_table(tax_ids)

log.info('grouping lineages')
Expand Down
7 changes: 7 additions & 0 deletions taxtastic/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,13 @@ def has_node(self, tax_id):
.filter_by(tax_id=tax_id))
return bool(result)

def unknowns(self, tax_ids):
result = self.fetchall(
select(self.nodes.c.tax_id)
.filter(self.nodes.c.tax_id.in_(tax_ids)))
result = set(r for r, in result)
return [i for i in tax_ids if i not in result]

def add_node(self, tax_id, parent_id, rank, names, source_name,
children=None, is_valid=True, execute=True, **ignored):
"""Add a node to the taxonomy.
Expand Down

0 comments on commit be4753b

Please sign in to comment.