Skip to content

Commit

Permalink
New subcommand taxit named that filters unclassified, unnamed taxonom…
Browse files Browse the repository at this point in the history
…y ids
  • Loading branch information
crosenth committed Nov 15, 2023
1 parent faf1947 commit bc1bdeb
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 0 deletions.
1 change: 1 addition & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Upcoming
========

* Updated NCBI taxdmp.zip download protocol from ftp to https [GH-156]
* New subcommand `taxit named` that filters unclassified, unnamed taxonomy ids

0.10.1
======
Expand Down
79 changes: 79 additions & 0 deletions taxtastic/subcommands/named.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# This file is part of taxtastic.
#
# taxtastic is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# taxtastic is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with taxtastic. If not, see <http://www.gnu.org/licenses/>.
"""
Filters unclassified, unnamed taxonomy ids
"""
import argparse
import csv
import sqlalchemy
import sys
from taxtastic.utils import add_database_args
from taxtastic.taxonomy import Taxonomy


def build_parser(parser):
parser = add_database_args(parser)
input_group = parser.add_mutually_exclusive_group(required=True)
input_group.add_argument(
'-t', '--tax-ids',
nargs='+',
help='one or more space-delimited tax_ids (eg "-t 47770 33945")')
input_group.add_argument(
'-f', '--tax-id-file',
metavar='FILE',
type=argparse.FileType('rt'),
help=('File containing a whitespace-delimited list of '
'tax_ids (ie, separated by tabs, spaces, or newlines.'))
input_group.add_argument(
'-i', '--seq-info',
type=argparse.FileType('rt'),
help=('Read tax_ids from sequence info file, minimally '
'containing a column named "tax_id"'))
parser.add_argument(
'--ranked',
action='store_true',
help='Ignore "no rank" taxonomies [%(default)s]')
parser.add_argument(
'-o', '--outfile',
type=argparse.FileType('wt'),
default=sys.stdout,
metavar='FILE',
help=('Output file containing named taxonomy ids;'
'writes to stdout if unspecified'))


def action(args):
engine = sqlalchemy.create_engine(args.url, echo=args.verbosity > 3)
tax = Taxonomy(engine, schema=args.schema)
if args.tax_ids:
tax_ids = args.tax_ids
elif args.tax_id_file:
tax_ids = (i.strip() for i in args.tax_id_file)
tax_ids = [i for i in tax_ids if i]
elif args.seq_info:
seq_info = csv.DictReader(args.seq_info)
tax_ids = (row['tax_id'] for row in seq_info)
named = set(tax.named(set(tax_ids), no_rank=not args.ranked))
if args.seq_info:
out = csv.DictWriter(args.outfile, fieldnames=seq_info.fieldnames)
out.writeheader()
args.seq_info.seek(0)
for i in csv.DictReader(args.seq_info):
if i['tax_id'] in named:
out.writerow(i)
else:
for i in tax_ids:
if i in named:
args.outfile.write(i + '\n')
10 changes: 10 additions & 0 deletions taxtastic/taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -824,3 +824,13 @@ def species_below(self, tax_id):
newc = self.species_below(c)
assert self.is_ancestor_of(newc, tax_id)
return newc

def named(self, taxids, no_rank=True):
nodes = self.nodes
s = select(nodes.c.tax_id).\
where(and_(
nodes.c.tax_id.in_(taxids),
nodes.c.is_valid))
if not no_rank:
s = s.where(nodes.c.rank != 'no_rank')
return [f[0] for f in self.fetchall(s)]

0 comments on commit bc1bdeb

Please sign in to comment.