Removing references to fastalite that are not reading or writing fast…

…a files
fhcrc · Nov 17, 2023 · 7037e85 · 7037e85
1 parent 24727db
commit 7037e85
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 14 deletions.
diff --git a/taxtastic/subcommands/add_nodes.py b/taxtastic/subcommands/add_nodes.py
@@ -27,10 +27,9 @@
 import traceback
 
 import yaml
-from fastalite import Opener
 
 from taxtastic.taxonomy import Taxonomy
-from taxtastic.utils import add_database_args
+from taxtastic.utils import add_database_args, Opener
 
 log = logging.getLogger(__name__)
 

diff --git a/taxtastic/subcommands/extract_nodes.py b/taxtastic/subcommands/extract_nodes.py
@@ -24,10 +24,9 @@
 # from collections import OrderedDict
 
 import yaml
-from fastalite import Opener
 
 from taxtastic.taxonomy import Taxonomy
-from taxtastic.utils import add_database_args
+from taxtastic.utils import add_database_args, Opener
 
 log = logging.getLogger(__name__)
 

diff --git a/taxtastic/subcommands/named.py b/taxtastic/subcommands/named.py
@@ -15,11 +15,10 @@
 """
 Filters unclassified, unnamed taxonomy ids
 """
-import argparse
 import csv
 import sqlalchemy
 import sys
-from taxtastic.utils import add_database_args
+from taxtastic.utils import add_database_args, Opener
 from taxtastic.taxonomy import Taxonomy
 
 
@@ -33,12 +32,12 @@ def build_parser(parser):
     input_group.add_argument(
         '-f', '--tax-id-file',
         metavar='FILE',
-        type=argparse.FileType('rt'),
+        type=Opener('rt'),
         help=('File containing a whitespace-delimited list of '
               'tax_ids (ie, separated by tabs, spaces, or newlines.'))
     input_group.add_argument(
         '-i', '--seq-info',
-        type=argparse.FileType('rt'),
+        type=Opener('rt'),
         help=('Read tax_ids from sequence info file, minimally '
               'containing a column named "tax_id"'))
     parser.add_argument(
@@ -47,7 +46,7 @@ def build_parser(parser):
         help='Ignore "no rank" taxonomies [%(default)s]')
     parser.add_argument(
         '-o', '--outfile',
-        type=argparse.FileType('wt'),
+        type=Opener('wt'),
         default=sys.stdout,
         metavar='FILE',
         help=('Output file containing named taxonomy ids;'

diff --git a/taxtastic/subcommands/update_taxids.py b/taxtastic/subcommands/update_taxids.py
@@ -26,8 +26,6 @@
 
 import sqlalchemy as sa
 
-from fastalite import Opener
-
 import taxtastic
 from taxtastic.taxonomy import Taxonomy
 
@@ -36,12 +34,14 @@
 
 def build_parser(parser):
     parser.add_argument(
-        'infile', type=Opener('r'),
+        'infile', type=taxtastic.utils.Opener('r'),
         help=('Input CSV file to process, minimally containing the field '
               '`tax_id`. Use "-" for stdin.'))
     parser = taxtastic.utils.add_database_args(parser)
     parser.add_argument(
-        '-o', '--outfile', default=sys.stdout, type=Opener('wt'),
+        '-o', '--outfile',
+        default=sys.stdout,
+        type=taxtastic.utils.Opener('wt'),
         help='Modified version of input file [default: stdout]')
     input_format = parser.add_mutually_exclusive_group(required=False)
     input_format.add_argument(
@@ -52,7 +52,7 @@ def build_parser(parser):
         help='Infile is a headerless text file '
              'of tax_ids separated by newlines. [%(default)s]')
     parser.add_argument(
-        '--unknowns', type=Opener('wt'),
+        '--unknowns', type=taxtastic.utils.Opener('wt'),
         help=('optional output file containing rows with unknown tax_ids '
               'having no replacements in merged table'))
     parser.add_argument(

diff --git a/taxtastic/utils.py b/taxtastic/utils.py
@@ -12,21 +12,57 @@
 #
 #    You should have received a copy of the GNU General Public License
 #    along with taxtastic.  If not, see <http://www.gnu.org/licenses/>.
+import bz2
 import csv
 import errno
+import gzip
 import logging
 import os
 import re
 import subprocess
 import string
 import random
 import configparser
+import sys
 from collections import OrderedDict
 
 
 log = logging
 
 
+class Opener(object):
+    """Factory for creating file objects. Transparenty opens compressed
+    files for reading or writing based on suffix (.gz and .bz2 only).
+
+    Example::
+
+        with Opener()('in.txt') as infile, Opener('w')('out.gz') as outfile:
+            outfile.write(infile.read())
+    """
+
+    def __init__(self, mode='r', *args, **kwargs):
+        self.mode = mode
+        self.args = args
+        self.kwargs = kwargs
+        self.writable = 'w' in self.mode
+
+    def __call__(self, obj):
+        if obj is sys.stdout or obj is sys.stdin:
+            return obj
+        elif obj == '-':
+            return sys.stdout if self.writable else sys.stdin
+        else:
+            openers = {'bz2': bz2.open, 'gz': gzip.open}
+            __, suffix = obj.rsplit('.', 1)
+            # in python3, both bz2 and gz libraries default to binary input and output
+            mode = self.mode
+            if sys.version_info.major == 3 and suffix in openers \
+               and mode in {'w', 'r'}:
+                mode += 't'
+            opener = openers.get(suffix, open)
+            return opener(obj, mode=mode, *self.args, **self.kwargs)
+
+
 def get_new_nodes(fname):
     """
     Return an iterator of dicts given a .csv-format file.