update baseimage to 0.1.17; move some code from assembly.py and inter…

…host.py into util.file and util.misc
broadinstitute · Nov 5, 2019 · c0875e1 · c0875e1
1 parent c184492
commit c0875e1
Show file tree

Hide file tree

Showing 5 changed files with 64 additions and 104 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/broadinstitute/viral-baseimage:0.1.16
+FROM quay.io/broadinstitute/viral-baseimage:0.1.17
 
 LABEL maintainer "[email protected]"
 

diff --git a/reports.py b/reports.py
@@ -30,8 +30,6 @@
 import tools.samtools
 import tools.bwa
 import tools.fastqc
-import assembly
-import interhost
 from util.stats import mean, median
 
 log = logging.getLogger(__name__)
@@ -103,7 +101,7 @@ def get_assembly_stats(sample,
             out['n_contigs'] = 0
     if os.path.isfile(assembly_fname):
         with open(assembly_fname, 'rt') as inf:
-            counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
+            counts = [(len(s), util.misc.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
         out['n_contigs'] = len(counts)
         out['contig_len'] = ','.join(str(x) for x, y in counts)
         out['unambig_bases'] = ','.join(str(y) for x, y in counts)
@@ -253,7 +251,7 @@ def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCou
     ambiguous = 'N'
     aligner = tools.muscle.MuscleTool()
 
-    per_chr_fastas = interhost.transposeChromosomeFiles([inFastaFileOne, inFastaFileTwo])
+    per_chr_fastas = util.file.transposeChromosomeFiles([inFastaFileOne, inFastaFileTwo])
 
     results = OrderedDict()
     results["same_unambig"]  = 0

diff --git a/util/file.py b/util/file.py
@@ -692,6 +692,61 @@ def fastas_with_sanitized_ids(input_fasta_paths, use_tmp=False):
             sanitized_fasta_paths.append(write_fasta_with_sanitized_ids(fasta_in, out_filepath))
         yield sanitized_fasta_paths
 
+class TranspositionError(Exception):
+    def __init___(self, *args, **kwargs):
+        super(TranspositionError, self).__init__(self, *args, **kwargs)
+
+def transposeChromosomeFiles(inputFilenamesList, sampleRelationFile=None, sampleNameListFile=None):
+    ''' Input:  a list of FASTA files representing a genome for each sample.
+                Each file contains the same number of sequences (chromosomes, segments,
+                etc) in the same order.
+                If the parameter sampleRelationFile is specified (as a file path),
+                a JSON file will be written mapping sample name to sequence position
+                in the output.
+        Output: a list of FASTA files representing all samples for each
+                chromosome/segment for input to a multiple sequence aligner.
+                The number of FASTA files corresponds to the number of chromosomes
+                in the genome.  Each file contains the same number of samples
+                in the same order.  Each output file is a tempfile.
+    '''
+    outputFilenames = []
+
+    # open all files
+    inputFilesList = [util.file.open_or_gzopen(x, 'r') for x in inputFilenamesList]
+    # get BioPython iterators for each of the FASTA files specified in the input
+    fastaFiles = [SeqIO.parse(x, 'fasta') for x in inputFilesList]
+
+    # write out json file containing relation of
+    # sample name to position in output
+    if sampleRelationFile:
+        with open(os.path.realpath(sampleRelationFile), "w") as outFile:
+            # dict mapping sample->index, zero indexed
+            sampleIdxMap = dict((os.path.basename(v).replace(".fasta", ""), k)
+                                for k, v in enumerate(inputFilenamesList))
+            json.dump(sampleIdxMap, outFile, sort_keys=True, indent=4, separators=(',', ': '))
+
+    if sampleNameListFile:
+        with open(os.path.realpath(sampleNameListFile), "w") as outFile:
+            sampleNameList = [os.path.basename(v).replace(".fasta", "\n") for v in inputFilenamesList]
+            outFile.writelines(sampleNameList)
+
+    # for each interleaved record
+    for chrRecordList in zip_longest(*fastaFiles):
+        if any(rec is None for rec in chrRecordList):
+            raise TranspositionError("input fasta files must all have the same number of sequences")
+
+        outputFilename = util.file.mkstempfname('.fasta')
+        outputFilenames.append(outputFilename)
+        with open(outputFilename, "w") as outf:
+            # write the corresonding records to a new FASTA file
+            SeqIO.write(chrRecordList, outf, 'fasta')
+
+    # close all input files
+    for x in inputFilesList:
+        x.close()
+
+    return outputFilenames
+
 def string_to_file_name(string_value, file_system_path=None, length_margin=0):
     """Constructs a valid file name from a given string, replacing or deleting invalid characters.
     If `file_system_path` is given, makes sure the file name is valid on that file system.

diff --git a/util/misc.py b/util/misc.py
@@ -15,13 +15,18 @@
 import time
 
 import util.file
+from subprocess import run
 
 log = logging.getLogger(__name__)
 
 __author__ = "[email protected]"
 
 MAX_INT32 = (2 ** 31)-1
 
+def unambig_count(seq):
+    unambig = set(('A', 'T', 'C', 'G'))
+    return sum(1 for s in seq if s.upper() in unambig)
+
 @contextlib.contextmanager
 def timer(prefix):
     start = time.time()
@@ -144,84 +149,6 @@ def list_contains(sublist, list_):
     return False
 
 
-try:
-    from subprocess import run
-except ImportError:
-    CompletedProcess = collections.namedtuple(
-        'CompletedProcess', ['args', 'returncode', 'stdout', 'stderr'])
-
-    def run(args, stdin=None, stdout=None, stderr=None, shell=False,
-            env=None, cwd=None, timeout=None, check=False, executable=None):
-        '''A poor man's substitute of python 3.5's subprocess.run().
-
-        Should only be used for capturing stdout. If stdout is unneeded, a
-        simple subprocess.call should suffice.
-        '''
-        assert stdout is not None, (
-            'Why are you using this util function if not capturing stdout?')
-
-        stdout_pipe = stdout == subprocess.PIPE
-        stderr_pipe = stderr == subprocess.PIPE
-        # A little optimization when we don't need temporary files.
-        if stdout_pipe and (
-                stderr == subprocess.STDOUT or stderr is None):
-            try:
-                output = subprocess.check_output(
-                    args, stdin=stdin, stderr=stderr, shell=shell,
-                    env=env, cwd=cwd, executable=executable)
-                return CompletedProcess(args, 0, output, b'')
-            # Py3.4 doesn't have stderr attribute
-            except subprocess.CalledProcessError as e:
-                if check:
-                    raise
-                returncode = e.returncode
-                stderr_text = getattr(e, 'stderr', b'')
-                return CompletedProcess(args, e.returncode, e.output, stderr_text)
-
-        # Otherwise use temporary files as buffers, since subprocess.call
-        # cannot use PIPE.
-        if stdout_pipe:
-            stdout_fn = util.file.mkstempfname('.stdout')
-            stdout = open(stdout_fn, 'wb')
-        if stderr_pipe:
-            stderr_fn = util.file.mkstempfname('.stderr')
-            stderr = open(stderr_fn, 'wb')
-        try:
-            returncode = subprocess.call(
-                args, stdin=stdin, stdout=stdout,
-                stderr=stderr, shell=shell, env=env, cwd=cwd,
-                executable=executable)
-            if stdout_pipe:
-                stdout.close()
-                with open(stdout_fn, 'rb') as f:
-                    output = f.read()
-            else:
-                output = ''
-            if stderr_pipe:
-                stderr.close()
-                with open(stderr_fn, 'rb') as f:
-                    error = f.read()
-            else:
-                error = ''
-            if check and returncode != 0:
-                print(output.decode("utf-8"))
-                print(error.decode("utf-8"))
-                try:
-                    raise subprocess.CalledProcessError(
-                        returncode, args, output, error) #pylint: disable-msg=E1121
-                except TypeError: # py2 CalledProcessError does not accept error
-                    raise subprocess.CalledProcessError(
-                        returncode, args, output)
-            return CompletedProcess(args, returncode, output, error)
-        finally:
-            if stdout_pipe:
-                stdout.close()
-                os.remove(stdout_fn)
-            if stderr_pipe:
-                stderr.close()
-                os.remove(stderr_fn)
-
-
 def run_and_print(args, stdout=None, stderr=subprocess.STDOUT,
                   stdin=None, shell=False, env=None, cwd=None,
                   timeout=None, silent=False, buffered=False, check=False,

diff --git a/util/stats.py b/util/stats.py
@@ -5,27 +5,7 @@
 
 __author__ = "[email protected], [email protected]"
 
-try:
-    # Python 3.4
-    from statistics import mean, median
-except ImportError:
-    # Python <3.4, avoid numpy if these two methods are all we really need
-    def mean(l):
-        if len(l) > 0:
-            return float(sum(l)) / len(l)
-        else:
-            raise Exception("empty list for mean")
-
-    def median(l):
-        if len(l) > 0:
-            half = len(l) // 2
-            l.sort()
-            if len(l) % 2 == 0:
-                return (l[half - 1] + l[half]) / 2.0
-            else:
-                return l[half]
-        else:
-            raise Exception("empty list for median")
+from statistics import mean, median
 
 
 def product(iterable):