Skip to content

Commit

Permalink
update baseimage to 0.1.17; move some code from assembly.py and inter…
Browse files Browse the repository at this point in the history
…host.py into util.file and util.misc
  • Loading branch information
dpark01 committed Nov 5, 2019
1 parent c184492 commit c0875e1
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 104 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM quay.io/broadinstitute/viral-baseimage:0.1.16
FROM quay.io/broadinstitute/viral-baseimage:0.1.17

LABEL maintainer "[email protected]"

Expand Down
6 changes: 2 additions & 4 deletions reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,6 @@
import tools.samtools
import tools.bwa
import tools.fastqc
import assembly
import interhost
from util.stats import mean, median

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -103,7 +101,7 @@ def get_assembly_stats(sample,
out['n_contigs'] = 0
if os.path.isfile(assembly_fname):
with open(assembly_fname, 'rt') as inf:
counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
counts = [(len(s), util.misc.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
out['n_contigs'] = len(counts)
out['contig_len'] = ','.join(str(x) for x, y in counts)
out['unambig_bases'] = ','.join(str(y) for x, y in counts)
Expand Down Expand Up @@ -253,7 +251,7 @@ def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCou
ambiguous = 'N'
aligner = tools.muscle.MuscleTool()

per_chr_fastas = interhost.transposeChromosomeFiles([inFastaFileOne, inFastaFileTwo])
per_chr_fastas = util.file.transposeChromosomeFiles([inFastaFileOne, inFastaFileTwo])

results = OrderedDict()
results["same_unambig"] = 0
Expand Down
55 changes: 55 additions & 0 deletions util/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,61 @@ def fastas_with_sanitized_ids(input_fasta_paths, use_tmp=False):
sanitized_fasta_paths.append(write_fasta_with_sanitized_ids(fasta_in, out_filepath))
yield sanitized_fasta_paths

class TranspositionError(Exception):
def __init___(self, *args, **kwargs):
super(TranspositionError, self).__init__(self, *args, **kwargs)

def transposeChromosomeFiles(inputFilenamesList, sampleRelationFile=None, sampleNameListFile=None):
''' Input: a list of FASTA files representing a genome for each sample.
Each file contains the same number of sequences (chromosomes, segments,
etc) in the same order.
If the parameter sampleRelationFile is specified (as a file path),
a JSON file will be written mapping sample name to sequence position
in the output.
Output: a list of FASTA files representing all samples for each
chromosome/segment for input to a multiple sequence aligner.
The number of FASTA files corresponds to the number of chromosomes
in the genome. Each file contains the same number of samples
in the same order. Each output file is a tempfile.
'''
outputFilenames = []

# open all files
inputFilesList = [util.file.open_or_gzopen(x, 'r') for x in inputFilenamesList]
# get BioPython iterators for each of the FASTA files specified in the input
fastaFiles = [SeqIO.parse(x, 'fasta') for x in inputFilesList]

# write out json file containing relation of
# sample name to position in output
if sampleRelationFile:
with open(os.path.realpath(sampleRelationFile), "w") as outFile:
# dict mapping sample->index, zero indexed
sampleIdxMap = dict((os.path.basename(v).replace(".fasta", ""), k)
for k, v in enumerate(inputFilenamesList))
json.dump(sampleIdxMap, outFile, sort_keys=True, indent=4, separators=(',', ': '))

if sampleNameListFile:
with open(os.path.realpath(sampleNameListFile), "w") as outFile:
sampleNameList = [os.path.basename(v).replace(".fasta", "\n") for v in inputFilenamesList]
outFile.writelines(sampleNameList)

# for each interleaved record
for chrRecordList in zip_longest(*fastaFiles):
if any(rec is None for rec in chrRecordList):
raise TranspositionError("input fasta files must all have the same number of sequences")

outputFilename = util.file.mkstempfname('.fasta')
outputFilenames.append(outputFilename)
with open(outputFilename, "w") as outf:
# write the corresonding records to a new FASTA file
SeqIO.write(chrRecordList, outf, 'fasta')

# close all input files
for x in inputFilesList:
x.close()

return outputFilenames

def string_to_file_name(string_value, file_system_path=None, length_margin=0):
"""Constructs a valid file name from a given string, replacing or deleting invalid characters.
If `file_system_path` is given, makes sure the file name is valid on that file system.
Expand Down
83 changes: 5 additions & 78 deletions util/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,18 @@
import time

import util.file
from subprocess import run

log = logging.getLogger(__name__)

__author__ = "[email protected]"

MAX_INT32 = (2 ** 31)-1

def unambig_count(seq):
unambig = set(('A', 'T', 'C', 'G'))
return sum(1 for s in seq if s.upper() in unambig)

@contextlib.contextmanager
def timer(prefix):
start = time.time()
Expand Down Expand Up @@ -144,84 +149,6 @@ def list_contains(sublist, list_):
return False


try:
from subprocess import run
except ImportError:
CompletedProcess = collections.namedtuple(
'CompletedProcess', ['args', 'returncode', 'stdout', 'stderr'])

def run(args, stdin=None, stdout=None, stderr=None, shell=False,
env=None, cwd=None, timeout=None, check=False, executable=None):
'''A poor man's substitute of python 3.5's subprocess.run().
Should only be used for capturing stdout. If stdout is unneeded, a
simple subprocess.call should suffice.
'''
assert stdout is not None, (
'Why are you using this util function if not capturing stdout?')

stdout_pipe = stdout == subprocess.PIPE
stderr_pipe = stderr == subprocess.PIPE
# A little optimization when we don't need temporary files.
if stdout_pipe and (
stderr == subprocess.STDOUT or stderr is None):
try:
output = subprocess.check_output(
args, stdin=stdin, stderr=stderr, shell=shell,
env=env, cwd=cwd, executable=executable)
return CompletedProcess(args, 0, output, b'')
# Py3.4 doesn't have stderr attribute
except subprocess.CalledProcessError as e:
if check:
raise
returncode = e.returncode
stderr_text = getattr(e, 'stderr', b'')
return CompletedProcess(args, e.returncode, e.output, stderr_text)

# Otherwise use temporary files as buffers, since subprocess.call
# cannot use PIPE.
if stdout_pipe:
stdout_fn = util.file.mkstempfname('.stdout')
stdout = open(stdout_fn, 'wb')
if stderr_pipe:
stderr_fn = util.file.mkstempfname('.stderr')
stderr = open(stderr_fn, 'wb')
try:
returncode = subprocess.call(
args, stdin=stdin, stdout=stdout,
stderr=stderr, shell=shell, env=env, cwd=cwd,
executable=executable)
if stdout_pipe:
stdout.close()
with open(stdout_fn, 'rb') as f:
output = f.read()
else:
output = ''
if stderr_pipe:
stderr.close()
with open(stderr_fn, 'rb') as f:
error = f.read()
else:
error = ''
if check and returncode != 0:
print(output.decode("utf-8"))
print(error.decode("utf-8"))
try:
raise subprocess.CalledProcessError(
returncode, args, output, error) #pylint: disable-msg=E1121
except TypeError: # py2 CalledProcessError does not accept error
raise subprocess.CalledProcessError(
returncode, args, output)
return CompletedProcess(args, returncode, output, error)
finally:
if stdout_pipe:
stdout.close()
os.remove(stdout_fn)
if stderr_pipe:
stderr.close()
os.remove(stderr_fn)


def run_and_print(args, stdout=None, stderr=subprocess.STDOUT,
stdin=None, shell=False, env=None, cwd=None,
timeout=None, silent=False, buffered=False, check=False,
Expand Down
22 changes: 1 addition & 21 deletions util/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,7 @@

__author__ = "[email protected], [email protected]"

try:
# Python 3.4
from statistics import mean, median
except ImportError:
# Python <3.4, avoid numpy if these two methods are all we really need
def mean(l):
if len(l) > 0:
return float(sum(l)) / len(l)
else:
raise Exception("empty list for mean")

def median(l):
if len(l) > 0:
half = len(l) // 2
l.sort()
if len(l) % 2 == 0:
return (l[half - 1] + l[half]) / 2.0
else:
return l[half]
else:
raise Exception("empty list for median")
from statistics import mean, median


def product(iterable):
Expand Down

0 comments on commit c0875e1

Please sign in to comment.