Skip to content

Commit

Permalink
Allow ReferenceSequenceFileFactory to load from streams (#1123)
Browse files Browse the repository at this point in the history
* Add a method to open a ReferenceSequence by passing a FASTA and it's index as SeekableStreams 
* This is useful for clients that are using filesystems that don't have an nio.Path provider available but can produce a stream
* part of #1112
  • Loading branch information
tomwhite authored and lbergelson committed Jun 18, 2018
1 parent 3a20218 commit 72818a0
Show file tree
Hide file tree
Showing 10 changed files with 274 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import htsjdk.samtools.util.IOUtil;

import java.io.File;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;

Expand All @@ -41,6 +42,7 @@
*/
abstract class AbstractFastaSequenceFile implements ReferenceSequenceFile {
private final Path path;
private final String source;
protected SAMSequenceDictionary sequenceDictionary;

/**
Expand All @@ -57,27 +59,32 @@ abstract class AbstractFastaSequenceFile implements ReferenceSequenceFile {
*/
AbstractFastaSequenceFile(final Path path) {
this.path = path;
this.source = path == null ? "unknown" : path.toAbsolutePath().toString();
final Path dictionary = findSequenceDictionary(path);

if (dictionary != null) {
IOUtil.assertFileIsReadable(dictionary);

try {
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
final BufferedLineReader reader = new BufferedLineReader(Files.newInputStream(dictionary));
final SAMFileHeader header = codec.decode(reader,
dictionary.toString());
if (header.getSequenceDictionary() != null && !header.getSequenceDictionary().isEmpty()) {
this.sequenceDictionary = header.getSequenceDictionary();
}
reader.close();
try (InputStream dictionaryIn = Files.newInputStream(dictionary)) {
this.sequenceDictionary = ReferenceSequenceFileFactory.loadDictionary(dictionaryIn);
}
catch (Exception e) {
throw new SAMException("Could not open sequence dictionary file: " + dictionary, e);
}
}
}

/**
* Constructs an {@link AbstractFastaSequenceFile} with an optional sequence dictionary.
* @param path Fasta file to read. Also acts as a prefix for supporting files.
* @param source Named source used for error messages.
* @param sequenceDictionary The sequence dictionary, or null if there isn't one.
*/
AbstractFastaSequenceFile(final Path path, final String source, final SAMSequenceDictionary sequenceDictionary) {
this.path = path;
this.source = source;
this.sequenceDictionary = sequenceDictionary;
}

protected static File findSequenceDictionary(final File file) {
if (file == null) {
return null;
Expand Down Expand Up @@ -111,6 +118,11 @@ protected Path getPath() {
return path;
}

/** Returns the named source of the reference file. */
protected String getSource() {
return source;
}

/**
* Returns the list of sequence records associated with the reference sequence if found
* otherwise null.
Expand All @@ -122,12 +134,15 @@ public SAMSequenceDictionary getSequenceDictionary() {

/** Returns the full path to the reference file. */
protected String getAbsolutePath() {
if (path == null) {
return null;
}
return path.toAbsolutePath().toString();
}

/** Returns the full path to the reference file. */
/** Returns the full path to the reference file, or the source if no path was specified. */
public String toString() {
return getAbsolutePath();
return source;
}

/** default implementation -- override if index is supported */
Expand All @@ -137,13 +152,13 @@ public String toString() {
/** default implementation -- override if index is supported */
@Override
public ReferenceSequence getSequence( String contig ) {
throw new UnsupportedOperationException("Index does not appear to exist for " + getAbsolutePath() + ". samtools faidx can be used to create an index");
throw new UnsupportedOperationException("Index does not appear to exist for " + getSource() + ". samtools faidx can be used to create an index");
}

/** default implementation -- override if index is supported */
@Override
public ReferenceSequence getSubsequenceAt( String contig, long start, long stop ) {
throw new UnsupportedOperationException("Index does not appear to exist for " + getAbsolutePath() + ". samtools faidx can be used to create an index");
throw new UnsupportedOperationException("Index does not appear to exist for " + getSource() + ". samtools faidx can be used to create an index");
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,18 @@ protected AbstractIndexedFastaSequenceFile(final Path path, final FastaSequenceI
}
}

/**
* Initialise the given indexed fasta sequence file stream.
* @param source The named source of the reference file (used in error messages).
* @param index The fasta index.
* @param dictionary The sequence dictionary, or null if there isn't one.
*/
protected AbstractIndexedFastaSequenceFile(String source, final FastaSequenceIndex index, SAMSequenceDictionary dictionary) {
super(null, source, dictionary);
this.index = index;
reset();
}

protected static Path findRequiredFastaIndexFile(Path fastaFile) throws FileNotFoundException {
Path ret = findFastaIndex(fastaFile);
if (ret == null) throw new FileNotFoundException(ReferenceSequenceFileFactory.getFastaIndexFileName(fastaFile) + " not found.");
Expand Down Expand Up @@ -192,7 +204,7 @@ public ReferenceSequence getSubsequenceAt( String contig, long start, long stop
startOffset += readFromPosition(channelBuffer, indexEntry.getLocation()+startOffset);
}
catch(IOException ex) {
throw new SAMException("Unable to load " + contig + "(" + start + ", " + stop + ") from " + getAbsolutePath(), ex);
throw new SAMException("Unable to load " + contig + "(" + start + ", " + stop + ") from " + getSource(), ex);
}

// Reset the buffer for outbound transfers.
Expand Down
36 changes: 30 additions & 6 deletions src/main/java/htsjdk/samtools/reference/FastaSequenceFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,15 @@

import htsjdk.samtools.Defaults;
import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMSequenceRecord;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.FastLineReader;
import htsjdk.samtools.util.IOUtil;
import htsjdk.samtools.util.StringUtil;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;

/**
Expand All @@ -42,6 +45,7 @@
public class FastaSequenceFile extends AbstractFastaSequenceFile {

private final boolean truncateNamesAtWhitespace;
private final SeekableStream seekableStream;
private FastLineReader in;
private int sequenceIndex = -1;
private final byte[] basesBuffer = new byte[Defaults.NON_ZERO_BUFFER_SIZE];
Expand All @@ -56,9 +60,21 @@ public FastaSequenceFile(final File file, final boolean truncateNamesAtWhitespac
public FastaSequenceFile(final Path path, final boolean truncateNamesAtWhitespace) {
super(path);
this.truncateNamesAtWhitespace = truncateNamesAtWhitespace;
this.seekableStream = null;
this.in = new FastLineReader(IOUtil.openFileForReading(path));
}

/**
* Constructs a FastaSequenceFile that reads from the specified stream (which must not be compressed, i.e.
* the caller is responsible for decompressing the stream).
*/
public FastaSequenceFile(String source, final SeekableStream seekableStream, SAMSequenceDictionary dictionary, final boolean truncateNamesAtWhitespace) {
super(null, source, dictionary);
this.truncateNamesAtWhitespace = truncateNamesAtWhitespace;
this.seekableStream = seekableStream;
this.in = new FastLineReader(seekableStream);
}

/**
* It's good to call this to free up memory.
*/
Expand Down Expand Up @@ -88,9 +104,17 @@ public ReferenceSequence nextSequence() {
@Override
public void reset() {
this.sequenceIndex = -1;
this.in.close();
this.in = new FastLineReader(IOUtil.openFileForReading(getPath()));

if (getPath() != null) {
this.in.close();
this.in = new FastLineReader(IOUtil.openFileForReading(getPath()));
} else {
try {
this.seekableStream.seek(0);
} catch (IOException e) {
throw new SAMException("Problem seeking to start of stream during reset", e);
}
this.in = new FastLineReader(this.seekableStream);
}
}

private String readSequenceName() {
Expand All @@ -100,7 +124,7 @@ private String readSequenceName() {
}
final byte b = in.getByte();
if (b != '>') {
throw new SAMException("Format exception reading FASTA " + getAbsolutePath() + ". Expected > but saw chr(" +
throw new SAMException("Format exception reading FASTA " + getSource() + ". Expected > but saw chr(" +
b + ") at start of sequence with index " + this.sequenceIndex);
}
final byte[] nameBuffer = new byte[4096];
Expand All @@ -111,11 +135,11 @@ private String readSequenceName() {
}
nameLength += in.readToEndOfOutputBufferOrEoln(nameBuffer, nameLength);
if (nameLength == nameBuffer.length && !in.atEoln()) {
throw new SAMException("Sequence name too long in FASTA " + getAbsolutePath());
throw new SAMException("Sequence name too long in FASTA " + getSource());
}
} while (!in.atEoln());
if (nameLength == 0) {
throw new SAMException("Missing sequence name in FASTA " + getAbsolutePath());
throw new SAMException("Missing sequence name in FASTA " + getSource());
}
String name = StringUtil.bytesToString(nameBuffer, 0, nameLength).trim();
if (truncateNamesAtWhitespace) {
Expand Down
27 changes: 17 additions & 10 deletions src/main/java/htsjdk/samtools/reference/FastaSequenceIndex.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.nio.file.Files;
Expand Down Expand Up @@ -66,7 +67,19 @@ public FastaSequenceIndex( File indexFile ) {
*/
public FastaSequenceIndex( Path indexFile ) {
IOUtil.assertFileIsReadable(indexFile);
parseIndexFile(indexFile);
try (InputStream in = Files.newInputStream(indexFile)) {
parseIndexFile(in);
} catch (IOException e) {
throw new SAMException("Fasta index file could not be opened: " + indexFile, e);
}
}

/**
* Build a sequence index from the specified input stream.
* @param in InputStream to read from.
*/
public FastaSequenceIndex(InputStream in) {
parseIndexFile(in);
}

/**
Expand Down Expand Up @@ -124,12 +137,10 @@ public boolean equals(Object other) {

/**
* Parse the contents of an index file, caching the results internally.
* @param indexFile File to parse.
* @throws IOException Thrown if file could not be opened.
* @param in InputStream to parse.
*/
private void parseIndexFile(Path indexFile) {
try {
Scanner scanner = new Scanner(indexFile);
private void parseIndexFile(InputStream in) {
try (Scanner scanner = new Scanner(in)) {
int sequenceIndex = 0;
while( scanner.hasNext() ) {
// Tokenize and validate the index line.
Expand All @@ -154,10 +165,6 @@ private void parseIndexFile(Path indexFile) {
// Build sequence structure
add(new FastaSequenceIndexEntry(contig,location,size,basesPerLine,bytesPerLine, sequenceIndex++) );
}
scanner.close();
} catch (IOException e) {
throw new SAMException("Fasta index file could not be opened: " + indexFile, e);

}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@
package htsjdk.samtools.reference;

import htsjdk.samtools.SAMException;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.seekablestream.ReadableSeekableStreamByteChannel;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.IOUtil;

Expand Down Expand Up @@ -95,6 +98,18 @@ public IndexedFastaSequenceFile(final Path path) throws FileNotFoundException {
this(path, new FastaSequenceIndex((findRequiredFastaIndexFile(path))));
}

/**
* Initialise the given indexed fasta sequence file stream.
* @param source The named source of the reference file (used in error messages).
* @param in The input stream to read the fasta file from.
* @param index The fasta index.
* @param dictionary The sequence dictionary, or null if there isn't one.
*/
public IndexedFastaSequenceFile(String source, final SeekableStream in, final FastaSequenceIndex index, SAMSequenceDictionary dictionary) {
super(source, index, dictionary);
this.channel = new ReadableSeekableStreamByteChannel(in);
}

/**
* @deprecated use {@link ReferenceSequenceFileFactory#canCreateIndexedFastaReader(Path)} instead.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
import htsjdk.samtools.SAMException;
import htsjdk.samtools.util.BlockCompressedInputStream;
import htsjdk.samtools.util.GZIIndex;
import htsjdk.samtools.SAMFileHeader;
import htsjdk.samtools.SAMSequenceDictionary;
import htsjdk.samtools.SAMTextHeaderCodec;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.util.BufferedLineReader;
import htsjdk.samtools.util.IOUtil;

import java.io.BufferedInputStream;
Expand Down Expand Up @@ -166,6 +171,35 @@ public static boolean canCreateIndexedFastaReader(final Path fastaFile) {
return false;
}

/**
* Return an instance of ReferenceSequenceFile using the given fasta sequence file stream, optional index stream,
* and no sequence dictionary
*
* @param source The named source of the reference file (used in error messages).
* @param in The input stream to read the fasta file from.
* @param index The index, or null to return a non-indexed reader.
*/
public static ReferenceSequenceFile getReferenceSequenceFile(final String source, final SeekableStream in, final FastaSequenceIndex index) {
return getReferenceSequenceFile(source, in, index, null, true);
}

/**
* Return an instance of ReferenceSequenceFile using the given fasta sequence file stream and optional index stream
* and sequence dictionary.
*
* @param source The named source of the reference file (used in error messages).
* @param in The input stream to read the fasta file from.
* @param index The index, or null to return a non-indexed reader.
* @param dictionary The sequence dictionary, or null if there isn't one.
* @param truncateNamesAtWhitespace if true, only include the first word of the sequence name
*/
public static ReferenceSequenceFile getReferenceSequenceFile(final String source, final SeekableStream in, final FastaSequenceIndex index, final SAMSequenceDictionary dictionary, final boolean truncateNamesAtWhitespace) {
if (truncateNamesAtWhitespace && index != null) {
return new IndexedFastaSequenceFile(source, in, index, dictionary);
}
return new FastaSequenceFile(source, in, dictionary, truncateNamesAtWhitespace);
}

/**
* Returns the default dictionary name for a FASTA file.
*
Expand All @@ -186,6 +220,22 @@ public static Path getDefaultDictionaryForReferenceSequence(final Path path) {
return path.resolveSibling(name.substring(0, extensionIndex) + IOUtil.DICT_FILE_EXTENSION);
}

/**
* Loads the sequence dictionary from a FASTA file input stream.
*
* @param in the FASTA file input stream.
* @return the sequence dictionary, or <code>null</code> if the header has no dictionary or it was empty.
*/
public static SAMSequenceDictionary loadDictionary(final InputStream in) {
final SAMTextHeaderCodec codec = new SAMTextHeaderCodec();
final BufferedLineReader reader = new BufferedLineReader(in);
final SAMFileHeader header = codec.decode(reader, null);
if (header.getSequenceDictionary().isEmpty()) {
return null;
}
return header.getSequenceDictionary();
}

/**
* Returns the FASTA extension for the path.
*
Expand Down
Loading

0 comments on commit 72818a0

Please sign in to comment.