Skip to content

Commit

Permalink
Pefer FileSystemProvider plugins for http/ftp over legacy handlers(#1693
Browse files Browse the repository at this point in the history
)

* Re-Ordering priority in SeekableStreamFactory so FileSystemProvider plugins will be preferred over the built in http/ftp handlers.
  * Now when constructing streams it will prefer NIO plugins if they are available. 
   * If no http(s) / ftp plugin exists it will fall back to the htsjdk built in.

* Similarly updating ParsingUtils.openInputStream() and exists()

* Update HtsPath to throw on non file schemes with malformed URIs instead of trying to interpret them as file://

* Deprecating SeekableStreamFactory.isFilePath() since it is no longer used and interacts poorly
  with nio filesystem providers

* Note previously, unencoded FTP paths were allowed.  Now FTP paths with spaces must be percent encoded.
  • Loading branch information
lbergelson authored Dec 12, 2023
1 parent 3964abe commit 40e3a8b
Show file tree
Hide file tree
Showing 11 changed files with 214 additions and 97 deletions.
24 changes: 24 additions & 0 deletions src/main/java/htsjdk/io/HtsPath.java
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,10 @@ private URI getURIForString(final String pathString) {
tempURI = getCachedPath().toUri();
}
} catch (URISyntaxException uriException) {
//check that the uri wasn't a badly encoded absolute uri of some sort
//if you don't do this it will be treated as a badly formed file:// url
assertNoNonFileScheme(pathString, uriException);

// the input string isn't a valid URI; assume its a local (non-URI) file reference, and
// use the URI resulting from the corresponding Path
try {
Expand All @@ -276,5 +280,25 @@ private URI getURIForString(final String pathString) {

return tempURI;
}

/**
* check that there isn't a non file scheme at the start of the path
* @param pathString
* @param cause
*/
private static void assertNoNonFileScheme(String pathString, URISyntaxException cause){
final String[] split = pathString.split(":");
if(split.length > 1){
if(split[0] == null || split[0].isEmpty()){
throw new IllegalArgumentException("Malformed url " + pathString + " includes an empty scheme." +
"\nCheck that it is fully encoded.", cause);
}
if(!split[0].equals("file")){
throw new IllegalArgumentException("Malformed url " + pathString + " includes a scheme: " + split[0] + ":// but was an invalid URI." +
"\nCheck that it is fully encoded.", cause);
}
}

}

}
1 change: 1 addition & 0 deletions src/main/java/htsjdk/samtools/SAMRecordSetBuilder.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ public class SAMRecordSetBuilder implements Iterable<SAMRecord> {
"chr21", "chr22", "chrX", "chrY", "chrM"
};


private static final String READ_GROUP_ID = "1";
private static final String SAMPLE = "FREE_SAMPLE";
private final Random random = new Random(TestUtil.RANDOM_SEED);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,15 @@
*/
package htsjdk.samtools.seekablestream;

import htsjdk.samtools.util.IOUtil;
import java.io.File;
import htsjdk.io.HtsPath;
import htsjdk.io.IOPath;
import htsjdk.tribble.TribbleException;

import java.io.IOException;
import java.net.URI;
import java.net.URL;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Path;
import java.util.Set;
import java.util.function.Function;

/**
Expand All @@ -40,6 +43,14 @@
public class SeekableStreamFactory{

private static final ISeekableStreamFactory DEFAULT_FACTORY;
private static final String HTTP = "http";
private static final String HTTPS = "https";
private static final String FTP = "ftp";
/**
* the set of url schemes that have special support in htsjdk that isn't through a FileSystemProvider
*/
private static final Set<String> URL_SCHEMES_WITH_LEGACY_SUPPORT = Set.of(HTTP, FTP, HTTPS);
public static final String FILE_SCHEME = "file";
private static ISeekableStreamFactory currentFactory;

static{
Expand All @@ -61,9 +72,28 @@ public static ISeekableStreamFactory getInstance(){
* Does this path point to a regular file on disk and not something like a URL?
* @param path the path to test
* @return true if the path is to a file on disk
* @deprecated this method is simplistic and no longer particularly useful since IOPath allows similar access to
* various non-file data sources, internal use has been replaced with {@link #isBeingHandledByLegacyUrlSupport(String)}
*/
@Deprecated
public static boolean isFilePath(final String path) {
return ! ( path.startsWith("http:") || path.startsWith("https:") || path.startsWith("ftp:") );
return !canBeHandledByLegacyUrlSupport(path);
}

/**
* is this path being handled by one of the legacy SeekableStream types (http(s) / ftp)
*
* @param path a path to check
* @return if the path is not being handled by a FileSystemProvider and it can be read by legacy streams
*/
public static boolean isBeingHandledByLegacyUrlSupport(final String path){
return !new HtsPath(path).hasFileSystemProvider() //if we have a provider for it that's what we'll use
&& canBeHandledByLegacyUrlSupport(path); // otherwise we fall back to the special handlers
}

//is this one of the url types that has legacy htsjdk support built in?
public static boolean canBeHandledByLegacyUrlSupport(final String path) {
return URL_SCHEMES_WITH_LEGACY_SUPPORT.stream().anyMatch(scheme-> path.startsWith(scheme +"://"));
}

private static class DefaultSeekableStreamFactory implements ISeekableStreamFactory {
Expand All @@ -79,7 +109,7 @@ public SeekableStream getStreamFor(final String path) throws IOException {
}

/**
* The wrapper will only be applied to the stream if the stream is treated as a {@link java.nio.file.Path}
* The wrapper will only be applied to the stream if the stream is treated as a {@link Path}
*
* This currently means any uri with a scheme that is not http, https, ftp, or file will have the wrapper applied to it
*
Expand All @@ -89,26 +119,30 @@ public SeekableStream getStreamFor(final String path) throws IOException {
@Override
public SeekableStream getStreamFor(final String path,
Function<SeekableByteChannel, SeekableByteChannel> wrapper) throws IOException {
// todo -- add support for SeekableBlockInputStream

if (path.startsWith("http:") || path.startsWith("https:")) {
final URL url = new URL(path);
return new SeekableHTTPStream(url);
} else if (path.startsWith("ftp:")) {
return new SeekableFTPStream(new URL(path));
} else if (path.startsWith("file:")) {
try {
// convert to URI in order to obtain a decoded version of the path string suitable
// for use with the File constructor
final String decodedPath = new URI(path).getPath();
return new SeekableFileStream(new File(decodedPath));
} catch (java.net.URISyntaxException e) {
throw new IllegalArgumentException(String.format("The input string %s contains a URI scheme but is not a valid URI", path), e);
}
} else if (IOUtil.hasScheme(path)) {
return new SeekablePathStream(IOUtil.getPath(path), wrapper);
return getStreamFor(new HtsPath(path), wrapper);
}


/**
* The wrapper will only be applied to the stream if the stream is treated as a non file:// {@link Path}
*
* This has a fall back to htsjdk's built in http and ftp providers if no FileSystemProvder is available for them
*
* @param path an IOPath to be opened
* @param wrapper a wrapper to apply to the stream allowing direct transformations on the byte stream to be applied
* @throws IOException
*/
public static SeekableStream getStreamFor(final IOPath path, Function<SeekableByteChannel, SeekableByteChannel> wrapper) throws IOException {
if(path.hasFileSystemProvider()) {
return path.getScheme().equals(FILE_SCHEME)
? new SeekableFileStream(path.toPath().toFile()) //don't apply the wrapper to local files
: new SeekablePathStream(path.toPath(), wrapper);
} else {
return new SeekableFileStream(new File(path));
return switch(path.getScheme()){
case HTTP, HTTPS -> new SeekableHTTPStream(new URL(path.getRawInputString()));
case FTP -> new SeekableFTPStream((new URL(path.getRawInputString())));
default -> throw new TribbleException("Unknown path type. No FileSystemProvider available for " + path.getRawInputString());
};
}
}

Expand Down
1 change: 1 addition & 0 deletions src/main/java/htsjdk/tribble/FeatureCodec.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

package htsjdk.tribble;

import htsjdk.io.IOPath;
import htsjdk.samtools.util.LocationAware;
import htsjdk.tribble.index.tabix.TabixFormat;

Expand Down
13 changes: 7 additions & 6 deletions src/main/java/htsjdk/tribble/TribbleIndexedFeatureReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
*/
package htsjdk.tribble;

import htsjdk.io.HtsPath;
import htsjdk.samtools.seekablestream.SeekableStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.samtools.util.IOUtil;
Expand All @@ -40,6 +41,7 @@
import java.net.URI;
import java.net.URLEncoder;
import java.nio.channels.SeekableByteChannel;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
Expand All @@ -60,9 +62,9 @@ public class TribbleIndexedFeatureReader<T extends Feature, SOURCE> extends Abst
private Index index;

/**
* is the path pointing to our source data a regular file?
* is the path backed by old style built in http(s) / ftp support instead of a FileSystemProvider
*/
private final boolean pathIsRegularFile;
private final boolean pathIsOldStyleHttpOrFtp;

/**
* a potentially reusable seekable stream for queries over regular files
Expand Down Expand Up @@ -97,8 +99,7 @@ public TribbleIndexedFeatureReader(final String featurePath, final FeatureCodec<
}
}

// does path point to a regular file?
this.pathIsRegularFile = SeekableStreamFactory.isFilePath(path);
this.pathIsOldStyleHttpOrFtp = SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(path);

readHeader();
}
Expand Down Expand Up @@ -203,7 +204,7 @@ private SeekableStream getSeekableStream() throws IOException {
* @return true if
*/
private boolean reuseStreamInQuery() {
return pathIsRegularFile;
return !pathIsOldStyleHttpOrFtp;
}

@Override
Expand Down Expand Up @@ -252,7 +253,7 @@ private void readHeader() throws IOException {
PositionalBufferedStream pbs = null;
try {
is = ParsingUtils.openInputStream(path, wrapper);
if (IOUtil.hasBlockCompressedExtension(new URI(URLEncoder.encode(path, "UTF-8")))) {
if (IOUtil.hasBlockCompressedExtension(new HtsPath(path).getURI())) {
// TODO: TEST/FIX THIS! https://github.com/samtools/htsjdk/issues/944
// TODO -- warning I don't think this can work, the buffered input stream screws up position
is = new GZIPInputStream(new BufferedInputStream(is));
Expand Down
8 changes: 7 additions & 1 deletion src/main/java/htsjdk/tribble/util/FTPHelper.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;

/**
Expand Down Expand Up @@ -35,7 +36,12 @@ public long getContentLength() throws IOException {

@Override
public InputStream openInputStream() throws IOException {
String file = url.getPath();
String file = null;
try {
file = url.toURI().getPath();
} catch (URISyntaxException e) {
throw new IOException(e);
}
FTPClient ftp = FTPUtils.connect(url.getHost(), url.getUserInfo(), null);
ftp.pasv();
ftp.retr(file);
Expand Down
45 changes: 25 additions & 20 deletions src/main/java/htsjdk/tribble/util/ParsingUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,20 @@
*/
package htsjdk.tribble.util;

import htsjdk.io.HtsPath;
import htsjdk.io.IOPath;
import htsjdk.samtools.seekablestream.SeekablePathStream;
import htsjdk.samtools.seekablestream.SeekableStreamFactory;
import htsjdk.samtools.util.IOUtil;

import java.awt.Color;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.reflect.Constructor;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.net.URLEncoder;
import java.nio.channels.SeekableByteChannel;
import java.nio.file.Files;
import java.util.*;
Expand All @@ -49,7 +52,7 @@ public class ParsingUtils {
private static URLHelperFactory urlHelperFactory = RemoteURLHelper::new;

// HTML 4.1 color table, + orange and magenta
private static Map<String, String> colorSymbols = new HashMap();
private static final Map<String, String> colorSymbols = new HashMap<>();

static {
colorSymbols.put("white", "FFFFFF");
Expand Down Expand Up @@ -81,32 +84,35 @@ public static InputStream openInputStream(String path)
return openInputStream(path, null);
}

static private final Set<String> URL_SCHEMES = new HashSet<>(Arrays.asList("http", "ftp", "https"));

/**
* open an input stream from the given path and wrap the raw byte stream with a wrapper if given
*
* the wrapper will only be applied to paths that are not http, https, ftp, or file, i.e. any {@link java.nio.file.Path}
* using a custom filesystem plugin
* the wrapper will only be applied to paths that are
* 1. not local files
* 2. not being handled by the legacy http(s)/ftp providers
* i.e. any {@link java.nio.file.Path} using a custom FileSystem plugin
* @param uri a uri like string
* @param wrapper to wrap the input stream in, may be used to implement caching or prefetching, etc
* @return An inputStream appropriately created from uri and conditionally wrapped with wrapper (only in certain cases)
* @throws IOException when stream cannot be opened against uri
*/
public static InputStream openInputStream(final String uri, final Function<SeekableByteChannel, SeekableByteChannel> wrapper)
throws IOException {

final InputStream inputStream;

if (URL_SCHEMES.stream().anyMatch(uri::startsWith)) {
inputStream = getURLHelper(new URL(uri)).openInputStream();
} else if (!IOUtil.hasScheme(uri)) {
File file = new File(uri);
inputStream = Files.newInputStream(file.toPath());
final IOPath path = new HtsPath(uri);
if(path.hasFileSystemProvider()){
if(path.isPath()) {
return path.getScheme().equals("file")
? Files.newInputStream(path.toPath())
: new SeekablePathStream(path.toPath(), wrapper);
} else {
throw new IOException("FileSystemProvider for path " + path.getRawInputString() + " exits but failed to " +
" create path. \n" + path.getToPathFailureReason());
}
} else if( SeekableStreamFactory.canBeHandledByLegacyUrlSupport(uri)){
return getURLHelper(new URL(uri)).openInputStream();
} else {
inputStream = new SeekablePathStream(IOUtil.getPath(uri), wrapper);
throw new IOException("No FileSystemProvider available to handle path: " + path.getRawInputString());
}
return inputStream;
}

public static <T> String join(String separator, Collection<T> objects) {
Expand Down Expand Up @@ -402,10 +408,9 @@ private static Color hexToColor(String string) {
}

public static boolean resourceExists(String resource) throws IOException{

boolean remoteFile = resource.startsWith("http://") || resource.startsWith("https://") || resource.startsWith("ftp://");
boolean remoteFile = SeekableStreamFactory.isBeingHandledByLegacyUrlSupport(resource);
if (remoteFile) {
URL url = null;
URL url;
try {
url = new URL(resource);
} catch (MalformedURLException e) {
Expand Down
3 changes: 3 additions & 0 deletions src/test/java/htsjdk/io/HtsPathUnitTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,8 @@ public Object[][] validHtsPath() {
{"gcs://abucket/bucket", "gcs://abucket/bucket", false, false},
{"gendb://somegdb", "gendb://somegdb", false, false},
{"chr1:1-100", "chr1:1-100", false, false},
{"ftp://broad.org/file", "ftp://broad.org/file", false, false},
{"ftp://broad.org/with%20space", "ftp://broad.org/with%20space", false, false},

//**********************************************************************************************
// Valid URIs which ARE valid NIO URIs (there *IS* an installed file system provider), but are
Expand Down Expand Up @@ -167,6 +169,7 @@ public Object[][] invalidHtsPath() {
// the nul character is rejected on all of the supported platforms in both local
// filenames and URIs, so use it to test HtsPath constructor failure on all platforms
{"\0"},
{"ftp://broad.org/file with space"} // this has a non-file scheme but isn't encoded properly
};
}

Expand Down
Loading

0 comments on commit 40e3a8b

Please sign in to comment.