From b23c36aab4d012023fdf8733c85006c2f6f1995a Mon Sep 17 00:00:00 2001 From: Louis Bergelson Date: Wed, 13 Dec 2023 13:15:28 -0500 Subject: [PATCH] Improving the last changes to HtsPath (#1694) * fixing edge cases as well as documentation and tests from the change to HtsPath in #1693 --- src/main/java/htsjdk/io/HtsPath.java | 42 ++++++++--- src/test/java/htsjdk/io/HtsPathUnitTest.java | 77 +++++++++++++++++++- 2 files changed, 103 insertions(+), 16 deletions(-) diff --git a/src/main/java/htsjdk/io/HtsPath.java b/src/main/java/htsjdk/io/HtsPath.java index c16090b450..f8e5cef5c6 100644 --- a/src/main/java/htsjdk/io/HtsPath.java +++ b/src/main/java/htsjdk/io/HtsPath.java @@ -27,7 +27,7 @@ * encoding/escaping. * * For example, a URI that contains a scheme and has an embedded "#" in the path will be treated as a URI - * having a fragment delimiter. If the URI contains an scheme, the "#" will be escaped and the encoded "#" + * having a fragment delimiter. If the URI contains no scheme, the "#" will be escaped and the encoded "#" * will be interpreted as part of the URI path. * * There are 3 succeeding levels of input validation/conversion: @@ -69,6 +69,7 @@ */ public class HtsPath implements IOPath, Serializable { private static final long serialVersionUID = 1L; + private static final String HIERARCHICAL_SCHEME_SEPARATOR = "://"; private final String rawInputString; // raw input string provided by th user; may or may not have a scheme private final URI uri; // working URI; always has a scheme ("file" if not otherwise specified) @@ -255,7 +256,7 @@ private URI getURIForString(final String pathString) { } catch (URISyntaxException uriException) { //check that the uri wasn't a badly encoded absolute uri of some sort //if you don't do this it will be treated as a badly formed file:// url - assertNoNonFileScheme(pathString, uriException); + assertNoProblematicScheme(pathString, uriException); // the input string isn't a valid URI; assume its a local (non-URI) file reference, and // use the URI resulting from the corresponding Path @@ -282,19 +283,36 @@ private URI getURIForString(final String pathString) { } /** - * check that there isn't a non file scheme at the start of the path - * @param pathString - * @param cause + * Check for problems associated with the presence of a hierarchical scheme. + * + * It's better to reject cases like `://` or `ftp://I forgot to encode this` than to treat them as relative file paths + * It's almost certainly an error on the users part instead of an atttempt to intentionally reference a file named + * `file:///workingidr/ftp:/I forgot to encode this` + * + * Note this is only meant to be called in the case of a URLSyntaxException already having occured during initial + * parsing of the URI + * + * @param pathString the path being examined + * @param cause the original failure reason */ - private static void assertNoNonFileScheme(String pathString, URISyntaxException cause){ - final String[] split = pathString.split(":"); + static void assertNoProblematicScheme(String pathString, URISyntaxException cause){ + if(pathString.equals(HIERARCHICAL_SCHEME_SEPARATOR)){ + throw new IllegalArgumentException(HIERARCHICAL_SCHEME_SEPARATOR + " is not a valid path.", cause); + } + + final String[] split = pathString.split(HIERARCHICAL_SCHEME_SEPARATOR, -1); + final String scheme = split[0]; + + if(split.length == 2 && pathString.endsWith(HIERARCHICAL_SCHEME_SEPARATOR)) { + throw new IllegalArgumentException("A path consisting of only a scheme is not allowed: " + pathString, cause); + } + if(split.length > 1){ - if(split[0] == null || split[0].isEmpty()){ - throw new IllegalArgumentException("Malformed url " + pathString + " includes an empty scheme." + - "\nCheck that it is fully encoded.", cause); + if(scheme == null || scheme.isEmpty()){ + throw new IllegalArgumentException("Malformed path " + pathString + " includes an empty scheme.", cause); } - if(!split[0].equals("file")){ - throw new IllegalArgumentException("Malformed url " + pathString + " includes a scheme: " + split[0] + ":// but was an invalid URI." + + if(!scheme.equals("file")){ + throw new IllegalArgumentException("Malformed path " + pathString + " includes a scheme: " + scheme + ":// but was an invalid URI." + "\nCheck that it is fully encoded.", cause); } } diff --git a/src/test/java/htsjdk/io/HtsPathUnitTest.java b/src/test/java/htsjdk/io/HtsPathUnitTest.java index c5ffe2c67d..b7ea525b42 100644 --- a/src/test/java/htsjdk/io/HtsPathUnitTest.java +++ b/src/test/java/htsjdk/io/HtsPathUnitTest.java @@ -21,8 +21,6 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.ProviderNotFoundException; -import java.util.HashMap; -import java.util.Map; import java.util.Optional; public class HtsPathUnitTest extends HtsjdkTest { @@ -114,7 +112,7 @@ public Object[][] validHtsPath() { //***************************************************************************************** // Reference that contain characters that require URI-encoding. If the input string is presented - // with no scheme, it will be be automatically encoded by HtsPath, otherwise it + // with no scheme, it will be automatically encoded by HtsPath, otherwise it // must already be URI-encoded. //***************************************************************************************** @@ -126,6 +124,8 @@ public Object[][] validHtsPath() { {"file:project/gvcf-pcr/23232_1#1/1.g.vcf.gz", "file:project/gvcf-pcr/23232_1#1/1.g.vcf.gz", true, false}, {"file:/project/gvcf-pcr/23232_1#1/1.g.vcf.gz", "file:/project/gvcf-pcr/23232_1#1/1.g.vcf.gz", true, false}, {"file:///project/gvcf-pcr/23232_1%231/1.g.vcf.g", "file:///project/gvcf-pcr/23232_1%231/1.g.vcf.g", true, true}, + + {"http://host/path://", "http://host/path://", false, false} }; } @@ -169,7 +169,15 @@ public Object[][] invalidHtsPath() { // the nul character is rejected on all of the supported platforms in both local // filenames and URIs, so use it to test HtsPath constructor failure on all platforms {"\0"}, - {"ftp://broad.org/file with space"} // this has a non-file scheme but isn't encoded properly + + // this has a non-file scheme but isn't encoded properly, best to reject these with + // a helpful error message than to continue on and treat it as a file:// path + {"ftp://broad.org/file with space"}, + + // if you have a scheme you need something with it + {"file://"}, + {"http://"} + }; } @@ -193,11 +201,14 @@ public Object[][] invalidPath() { {"unknownscheme://foobar"}, {"gendb://adb"}, + {"gcs://abucket/bucket"}, // URIs with schemes that are backed by an valid NIO provider, but for which the // scheme-specific part is not valid. {"file://nonexistent_authority/path/to/file.bam"}, // unknown authority "nonexistent_authority" + + }; } @@ -649,4 +660,62 @@ private String getLocalFileAsURIPathString(final Path localPath) { return localPath.toUri().normalize().getPath(); } + @DataProvider + public Object[][] getNonProblematic() { + return new Object[][]{ + // URI is unencoded but no problems with a scheme + {"file/ name-sare/ :wierd-"}, + {"hello there"}, + + //schemes schemes everywheret + {"file://://"}, + {"file://://://"}, + {"file://something://"}, + {"file://something://somoethingelse"}, + + // file scheme is deliberately ignored here since it's handled specially later + {"file://unencoded file names/ are/ok!"}, + + //these aren't invalid URIs so they should never be tested by this method, they'll pass it though + + {"eep/ee:e:e:ep"}, + {"file:///"} + }; + } + @Test(dataProvider = "getNonProblematic") + public void testAssertNoProblematicScheme(String path){ + HtsPath.assertNoProblematicScheme(path, null); + } + + @DataProvider + public Object[][] getProblematic(){ + return new Object[][]{ + + // This is the primary use case, to detect unencoded uris that were intended to be encoded. + // Note that assertNoProblematicScheme doesn't check for issues constructing the URI itself + // it is only called after a URI parsing exception has already occured. + {"http://forgot.com/to encode"}, + {"ftp://server.com/file name.txt"}, + + {"://"}, + {"://://"}, + {"://://://"}, + + //this is technically a valid file name, but it seems very unlikely that anyone would do this delierately + //better to call it out + {"://forgotmyscheme"}, + + //invalid URI, needs the rest of the path + {"file://"}, + {"http://"}, + + //This gets rejected but it should never reach here because it's not an invalid URI + {"http://thisIsOkButItwouldNeverGetHere/something?file=thisone#righthere"}, + }; + } + + @Test(dataProvider = "getProblematic", expectedExceptions = IllegalArgumentException.class) + public void testAssertNoProblematicSchemeRejectedCases(String path){ + HtsPath.assertNoProblematicScheme(path, null); + } }