Skip to content

Commit

Permalink
GH-2979: Test for surrogates after escape processing
Browse files Browse the repository at this point in the history
  • Loading branch information
afs committed Feb 24, 2025
1 parent 2c1f02f commit 55694c2
Show file tree
Hide file tree
Showing 9 changed files with 65 additions and 14 deletions.
15 changes: 15 additions & 0 deletions jena-arq/Grammar/Final/README.txt
Original file line number Diff line number Diff line change
@@ -1 +1,16 @@
Final-for-spec-publication versions of the grammar.

sparql_10-final.jj - SPARQL 1.0 "sparql_10.jj" ("main.jj" after cpp)

sparql_11-final.jj - SPARQL 1.1 "sparql_11.jj" ("main.jj" after cpp)

sparql-main-11.jj - SPARQL 1.1 "main.jj" (com.hp)


sparql_11-dev-final.jj - End SPARQL 1.1 development. (org.apache.jena.graph "main.jj" at SPARQLParser11)
sparql_11-dev-final.txt - jjdoc
tokens_11.txt - Tokens file.

-- Coming soon.
sparql_12-final.jj - SPARQL 1.2 "sparql_11.jj" ("main.jj" after cpp)
sparql-main-12.jj - SPARQL 1.2 "main.jj"
4 changes: 2 additions & 2 deletions jena-arq/Grammar/arq.jj
Original file line number Diff line number Diff line change
Expand Up @@ -1910,8 +1910,8 @@ String String() : { Token t ; String lex ; }
| t = <STRING_LITERAL_LONG1> { lex = stripQuotes3(t.image) ; }
| t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; }
)
{ checkString(lex, t.beginLine, t.beginColumn) ;
lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
{ lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
checkString(lex, t.beginLine, t.beginColumn) ;
return lex ;
}
}
Expand Down
3 changes: 1 addition & 2 deletions jena-arq/Grammar/main.jj
Original file line number Diff line number Diff line change
Expand Up @@ -2621,8 +2621,7 @@ String String() : { Token t ; String lex ; }
| t = <STRING_LITERAL_LONG1> { lex = stripQuotes3(t.image) ; }
| t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; }
)
{ checkString(lex, t.beginLine, t.beginColumn) ;
lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
{ lex = prepareLexicalForm(lex, t.beginLine, t.beginColumn) ;
return lex ;
}
}
Expand Down
4 changes: 2 additions & 2 deletions jena-arq/Grammar/sparql_12.jj
Original file line number Diff line number Diff line change
Expand Up @@ -1625,8 +1625,8 @@ String String() : { Token t ; String lex ; }
| t = <STRING_LITERAL_LONG1> { lex = stripQuotes3(t.image) ; }
| t = <STRING_LITERAL_LONG2> { lex = stripQuotes3(t.image) ; }
)
{ checkString(lex, t.beginLine, t.beginColumn) ;
lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
{ lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
checkString(lex, t.beginLine, t.beginColumn) ;
return lex ;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,15 @@ protected Node stripSign(Node node) {
}

protected void checkString(String string, int line, int column) {
// Checks for bare surrogate pairs.
for ( int i = 0 ; i < string.length() ; i++ ) {
// Not "codePointAt" which does surrogate processing.
char ch = string.charAt(i);
// Check surrogate pairs are pairs.

// Check surrogate pairs are in pairs. Pairs are high-low.
if ( Character.isLowSurrogate(ch) )
throw new QueryParseException("Bad surrogate pair (low surrogate without high surrogate)", line, column);

if ( Character.isHighSurrogate(ch) ) {
i++;
if ( i == string.length() )
Expand All @@ -221,8 +226,6 @@ protected void checkString(String string, int line, int column) {
if ( !Character.isLowSurrogate(ch1) ) {
throw new QueryParseException("Bad surrogate pair (high surrogate not followed by low surrogate)", line, column);
}
} else if ( Character.isLowSurrogate(ch) ) {
throw new QueryParseException("Bad surrogate pair (low surrogate without high surrogate)", line, column);
}
}
}
Expand Down Expand Up @@ -345,7 +348,6 @@ protected static String stripChars(String s, int n) {

protected Var createVariable(String s, int line, int column) {
s = s.substring(1); // Drop the marker

// This is done by the parser input stream nowadays.
// s = unescapeCodePoint(s, line, column);
// Check \ u did not put in any illegals.
Expand All @@ -361,6 +363,8 @@ protected Node createTripleTerm(Node s, Node p, Node o, int line, int column) {
protected String resolveQuotedIRI(String iriStr, int line, int column) {
iriStr = stripQuotes(iriStr);
iriStr = unescapeUnicode(iriStr, line, column);
// Check for Unicode surrogates
checkString(iriStr, line, column);
return resolveIRI(iriStr, line, column);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7442,8 +7442,8 @@ final public Node BooleanLiteral() throws ParseException {
jj_consume_token(-1);
throw new ParseException();
}
checkString(lex, t.beginLine, t.beginColumn) ;
lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
checkString(lex, t.beginLine, t.beginColumn) ;
{if ("" != null) return lex ;}
throw new Error("Missing return statement in function");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5837,8 +5837,8 @@ final public Node BooleanLiteral() throws ParseException {
jj_consume_token(-1);
throw new ParseException();
}
checkString(lex, t.beginLine, t.beginColumn) ;
lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
lex = unescapeStr(lex, t.beginLine, t.beginColumn) ;
checkString(lex, t.beginLine, t.beginColumn) ;
{if ("" != null) return lex ;}
throw new Error("Missing return statement in function");
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
package org.apache.jena.sparql.syntax;

import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;

import org.junit.jupiter.api.Test;

Expand All @@ -40,6 +41,36 @@ private static void silent(Runnable action) {
LogCtl.withLevel(loggerSPARQL, "fatal", action);
}

// Single backslash so a Java string escape, raw surrogate in the string.
@Test
public void syntax_unicode_raw_surrogate_uri() {
QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { <http://example/\uD800> ?p ?o}"));
assertTrue(ex.getMessage().contains("surrogate"));
}

@Test
public void syntax_unicode_raw_surrogate_string() {
QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { ?s ?p '\uD800' }"));
assertTrue(ex.getMessage().contains("surrogate"));
}

// Double backslash so the query string has an escape in it.
@Test
public void syntax_unicode_escaped_surrogate_uri() {
QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { <http://example/\\uD800> ?p ?o}"));
assertTrue(ex.getMessage().contains("surrogate"));
}

@Test
public void syntax_unicode_escaped_surrogate_strings() {
QueryParseException ex = assertThrows(QueryParseException.class, ()->testParse("SELECT * { ?s ?p '\\uD800'}"));
assertTrue(ex.getMessage().contains("surrogate"));
}

private static void testParse(String string) {
QueryFactory.create(string);
}

@Test public void syntax_uri_brackets_1() {
testParseIRIs("<http://example/#[]>");
}
Expand Down
2 changes: 2 additions & 0 deletions jena-cmds/src/test/java/arq/rdftests.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ public class rdftests extends CmdGeneral
RIOT.getContext().set(RIOT.symTurtleDirectiveStyle, "sparql");
}

// Test runners are in jena-arq, package org.apache.jena.arq.junit.runners

public static void main(String...argv) {
try { new rdftests(argv).mainRun(); }
catch (TerminationException ex) {
Expand Down

0 comments on commit 55694c2

Please sign in to comment.