diff --git a/.github/workflows/ci_test.yml b/.github/workflows/linux_ci.yml
similarity index 100%
rename from .github/workflows/ci_test.yml
rename to .github/workflows/linux_ci.yml
diff --git a/.github/workflows/windows_ci.yml b/.github/workflows/windows_ci.yml
new file mode 100644
index 00000000..2409e1f1
--- /dev/null
+++ b/.github/workflows/windows_ci.yml
@@ -0,0 +1,21 @@
+name: CI Windows
+
+on: [push, pull_request]
+
+jobs:
+ build:
+ runs-on: windows-latest
+ steps:
+ - name: Set git to use LF
+ run: |
+ git config --global core.autocrlf false
+ git config --global core.eol lf
+ - name: Checkout Koral
+ uses: actions/checkout@v2
+ - name: Set up JDK 11
+ uses: actions/setup-java@v1
+ with:
+ java-version: 11
+
+ - name: Build and install Koral
+ run: mvn --file pom.xml install
diff --git a/Changes b/Changes
index 3e98baaa..c7704005 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,11 @@
+0.42 2024-01-11
+ - [feature] Support #REG in C2 (bodmer)
+ - [bugfix] Fix comma in #BED in C2 (bodmer)
+
0.41 2023-09-13
+ - [feature] Finish support for CQP (irimia)
- [bugfix] Disallow empty regex in PQ+ (diewald)
+ - [cleanup] Change of groupID.
0.40 2023-07-26
- [feature] Initial support for CQP
diff --git a/README.md b/README.md
index d7370080..280ea328 100644
--- a/README.md
+++ b/README.md
@@ -132,7 +132,7 @@ J. Bingel and N. Diewald, "KoralQuery – a General Corpus Query Protocol," in P
## Authorship
Koral and KoralQuery were developed by Joachim Bingel,
-Nils Diewald, Michael Hanl and Eliza Margaretha at the
+Nils Diewald, Michael Hanl, Eliza Margaretha, and Franck Bodmer at the
[Leibniz Institute for the German Language (IDS)](https://www.ids-mannheim.de/),
member of the [Leibniz Association](https://www.leibniz-gemeinschaft.de).
@@ -142,7 +142,7 @@ The ANTLR grammars for parsing ANNIS QL and COSMAS II QL were developed by
Thomas Krause (HU Berlin) and Franck Bodmer (IDS Mannheim), respectively.
Minor adaptations of those grammars were implemented by the Koral authors.
-The authors wish to thank Piotr Bański, Franck Bodmer, Elena Frick and
+The authors wish to thank Piotr Bański, Elena Frick and
Carsten Schnober for their valuable input.
## License
diff --git a/pom.xml b/pom.xml
index 6bdd5574..7b89fc9a 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2,9 +2,9 @@
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
4.0.0
- de.ids_mannheim.korap
+ de.ids-mannheim.korap.koral
Koral
- 0.41.0
+ 0.42.0
jar
Koral
https://korap.ids-mannheim.de
@@ -39,11 +39,11 @@
UTF-8
- 2.15.2
+ 2.16.1
4.9.3
3.5.3
11
- 2.20.0
+ 2.22.1
@@ -83,7 +83,7 @@
com.google.guava
guava
- 32.1.2-jre
+ 33.0.0-jre
com.fasterxml.jackson.core
@@ -145,7 +145,7 @@
org.slf4j
slf4j-api
- 2.0.9
+ 2.0.11
eu.clarin.sru.fcs
@@ -160,7 +160,7 @@
org.apache.maven.plugins
maven-clean-plugin
- 3.3.1
+ 3.3.2
@@ -177,6 +177,7 @@
**/c2ps_opIN.java
**/c2ps_opOV.java
**/c2ps_opPROX.java
+ **/c2ps_opREG.java
**/c2ps_opWF.java
**/c2ps_optCase.java
**/.gitignore
@@ -190,7 +191,7 @@
org.apache.maven.plugins
maven-surefire-plugin
- 3.1.2
+ 3.2.5
false
@@ -201,7 +202,7 @@
org.apache.maven.plugins
maven-compiler-plugin
- 3.11.0
+ 3.12.1
diff --git a/src/main/antlr/cosmas/c2ps.g b/src/main/antlr/cosmas/c2ps.g
index c264ea63..269f27f5 100644
--- a/src/main/antlr/cosmas/c2ps.g
+++ b/src/main/antlr/cosmas/c2ps.g
@@ -1,16 +1,26 @@
- // * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-// //
-// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax) //
-// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf). //
-// 17.12.12/FB //
-// v-0.6 //
-// TODO: //
-// - se1: Einsetzen des Default-Operators in den kumulierten AST. //
+// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
+//
+// COSMAS II zeilenorientierten Suchanfragesprache (C2 plain syntax)
+// globale Grammatik (ruft lokale c2ps_x.g Grammatiken auf).
+// 17.12.12/FB
+// v-0.6
+// TODO:
+// - se1: Einsetzen des Default-Operators in den kumulierten AST.
+//
+// v0.7 - 25.07.23/FB
+// - added: #REG(x)
+// v0.8 - 06.11.23/FB
+// - accepts #BED(searchword, sa) : comma attached to searchword.
+// - more generally: comma at end of searchword, which is not enclosed by "..." is
+// excluded from searchword now.
+// - a comma inside a searchword is accepted if enclosed by "...".
+//
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
grammar c2ps;
options { output=AST; backtrack=true; k=5;}
+// tokens that will appear as node names in the resulting AST:
tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
ARG1; ARG2;
OPWF; OPLEM; OPANNOT;
@@ -21,6 +31,7 @@ tokens {C2PQ; OPBED; OPTS; OPBEG; OPEND; OPNHIT; OPALL; OPLEM; OPPROX;
OPNOT;
OPEXPR1;
OPMORPH; OPELEM;
+ OPREG;
}
@header {package de.ids_mannheim.korap.query.parse.cosmas;}
@@ -63,19 +74,50 @@ WS : (' '|'\r'|'\n')+ {skip();};
fragment DISTVALUE
: ('0' .. '9')+ (':' ('0'..'9')+)? ;
-
+
+fragment DISTTYPE // 30.11.23/FB
+ : ('w'|'s'|'p'|'t');
+
+fragment DISTDIR // 30.11.23/FB
+ : ('+'|'-');
+
+/* old version (before 30.11.23/FB)
fragment DIST
: ('+'|'-')? (DISTVALUE ('w'|'s'|'p'|'t') | ('w'|'s'|'p'|'t') DISTVALUE);
-
+*/
+
+// accept these 3 options in any order.
+// afterwards, we will have to check if any of them is missing.
+// 30.11.23/FB
+
+fragment DIST // 30.11.23/FB
+ : (DISTDIR | DISTTYPE | DISTVALUE )+;
+
fragment GROUP
: ('min' | 'max');
-OP_PROX : ('/' | '%') DIST (',' DIST)* (',' GROUP)? ;
+// version (12.01.24/FB):
+// accept correct and incorrect chars till the next blank, that way the incorrect chars
+// are submitted to the sub-grammer c2ps_opPROX where they are detected and an appropriate
+// error message is inserted:
+OP_PROX : ('/' | '%') DIST (~' ')*;
+
+// old version: accepts only correctly formulated options, so the incorrect
+// chars/options are hard to detect:
+// OP_PROX : ('/' | '%') DIST (',' DIST)* (',' GROUP)? ;
OP_IN : '#IN' | '#IN(' OP_IN_OPTS? ')' ;
OP_OV : '#OV' | '#OV(' OP_OV_OPTS? ')' ;
+// #REG(abc['"]) or #REG('abc\'s') or #REG("abc\"s"):
+
+OP_REG : '#REG(' ' '* '\'' ('\\\''|~'\'')+ '\'' (' ')* ')'
+ |
+ '#REG(' ' '* '"' ('\\"'|~'"')+ '"' (' ')* ')'
+ |
+ '#REG(' ' '* ~('\''|'"'|' ') (~(')'))* ')';
+
// EAVEXP wird hier eingesetzt für eine beliebige Sequenz von Zeichen bis zu ')'.
fragment OP_IN_OPTS
: EAVEXPR ;
@@ -111,8 +153,23 @@ SEARCHLEMMA
: '&' SEARCHWORD1 ; // rewrite rules funktionieren im lexer nicht: -> ^(OPLEM $SEARCHWORD1.text);
// SEARCHWORD2: schluckt Blanks. Diese müssen nachträglich als Wortdelimiter erkannt werden.
+
+// current syntax, drawback is:
+// e.g. aber, -> SEARCHWORD1 = "aber,"
+// but correct should be -> SEARCHWORD1 = "aber"
+//SEARCHWORD1
+// : ~('"' | ' ' | '#' | ')' | '(' )+ ;
+
+// new syntax (06.11.23/FB):
+// accept for searchword1 either a single ',' or exclude trailing ',' from searchword1:
+// E.g. Haus, -> searchword1=Haus.
+// For a ',' inside a search word, see searchword2.
+// exclude trailing "," from searchword1.
SEARCHWORD1
- : ~('"' | ' ' | '#' | ')' | '(' )+ ;
+ : (',' | ~('"' | ' ' | '#' | ')' | '(' | ',')+) ;
+
+// searchword2 accepts a ',' inside a searchword enclosed by "...".
+// E.g. "Haus,tür": OK.
SEARCHWORD2
: '"' (~('"') | '\\"')+ '"' ;
@@ -226,7 +283,7 @@ searchLabel
op2 : (opPROX | opIN | opOV | opAND | opOR | opNOT) ;
// AST with Options for opPROX is returned by c2ps_opPROX.check():
-opPROX : OP_PROX -> ^(OPPROX {c2ps_opPROX.check($OP_PROX.text, $OP_PROX.index)} );
+opPROX : OP_PROX -> ^(OPPROX {c2ps_opPROX.check($OP_PROX.text, $OP_PROX.pos)} );
opIN : OP_IN -> {c2ps_opIN.check($OP_IN.text, $OP_IN.index)};
@@ -241,7 +298,7 @@ opNOT : ('nicht' | 'NICHT' | 'not' | 'NOT') -> ^(OPNOT);
// OP1: Suchoperatoren mit 1 Argument:
// -----------------------------------
-op1 : opBEG | opEND | opNHIT | opALL | opBED;
+op1 : opBEG | opEND | opNHIT | opALL | opBED | opREG;
// #BED(serchExpr, B).
// B muss nachträglich in einer lokalen Grammatik überprüft werden.
@@ -259,3 +316,5 @@ opEND : ( '#END(' | '#RECHTS(' ) searchExpr ')' -> ^(OPEND searchExpr) ;
opNHIT : ( '#NHIT(' | '#INKLUSIVE(' ) searchExpr ')' -> ^(OPNHIT searchExpr) ;
opALL : ( '#ALL(' | '#EXKLUSIVE(' ) searchExpr ')' -> ^(OPALL searchExpr) ;
+
+opREG : OP_REG -> ^(OPREG {c2ps_opREG.encode($OP_REG.text, OPREG)}) ;
diff --git a/src/main/antlr/cosmas/c2ps_opPROX.g b/src/main/antlr/cosmas/c2ps_opPROX.g
index f7a42f5d..1569d1a5 100644
--- a/src/main/antlr/cosmas/c2ps_opPROX.g
+++ b/src/main/antlr/cosmas/c2ps_opPROX.g
@@ -1,9 +1,10 @@
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-// //
-// lokale Grammatik der COSMAS II zeilenorientierten Suchanfragesprache (= c2ps) //
-// für den Abstandsoperator /w... und %w... //
-// v-1.0 - 07.12.12/FB //
-// //
+//
+// lokale Grammatik der COSMAS II zeilenorientierten Suchanfragesprache (= c2ps)
+// für den Abstandsoperator /w... und %w...
+// v-1.0 - 07.12.12/FB
+// v-1.1 - 30.11.23/FB opPROX accepts any order of direction, measure and value.
+//
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
grammar c2ps_opPROX;
@@ -15,10 +16,15 @@ tokens { PROX_OPTS;
DIST_LIST; DIST; RANGE; VAL0;
MEAS; // measure
DIR; PLUS; MINUS; BOTH;
- GRP; MIN; MAX; }
-@header {package de.ids_mannheim.korap.query.parse.cosmas;}
+ GRP; MIN; MAX;
+ }
+
+@header {package de.ids_mannheim.korap.query.parse.cosmas;
+ import de.ids_mannheim.korap.util.C2RecognitionException;}
+
@lexer::header {package de.ids_mannheim.korap.query.parse.cosmas;}
+
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
//
// PROX-Lexer
@@ -28,6 +34,12 @@ tokens { PROX_OPTS;
DISTVALUE
: ('0' .. '9')+ ;
+// trying to catch everything (at the end of the option sequence) that should not appear inside the prox. options:
+// e.g. /w5umin -> remain = 'umin'.
+
+PROX_REMAIN
+ : (',')? ('b'..'h'|'j'..'l'|'n'|'o'|'q'|'r'|'u'|'v'|'y'|'z'|'B'..'H'|'J'..'L'|'N'|'O'|'Q'|'R'|'U'|'V'|'Y'|'Z') (~ ' ')* ;
+
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
//
// PROX-Parser
@@ -35,36 +47,49 @@ DISTVALUE
// * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
-opPROX : proxTyp proxDist (',' proxDist)* (',' proxGroup)?
+opPROX[int pos] : proxTyp proxDist[$pos] (',' proxDist[$pos])* (',' proxGroup)? (proxRemain[$pos])?
- -> ^(PROX_OPTS {$proxTyp.tree} ^(DIST_LIST proxDist+) {$proxGroup.tree});
+ -> ^(PROX_OPTS {$proxTyp.tree} ^(DIST_LIST proxDist+) {$proxGroup.tree} {$proxRemain.tree});
-proxTyp : '/' -> ^(TYP PROX) // klassischer Abstand.
- | '%' -> ^(TYP EXCL); // ausschließender Abstand.
+proxRemain[int pos] : PROX_REMAIN
+
+ -> { c2ps_opPROX.checkRemain(DIST, $PROX_REMAIN.text, $pos) };
+
+proxTyp : '/' -> ^(TYP PROX) // klassischer Abstand.
+ | '%' -> ^(TYP EXCL); // ausschließender Abstand.
// proxDist: e.g. +5w or -s0 or /w2:4 etc.
// kein proxDirection? hier, weil der Default erst innerhalb von Regel proxDirection erzeugt werden kann.
-proxDist: proxDirection (v1=proxDistValue m1=proxMeasure | m2=proxMeasure v2=proxDistValue)
- -> {$v1.tree != null}? ^(DIST {$proxDirection.tree} {$v1.tree} {$m1.tree})
- -> ^(DIST {$proxDirection.tree} {$v2.tree} {$m2.tree});
+// new rule: accepts options in any order:
+// count each option type and find out if any one is missing or occures multiple times.
+// 28.11.23/FB
+
+proxDist[int pos]
+@init{ int countM=0; int countD=0; int countV=0;}
+ :
+ ((m=proxMeasure {countM++;})|(d=proxDirection {countD++;})|(v=proxDistValue {countV++;}) )+
+
+ -> {c2ps_opPROX.encodeDIST(DIST, DIR, $d.tree, $m.tree, $v.tree, $proxDist.text, countD, countM, countV, $pos)};
+
+
+// new rule accepts only '+' and '-'; default tree for direction is
+// set in c2ps_opPROX.encodeDIST() now.
+// 28.11.23/FB
proxDirection
- : (p='+'|m='-')? -> {$p != null}? ^(DIR PLUS)
- -> {$m != null}? ^(DIR MINUS)
- -> ^(DIR BOTH) ;
-/*
-proxDistValue // proxDistMin ( ':' proxDistMax)? ;
- : (m1=proxDistMin -> ^(DIST_RANGE VAL0 $m1)) (':' m2=proxDistMax -> ^(DIST_RANGE $m1 $m2))? ;
-*/
-proxDistValue // proxDistMin ( ':' proxDistMax)? ;
- : (m1=proxDistMin ) (':' m2=proxDistMax)?
+ : '+' -> ^(DIR PLUS)
+ | '-' -> ^(DIR MINUS);
+
+proxDistValue : (m1=proxDistMin ) (':' m2=proxDistMax)?
-> {$m2.text != null}? ^(RANGE $m1 $m2)
- -> ^(RANGE VAL0 $m1);
-
+ -> ^(RANGE VAL0 $m1);
+
+// mentioning >1 measures will be checked/rejected in c2ps_opPROX.encodeDIST().
+
proxMeasure
- : (m='w'|m='s'|m='p'|m='t') -> ^(MEAS $m);
+ : (meas='w'|meas='s'|meas='p'|meas='t') -> ^(MEAS $meas) ;
proxDistMin
: DISTVALUE;
@@ -73,6 +98,8 @@ proxDistMax
: DISTVALUE;
proxGroup
- : 'min' -> ^(GRP MIN)
- | 'max' -> ^(GRP MAX);
+ : ('min'|'MIN') -> ^(GRP MIN)
+ | ('max'|'MAX') -> ^(GRP MAX);
+
+
\ No newline at end of file
diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
index fb9df4e8..35f64379 100644
--- a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
+++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opBED.java
@@ -17,7 +17,8 @@ public static Tree check (String input, int index) {
c2ps_opBEDParser.opBEDOpts_return c2PQReturn = null;
/*
- System.out.println("check opBED: " + index + ": " + input);
+ System.out.format("opBED: check: input='%s', index=%d.\n", input, index);
+ System.out.format("opBED: tokens ='%s'.\n", tokens.toString());
System.out.flush();
*/
@@ -68,7 +69,7 @@ public static Tree checkTPos (String input, int index) {
public static void main (String args[]) throws Exception {
- String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)" };
+ String[] input = { ",sa,se,-ta,-te/pa,-pe)", ",sa)", ",/pa,-pe)"};
Tree tree;
for (int i = 0; i < input.length; i++) {
diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opPROX.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opPROX.java
index 2a5b1634..62297195 100644
--- a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opPROX.java
+++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opPROX.java
@@ -3,29 +3,177 @@
import org.antlr.runtime.*;
import org.antlr.runtime.tree.*;
+import de.ids_mannheim.korap.query.serialize.Antlr3AbstractQueryProcessor;
+import de.ids_mannheim.korap.query.serialize.util.Antlr3DescriptiveErrorListener;
+import de.ids_mannheim.korap.query.serialize.util.StatusCodes;
+import de.ids_mannheim.korap.util.*;
+
/*
* parses Opts of PROX: /w3:4,s0,min or %w3:4,s0,min.
*/
-public class c2ps_opPROX
+public class c2ps_opPROX
{
-
- public static Tree check (String input, int index) {
+ final static boolean bDebug = false;
+
+ // type of an Error CommonToken:
+ final static int typeERROR = 1;
+ // Prox error codes defined in StatusCodes.java.
+
+ private static CommonTree buildErrorTree(String text, int errCode, int typeDIST, int pos)
+
+ {
+ CommonTree
+ errorTree = new CommonTree(new CommonToken(typeDIST, "DIST"));
+ CommonTree
+ errorNode = new CommonTree(new CommonToken(typeERROR, "ERROR"));
+ CommonTree
+ errorPos = new CommonTree(new CommonToken(typeERROR, String.valueOf(pos)));
+ CommonTree
+ errorCode = new CommonTree(new CommonToken(typeERROR, String.valueOf(errCode)));
+ CommonTree
+ errorMes;
+ String
+ mess;
+
+ switch( errCode )
+ {
+ case StatusCodes.ERR_PROX_MEAS_NULL:
+ mess = String.format("Abstandsoperator an der Stelle '%s' es fehlt eine der folgenden Angaben: w,s,p!", text);
+ errorMes = new CommonTree(new CommonToken(typeERROR, mess));
+ break;
+ case StatusCodes.ERR_PROX_MEAS_TOOGREAT:
+ mess = String.format("Abstandsoperator an der Stelle '%s': Bitte nur 1 der folgenden Angaben einsetzen: w,s,p! " +
+ "Falls Mehrfachangabe erwünscht, müssen diese durch Kommata getrennt werden (z.B.: /+w2,s0).", text);
+ errorMes = new CommonTree(new CommonToken(typeERROR, mess));
+ break;
+ case StatusCodes.ERR_PROX_VAL_NULL:
+ mess = String.format("Abstandsoperator an der Stelle '%s': Bitte einen numerischen Wert einsetzen (z.B. /+w5)! ", text);
+ errorMes = new CommonTree(new CommonToken(typeERROR, mess));
+ break;
+ case StatusCodes.ERR_PROX_VAL_TOOGREAT:
+ mess = String.format("Abstandsoperator an der Stelle '%s': Bitte nur 1 numerischen Wert einsetzen (z.B. /+w5)! ", text);
+ errorMes = new CommonTree(new CommonToken(typeERROR, mess));
+ break;
+ case StatusCodes.ERR_PROX_DIR_TOOGREAT:
+ mess = String.format("Abstandsoperator an der Stelle '%s': Bitte nur 1 Angabe '+' oder '-' oder keine! ", text);
+ errorMes = new CommonTree(new CommonToken(typeERROR, mess));
+ break;
+ case StatusCodes.ERR_PROX_WRONG_CHARS:
+ mess = String.format("Abstandsoperator an der Stelle '%s': unbekannte Abstandsoption(en)!", text);
+ errorMes = new CommonTree(new CommonToken(typeERROR, mess));
+ break;
+ default:
+ mess = String.format("Abstandsoperator an der Stelle '%s': unbekannter Fehler. Korrekte Syntax z.B.: /+w2 oder /w10,s0.", text);
+
+ errorMes = new CommonTree(new CommonToken(typeERROR, mess));
+ }
+
+ errorTree.addChild(errorNode);
+ errorNode.addChild(errorPos);
+ errorNode.addChild(errorCode);
+ errorNode.addChild(errorMes);
+
+ return errorTree;
+ }
+
+ /* encodeDIST():
+ * - returns a CommonTree built from the Direction/Measure/Distance value.
+ * - accepts options in any order.
+ * - creates CommonTree in that order: Direction .. Distance value .. Measure.
+ * - sets default direction to BOTH if not set yet.
+ * - unfortunately, in ANTLR3 it seems that there is no way inside the Parser Grammar to get
+ * the absolute token position from the beginning of the query. Something like $ProxDist.pos or
+ * $start.pos is not available, so we have no info in this function about the position at which
+ * an error occurs.
+ * - For multiple prox options, e.g. /w2,s2,p0, this function if called 3 times.
+ * Arguments:
+ * countD : how many occurences of distance: + or - or nothing. If 0 insert the default BOTH.
+ * countM : how many occurences of measure: w,s,p,t: should be 1.
+ * countV : how many occurences of distance value: should be 1.
+ * 28.11.23/FB
+ */
+
+ public static Object encodeDIST(int typeDIST, int typeDIR, Object ctDir, Object ctMeas, Object ctVal, String text,
+ int countD, int countM, int countV, int pos)
+
+ {
+ CommonTree tree1 = (CommonTree)ctDir;
+ CommonTree tree2 = (CommonTree)ctMeas;
+ CommonTree tree3 = (CommonTree)ctVal;
+
+ if( bDebug )
+ System.err.printf("Debug: encodeDIST: scanned input='%s' countM=%d countD=%d countV=%d pos=%d.\n",
+ text, countM, countD, countV, pos);
+
+ if( countM == 0 )
+ return buildErrorTree(text, StatusCodes.ERR_PROX_MEAS_NULL, typeDIST, pos);
+ if( countM > 1 )
+ return buildErrorTree(text, StatusCodes.ERR_PROX_MEAS_TOOGREAT, typeDIST, pos);
+ if( countV == 0 )
+ return buildErrorTree(text, StatusCodes.ERR_PROX_VAL_NULL, typeDIST, pos);
+ if( countV > 1 )
+ return buildErrorTree(text, StatusCodes.ERR_PROX_VAL_TOOGREAT, typeDIST, pos);
+
+ if( countD == 0 )
+ {
+ // if direction is not specified (ctDir == null or countD==0), return default = BOTH:
+ CommonTree treeDIR = new CommonTree(new CommonToken(typeDIR, (String)"DIR"));
+ CommonTree treeBOTH = new CommonTree(new CommonToken(typeDIR, "BOTH"));
+ treeDIR.addChild(treeBOTH);
+
+ if( bDebug )
+ System.err.printf("Debug: encodeDIST: tree for DIR: '%s'.\n", treeDIR.toStringTree());
+ tree1 = treeDIR;
+ }
+ else if( countD > 1 )
+ return buildErrorTree(text, StatusCodes.ERR_PROX_DIR_TOOGREAT, typeDIST, pos);
+
+ // create DIST tree:
+ CommonTree
+ tree = new CommonTree(new CommonToken(typeDIST, "DIST"));
+
+ tree.addChild(tree1);
+ tree.addChild(tree3); // tree3 before tree2 expected by serialization.
+ tree.addChild(tree2);
+
+ if( bDebug )
+ System.err.printf("Debug: encodeDIST: returning '%s'.\n", tree.toStringTree());
+
+ return tree;
+ } // encodeDIST
+
+ /* checkRemain:
+ *
+ * - the chars in proxRemain are not allowed in prox. options.
+ * - return an error tree.
+ * 12.01.24/FB
+ */
+
+ public static Object checkRemain(int typeDIST, String proxRemain, int pos)
+
+ {
+ if( bDebug )
+ System.out.printf("Debug: checkRemain: '%s' at pos %d.\n", proxRemain, pos);
+
+ return buildErrorTree(proxRemain, StatusCodes.ERR_PROX_WRONG_CHARS, typeDIST, pos);
+ }
+
+ public static Tree check (String input, int pos) throws RecognitionException
+ {
ANTLRStringStream ss = new ANTLRStringStream(input);
c2ps_opPROXLexer lex = new c2ps_opPROXLexer(ss);
CommonTokenStream tokens = new CommonTokenStream(lex);
c2ps_opPROXParser g = new c2ps_opPROXParser(tokens);
c2ps_opPROXParser.opPROX_return c2PQReturn = null;
- /*
- System.out.println("check opPROX:" + index + ": " + input);
- System.out.flush();
- */
+ if( bDebug )
+ System.out.printf("check opPROX: pos=%d input='%s'.\n", pos, input);
try {
- c2PQReturn = g.opPROX();
- }
+ c2PQReturn = g.opPROX(pos);
+ }
catch (RecognitionException e) {
e.printStackTrace();
}
@@ -37,7 +185,19 @@ public static Tree check (String input, int index) {
return tree;
}
-
+ public static boolean checkFalse()
+ {
+
+ return false; // testwise
+ }
+
+ public static boolean checkMeasure( Object measure)
+ {
+ System.err.printf("Debug: checkMeasure: measure = %s.\n",
+ measure == null ? "null" : "not null");
+ return true;
+ }
+
/*
* main testprogram:
*/
diff --git a/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java
new file mode 100644
index 00000000..a798647a
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/query/parse/cosmas/c2ps_opREG.java
@@ -0,0 +1,235 @@
+package de.ids_mannheim.korap.query.parse.cosmas;
+
+import org.antlr.runtime.*;
+import org.antlr.runtime.tree.*;
+
+import de.ids_mannheim.korap.query.serialize.util.Antlr3DescriptiveErrorListener;
+import de.ids_mannheim.korap.util.StringUtils;
+
+/*
+ * 1. transforms and encodes a regular COSMAS II like expression #REG(regexpr)
+ * into a AST tree -> encode().
+ * 2. transforms tree into the corresponding Koral:token/Koral:term, like:
+ * e.g. #REG(abc[']?s) ->
+ * {
+ * "@type": "koral:term",
+ * "match": "match:eq",
+ * "type" : "type:regex",
+ * "key" : "abc[']?s",
+ * "layer": "orth"
+ * }...
+ *
+ * - see doc: http://korap.github.io/Koral/
+ * - generation of koral:term -> processOPREG().
+ * 06.09.23/FB
+ */
+
+public class c2ps_opREG
+
+{
+ private static boolean DEBUG = false;
+
+ /*
+ * encode():
+ *
+ * input = e.g. "#REG('abc(d|e)*')" -> return AST = (OPREG abc(d|e)*):
+ *
+ * Returned String: no enclosing "..." needed, so no escaping of " nor \ needed.
+ * 06.09.23/FB
+ */
+ public static Tree encode (String input, int tokenType)
+
+ {
+ if( DEBUG )
+ {
+ System.out.printf("opREG.encode: input = >>%s<<, token type=%d.\n", input, tokenType);
+ System.out.flush();
+ }
+
+ if( input.substring(0, 5).compareToIgnoreCase("#REG(") != 0 || input.charAt(input.length()-1) != ')' )
+ {
+ // error: '#REG(' and ')' not found: return input unchanged.
+ if( DEBUG ) System.out.printf("opREG.encode: unexpected input = >>%s<<: nothing encoded!\n", input);
+ return new CommonTree(new CommonToken(tokenType, input));
+ }
+
+
+ StringBuffer sb = new StringBuffer(input.substring(5));
+ sb.deleteCharAt(sb.length()-1);
+
+ // #REG("a"), #REG(a), #REG('a') -> >>a<<.
+ // enclosing ".." are appended at the end of this function.
+ // a. remove blanks around ".." and '..',
+ // e.g. a. #REG( ' abc ' ) -> #REG(' abc ').
+
+ StringUtils.removeBlanksAtBothSides(sb);
+
+ if( sb.charAt(0) == '\'' || sb.charAt(0) == '"')
+ {
+ // remove pairwise at both ends.
+ sb.deleteCharAt(0);
+ if( sb.charAt(sb.length()-1) == '\'' || sb.charAt(sb.length()-1) == '"' )
+ sb.deleteCharAt(sb.length()-1);
+ }
+
+ // b. remove blanks inside '..' or "..",
+ // E.g. #REG(' abc ') -> #REG('abc'):
+
+ StringUtils.removeBlanksAtBothSides(sb);
+
+ /* unescape >>'<<, >>"<< and >>\<<.
+ * e.g. #REG('that\'s') -> "that\'s" -> >>that's<<.
+ */
+
+ for(int i=0; i>%s<<.\n", sb.toString());
+
+ return new CommonTree(new CommonToken(tokenType, sb.toString()));
+
+ } // encode
+
+ /*
+ * printTokens:
+ * Notes:
+ * - must build a separate CommonTokenStream here, because
+ * tokens.fill() will consume all tokens.
+ * - prints to stdout list of tokens from lexer.
+ * - mainly for debugging.
+ * 14.09.23/FB
+ *
+ */
+
+ private static void printTokens(String query, Antlr3DescriptiveErrorListener errorListener)
+
+ {
+ ANTLRStringStream
+ ss = new ANTLRStringStream(query);
+ c2psLexer
+ lex = new c2psLexer(ss);
+ org.antlr.runtime.CommonTokenStream
+ tokens = new org.antlr.runtime.CommonTokenStream(lex); // v3
+
+ lex.setErrorReporter(errorListener);
+
+ // get all tokens from lexer:
+ tokens.fill();
+
+ System.out.printf("opREG.check: no. of tokens = %d.\n", tokens.size());
+ for(int i=0; i>#REG(\" a"s\")<<.
+ lex.setErrorReporter(errorListener);
+ ((c2psParser) g).setErrorReporter(errorListener);
+
+ if( DEBUG )
+ {
+ //System.out.format("opREG.check: input='%s', index=%d.\n", query, index);
+ printTokens(query, errorListener);
+ System.out.flush();
+ }
+
+
+ try {
+ c2psParser.c2ps_query_return
+ c2Return = ((c2psParser) g).c2ps_query(); // statt t().
+
+ // AST Tree anzeigen:
+ tree = (Tree) c2Return.getTree();
+ //if (DEBUG)
+ // System.out.printf("opREG.check: tree = '%s'.\n", tree.toStringTree());
+ }
+ catch (RecognitionException e) {
+ System.err.printf("c2po_opREG.check: Recognition Exception!\n");
+ }
+
+ return tree;
+ } // check
+
+
+ /**
+ * main
+ */
+
+ public static void main (String args[]) throws Exception
+
+ {
+ String input[] = { "#REG(abc)",
+ "#REG(def's)",
+ "#REG( def's )", // all blanks should be removed.
+ "#REG( ' def\\'s ' )", // same
+ "#REG( \" def's \" )", // same
+ "#REG(abc[\"]ef)",
+ "#REG('abc')", // ' fehlt: generates Syntax Error .
+ "#REG('abc\')", // User input = #REG('abc\') : OK, nothing escaped.
+ "#REG('abc\'')", // User input = #REG('abc\') : OK, nothing escaped.
+ "#REG('abc\\')", // User input = #REG('abc\') : OK, same behavior: \\ == \.
+ "#REG((a|b))", // broken input, should use ".." or '..'.
+ "#REG('(a|b)')", // OK.
+ "#REG(\"(a|b)\")", // OK.
+ "#REG(^[A-Z]+abc[\']*ung$)",
+ "#REG('ab(cd|ef)*')",
+ "#REG('abc(def|g)*[)(]')",
+ "#REG(\"abc(def|g)*[)(]\")",
+ "#REG('abc[\"]')", // User input = #REG('abc["]') : OK, needs escape => #REG("...\"...")
+ "#REG(\"abc[\\\"]\")", // User input = #REG("abc["]") : broken because of 2nd " -> syntax error.
+ "#REG(\"abc[\\\"]\")", // User input = #REG("abc[\"]"): OK, already escaped by user => #REG("...\"...")
+ "#REG(\"abc[\\\\\"]\")" // User input = #REG("abc[\\"]") : broken. with escaped " => #REG("...\"...")
+ };
+ Tree tree;
+
+ for (int i = 0; i < input.length; i++)
+ {
+ System.out.printf("c2ps_opREG: Parsing input %02d: >>%s<<\n", i, input[i]);
+ tree = check(input[i], 0);
+ System.out.printf("c2ps_opREG: tree %02d: >>%s<<.\n\n", i, tree.toStringTree());
+ }
+
+
+ } // main
+
+}
diff --git a/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java b/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
index 69a6293a..285a3e71 100644
--- a/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
+++ b/src/main/java/de/ids_mannheim/korap/query/serialize/Cosmas2QueryProcessor.java
@@ -1,5 +1,6 @@
package de.ids_mannheim.korap.query.serialize;
+import de.ids_mannheim.korap.query.parse.cosmas.c2ps_opPROX; // error codes.
import de.ids_mannheim.korap.query.object.ClassRefCheck;
import de.ids_mannheim.korap.query.object.ClassRefOp;
import de.ids_mannheim.korap.query.object.CosmasPosition;
@@ -15,14 +16,19 @@
import de.ids_mannheim.korap.query.serialize.util.KoralObjectGenerator;
import de.ids_mannheim.korap.query.serialize.util.ResourceMapper;
import de.ids_mannheim.korap.query.serialize.util.StatusCodes;
+import de.ids_mannheim.korap.util.StringUtils;
import org.antlr.runtime.ANTLRStringStream;
+import org.antlr.runtime.FailedPredicateException;
import org.antlr.runtime.RecognitionException;
+import org.antlr.runtime.Token;
import org.antlr.runtime.tree.Tree;
import org.antlr.v4.runtime.tree.ParseTree;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.HashBasedTable;
import com.google.common.collect.Table;
@@ -127,6 +133,82 @@ public class Cosmas2QueryProcessor extends Antlr3AbstractQueryProcessor {
public static Pattern wildcardPlusPattern = Pattern.compile("([+])");
public static Pattern wildcardQuestionPattern = Pattern.compile("([?])");
+ /**
+ * reportErrorsinTree:
+ * - traverse the AST tree and search for nodes of type ERROR, they contain
+ * the errCode, the error message and the error char position.
+ * - returns true if an error node is found in the tree referenced by 'node'.
+ * - adds error code, error position and error message to the error list.
+ * Arguments:
+ * node : might be null if it has been reseted previously by another error handler.
+ * @param node
+ * @return: true: error node was found,
+ * false; no error node found.
+ * 19.12.23/FB
+ */
+
+ private boolean reportErrorsinTree(Tree node)
+
+ {
+ final String func = "reportErrorsinTree";
+
+ //System.err.printf("Debug: %s: '%s' has %d children.\n",
+ // func, node.getText(), node.getChildCount());
+ if( node == null )
+ {
+ // System.err.printf("Warning: %s: node == null: no action requested.\n", func);
+ return false;
+ }
+
+ if( node.getType() == 1 && node.getText().compareTo("ERROR") == 0 )
+ {
+ // error node found:
+ // child[0] : error pos.
+ // child[1] : error code.
+ // child[2] : error message, containing offending string.
+ /*
+ System.err.printf("Debug: %s: child[0]='%s' child[1]='%s' child[2]='%s'.\n", func,
+ node.getChild(0) != null ? node.getChild(0).getText() : "???",
+ node.getChild(1) != null ? node.getChild(1).getText() : "???",
+ node.getChild(2) != null ? node.getChild(2).getText() : "???");
+ */
+
+ int
+ errPos = node.getChild(0) != null ? Integer.parseInt(node.getChild(0).getText()) : 0;
+ int
+ errCode = node.getChild(1) != null ? Integer.parseInt(node.getChild(1).getText()) : StatusCodes.ERR_PROX_UNKNOWN;
+ String
+ errMess = node.getChild(2) != null ? node.getChild(2).getText() : "Genaue Fehlermeldung nicht auffindbar.";
+
+ ArrayList