Skip to content

Commit

Permalink
[SPARK-50269][SQL][TESTS] Further improve collation support testing f…
Browse files Browse the repository at this point in the history
…or various collations

### What changes were proposed in this pull request?

Extend collation-related unit and e2e sql tests for various collations in addition to the 4 common collations already used.
This is a follow up PR from #48608, where it was decided to split the changes in separate PRs. This PR includes the additional test suites mentioned in the comments of the original PR.

### Why are the changes needed?

Further expand collation testing coverage for various collations, incorporating different languages, scripts, case/accent sensitivity, etc.

### Does this PR introduce _any_ user-facing change?

No

### How was this patch tested?

Extending existing collation-related unit and e2e sql tests.

### Was this patch authored or co-authored using generative AI tooling?

No

Closes #48799 from dejankrak-db/collation-additional-tests.

Authored-by: Dejan Krakovic <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
  • Loading branch information
dejankrak-db authored and MaxGekk committed Nov 8, 2024
1 parent e4638c8 commit 5d75799
Showing 1 changed file with 24 additions and 0 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ public void testCompare() throws SparkException {
assertCompare("bxx", "bü", "UNICODE", 1);
assertCompare("äü", "bü", "UNICODE_CI", -1);
assertCompare("bxx", "bü", "UNICODE_CI", 1);
assertCompare("cČć", "ČćC", "SR_CI_AI", 0);
// Case variation.
assertCompare("AbCd", "aBcD", "UTF8_BINARY", -1);
assertCompare("ABCD", "abcd", "UTF8_LCASE", 0);
Expand All @@ -104,6 +105,7 @@ public void testCompare() throws SparkException {
assertCompare("AbCδ", "ABCΔ", "UTF8_LCASE", 0);
assertCompare("äBCd", "ÄBCD", "UNICODE", -1);
assertCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0);
assertCompare("ÈÉÊË", "EeEe", "AF_CI_AI", 0);
// One-to-many case mapping (e.g. Turkish dotted I).
assertCompare("i\u0307", "İ", "UTF8_BINARY", -1);
assertCompare("İ", "i\u0307", "UTF8_BINARY", 1);
Expand Down Expand Up @@ -334,6 +336,7 @@ public void testContains() throws SparkException {
assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertContains("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertContains("The KKelvin.", "KKelvin,", "UTF8_LCASE", false);
assertContains("abčćd", "ABCCD", "SR_CI_AI", true);
// Case variation.
assertContains("aBcDe", "bcd", "UTF8_BINARY", false);
assertContains("aBcDe", "BcD", "UTF8_BINARY", true);
Expand All @@ -352,6 +355,7 @@ public void testContains() throws SparkException {
assertContains("aBcDe", "BĆD", "UTF8_LCASE", false);
assertContains("aBcDe", "abćde", "UNICODE_CI", false);
assertContains("aBcDe", "AbĆdE", "UNICODE_CI", false);
assertContains("abEEE", "Bèêë", "AF_CI_AI", true);
// One-to-many case mapping (e.g. Turkish dotted I).
assertContains("i\u0307", "i", "UNICODE_CI", false);
assertContains("i\u0307", "\u0307", "UNICODE_CI", false);
Expand Down Expand Up @@ -580,6 +584,11 @@ public void testStartsWith() throws SparkException {
assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertStartsWith("2 Kelvin.", "2 Kelvin", "UTF8_LCASE", true);
assertStartsWith("KKelvin.", "KKelvin,", "UTF8_LCASE", false);
assertStartsWith("Ћао", "Ца", "sr_Cyrl_CI_AI", false);
assertStartsWith("Ћао", "ћа", "sr_Cyrl_CI_AI", true);
assertStartsWith("Ćao", "Ca", "SR_CI", false);
assertStartsWith("Ćao", "Ca", "SR_CI_AI", true);
assertStartsWith("Ćao", "Ća", "SR", true);
// Case variation.
assertStartsWith("aBcDe", "abc", "UTF8_BINARY", false);
assertStartsWith("aBcDe", "aBc", "UTF8_BINARY", true);
Expand Down Expand Up @@ -832,6 +841,11 @@ public void testEndsWith() throws SparkException {
assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true);
assertEndsWith("The 2 Kelvin", "2 Kelvin", "UTF8_LCASE", true);
assertEndsWith("The KKelvin", "KKelvin,", "UTF8_LCASE", false);
assertEndsWith("Ћевапчићи", "цици", "sr_Cyrl_CI_AI", false);
assertEndsWith("Ћевапчићи", "чИЋи", "sr_Cyrl_CI_AI", true);
assertEndsWith("Ćevapčići", "cici", "SR_CI", false);
assertEndsWith("Ćevapčići", "cici", "SR_CI_AI", true);
assertEndsWith("Ćevapčići", "čići", "SR", true);
// Case variation.
assertEndsWith("aBcDe", "cde", "UTF8_BINARY", false);
assertEndsWith("aBcDe", "cDe", "UTF8_BINARY", true);
Expand Down Expand Up @@ -1393,6 +1407,8 @@ public void testInitCap() throws SparkException {
assertInitCap("ÄBĆΔE", "UTF8_LCASE", "Äbćδe");
assertInitCap("ÄBĆΔE", "UNICODE", "Äbćδe");
assertInitCap("ÄBĆΔE", "UNICODE_CI", "Äbćδe");
assertInitCap("êéfgh", "AF_CI_AI", "Êéfgh");
assertInitCap("öoAÄ", "DE_CI_AI", "Öoaä");
// Case-variable character length
assertInitCap("İo", "UTF8_BINARY", "İo", "I\u0307o");
assertInitCap("İo", "UTF8_LCASE", "İo");
Expand Down Expand Up @@ -1580,6 +1596,8 @@ public void testStringInstr() throws SparkException {
assertStringInstr("aaadS", "Ds", "UTF8_LCASE", 4);
assertStringInstr("aaadS", "Ds", "UNICODE", 0);
assertStringInstr("aaadS", "Ds", "UNICODE_CI", 4);
assertStringInstr("aaaČŠčšcs", "cs", "SR", 8);
assertStringInstr("aaaČŠčšcs", "cs", "SR_CI_AI", 4);
// Advanced tests.
assertStringInstr("test大千世界X大千世界", "大千", "UTF8_BINARY", 5);
assertStringInstr("test大千世界X大千世界", "大千", "UTF8_LCASE", 5);
Expand Down Expand Up @@ -2038,6 +2056,7 @@ public void testStringReplace() throws SparkException {
assertStringReplace("aBc世abc", "b", "12", "UNICODE_CI", "a12c世a12c");
assertStringReplace("a世Bcdabcd", "bC", "", "UNICODE_CI", "a世dad");
assertStringReplace("repl世ace", "Pl", "", "UNICODE_CI", "re世ace");
assertStringReplace("abcčšdabĆŠscd", "cs", "", "SR_CI_AI", "abcdabscd");
// One-to-many case mapping (e.g. Turkish dotted I).
assertStringReplace("abi̇12", "i", "X", "UNICODE_CI", "abi̇12");
assertStringReplace("abi̇12", "\u0307", "X", "UNICODE_CI", "abi̇12");
Expand Down Expand Up @@ -2231,6 +2250,8 @@ public void testStringLocate() throws SparkException {
assertStringLocate("aa", "Aaads", 1, "UTF8_LCASE", 1);
assertStringLocate("aa", "Aaads", 1, "UNICODE", 2);
assertStringLocate("aa", "Aaads", 1, "UNICODE_CI", 1);
assertStringLocate("ćČ", "CćČČćCČĆČcČcććČč", 3, "SR", 14);
assertStringLocate("ćČ", "CćČČćCČĆČcČcććČč", 3, "SR_CI_AI", 3);
// Advanced tests.
assertStringLocate("界x", "test大千世界X大千世界", 1, "UTF8_BINARY", 0);
assertStringLocate("界X", "test大千世界X大千世界", 1, "UTF8_BINARY", 8);
Expand Down Expand Up @@ -2581,6 +2602,7 @@ public void testSubstringIndex() throws SparkException {
assertSubstringIndex("test大千世界X大千世界", "X", 1, "UNICODE_CI", "test大千世界");
assertSubstringIndex("test大千世界大千世界", "千", 2, "UNICODE_CI", "test大千世界大");
assertSubstringIndex("www||APACHE||org", "||", 2, "UNICODE_CI", "www||APACHE");
assertSubstringIndex("wwwèapacheËorg", "Ê", -3, "AF_CI_AI", "apacheËorg");
// One-to-many case mapping (e.g. Turkish dotted I).
assertSubstringIndex("abİo12", "i\u0307o", 1, "UNICODE_CI", "ab");
assertSubstringIndex("abİo12", "i\u0307o", -1, "UNICODE_CI", "12");
Expand Down Expand Up @@ -2803,6 +2825,7 @@ public void testStringTrim() throws SparkException {
assertStringTrim("UNICODE_CI", "asd", "A", "sd");
assertStringTrim("UNICODE_CI", "ASD", "a", "SD");
assertStringTrim("UNICODE_CI", "ddsXXXaa", "ASD", "XXX");
assertStringTrim("SR_CI_AI", "cSCšćČXXXsčšČŠsć", "čš", "XXX");
// One-to-many case mapping (e.g. Turkish dotted I)..
assertStringTrim("UTF8_BINARY", "ẞaaaẞ", "ß", "ẞaaaẞ");
assertStringTrim("UTF8_BINARY", "ßaaaß", "ẞ", "ßaaaß");
Expand Down Expand Up @@ -3730,6 +3753,7 @@ public void testStringTranslate() throws SparkException {
assertStringTranslate("abcdef", "abcde", "123", "UTF8_LCASE", "123f");
assertStringTranslate("abcdef", "abcde", "123", "UNICODE", "123f");
assertStringTranslate("abcdef", "abcde", "123", "UNICODE_CI", "123f");
assertStringTranslate("abcdëÈêf", "ÊèË", "123", "AF_CI", "abcd321f");
// One-to-many case mapping (e.g. Turkish dotted I).
assertStringTranslate("İ", "i\u0307", "xy", "UTF8_BINARY", "İ");
assertStringTranslate("İ", "i\u0307", "xy", "UTF8_LCASE", "İ");
Expand Down

0 comments on commit 5d75799

Please sign in to comment.