Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ICU-22503 Add support for property Indic_Conjunct_Break #3049

Merged
merged 1 commit into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bazeliskrc
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,4 @@
# for running Bazel commands while ensuring, through configuration, that only a
# specific version of Bazel is executed.

USE_BAZEL_VERSION=7.1.1
USE_BAZEL_VERSION=7.2.1
2,047 changes: 1,026 additions & 1,021 deletions icu4c/source/common/propname_data.h

Large diffs are not rendered by default.

4,465 changes: 2,234 additions & 2,231 deletions icu4c/source/common/uchar_props_data.h

Large diffs are not rendered by default.

33 changes: 32 additions & 1 deletion icu4c/source/common/unicode/uchar.h
Original file line number Diff line number Diff line change
Expand Up @@ -677,13 +677,19 @@ typedef enum UProperty {
* @draft ICU 75
*/
UCHAR_IDENTIFIER_STATUS=0x1019,
/**
* Enumerated property Indic_Conjunct_Break.
* Used in the grapheme cluster break algorithm in UAX #29.
* @draft ICU 76
*/
UCHAR_INDIC_CONJUNCT_BREAK=0x101A,
#endif // U_HIDE_DRAFT_API
#ifndef U_HIDE_DEPRECATED_API
/**
* One more than the last constant for enumerated/integer Unicode properties.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
UCHAR_INT_LIMIT=0x101A,
UCHAR_INT_LIMIT=0x101B,
#endif // U_HIDE_DEPRECATED_API

/** Bitmask property General_Category_Mask.
Expand Down Expand Up @@ -2729,6 +2735,31 @@ typedef enum UIndicSyllabicCategory {
U_INSC_REORDERING_KILLER,
} UIndicSyllabicCategory;

#ifndef U_HIDE_DRAFT_API
/**
* Indic Conjunct Break constants.
*
* @see UCHAR_INDIC_CONJUNCT_BREAK
* @draft ICU 76
*/
typedef enum UIndicConjunctBreak {
/*
* Note: UIndicConjunctBreak constants are parsed by preparseucd.py.
* It matches lines like
* U_INCB_<Unicode Indic_Conjunct_Break value name>
*/

/** @draft ICU 76 */
U_INCB_NONE,
/** @draft ICU 76 */
U_INCB_CONSONANT,
/** @draft ICU 76 */
U_INCB_EXTEND,
/** @draft ICU 76 */
U_INCB_LINKER,
} UIndicConjunctBreak;
#endif // U_HIDE_DRAFT_API

/**
* Vertical Orientation constants.
*
Expand Down
1 change: 1 addition & 0 deletions icu4c/source/common/uprops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -728,6 +728,7 @@ static const IntProperty intProps[UCHAR_INT_LIMIT-UCHAR_INT_START]={
{ UPROPS_SRC_INSC, 0, 0, getInSC, layoutGetMaxValue },
{ UPROPS_SRC_VO, 0, 0, getVo, layoutGetMaxValue },
{ UPROPS_SRC_PROPSVEC, 0, static_cast<int32_t>(U_ID_STATUS_ALLOWED), getIDStatusValue, getMaxValueFromShift },
{ 0, UPROPS_INCB_MASK, UPROPS_INCB_SHIFT,defaultGetValue, defaultGetMaxValue },
};

U_CAPI int32_t U_EXPORT2
Expand Down
6 changes: 5 additions & 1 deletion icu4c/source/common/uprops.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,8 @@ namespace {
// Bits
// 31..26 Age major version (major=0..63)
// 25..24 Age minor version (minor=0..3)
// 23..15 reserved
// 23..17 reserved
// 16..15 Indic Conjunct Break
// 14..12 East Asian Width
// 11..10 3..1: Bits 9..0 = Script_Extensions index
// 3: Script value from Script_Extensions
Expand Down Expand Up @@ -158,6 +159,9 @@ inline constexpr uint8_t UPROPS_AGE_MINOR_MAX = 3;
inline constexpr uint32_t UPROPS_EA_MASK = 0x00007000;
inline constexpr int32_t UPROPS_EA_SHIFT = 12;

inline constexpr uint32_t UPROPS_INCB_MASK = 0x00018000;
inline constexpr int32_t UPROPS_INCB_SHIFT = 15;

/** Script_Extensions: mask includes Script */
inline constexpr uint32_t UPROPS_SCRIPT_X_MASK = 0x00000fff;

Expand Down
Binary file modified icu4c/source/data/in/pnames.icu
Binary file not shown.
Binary file modified icu4c/source/data/in/uprops.icu
Binary file not shown.
5 changes: 3 additions & 2 deletions icu4c/source/data/unidata/changes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ export UNICODE_TOOLS=~/oss/unicodetools/mine/src
so that the makefiles see the new version number.
cd $ICU_OUT/icu4c
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data CXXFLAGS="-DU_USING_ICU_NAMESPACE=0 -Wimplicit-fallthrough" CPPFLAGS="-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -fsanitize=bounds" LDFLAGS=-fsanitize=bounds ../../src/icu4c/source/runConfigureICU --enable-debug --disable-release Linux/clang --prefix=/usr/local/google/home/mscherer/icu/mine/inst/icu4c > config.out 2>&1 ; tail config.out
+ Elango's version (diff default C++ compiler & in-source build paths):
cd $ICU_OUT/icu4c/source
ICU_DATA_BUILDTOOL_OPTS=--include_uni_core_data CXXFLAGS="-DU_USING_ICU_NAMESPACE=0 -Wimplicit-fallthrough" CPPFLAGS="-DU_NO_DEFAULT_INCLUDE_UTF_HEADERS=1 -fsanitize=bounds" LDFLAGS=-fsanitize=bounds ./runConfigureICU --enable-debug --disable-release Linux/gcc --prefix=/usr/local/google/home/elango/oss/icu/icu4c > config.out 2>&1 ; tail config.out

*** data files & enums & parser code

Expand Down Expand Up @@ -360,8 +363,6 @@ copying that version number into the $ICU_SRC/.bazeliskrc config file.

* run & fix ICU4J tests

TODO

*** API additions
- send notice to icu-design about new born-@stable API (enum constants etc.)

Expand Down
8 changes: 8 additions & 0 deletions icu4c/source/test/cintltst/cucdtst.c
Original file line number Diff line number Diff line change
Expand Up @@ -2802,6 +2802,14 @@ TestAdditionalProperties(void) {
{ 0x0606, UCHAR_PREPENDED_CONCATENATION_MARK, false },
{ 0x110BD, UCHAR_PREPENDED_CONCATENATION_MARK, true },

/* Indic_Conjunct_Break values */
{ 0x094D, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
{ 0x09B9, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
{ 0x05BE, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ 0x05BF, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
{ 0x05C0, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ 0xD800, UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },

/* undefined UProperty values */
{ 0x61, 0x4a7, 0 },
{ 0x234bc, 0x15ed, 0 }
Expand Down
9 changes: 8 additions & 1 deletion icu4c/source/test/intltest/ucdtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@
#include "testutil.h"
#include "uparse.h"
#include "ucdtest.h"
#include "usettest.h"

#include <iostream>

static const char *ignorePropNames[]={
"FC_NFKC",
Expand Down Expand Up @@ -1092,6 +1095,10 @@ void UnicodeTest::TestPropertiesUsingPpucd() {
{ UCHAR_NFC_QUICK_CHECK, UNORM_MAYBE },
{ UCHAR_NFKC_QUICK_CHECK, UNORM_MAYBE },
#endif // !UCONFIG_NO_NORMALIZATION
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_NONE },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_CONSONANT },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_EXTEND },
{ UCHAR_INDIC_CONJUNCT_BREAK, U_INCB_LINKER },
};

// Iterate through PPUCD file, accumulating each line's data into each UnicodeSet per property
Expand Down Expand Up @@ -1133,7 +1140,7 @@ void UnicodeTest::TestPropertiesUsingPpucd() {
if (!tp.isBinary()) {
msg = msg + "=" + u_getPropertyValueName(tp.prop, tp.value, U_LONG_PROPERTY_NAME);
}
assertTrue(msg.c_str(), tp.set == icuPropSet);
UnicodeSetTest::checkEqual(*this, tp.set, icuPropSet, msg.c_str());
}
}

Expand Down
22 changes: 14 additions & 8 deletions icu4c/source/test/intltest/usettest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2114,20 +2114,26 @@ void UnicodeSetTest::copyWithIterator(UnicodeSet& t, const UnicodeSet& s, UBool
}

UBool UnicodeSetTest::checkEqual(const UnicodeSet& s, const UnicodeSet& t, const char* message) {
assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
return checkEqual(*this, s, t, message);
}

UBool UnicodeSetTest::checkEqual(
IntlTest& intlTest,
const UnicodeSet& s, const UnicodeSet& t, const char* message) {
intlTest.assertEquals(UnicodeString("RangeCount: ","") + message, s.getRangeCount(), t.getRangeCount());
intlTest.assertEquals(UnicodeString("size: ","") + message, s.size(), t.size());
UnicodeString source; s.toPattern(source, true);
UnicodeString result; t.toPattern(result, true);
if (s != t) {
errln(UnicodeString("FAIL: ") + message
+ "; source = " + source
+ "; result = " + result
intlTest.errln((UnicodeString)"FAIL: " + message
+ "\nsource = " + source
+ "\nresult = " + result
);
return false;
} else {
logln(UnicodeString("Ok: ") + message
+ "; source = " + source
+ "; result = " + result
intlTest.logln((UnicodeString)"Ok: " + message
+ "\nsource = " + source
+ "\nresult = " + result
);
}
return true;
Expand Down
2 changes: 2 additions & 0 deletions icu4c/source/test/intltest/usettest.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ class UnicodeSetTest: public IntlTest {
UnicodeSetTest();
~UnicodeSetTest();

static UBool checkEqual(IntlTest& intlTest, const UnicodeSet& s, const UnicodeSet& t, const char* message);

private:
void runIndexedTest(int32_t index, UBool exec, const char* &name, char* par=nullptr) override;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,7 @@ int getMaxValue(int which) {
return IdentifierStatus.ALLOWED.ordinal();
}
},
new IntProperty(0, INCB_MASK, INCB_SHIFT), // INDIC_CONJUNCT_BREAK
};

public int getIntPropertyValue(int c, int which) {
Expand Down Expand Up @@ -1378,7 +1379,8 @@ private static final int ntvGetType(int ntv) {
// Bits
// 31..26 Age major version (major=0..63)
// 25..24 Age minor version (minor=0..3)
// 23..15 reserved
// 23..17 reserved
// 16..15 Indic Conjunct Break
// 14..12 East Asian Width
// 11..10 3..1: Bits 9..0 = Script_Extensions index
// 3: Script value from Script_Extensions
Expand All @@ -1390,6 +1392,9 @@ private static final int ntvGetType(int ntv) {
private static final int EAST_ASIAN_MASK_ = 0x00007000;
private static final int EAST_ASIAN_SHIFT_ = 12;

private static final int INCB_MASK = 0x00018000;
private static final int INCB_SHIFT = 15;

/** Script_Extensions: mask includes Script */
public static final int SCRIPT_X_MASK = 0x00000fff;

Expand Down
18 changes: 18 additions & 0 deletions icu4j/main/core/src/main/java/com/ibm/icu/lang/UCharacter.java
Original file line number Diff line number Diff line change
Expand Up @@ -4124,6 +4124,24 @@ public static interface IndicSyllabicCategory {
public static final int REORDERING_KILLER = 36;
}

/**
* Indic Conjunct Break constants.
* See https://unicode.org/reports/tr44/#Indic_Conjunct_Break
*
* @see UProperty#INDIC_CONJUNCT_BREAK
* @draft ICU 76
*/
public enum IndicConjunctBreak {
/** @draft ICU 76 */
NONE,
/** @draft ICU 76 */
CONSONANT,
/** @draft ICU 76 */
EXTEND,
/** @draft ICU 76 */
LINKER,
}

/**
* Vertical Orientation constants.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -859,12 +859,19 @@ public interface UProperty
*/
public static final int IDENTIFIER_STATUS = 0x1019;

/**
* Enumerated property Indic_Conjunct_Break.
* Used in the grapheme cluster break algorithm in UAX #29.
* @draft ICU 76
*/
public static final int INDIC_CONJUNCT_BREAK = 0x101A;

/**
* One more than the last constant for enumerated/integer Unicode properties.
* @deprecated ICU 58 The numeric value may change over time, see ICU ticket #12420.
*/
@Deprecated
public static final int INT_LIMIT = 0x101A;
public static final int INT_LIMIT = 0x101B;

/**
* Bitmask property General_Category_Mask.
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -2212,6 +2212,14 @@ public void TestAdditionalProperties()
{ 0x0606, UProperty.PREPENDED_CONCATENATION_MARK, FALSE },
{ 0x110BD, UProperty.PREPENDED_CONCATENATION_MARK, TRUE },

/* Indic_Conjunct_Break values */
{ 0x094D, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.LINKER.ordinal() },
{ 0x09B9, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.CONSONANT.ordinal() },
{ 0x05BE, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
{ 0x05BF, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.EXTEND.ordinal() },
{ 0x05C0, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },
{ 0xD800, UProperty.INDIC_CONJUNCT_BREAK, UCharacter.IndicConjunctBreak.NONE.ordinal() },

/* undefined UProperty values */
{ 0x61, 0x4a7, 0 },
{ 0x234bc, 0x15ed, 0 }
Expand Down
5 changes: 5 additions & 0 deletions tools/unicode/c/genprops/corepropsbuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,10 @@ although the trie can hold 16-bit values.

Props vector 0 bits shuffled so that script and script extensions bits are contiguous.

Used 2 bits from props vector 0 to add Indic_Conjunct_Break. The bits used were freed up
by the preceding move of the Block property out of props vector 0 and the bit shuffling
("defragmentation") of Script and Script_Extensions.

----------------------------------------------------------------------------- */

U_NAMESPACE_USE
Expand Down Expand Up @@ -712,6 +716,7 @@ struct PropToEnum {
const PropToEnum
propToEnums[]={
{ UCHAR_EAST_ASIAN_WIDTH, 0, UPROPS_EA_SHIFT, UPROPS_EA_MASK },
{ UCHAR_INDIC_CONJUNCT_BREAK, 0, UPROPS_INCB_SHIFT, UPROPS_INCB_MASK },
{ UCHAR_DECOMPOSITION_TYPE, 2, 0, UPROPS_DT_MASK },
{ UCHAR_GRAPHEME_CLUSTER_BREAK, 2, UPROPS_GCB_SHIFT, UPROPS_GCB_MASK },
{ UCHAR_WORD_BREAK, 2, UPROPS_WB_SHIFT, UPROPS_WB_MASK },
Expand Down
10 changes: 9 additions & 1 deletion tools/unicode/c/genprops/pnames_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,13 @@ static const Value VALUES_ID_Status[2] = {
Value(U_ID_STATUS_ALLOWED, "Allowed Allowed"),
};

static const Value VALUES_InCB[4] = {
Value(U_INCB_NONE, "None None"),
Value(U_INCB_CONSONANT, "Consonant Consonant"),
Value(U_INCB_EXTEND, "Extend Extend"),
Value(U_INCB_LINKER, "Linker Linker"),
};

static const Value VALUES_gcm[38] = {
Value((int32_t)U_GC_C_MASK, "C Other"),
Value((int32_t)U_GC_CC_MASK, "Cc Control cntrl"),
Expand Down Expand Up @@ -1242,7 +1249,7 @@ static const Value VALUES_ID_Type[12] = {
Value(U_ID_TYPE_RECOMMENDED, "Recommended Recommended"),
};

static const Property PROPERTIES[119] = {
static const Property PROPERTIES[120] = {
Property(UCHAR_ALPHABETIC, "Alpha Alphabetic"),
Property(UCHAR_ASCII_HEX_DIGIT, "AHex ASCII_Hex_Digit"),
Property(UCHAR_BIDI_CONTROL, "Bidi_C Bidi_Control"),
Expand Down Expand Up @@ -1344,6 +1351,7 @@ static const Property PROPERTIES[119] = {
Property(UCHAR_INDIC_SYLLABIC_CATEGORY, "InSC Indic_Syllabic_Category", VALUES_InSC, 37),
Property(UCHAR_VERTICAL_ORIENTATION, "vo Vertical_Orientation", VALUES_vo, 4),
Property(UCHAR_IDENTIFIER_STATUS, "ID_Status Identifier_Status", VALUES_ID_Status, 2),
Property(UCHAR_INDIC_CONJUNCT_BREAK, "InCB Indic_Conjunct_Break", VALUES_InCB, 4),
Property(UCHAR_GENERAL_CATEGORY_MASK, "gcm General_Category_Mask", VALUES_gcm, 38),
Property(UCHAR_NUMERIC_VALUE, "nv Numeric_Value"),
Property(UCHAR_AGE, "age Age"),
Expand Down
2 changes: 1 addition & 1 deletion tools/unicode/py/preparseucd.py
Original file line number Diff line number Diff line change
Expand Up @@ -2012,7 +2012,7 @@ def PrintNameStats():
# Sample line to match:
# U_EA_AMBIGUOUS,
_prop_and_value_re = re.compile(
" *(U_(BPT|DT|EA|GCB|HST|ID_STATUS|ID_TYPE|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")
" *(U_(BPT|DT|EA|GCB|HST|ID_STATUS|ID_TYPE|INCB|INPC|INSC|LB|JG|JT|NT|SB|VO|WB)_([0-9A-Z_]+))")

# Sample line to match if it has matched _prop_and_value_re
# (we want to exclude aliases):
Expand Down