Skip to content

Commit

Permalink
internal/export/unicode: add CategoryAliases, Cn, and LC
Browse files Browse the repository at this point in the history
CategoryAliases is for regexp to use, for things like \p{Letter} as an alias for \p{L}.
Cn and LC are special-case categories that were never implemented
but should have been.

For golang/go#70780.

Change-Id: I1401c1be42106a0ebecabb085c25e97485c363cf
Reviewed-on: https://go-review.googlesource.com/c/text/+/641395
Auto-Submit: Russ Cox <[email protected]>
Reviewed-by: Marcel van Lohuizen <[email protected]>
LUCI-TryBot-Result: Go LUCI <[email protected]>
Reviewed-by: Ian Lance Taylor <[email protected]>
  • Loading branch information
rsc authored and gopherbot committed Feb 27, 2025
1 parent 518d9c0 commit ae68efb
Showing 1 changed file with 51 additions and 13 deletions.
64 changes: 51 additions & 13 deletions internal/export/unicode/gen.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,10 @@ import (
"flag"
"fmt"
"log"
"maps"
"os"
"regexp"
"slices"
"sort"
"strings"
"unicode"
Expand Down Expand Up @@ -90,13 +92,15 @@ func println(args ...interface{}) {
var category = map[string]bool{
// Nd Lu etc.
// We use one-character names to identify merged categories
"L": true, // Lu Ll Lt Lm Lo
"P": true, // Pc Pd Ps Pe Pu Pf Po
"M": true, // Mn Mc Me
"N": true, // Nd Nl No
"S": true, // Sm Sc Sk So
"Z": true, // Zs Zl Zp
"C": true, // Cc Cf Cs Co Cn
"L": true, // Lu Ll Lt Lm Lo
"LC": true, // Lu Ll Lt
"P": true, // Pc Pd Ps Pe Pu Pf Po
"M": true, // Mn Mc Me
"N": true, // Nd Nl No
"S": true, // Sm Sc Sk So
"Z": true, // Zs Zl Zp
"C": true, // Cc Cf Cs Co Cn
"Cn": true, // unassigned
}

// This contains only the properties we're interested in.
Expand Down Expand Up @@ -149,6 +153,9 @@ func categoryOp(code rune, class uint8) bool {
}

func loadChars() {
for code := range chars {
chars[code].category = "Cn" // unassigned
}
ucd.Parse(gen.OpenUCDFile("UnicodeData.txt"), func(p *ucd.Parser) {
c := Char{codePoint: p.Rune(0)}

Expand Down Expand Up @@ -201,6 +208,7 @@ func loadCasefold() {
}

var categoryMapping = map[string]string{
"LC": "Letter, cased: Ll | Lt | Lu",
"Lu": "Letter, uppercase",
"Ll": "Letter, lowercase",
"Lt": "Letter, titlecase",
Expand Down Expand Up @@ -257,6 +265,7 @@ func printCategories() {
printf("\t%q: %s,\n", k, k)
}
print("}\n\n")
printCategoryAliases()
}

decl := make(sort.StringSlice, len(list))
Expand Down Expand Up @@ -315,14 +324,14 @@ func printCategories() {
}
decl[ndecl] = varDecl
ndecl++
match := func(cat string) bool { return cat == name }
if len(name) == 1 { // unified categories
dumpRange(
"_"+name,
func(code rune) bool { return categoryOp(code, name[0]) })
continue
match = func(cat string) bool { return strings.HasPrefix(cat, name) }
}
dumpRange("_"+name,
func(code rune) bool { return chars[code].category == name })
if name == "LC" { // special unified category
match = func(cat string) bool { return cat == "Ll" || cat == "Lt" || cat == "Lu" }
}
dumpRange("_"+name, func(code rune) bool { return match(chars[code].category) })
}
decl.Sort()
println("// These variables have type *RangeTable.")
Expand All @@ -333,6 +342,35 @@ func printCategories() {
print(")\n\n")
}

func printCategoryAliases() {
known := make(map[string]bool)
for _, name := range allCategories() {
known[name] = true
}

table := make(map[string]string)
ucd.Parse(gen.OpenUCDFile("PropertyValueAliases.txt"), func(p *ucd.Parser) {
if p.String(0) != "gc" {
return
}
name := p.String(1)
if !known[name] {
logger.Print("unknown category: ", name)
}
table[p.String(2)] = name
if a := p.String(3); a != "" {
table[a] = name
}
})

println("// CategoryAliases maps category aliases to standard category names.")
println("var CategoryAliases = map[string]string{")
for _, name := range slices.Sorted(maps.Keys(table)) {
printf("\t%q: %q,\n", name, table[name])
}
print("}\n\n")
}

type Op func(code rune) bool

func dumpRange(name string, inCategory Op) {
Expand Down

0 comments on commit ae68efb

Please sign in to comment.