Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unicode performance issue #668

Draft
wants to merge 5 commits into
base: dev
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
.vs/
.vscode/
support/vscode/koka.language-koka/whatsnew.md
EastAsianWidth.txt
DerivedCoreProperties.txt
src/Syntax/Lexer.hs.gen
node_modules/
out/
Expand Down
4 changes: 2 additions & 2 deletions koka.cabal
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cabal-version: 1.12

-- This file has been generated from package.yaml by hpack version 0.36.0.
-- This file has been generated from package.yaml by hpack version 0.37.0.
--
-- see: https://github.com/sol/hpack

Expand Down Expand Up @@ -187,7 +187,7 @@ executable koka
Paths_koka
hs-source-dirs:
src/Main/langserver
ghc-options: -rtsopts -j8 -O2 -threaded "-with-rtsopts=-N8"
ghc-options: -rtsopts -j8 -O2
build-depends:
FloatingHex >=0.5
, aeson
Expand Down
314,712 changes: 314,712 additions & 0 deletions koka.prof

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions lib/std/core/char.kk
Original file line number Diff line number Diff line change
Expand Up @@ -77,37 +77,37 @@ pub fip fun (-)(c : char, d : char) : total char
(c.int - d.int).char

// Is the character a lower-case ASCII character?
pub fip fun is-lower( c : char ) : bool
pub fip fun ascii/is-lower( c : char ) : bool
c >= 'a' && c <= 'z'

// Is the character an upper-case ASCII character?
pub fip fun is-upper( c : char ) : bool
pub fip fun ascii/is-upper( c : char ) : bool
c >= 'A' && c <= 'Z'

// Is the character an ASCII digit ?
pub fip fun is-digit( c : char ) : bool
pub fip fun ascii/is-digit( c : char ) : bool
c >= '0' && c <= '9'

// Is the character an ASCII hexa-decimal digit?
pub fip fun is-hex-digit( c : char ) : bool
pub fip fun ascii/is-hex-digit( c : char ) : bool
c.is-digit || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')

// Is the character an ASCII letter?
pub fip fun is-alpha( c : char ) : bool
pub fip fun ascii/is-alpha( c : char ) : bool
c.is-lower || c.is-upper

// Is the character ASCII letter or digit?
pub fip fun is-alpha-num( c : char ) : bool
pub fip fun ascii/is-alpha-num( c : char ) : bool
c.is-alpha || c.is-digit

// Is the character an ASCII character, e.g. `c <= '\x7F'`?
pub fip fun is-ascii( c : char ) : bool
c <= '\x7F'

// Is the character an ASCII control character, e.g. `c < ' '`?
pub fip fun is-control( c : char ) : bool
pub fip fun ascii/is-control( c : char ) : bool
c < ' '

// Tests if a character is an element of `" \t\n\r"`
pub fip fun is-white( c : char ) : bool
pub fip fun ascii/is-white( c : char ) : bool
c == ' ' || c == '\t' || c == '\n' || c == '\r'
1,952 changes: 1,938 additions & 14 deletions lib/std/text/unicode.kk

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions package.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
# - support/vscode/koka.language-koka/package.json
# - whatsnew.md, readme.md

# Also update unicode asian-width list in `std/text/unicode`
# using the output of `stack exec koka -- util/update-unicode.kk -- -a`

name: koka
version: 3.1.3
Expand Down Expand Up @@ -85,8 +87,14 @@ executables:
- -rtsopts
- -j8
- -O2
- -threaded
- '"-with-rtsopts=-N8"'
# - -threaded
# - '"-with-rtsopts=-N8"'
# rm -rf .stack-work
# stack build --profile
# rm -rf .koka && mkdir .koka
# Build until it starts unicode, then cancel the build, and rebuild starting from the unicode module
# Memory grows past 12GB
# stack exec --profile koka -- samples/basic/unicode -v4 +RTS -p

koka-plain:
main: Main.hs
Expand Down
4 changes: 4 additions & 0 deletions samples/basic/unicode.kk
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
import std/text/unicode

fun main()
'a'.unicode/is-upper.println
11 changes: 11 additions & 0 deletions test/lib/unicode.kk
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// https://github.com/koka-lang/koka/issues/457
// https://github.com/koka-lang/koka/issues/458
import std/text/unicode

fun main()
// heart, variation, zero width join, fire
// ['h','i','/u2764','/uFE0F','/u200D','/U01F525']
"hi❤️‍🔥".list.println
"hi❤️‍🔥".graphemes.length.println

println(width("👾"))
3 changes: 3 additions & 0 deletions test/lib/unicode.kk.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
['h','i','/u2764','/uFE0F','/u200D','/U01F525']
3
2
241 changes: 241 additions & 0 deletions util/update-unicode.kk
Original file line number Diff line number Diff line change
@@ -0,0 +1,241 @@
import std/os/path
import std/os/dir
import std/os/file
import std/os/process
import std/os/env
import std/os/flags
import std/core/undiv

struct iflags
asian-wide: bool = False
derived-core-properties: bool = False

val header = "usage:\n stack exec koka -- -e util/update-unicode [-- [options]]\n\noptions:"

val flag-descs =
fun set-asian-wide( f : iflags, b : bool ) : iflags { f(asian-wide = b) }
fun set-derived-core-properties( f : iflags, b : bool ) : iflags { f(derived-core-properties = b) }
[ Flag( "a", ["asian-wide"], Bool(set-asian-wide), "print updated asian wide information" ),
Flag( "d", ["derived-core-properties"], Bool(set-derived-core-properties), "print updated derived core property information")]

pub fun process-flags() : <ndet,console> maybe<iflags>
val (flags,args,errs) = parse( Iflags(), flag-descs, get-args() )
if errs.is-nil && args.is-nil then Just(flags) else
println( errs.join("\n") ++ "\n" ++ flag-descs.usage(header) )
Nothing

type range
Range(bottom: string, top: string, description: string)

fun hex-ranges(file: path): <exn,fsys> list<range>
val lines = file.read-text-file().split("\n")
lines.filter-map() fn(line)
val values = line.split(";")
if values.length >= 2 then
val range = values[0].unjust.split("..")
val info = values.drop(1).join(";").trim().split("#")
if info.length == 2 then
if range.length == 2 then
val desc = info[0].unjust.trim()
val bottom = range[0].unjust.trim()
val top = range[1].unjust.trim()
Just(Range(bottom, top, desc))
elif range.length == 1 then
val desc = info[0].unjust.trim()
val bottom = range[0].unjust.trim()
Just(Range(bottom, bottom, desc))
else
throw("Err unsupported range")
else
Nothing
else
Nothing


fun filter-desc-eq(ranges: list<range>, desc: string): list<range>
ranges.filter(fn(r) desc == r.description)

fun filter-desc(ranges: list<range>, pred: (string) -> bool): list<range>
ranges.filter(fn(r) pred(r.description))

fun batch(l: list<a>, n: int): list<list<a>>
fun recur(l1 : list<a>, accl : list<a>, acc: list<list<a>>, left : int): list<list<a>>
match l1
Nil ->
if accl.length == 0 then acc.reverse
else Cons(accl, acc).reverse
Cons(x, rst) ->
if left == 1 then
recur(rst, [], Cons(Cons(x, accl).reverse, acc), n)
else
recur(rst, Cons(x, accl), acc, left - 1)
recur(l, [], [], n)

fun property-ranges(doc: string, koka-name: string, ranges: list<range>)
"// Doc from unicode data file".println
doc.split("\n").foreach fn(s)
("// " ++ s).println
println("val " ++ koka-name ++ r" : delayed<total,rtree> = delay{
build-rtree([")
ranges.batch(8).foreach fn(rangex)
println(" " ++ rangex.map(show-range).join(",") ++ ",")
println(" ])\n}")

fun show-range(r: range)
if r.bottom == r.top then "point(0x" ++ r.bottom ++ ")"
else "single(0x" ++ r.bottom ++ ",0x" ++ r.top ++ ")"

fun main()
val flags = process-flags().unjust
if flags.asian-wide then
val file = run-system(r#"curl "https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt" > EastAsianWidth.txt"#)
if "EastAsianWidth.txt".path.is-file then
val ranges = hex-ranges("EastAsianWidth.txt".path)
property-ranges(r#"East_Asian_Width property, consisting of one of the following values:
"A", "F", "H", "N", "Na", "W"
- All code points, assigned or unassigned, that are not listed
explicitly are given the value "N".
- The unassigned code points in the following blocks default to "W":
CJK Unified Ideographs Extension A: U+3400..U+4DBF
CJK Unified Ideographs: U+4E00..U+9FFF
CJK Compatibility Ideographs: U+F900..U+FAFF
- All undesignated code points in Planes 2 and 3, whether inside or
outside of allocated blocks, default to "W":
Plane 2: U+20000..U+2FFFD
Plane 3: "#, "asian-wide", ranges.filter-desc-eq("W"))
else
println("Could not find EastAsianWidth.txt\n\tPlease download from https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt to your current directory")
elif flags.derived-core-properties then
val file = run-system(r#"curl "https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt" > DerivedCoreProperties.txt"#)
if "DerivedCoreProperties.txt".path.is-file then
val ranges = hex-ranges("DerivedCoreProperties.txt".path)
property-ranges(r"
Derived Property: Math
Generated from: Sm + Other_Math", "math", ranges.filter-desc-eq("Math"))
property-ranges(r"Derived Property: Alphabetic
Generated from: Uppercase + Lowercase + Lt + Lm + Lo + Nl + Other_Alphabetic", "alphabetic", ranges.filter-desc-eq("Alphabetic"))
property-ranges(r"Derived Property: Lowercase
Generated from: Ll + Other_Lowercase", "lower", ranges.filter-desc-eq("Lowercase"))
property-ranges(r"Derived Property: Uppercase
Generated from: Lu + Other_Uppercase", "upper", ranges.filter-desc-eq("Uppercase"))
property-ranges(r"Derived Property: Cased (Cased)
As defined by Unicode Standard Definition D135
C has the Lowercase or Uppercase property or has a General_Category value of Titlecase_Letter.", "cased", ranges.filter-desc-eq("Cased"))
property-ranges(r"Derived Property: Case_Ignorable (CI)
As defined by Unicode Standard Definition D136
C is defined to be case-ignorable if
Word_Break(C) = MidLetter or MidNumLet or Single_Quote, or
General_Category(C) = Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).",
"case-ignore", ranges.filter-desc-eq("Case_Ignorable"))
property-ranges(r#"Derived Property: Changes_When_Lowercased (CWL)
Characters whose normalized forms are not stable under a toLowercase mapping.
For more information, see D139 in Section 3.13, "Default Case Algorithms".
Changes_When_Lowercased(X) is true when toLowercase(toNFD(X)) != toNFD(X)"#,
"lowercase-changes", ranges.filter-desc-eq("Changes_When_Lowercased"))
property-ranges(r#"Derived Property: Changes_When_Uppercased (CWU)
Characters whose normalized forms are not stable under a toUppercase mapping.
For more information, see D140 in Section 3.13, "Default Case Algorithms".
Changes_When_Uppercased(X) is true when toUppercase(toNFD(X)) != toNFD(X)"#,
"uppercase-changes", ranges.filter-desc-eq("Changes_When_Uppercased"))
property-ranges(r#"Derived Property: Changes_When_Titlecased (CWT)
Characters whose normalized forms are not stable under a toTitlecase mapping.
For more information, see D141 in Section 3.13, "Default Case Algorithms".
Changes_When_Titlecased(X) is true when toTitlecase(toNFD(X)) != toNFD(X)"#,
"titlecase-changes", ranges.filter-desc-eq("Changes_When_Titlecased"))
property-ranges(r#"Derived Property: Changes_When_Casefolded (CWCF)
Characters whose normalized forms are not stable under case folding.
For more information, see D142 in Section 3.13, "Default Case Algorithms".
Changes_When_Casefolded(X) is true when toCasefold(toNFD(X)) != toNFD(X)"#,
"casefolded-changes", ranges.filter-desc-eq("Changes_When_Casefolded"))
property-ranges(r#"Derived Property: Changes_When_Casemapped (CWCM)
Characters whose normalized forms are not stable under case mapping.
For more information, see D143 in Section 3.13, "Default Case Algorithms".
Changes_When_Casemapped(X) is true when CWL(X), or CWT(X), or CWU(X)"#,
"casemapped-changes", ranges.filter-desc-eq("Changes_When_Casemapped"))
property-ranges(r#"Derived Property: ID_Start
Characters that can start an identifier.
Generated from:
Lu + Ll + Lt + Lm + Lo + Nl
+ Other_ID_Start
- Pattern_Syntax
- Pattern_White_Space
NOTE: See UAX #31 for more information"#,
"id-start", ranges.filter-desc-eq("ID_Start"))
property-ranges(r#"Derived Property: ID_Continue
Characters that can continue an identifier.
Generated from:
ID_Start
+ Mn + Mc + Nd + Pc
+ Other_ID_Continue
- Pattern_Syntax
- Pattern_White_Space
NOTE: See UAX #31 for more information"#,
"id-continue", ranges.filter-desc-eq("ID_Continue"))
property-ranges(r#"Derived Property: XID_Start
ID_Start modified for closure under NFKx
Modified as described in UAX #15
NOTE: Does NOT remove the non-NFKx characters.
Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))
NOTE: See UAX #31 for more information"#,
"xid-start", ranges.filter-desc-eq("XID_Start"))
property-ranges(r#"Derived Property: XID_Continue
Mod_ID_Continue modified for closure under NFKx
Modified as described in UAX #15
NOTE: Does NOT remove the non-NFKx characters.
Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))
NOTE: See UAX #31 for more information"#,
"xid-continue", ranges.filter-desc-eq("XID_Continue"))
property-ranges(r#"Derived Property: Default_Ignorable_Code_Point
Generated from
Other_Default_Ignorable_Code_Point
+ Cf (Format characters)
+ Variation_Selector
- White_Space
- FFF9..FFFB (Interlinear annotation format characters)
- 13430..13440 (Egyptian hieroglyph format characters)
- Prepended_Concatenation_Mark (Exceptional format characters that should be visible)
#
There are currently no stability guarantees for DICP. However, the
values of DICP interact with the derivation of XID_Continue
and NFKC_CF, for which there are stability guarantees.
Maintainers of this property should note that in the
unlikely case that the DICP value changes for an existing character
which is also XID_Continue=Yes, then exceptions must be put
in place to ensure that the NFKC_CF mapping value for that
existing character does not change."#,
"default-ignorable", ranges.filter-desc-eq("Default_Ignorable_Code_Point"))

property-ranges(r#"Derived Property: Grapheme_Extend
Generated from: Me + Mn + Other_Grapheme_Extend
Note: depending on an application's interpretation of Co (private use),
they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."#,
"grapheme-extend", ranges.filter-desc-eq("Grapheme_Extend"))
property-ranges(r#"Derived Property: Grapheme_Base
Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
Note: depending on an application's interpretation of Co (private use),
they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."#,
"grapheme-base", ranges.filter-desc-eq("Grapheme_Base"))
property-ranges(r#"Derived Property: Grapheme_Link (deprecated)
Generated from: Canonical_Combining_Class=Virama
Use Canonical_Combining_Class=Virama directly instead"#,
"grapheme-link", ranges.filter-desc-eq("Grapheme_Link"))
property-ranges(r#"Derived Property: Indic_Conjunct_Break
Generated from the Grapheme_Cluster_Break, Indic_Syllabic_Category,
Canonical_Combining_Class, and Script properties as described in UAX #44:
https://www.unicode.org/reports/tr44/.

All code points not explicitly listed for Indic_Conjunct_Break
have the value None.

@missing: 0000..10FFFF; InCB; None

================================================

Indic_Conjunct_Break=Linker"#,
"indic-break-linker", ranges.filter-desc-eq("InCB; Linker"))
property-ranges(r#"Indic_Conjunct_Break=Consonant"#,
"indic-break-consonant", ranges.filter-desc-eq("InCB; Consonant"))
property-ranges(r#"Indic_Conjunct_Break=Extend"#,
"indic-break-extend", ranges.filter-desc-eq("InCB; Extend"))
else
println("Could not find DerivedCoreProperties.txt\n\tPlease download from https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt to your current directory")