koka-lang · TimWhiting · Feb 3, 2024 · Jan 31, 2025 · Feb 4, 2025 · Feb 4, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,8 @@
 .vs/
 .vscode/
 support/vscode/koka.language-koka/whatsnew.md
+EastAsianWidth.txt
+DerivedCoreProperties.txt
 src/Syntax/Lexer.hs.gen
 node_modules/
 out/

diff --git a/koka.cabal b/koka.cabal
@@ -1,6 +1,6 @@
 cabal-version: 1.12
 
--- This file has been generated from package.yaml by hpack version 0.36.0.
+-- This file has been generated from package.yaml by hpack version 0.37.0.
 --
 -- see: https://github.com/sol/hpack
 
@@ -187,7 +187,7 @@ executable koka
       Paths_koka
   hs-source-dirs:
       src/Main/langserver
-  ghc-options: -rtsopts -j8 -O2 -threaded "-with-rtsopts=-N8"
+  ghc-options: -rtsopts -j8 -O2
   build-depends:
       FloatingHex >=0.5
     , aeson

diff --git a/koka.prof b/koka.prof
diff --git a/lib/std/core/char.kk b/lib/std/core/char.kk
@@ -77,37 +77,37 @@ pub fip fun (-)(c : char, d : char) : total char
   (c.int - d.int).char
 
 // Is the character a lower-case ASCII character?
-pub fip fun is-lower( c : char ) : bool
+pub fip fun ascii/is-lower( c : char ) : bool
   c >= 'a' && c <= 'z'
 
 // Is the character an upper-case ASCII character?
-pub fip fun is-upper( c : char ) : bool
+pub fip fun ascii/is-upper( c : char ) : bool
   c >= 'A' && c <= 'Z'
 
 // Is the character an ASCII digit ?
-pub fip fun is-digit( c : char ) : bool
+pub fip fun ascii/is-digit( c : char ) : bool
   c >= '0' && c <= '9'
 
 // Is the character an ASCII hexa-decimal digit?
-pub fip fun is-hex-digit( c : char ) : bool
+pub fip fun ascii/is-hex-digit( c : char ) : bool
   c.is-digit || (c >= 'a' && c <= 'f') || (c >= 'A'  && c <= 'F')
 
 // Is the character an ASCII letter?
-pub fip fun is-alpha( c : char ) : bool
+pub fip fun ascii/is-alpha( c : char ) : bool
   c.is-lower || c.is-upper
 
 // Is the character ASCII letter or digit?
-pub fip fun is-alpha-num( c : char ) : bool
+pub fip fun ascii/is-alpha-num( c : char ) : bool
   c.is-alpha || c.is-digit
 
 // Is the character an ASCII character, e.g. `c <= '\x7F'`?
 pub fip fun is-ascii( c : char )     : bool
   c <= '\x7F'
 
 // Is the character an ASCII control character, e.g. `c < ' '`?
-pub fip fun is-control( c : char )   : bool
+pub fip fun ascii/is-control( c : char )   : bool
   c < ' '
 
 // Tests if a character is an element of `" \t\n\r"`
-pub fip fun is-white( c : char )     : bool
+pub fip fun ascii/is-white( c : char )     : bool
   c == ' ' || c == '\t' || c == '\n' || c == '\r'
diff --git a/lib/std/text/unicode.kk b/lib/std/text/unicode.kk
diff --git a/package.yaml b/package.yaml
@@ -8,6 +8,8 @@
 # - support/vscode/koka.language-koka/package.json
 # - whatsnew.md, readme.md
 
+# Also update unicode asian-width list in `std/text/unicode`
+#  using the output of `stack exec koka -- util/update-unicode.kk -- -a`
 
 name:       koka
 version:    3.1.3
@@ -85,8 +87,14 @@ executables:
       - -rtsopts
       - -j8
       - -O2
-      - -threaded
-      - '"-with-rtsopts=-N8"'
+      # - -threaded
+      # - '"-with-rtsopts=-N8"'
+    # rm -rf .stack-work
+    # stack build --profile
+    # rm -rf .koka && mkdir .koka
+    # Build until it starts unicode, then cancel the build, and rebuild starting from the unicode module
+    # Memory grows past 12GB
+    # stack exec --profile koka -- samples/basic/unicode -v4 +RTS -p
 
   koka-plain:
     main: Main.hs

diff --git a/samples/basic/unicode.kk b/samples/basic/unicode.kk
@@ -0,0 +1,4 @@
+import std/text/unicode
+
+fun main()
+  'a'.unicode/is-upper.println
diff --git a/test/lib/unicode.kk b/test/lib/unicode.kk
@@ -0,0 +1,11 @@
+// https://github.com/koka-lang/koka/issues/457
+// https://github.com/koka-lang/koka/issues/458
+import std/text/unicode
+
+fun main()
+  //           heart, variation, zero width join, fire
+  // ['h','i','/u2764','/uFE0F','/u200D','/U01F525']
+  "hi❤️‍🔥".list.println
+  "hi❤️‍🔥".graphemes.length.println
+
+  println(width("👾"))
diff --git a/test/lib/unicode.kk.out b/test/lib/unicode.kk.out
@@ -0,0 +1,3 @@
+['h','i','/u2764','/uFE0F','/u200D','/U01F525']
+3
+2
diff --git a/util/update-unicode.kk b/util/update-unicode.kk
@@ -0,0 +1,241 @@
+import std/os/path
+import std/os/dir
+import std/os/file
+import std/os/process
+import std/os/env
+import std/os/flags
+import std/core/undiv
+
+struct iflags
+  asian-wide: bool = False
+  derived-core-properties: bool = False
+
+val header = "usage:\n stack exec koka -- -e util/update-unicode [-- [options]]\n\noptions:"
+
+val flag-descs = 
+  fun set-asian-wide( f : iflags, b : bool ) : iflags { f(asian-wide = b) }
+  fun set-derived-core-properties( f : iflags, b : bool ) : iflags { f(derived-core-properties = b) }
+  [ Flag( "a", ["asian-wide"], Bool(set-asian-wide), "print updated asian wide information" ), 
+    Flag( "d", ["derived-core-properties"], Bool(set-derived-core-properties), "print updated derived core property information")]
+
+pub fun process-flags() : <ndet,console> maybe<iflags>
+  val (flags,args,errs) = parse( Iflags(), flag-descs, get-args() )
+  if errs.is-nil && args.is-nil then Just(flags) else
+    println( errs.join("\n") ++ "\n" ++ flag-descs.usage(header) )
+    Nothing
+
+type range
+  Range(bottom: string, top: string, description: string)
+
+fun hex-ranges(file: path): <exn,fsys> list<range>
+  val lines = file.read-text-file().split("\n")
+  lines.filter-map() fn(line)
+    val values = line.split(";")
+    if values.length >= 2 then
+      val range = values[0].unjust.split("..")
+      val info = values.drop(1).join(";").trim().split("#")
+      if info.length == 2 then
+        if range.length == 2 then
+          val desc = info[0].unjust.trim()
+          val bottom = range[0].unjust.trim()
+          val top = range[1].unjust.trim()
+          Just(Range(bottom, top, desc))
+        elif range.length == 1 then
+          val desc = info[0].unjust.trim()
+          val bottom = range[0].unjust.trim()
+          Just(Range(bottom, bottom, desc))
+        else
+          throw("Err unsupported range")
+      else
+        Nothing
+    else 
+      Nothing
+
+
+fun filter-desc-eq(ranges: list<range>, desc: string): list<range>
+  ranges.filter(fn(r) desc == r.description)
+
+fun filter-desc(ranges: list<range>, pred: (string) -> bool): list<range>
+  ranges.filter(fn(r) pred(r.description))
+
+fun batch(l: list<a>, n: int): list<list<a>>
+  fun recur(l1 : list<a>, accl : list<a>, acc: list<list<a>>, left : int): list<list<a>>
+    match l1
+      Nil -> 
+        if accl.length == 0 then acc.reverse
+        else Cons(accl, acc).reverse
+      Cons(x, rst) ->
+        if left == 1 then
+          recur(rst, [], Cons(Cons(x, accl).reverse, acc), n)
+        else
+          recur(rst, Cons(x, accl), acc, left - 1)
+  recur(l, [], [], n)
+
+fun property-ranges(doc: string, koka-name: string, ranges: list<range>)
+  "// Doc from unicode data file".println
+  doc.split("\n").foreach fn(s)
+    ("// " ++ s).println
+  println("val " ++ koka-name ++ r" : delayed<total,rtree> = delay{
+  build-rtree([")
+  ranges.batch(8).foreach fn(rangex)
+    println("    " ++ rangex.map(show-range).join(",") ++ ",")
+  println("  ])\n}")
+
+fun show-range(r: range)
+  if r.bottom == r.top then "point(0x" ++ r.bottom ++ ")"
+  else "single(0x" ++ r.bottom ++ ",0x"  ++ r.top ++ ")"
+
+fun main()
+  val flags = process-flags().unjust
+  if flags.asian-wide then 
+    val file = run-system(r#"curl "https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt" > EastAsianWidth.txt"#)
+    if "EastAsianWidth.txt".path.is-file then 
+      val ranges = hex-ranges("EastAsianWidth.txt".path)
+      property-ranges(r#"East_Asian_Width property, consisting of one of the following values:
+       "A", "F", "H", "N", "Na", "W"
+- All code points, assigned or unassigned, that are not listed
+    explicitly are given the value "N".
+- The unassigned code points in the following blocks default to "W":
+       CJK Unified Ideographs Extension A: U+3400..U+4DBF
+       CJK Unified Ideographs:             U+4E00..U+9FFF
+       CJK Compatibility Ideographs:       U+F900..U+FAFF
+- All undesignated code points in Planes 2 and 3, whether inside or
+    outside of allocated blocks, default to "W":
+       Plane 2:                            U+20000..U+2FFFD
+       Plane 3: "#, "asian-wide", ranges.filter-desc-eq("W"))
+    else
+      println("Could not find EastAsianWidth.txt\n\tPlease download from https://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt to your current directory") 
+  elif flags.derived-core-properties then
+    val file = run-system(r#"curl "https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt" > DerivedCoreProperties.txt"#)
+    if "DerivedCoreProperties.txt".path.is-file then 
+      val ranges = hex-ranges("DerivedCoreProperties.txt".path)
+      property-ranges(r"
+Derived Property: Math
+ Generated from: Sm + Other_Math", "math", ranges.filter-desc-eq("Math"))
+      property-ranges(r"Derived Property: Alphabetic
+ Generated from: Uppercase + Lowercase + Lt + Lm + Lo + Nl + Other_Alphabetic", "alphabetic", ranges.filter-desc-eq("Alphabetic"))
+      property-ranges(r"Derived Property: Lowercase
+ Generated from: Ll + Other_Lowercase", "lower", ranges.filter-desc-eq("Lowercase"))
+      property-ranges(r"Derived Property: Uppercase
+ Generated from: Lu + Other_Uppercase", "upper", ranges.filter-desc-eq("Uppercase"))
+      property-ranges(r"Derived Property:   Cased (Cased)
+ As defined by Unicode Standard Definition D135
+ C has the Lowercase or Uppercase property or has a General_Category value of Titlecase_Letter.", "cased", ranges.filter-desc-eq("Cased"))
+      property-ranges(r"Derived Property:   Case_Ignorable (CI)
+ As defined by Unicode Standard Definition D136
+ C is defined to be case-ignorable if
+   Word_Break(C) = MidLetter or MidNumLet or Single_Quote, or
+   General_Category(C) = Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), Modifier_Letter (Lm), or Modifier_Symbol (Sk).",
+        "case-ignore", ranges.filter-desc-eq("Case_Ignorable"))
+      property-ranges(r#"Derived Property:   Changes_When_Lowercased (CWL)
+ Characters whose normalized forms are not stable under a toLowercase mapping.
+ For more information, see D139 in Section 3.13, "Default Case Algorithms".
+ Changes_When_Lowercased(X) is true when toLowercase(toNFD(X)) != toNFD(X)"#,
+        "lowercase-changes", ranges.filter-desc-eq("Changes_When_Lowercased"))
+      property-ranges(r#"Derived Property:   Changes_When_Uppercased (CWU)
+ Characters whose normalized forms are not stable under a toUppercase mapping.
+ For more information, see D140 in Section 3.13, "Default Case Algorithms".
+ Changes_When_Uppercased(X) is true when toUppercase(toNFD(X)) != toNFD(X)"#,
+        "uppercase-changes", ranges.filter-desc-eq("Changes_When_Uppercased"))
+      property-ranges(r#"Derived Property:   Changes_When_Titlecased (CWT)
+ Characters whose normalized forms are not stable under a toTitlecase mapping.
+ For more information, see D141 in Section 3.13, "Default Case Algorithms".
+ Changes_When_Titlecased(X) is true when toTitlecase(toNFD(X)) != toNFD(X)"#,
+        "titlecase-changes", ranges.filter-desc-eq("Changes_When_Titlecased"))
+      property-ranges(r#"Derived Property:   Changes_When_Casefolded (CWCF)
+ Characters whose normalized forms are not stable under case folding.
+ For more information, see D142 in Section 3.13, "Default Case Algorithms".
+ Changes_When_Casefolded(X) is true when toCasefold(toNFD(X)) != toNFD(X)"#,
+        "casefolded-changes", ranges.filter-desc-eq("Changes_When_Casefolded"))
+      property-ranges(r#"Derived Property:   Changes_When_Casemapped (CWCM)
+ Characters whose normalized forms are not stable under case mapping.
+ For more information, see D143 in Section 3.13, "Default Case Algorithms".
+ Changes_When_Casemapped(X) is true when CWL(X), or CWT(X), or CWU(X)"#,
+        "casemapped-changes", ranges.filter-desc-eq("Changes_When_Casemapped"))
+      property-ranges(r#"Derived Property: ID_Start
+ Characters that can start an identifier.
+ Generated from:
+     Lu + Ll + Lt + Lm + Lo + Nl
+   + Other_ID_Start
+   - Pattern_Syntax
+   - Pattern_White_Space
+ NOTE: See UAX #31 for more information"#,
+        "id-start", ranges.filter-desc-eq("ID_Start"))
+      property-ranges(r#"Derived Property: ID_Continue
+ Characters that can continue an identifier.
+ Generated from:
+     ID_Start
+   + Mn + Mc + Nd + Pc
+   + Other_ID_Continue
+   - Pattern_Syntax
+   - Pattern_White_Space
+ NOTE: See UAX #31 for more information"#,
+        "id-continue", ranges.filter-desc-eq("ID_Continue"))
+      property-ranges(r#"Derived Property: XID_Start
+ ID_Start modified for closure under NFKx
+ Modified as described in UAX #15
+ NOTE: Does NOT remove the non-NFKx characters.
+       Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))
+ NOTE: See UAX #31 for more information"#,
+        "xid-start", ranges.filter-desc-eq("XID_Start"))
+      property-ranges(r#"Derived Property: XID_Continue
+ Mod_ID_Continue modified for closure under NFKx
+ Modified as described in UAX #15
+ NOTE: Does NOT remove the non-NFKx characters.
+       Merely ensures that if isIdentifer(string) then isIdentifier(NFKx(string))
+ NOTE: See UAX #31 for more information"#,
+        "xid-continue", ranges.filter-desc-eq("XID_Continue"))
+      property-ranges(r#"Derived Property: Default_Ignorable_Code_Point
+ Generated from
+   Other_Default_Ignorable_Code_Point
+ + Cf (Format characters)
+ + Variation_Selector
+ - White_Space
+ - FFF9..FFFB (Interlinear annotation format characters)
+ - 13430..13440 (Egyptian hieroglyph format characters)
+ - Prepended_Concatenation_Mark (Exceptional format characters that should be visible)
+#
+There are currently no stability guarantees for DICP. However, the
+values of DICP interact with the derivation of XID_Continue
+and NFKC_CF, for which there are stability guarantees.
+Maintainers of this property should note that in the
+unlikely case that the DICP value changes for an existing character
+which is also XID_Continue=Yes, then exceptions must be put
+in place to ensure that the NFKC_CF mapping value for that
+existing character does not change."#,
+        "default-ignorable", ranges.filter-desc-eq("Default_Ignorable_Code_Point"))
+
+      property-ranges(r#"Derived Property: Grapheme_Extend
+ Generated from: Me + Mn + Other_Grapheme_Extend
+ Note: depending on an application's interpretation of Co (private use),
+ they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."#,
+        "grapheme-extend", ranges.filter-desc-eq("Grapheme_Extend"))
+      property-ranges(r#"Derived Property: Grapheme_Base
+ Generated from: [0..10FFFF] - Cc - Cf - Cs - Co - Cn - Zl - Zp - Grapheme_Extend
+ Note: depending on an application's interpretation of Co (private use),
+ they may be either in Grapheme_Base, or in Grapheme_Extend, or in neither."#,
+        "grapheme-base", ranges.filter-desc-eq("Grapheme_Base"))
+      property-ranges(r#"Derived Property: Grapheme_Link (deprecated)
+ Generated from: Canonical_Combining_Class=Virama
+ Use Canonical_Combining_Class=Virama directly instead"#,
+        "grapheme-link", ranges.filter-desc-eq("Grapheme_Link"))
+      property-ranges(r#"Derived Property: Indic_Conjunct_Break
+ Generated from the Grapheme_Cluster_Break, Indic_Syllabic_Category,
+ Canonical_Combining_Class, and Script properties as described in UAX #44:
+ https://www.unicode.org/reports/tr44/.
+
+ All code points not explicitly listed for Indic_Conjunct_Break
+ have the value None.
+
+@missing: 0000..10FFFF; InCB; None
+
+================================================
+
+Indic_Conjunct_Break=Linker"#,
+        "indic-break-linker", ranges.filter-desc-eq("InCB; Linker"))
+      property-ranges(r#"Indic_Conjunct_Break=Consonant"#,
+        "indic-break-consonant", ranges.filter-desc-eq("InCB; Consonant"))
+      property-ranges(r#"Indic_Conjunct_Break=Extend"#,
+        "indic-break-extend", ranges.filter-desc-eq("InCB; Extend"))
+    else
+      println("Could not find DerivedCoreProperties.txt\n\tPlease download from https://www.unicode.org/Public/UNIDATA/DerivedCoreProperties.txt to your current directory")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		['h','i','/u2764','/uFE0F','/u200D','/U01F525']
		3
		2