[spec/word-split] Add ash, ysh

[doc/unicode] Update list of operations, including $IFS splitting, aka shSplit()
oils-for-unix · Jan 5, 2025 · 097f0d1 · 097f0d1
1 parent 10ec7ae
commit 097f0d1
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 33 deletions.
diff --git a/doc/unicode.md b/doc/unicode.md
@@ -19,10 +19,14 @@ UTF-8, while the latter use arrays of code points or UTF-16 code units.
 
 ## A Mental Model
 
-### Program Encoding
+### Program Encoding - OSH vs. YSH
 
-Shell **programs** should be encoded in UTF-8 (or its ASCII subset).  Unicode
-characters can be encoded directly in the source:
+- The source files of OSH programs may have arbitrary bytes, for backward
+  compatibility.
+- The source files of YSH programs should be should be encoded in UTF-8 (or its
+  ASCII subset).  TODO: Enforce this with `shopt --set utf8_source`
+
+Unicode characters can be encoded directly in the source:
 
 <pre>
 echo '&#x03bc;'
@@ -38,8 +42,8 @@ or denoted in ASCII with C-escaped strings:
 
 ### Data Encoding
 
-Strings in OSH are arbitrary sequences of **bytes**, which may be valid UTF-8.
-Details:
+Strings in OSH are arbitrary sequences of **bytes**, which may or may not be
+valid UTF-8.  Details:
 
 - When passed to external programs, strings are truncated at the first `NUL`
   (`'\0'`) byte.  This is a consequence of how Unix and C work.
@@ -51,20 +55,30 @@ Details:
 
 ### OSH / bash
 
-These operations are currently implemented in Python, in `osh/string_ops.py`:
+These operations are implemented in Python.
+
+In `osh/string_ops.py`:
 
 - `${#s}` -- length in code points (buggy in bash)
   - Note: YSH `len(s)` returns a number of bytes, not code points.
 - `${s:1:2}` -- index and length are a number of code points
 - `${x#glob?}` and `${x##glob?}` (see below)
 
+In `builtin/`:
+
+- `printf '%d' \'c` where `c` is an arbitrary character.  This is an obscure
+  syntax for `ord()`, i.e. getting an integer from an encoded character.
+
 More:
 
+- `$IFS` word splitting.  Affects `shSplit()` builtin
+  - Doesn't respect unicode in dash, ash, mksh.  But it does in bash, yash, and
+    zsh with `setopt SH_WORD_SPLIT`.
+  - TODO: Oils should probably respect it
 - `${foo,}` and `${foo^}` for lowercase / uppercase
+  - TODO: doesn't respect unicode
 - `[[ a < b ]]` and `[ a '<' b ]` for sorting
   - these can use libc `strcoll()`?
-- `printf '%d' \'c` where `c` is an arbitrary character.  This is an obscure
-  syntax for `ord()`, i.e. getting an integer from an encoded character.
 
 #### Globs
 
@@ -109,10 +123,11 @@ Other:
 - Eggex matching depends on ERE semantics.
   - `mystr ~ / [ \xff ] /` 
   - `case (x) { / dot / }`
-- `for offset, rune in (runes(mystr))` decodes UTF-8, like Go
 - `Str.{trim,trimLeft,trimRight}` respect unicode space, like JavaScript does
-- `Str.{upper,lower}` also need unicode case folding
-- `split()` respects unicode space?
+- TODO: `Str.{upper,lower}` also need unicode case folding
+- TODO: `s.split()` doesn't have a default "split by space", which should
+  probably respect unicode space, like `trim()` does
+- TODO: `for offset, rune in (runes(mystr))` decodes UTF-8, like Go
 
 Not unicode aware:
 
@@ -183,8 +198,6 @@ June 2024 notes:
 
 -->
 
-
-
 <!--
 
 What libraries are we using?

diff --git a/spec/word-split.test.sh b/spec/word-split.test.sh
@@ -1,5 +1,5 @@
-## compare_shells: bash dash mksh
-## oils_failures_allowed: 9
+## compare_shells: bash dash mksh ash yash
+## oils_failures_allowed: 10
 
 # NOTE on bash bug:  After setting IFS to array, it never splits anymore?  Even
 # if you assign IFS again.
@@ -79,6 +79,11 @@ argv.py $empty
 [' ']
 []
 ## END
+## BUG yash STDOUT:
+[]
+[' ']
+[]
+## END
 
 #### Leading/trailing word elision with non-whitespace IFS
 # This behavior is weird.
@@ -153,26 +158,19 @@ argv.py 1${undefined:-"2_3"x_x"4_5"}6
 
 #### IFS empty doesn't do splitting
 IFS=''
-x=$(echo -e ' a b\tc\n')
+x=$(python2 -c 'print(" a b\tc\n")')
 argv.py $x
 ## STDOUT:
 [' a b\tc']
 ## END
-## N-I dash STDOUT:
-['-e  a b\tc']
-## END
-
 
 #### IFS unset behaves like $' \t\n'
 unset IFS
-x=$(echo -e ' a b\tc\n')
+x=$(python2 -c 'print(" a b\tc\n")')
 argv.py $x
 ## STDOUT:
 ['a', 'b', 'c']
 ## END
-## N-I dash STDOUT:
-['-e', 'a', 'b', 'c']
-## END
 
 #### IFS='\'
 # NOTE: OSH fails this because of double backslash escaping issue!
@@ -285,7 +283,7 @@ argv.py "$s"
 ['x y z']
 ['x:y z']
 ## END
-## OK dash STDOUT:
+## BUG dash/ash/yash STDOUT:
 ['x:y z']
 ['x:y z']
 ['x:y z']
@@ -325,10 +323,6 @@ argv.py star $*
 ['at', 'a', 'b c']
 ['star', 'a', 'b c']
 ## END
-## BUG ash STDOUT:
-['at', 'ab c']
-['star', 'ab c']
-## END
 
 #### IFS='' with $@ and $* and printf (bug #627)
 set -- a 'b c'
@@ -341,10 +335,6 @@ printf '[%s]\n' $*
 [a]
 [b c]
 ## END
-## BUG ash STDOUT:
-[ab c]
-[ab c]
-## END
 
 #### IFS='' with ${a[@]} and ${a[*]} (bug #627)
 myarray=(a 'b c')
@@ -456,3 +446,25 @@ sum 12 30 # fails with "fatal: Undefined variable '2'" on res=$(($1 + $2))
 12 + 30 = 42
 12 + 30 = 42
 ## END
+
+#### Unicode in IFS
+
+# bash, zsh, and yash support unicode in IFS, but dash/mksh/ash don't.
+
+# for zsh, though we're not testing it here
+setopt SH_WORD_SPLIT
+
+x=çx IFS=ç
+printf "<%s>\n" $x
+
+## STDOUT:
+<>
+<x>
+## END
+
+## BUG dash/mksh/ash STDOUT:
+<>
+<>
+<x>
+## END
+