oils-for-unix · andychu · Aug 31, 2024 · Aug 20, 2024 · Aug 21, 2024 · Aug 21, 2024
diff --git a/builtin/method_str.py b/builtin/method_str.py
@@ -488,38 +488,89 @@ def __init__(self):
     def Call(self, rd):
         # type: (typed_args.Reader) -> value_t
         """
-        s.split(sep, count=-1)
+        s.split(string_sep, count=-1)
+        s.split(eggex_sep, count=-1)
 
         Count behaves like in replace() in that:
         - `count` <  0 -> ignore
         - `count` >= 0 -> there will be at most `count` splits
         """
         string = rd.PosStr()
-        sep = rd.PosStr()
+
+        string_sep = None  # type: str
+        eggex_sep = None  # type: value.Eggex
+
+        sep = rd.PosValue()
+        with tagswitch(sep) as case:
+            if case(value_e.Eggex):
+                eggex_sep_ = cast(value.Eggex, sep)
+                eggex_sep = eggex_sep_
+
+            elif case(value_e.Str):
+                string_sep_ = cast(value.Str, sep)
+                string_sep = string_sep_.s
+
+            else:
+                raise error.TypeErr(sep, 'expected separator to be Eggex or Str',
+                                    rd.LeftParenToken())
+
         count = mops.BigTruncate(rd.NamedInt("count", -1))
         rd.Done()
 
-        if len(sep) == 0:
-            raise error.Structured(3, "sep must be non-empty", rd.LeftParenToken())
-
         if len(string) == 0:
             return value.List([])
 
-        cursor = 0
-        chunks = []  # type: List[value_t]
-        while cursor < len(string) and count != 0:
-            next = string.find(sep, cursor)
-            if next == -1:
-                break
+        if string_sep is not None:
+            if len(string_sep) == 0:
+                raise error.Structured(3, "separator must be non-empty",
+                                       rd.LeftParenToken())
 
-            chunks.append(value.Str(string[cursor:next]))
-            cursor = next + len(sep)
-            count -= 1
+            cursor = 0
+            chunks = []  # type: List[value_t]
+            while cursor < len(string) and count != 0:
+                next = string.find(string_sep, cursor)
+                if next == -1:
+                    break
+
+                chunks.append(value.Str(string[cursor:next]))
+                cursor = next + len(string_sep)
+                count -= 1
 
-        if cursor == len(string):
-            # An instance of sep was against the end of the string
-            chunks.append(value.Str(""))
-        else:
             chunks.append(value.Str(string[cursor:]))
 
-        return value.List(chunks)
+            return value.List(chunks)
+
+        if eggex_sep is not None:
+            if '\0' in string:
+                raise error.Structured(
+                    3, "cannot split a string with a NUL byte",
+                    rd.LeftParenToken())
+
+            regex = regex_translate.AsPosixEre(eggex_sep)
+            cflags = regex_translate.LibcFlags(eggex_sep.canonical_flags)
+
+            cursor = 0
+            chunks = []
+            while cursor < len(string) and count != 0:
+                m = libc.regex_search(regex, cflags, string, 0, cursor)
+                if m is None:
+                    break
+
+                start = m[0]
+                end = m[1]
+                if start == end:
+                    raise error.Structured(
+                        3,
+                        "eggex separators should never match the empty string",
+                        rd.LeftParenToken())
+
+                chunks.append(value.Str(string[cursor:start]))
+                cursor = end
+
+                count -= 1
+
+            chunks.append(value.Str(string[cursor:]))
+
+            return value.List(chunks)
+
+        raise AssertionError()
diff --git a/demo/survey-str-api.sh b/demo/survey-str-api.sh
@@ -122,4 +122,67 @@ survey-trim() {
   nodejs -e 'var s = process.argv[1]; var t = s.trim(); console.log(`[${s}] [${t}]`);' "$str"
 }
 
+survey-split() {
+  echo '============== PYTHON'
+  echo
+
+  python3 << EOF
+print('a,b,c'.split(','))
+print('aa'.split('a'))
+print('a<>b<>c<d'.split('<>'))
+print('a;b;;c'.split(';'))
+print(''.split('foo'))
+
+import re
+
+print(re.split(',|;', 'a,b;c'))
+print(re.split('.*', 'aa'))
+print(re.split('.', 'aa'))
+print(re.split('<>|@@', 'a<>b@@c<d'))
+print(re.split('\\s*', 'a b cd'))
+print(re.split('\\s+', 'a b cd'))
+print(re.split('.', ''))
+EOF
+
+  echo
+  echo '============== NODE'
+  echo
+
+  node << EOF
+console.log('a,b,c'.split(','))
+console.log('aa'.split('a'))
+console.log('a<>b<>c<d'.split('<>'))
+console.log('a;b;;c'.split(';'))
+console.log(''.split('foo'))
+
+console.log('a,b;c'.split(/,|;/))
+console.log('aa'.split(/.*/))
+console.log('aa'.split(/./))
+console.log('a<>b@@c<d'.split(/<>|@@/))
+console.log('a b  cd'.split(/\s*/))
+console.log('a b  cd'.split(/\s+/))
+console.log(''.split(/./))
+EOF
+
+  echo
+  echo '============== YSH'
+  echo
+
+  bin/ysh << EOF
+pp test_ ('a,b,c'.split(','))
+pp test_ ('aa'.split('a'))
+pp test_ ('a<>b<>c<d'.split('<>'))
+pp test_ ('a;b;;c'.split(';'))
+pp test_ (''.split('foo'))
+
+pp test_ ('a,b;c'.split(/ ',' | ';' /))
+pp test_ ('aa'.split(/ dot* /))
+pp test_ ('aa'.split(/ dot /))
+pp test_ ('a<>b@@c<d'.split(/ '<>' | '@@' /))
+pp test_ ('a b  cd'.split(/ space* /))
+pp test_ ('a b  cd'.split(/ space+ /))
+pp test_ (''.split(/ dot /))
+EOF
+}
+
 "$@"
diff --git a/doc/ref/chap-type-method.md b/doc/ref/chap-type-method.md
@@ -261,18 +261,30 @@ The `%start` or `^` metacharacter will only match when `pos` is zero.
 Split a string by a `Str` separator `sep` into a `List` of chunks.
 
     pp ('a;b;;c'.split(';'))       # => ["a", "b", "", "c"]
-    pp ('a<>b<>c<d'.split('<>'))   # => ["a","b","c<d"]
+    pp ('a<>b<>c<d'.split('<>'))   # => ["a", "b", "c<d"]
     pp ('🌞🌝🌞🌝🌞'.split('🌝'))  # => ["🌞", "🌞", "🌞"]
 
+Or split using an `Eggex`.
+
+    pp ('a b  cd'.split(/ space+ /))   # => ["a", "b", "cd"]
+    pp ('a,b;c'.split(/ ',' | ';' /))  # => ["a", "b", "c"]
+
 Optionally, provide a `count` to split on `sep` at most `count` times. A
 negative `count` will split on all occurrences of `sep`.
 
     pp ('a;b;;c'.split(';', count=2))   # => ["a", "b", ";c"]
     pp ('a;b;;c'.split(';', count=-1))  # => ["a", "b", "", "c"]
 
-Passing an empty `sep` will result in an error:
+Passing an empty `sep` will result in an error.
+
+    pp ('abc'.split(''))  # => Error: Sep cannot be ""
+
+Splitting by an `Eggex` has some limitations:
 
-    pp test_ ('abc'.split(''))            # => Error: Sep cannot be ""
+- If a `search()` results in an empty string match, eg.
+  `'abc'.split(/ space* /)`, then we raise an error to avoid an infinite loop.
+- The string to split cannot contain NUL bytes because we use the libc regex
+  engine.
 
 ## List
 

diff --git a/spec/ysh-methods.test.sh b/spec/ysh-methods.test.sh
@@ -382,7 +382,7 @@ pp test_ (en2fr => keys())
 (List)   ["hello","friend","cat"]
 ## END
 
-#### Str => split(sep), non-empty sep
+#### Str => split(sep), non-empty str sep
 pp test_ ('a,b,c'.split(','))
 pp test_ ('aa'.split('a'))
 pp test_ ('a<>b<>c<d'.split('<>'))
@@ -396,7 +396,21 @@ pp test_ (''.split('foo'))
 (List)   []
 ## END
 
-#### Str => split(sep, count), non-empty sep
+#### Str => split(sep), eggex sep
+pp test_ ('a,b;c'.split(/ ',' | ';' /))
+pp test_ ('aa'.split(/ dot /))
+pp test_ ('a<>b@@c<d'.split(/ '<>' | '@@' /))
+pp test_ ('a b  cd'.split(/ space+ /))
+pp test_ (''.split(/ dot /))
+## STDOUT:
+(List)   ["a","b","c"]
+(List)   ["","",""]
+(List)   ["a","b","c<d"]
+(List)   ["a","b","cd"]
+(List)   []
+## END
+
+#### Str => split(sep, count), non-empty str sep
 pp test_ ('a,b,c'.split(',', count=-1))
 pp test_ ('a,b,c'.split(',', count=-2))  # Any negative count means "ignore count"
 pp test_ ('aa'.split('a', count=1))
@@ -416,20 +430,47 @@ pp test_ (''.split(',', count=0))
 (List)   []
 ## END
 
+#### Str => split(sep, count), eggex sep
+pp test_ ('a,b;c'.split(/ ',' | ';' /, count=-1))
+pp test_ ('aa'.split(/ dot /, count=1))
+pp test_ ('a<>b@@c<d'.split(/ '<>' | '@@' /, count=50))
+pp test_ ('a b  c'.split(/ space+ /, count=0))
+pp test_ (''.split(/ dot /, count=1))
+## STDOUT:
+(List)   ["a","b","c"]
+(List)   ["","a"]
+(List)   ["a","b","c<d"]
+(List)   ["a b  c"]
+(List)   []
+## END
+
 #### Str => split(), usage errors
-try { pp test_ ('abc'.split(''))           } # Sep cannot be ""
+try { pp test_ ('abc'.split(''))             } # Sep cannot be ""
+echo status=$[_error.code]
+try { pp test_ ('abc'.split())               } # Sep must be present
+echo status=$[_error.code]
+try { pp test_ (b'\y00a\y01'.split(/ 'a' /)) } # Cannot split by eggex when str has NUL-byte
 echo status=$[_error.code]
-try { pp test_ ('abc'.split())             } # Sep must be present
+try { pp test_ (b'abc'.split(/ space* /))    } # Eggex cannot accept empty string
+echo status=$[_error.code]
+try { pp test_ (b'abc'.split(/ dot* /))      } # But in some cases the input doesn't cause an
+                                               # infinite loop, so we actually allow it!
 echo status=$[_error.code]
 ## STDOUT:
 status=3
 status=3
+status=3
+status=3
+(List)   ["",""]
+status=0
 ## END
 
 #### Str => split(), non-ascii
 pp test_ ('🌞🌝🌞🌝🌞'.split('🌝'))
+pp test_ ('🌞🌝🌞🌝🌞'.split(/ '🌝' /))
 ## STDOUT:
 (List)   ["🌞","🌞","🌞"]
+(List)   ["🌞","🌞","🌞"]
 ## END
 
 #### Dict => values()