diff --git a/builtin/method_str.py b/builtin/method_str.py index 5704410727..4fb75d0e0a 100644 --- a/builtin/method_str.py +++ b/builtin/method_str.py @@ -488,38 +488,89 @@ def __init__(self): def Call(self, rd): # type: (typed_args.Reader) -> value_t """ - s.split(sep, count=-1) + s.split(string_sep, count=-1) + s.split(eggex_sep, count=-1) Count behaves like in replace() in that: - `count` < 0 -> ignore - `count` >= 0 -> there will be at most `count` splits """ string = rd.PosStr() - sep = rd.PosStr() + + string_sep = None # type: str + eggex_sep = None # type: value.Eggex + + sep = rd.PosValue() + with tagswitch(sep) as case: + if case(value_e.Eggex): + eggex_sep_ = cast(value.Eggex, sep) + eggex_sep = eggex_sep_ + + elif case(value_e.Str): + string_sep_ = cast(value.Str, sep) + string_sep = string_sep_.s + + else: + raise error.TypeErr(sep, 'expected separator to be Eggex or Str', + rd.LeftParenToken()) + count = mops.BigTruncate(rd.NamedInt("count", -1)) rd.Done() - if len(sep) == 0: - raise error.Structured(3, "sep must be non-empty", rd.LeftParenToken()) - if len(string) == 0: return value.List([]) - cursor = 0 - chunks = [] # type: List[value_t] - while cursor < len(string) and count != 0: - next = string.find(sep, cursor) - if next == -1: - break + if string_sep is not None: + if len(string_sep) == 0: + raise error.Structured(3, "separator must be non-empty", + rd.LeftParenToken()) - chunks.append(value.Str(string[cursor:next])) - cursor = next + len(sep) - count -= 1 + cursor = 0 + chunks = [] # type: List[value_t] + while cursor < len(string) and count != 0: + next = string.find(string_sep, cursor) + if next == -1: + break + + chunks.append(value.Str(string[cursor:next])) + cursor = next + len(string_sep) + count -= 1 - if cursor == len(string): - # An instance of sep was against the end of the string - chunks.append(value.Str("")) - else: chunks.append(value.Str(string[cursor:])) - return value.List(chunks) + return value.List(chunks) + + if eggex_sep is not None: + if '\0' in string: + raise error.Structured( + 3, "cannot split a string with a NUL byte", + rd.LeftParenToken()) + + regex = regex_translate.AsPosixEre(eggex_sep) + cflags = regex_translate.LibcFlags(eggex_sep.canonical_flags) + + cursor = 0 + chunks = [] + while cursor < len(string) and count != 0: + m = libc.regex_search(regex, cflags, string, 0, cursor) + if m is None: + break + + start = m[0] + end = m[1] + if start == end: + raise error.Structured( + 3, + "eggex separators should never match the empty string", + rd.LeftParenToken()) + + chunks.append(value.Str(string[cursor:start])) + cursor = end + + count -= 1 + + chunks.append(value.Str(string[cursor:])) + + return value.List(chunks) + + raise AssertionError() diff --git a/demo/survey-str-api.sh b/demo/survey-str-api.sh index 681aaf1fc3..fc6017f749 100755 --- a/demo/survey-str-api.sh +++ b/demo/survey-str-api.sh @@ -122,4 +122,67 @@ survey-trim() { nodejs -e 'var s = process.argv[1]; var t = s.trim(); console.log(`[${s}] [${t}]`);' "$str" } +survey-split() { + echo '============== PYTHON' + echo + + python3 << EOF +print('a,b,c'.split(',')) +print('aa'.split('a')) +print('a<>b<>c')) +print('a;b;;c'.split(';')) +print(''.split('foo')) + +import re + +print(re.split(',|;', 'a,b;c')) +print(re.split('.*', 'aa')) +print(re.split('.', 'aa')) +print(re.split('<>|@@', 'a<>b@@cb<>c')) +console.log('a;b;;c'.split(';')) +console.log(''.split('foo')) + +console.log('a,b;c'.split(/,|;/)) +console.log('aa'.split(/.*/)) +console.log('aa'.split(/./)) +console.log('a<>b@@c|@@/)) +console.log('a b cd'.split(/\s*/)) +console.log('a b cd'.split(/\s+/)) +console.log(''.split(/./)) +EOF + + echo + echo '============== YSH' + echo + + bin/ysh << EOF +pp test_ ('a,b,c'.split(',')) +pp test_ ('aa'.split('a')) +pp test_ ('a<>b<>c')) +pp test_ ('a;b;;c'.split(';')) +pp test_ (''.split('foo')) + +pp test_ ('a,b;c'.split(/ ',' | ';' /)) +pp test_ ('aa'.split(/ dot* /)) +pp test_ ('aa'.split(/ dot /)) +pp test_ ('a<>b@@c' | '@@' /)) +pp test_ ('a b cd'.split(/ space* /)) +pp test_ ('a b cd'.split(/ space+ /)) +pp test_ (''.split(/ dot /)) +EOF +} + "$@" diff --git a/doc/ref/chap-type-method.md b/doc/ref/chap-type-method.md index 55d9a56300..02ca37a9ee 100644 --- a/doc/ref/chap-type-method.md +++ b/doc/ref/chap-type-method.md @@ -261,18 +261,30 @@ The `%start` or `^` metacharacter will only match when `pos` is zero. Split a string by a `Str` separator `sep` into a `List` of chunks. pp ('a;b;;c'.split(';')) # => ["a", "b", "", "c"] - pp ('a<>b<>c')) # => ["a","b","cb<>c')) # => ["a", "b", "c ["🌞", "🌞", "🌞"] +Or split using an `Eggex`. + + pp ('a b cd'.split(/ space+ /)) # => ["a", "b", "cd"] + pp ('a,b;c'.split(/ ',' | ';' /)) # => ["a", "b", "c"] + Optionally, provide a `count` to split on `sep` at most `count` times. A negative `count` will split on all occurrences of `sep`. pp ('a;b;;c'.split(';', count=2)) # => ["a", "b", ";c"] pp ('a;b;;c'.split(';', count=-1)) # => ["a", "b", "", "c"] -Passing an empty `sep` will result in an error: +Passing an empty `sep` will result in an error. + + pp ('abc'.split('')) # => Error: Sep cannot be "" + +Splitting by an `Eggex` has some limitations: - pp test_ ('abc'.split('')) # => Error: Sep cannot be "" +- If a `search()` results in an empty string match, eg. + `'abc'.split(/ space* /)`, then we raise an error to avoid an infinite loop. +- The string to split cannot contain NUL bytes because we use the libc regex + engine. ## List diff --git a/spec/ysh-methods.test.sh b/spec/ysh-methods.test.sh index 61e9d969a0..1744a1bb59 100644 --- a/spec/ysh-methods.test.sh +++ b/spec/ysh-methods.test.sh @@ -382,7 +382,7 @@ pp test_ (en2fr => keys()) (List) ["hello","friend","cat"] ## END -#### Str => split(sep), non-empty sep +#### Str => split(sep), non-empty str sep pp test_ ('a,b,c'.split(',')) pp test_ ('aa'.split('a')) pp test_ ('a<>b<>c')) @@ -396,7 +396,21 @@ pp test_ (''.split('foo')) (List) [] ## END -#### Str => split(sep, count), non-empty sep +#### Str => split(sep), eggex sep +pp test_ ('a,b;c'.split(/ ',' | ';' /)) +pp test_ ('aa'.split(/ dot /)) +pp test_ ('a<>b@@c' | '@@' /)) +pp test_ ('a b cd'.split(/ space+ /)) +pp test_ (''.split(/ dot /)) +## STDOUT: +(List) ["a","b","c"] +(List) ["","",""] +(List) ["a","b","c split(sep, count), non-empty str sep pp test_ ('a,b,c'.split(',', count=-1)) pp test_ ('a,b,c'.split(',', count=-2)) # Any negative count means "ignore count" pp test_ ('aa'.split('a', count=1)) @@ -416,20 +430,47 @@ pp test_ (''.split(',', count=0)) (List) [] ## END +#### Str => split(sep, count), eggex sep +pp test_ ('a,b;c'.split(/ ',' | ';' /, count=-1)) +pp test_ ('aa'.split(/ dot /, count=1)) +pp test_ ('a<>b@@c' | '@@' /, count=50)) +pp test_ ('a b c'.split(/ space+ /, count=0)) +pp test_ (''.split(/ dot /, count=1)) +## STDOUT: +(List) ["a","b","c"] +(List) ["","a"] +(List) ["a","b","c split(), usage errors -try { pp test_ ('abc'.split('')) } # Sep cannot be "" +try { pp test_ ('abc'.split('')) } # Sep cannot be "" +echo status=$[_error.code] +try { pp test_ ('abc'.split()) } # Sep must be present +echo status=$[_error.code] +try { pp test_ (b'\y00a\y01'.split(/ 'a' /)) } # Cannot split by eggex when str has NUL-byte echo status=$[_error.code] -try { pp test_ ('abc'.split()) } # Sep must be present +try { pp test_ (b'abc'.split(/ space* /)) } # Eggex cannot accept empty string +echo status=$[_error.code] +try { pp test_ (b'abc'.split(/ dot* /)) } # But in some cases the input doesn't cause an + # infinite loop, so we actually allow it! echo status=$[_error.code] ## STDOUT: status=3 status=3 +status=3 +status=3 +(List) ["",""] +status=0 ## END #### Str => split(), non-ascii pp test_ ('🌞🌝🌞🌝🌞'.split('🌝')) +pp test_ ('🌞🌝🌞🌝🌞'.split(/ '🌝' /)) ## STDOUT: (List) ["🌞","🌞","🌞"] +(List) ["🌞","🌞","🌞"] ## END #### Dict => values()