Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support split by eggex in Str.split #2051

Merged
merged 17 commits into from
Aug 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 70 additions & 19 deletions builtin/method_str.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,38 +488,89 @@ def __init__(self):
def Call(self, rd):
# type: (typed_args.Reader) -> value_t
"""
s.split(sep, count=-1)
s.split(string_sep, count=-1)
s.split(eggex_sep, count=-1)

Count behaves like in replace() in that:
- `count` < 0 -> ignore
- `count` >= 0 -> there will be at most `count` splits
"""
string = rd.PosStr()
sep = rd.PosStr()

string_sep = None # type: str
eggex_sep = None # type: value.Eggex

sep = rd.PosValue()
with tagswitch(sep) as case:
if case(value_e.Eggex):
eggex_sep_ = cast(value.Eggex, sep)
eggex_sep = eggex_sep_

elif case(value_e.Str):
string_sep_ = cast(value.Str, sep)
string_sep = string_sep_.s

else:
raise error.TypeErr(sep, 'expected separator to be Eggex or Str',
rd.LeftParenToken())

count = mops.BigTruncate(rd.NamedInt("count", -1))
rd.Done()

if len(sep) == 0:
raise error.Structured(3, "sep must be non-empty", rd.LeftParenToken())

if len(string) == 0:
return value.List([])

cursor = 0
chunks = [] # type: List[value_t]
while cursor < len(string) and count != 0:
next = string.find(sep, cursor)
if next == -1:
break
if string_sep is not None:
if len(string_sep) == 0:
raise error.Structured(3, "separator must be non-empty",
rd.LeftParenToken())

chunks.append(value.Str(string[cursor:next]))
cursor = next + len(sep)
count -= 1
cursor = 0
chunks = [] # type: List[value_t]
while cursor < len(string) and count != 0:
next = string.find(string_sep, cursor)
if next == -1:
break

chunks.append(value.Str(string[cursor:next]))
cursor = next + len(string_sep)
count -= 1

if cursor == len(string):
# An instance of sep was against the end of the string
chunks.append(value.Str(""))
else:
chunks.append(value.Str(string[cursor:]))

return value.List(chunks)
return value.List(chunks)

if eggex_sep is not None:
if '\0' in string:
raise error.Structured(
3, "cannot split a string with a NUL byte",
rd.LeftParenToken())

regex = regex_translate.AsPosixEre(eggex_sep)
cflags = regex_translate.LibcFlags(eggex_sep.canonical_flags)

cursor = 0
chunks = []
while cursor < len(string) and count != 0:
m = libc.regex_search(regex, cflags, string, 0, cursor)
if m is None:
break

start = m[0]
end = m[1]
if start == end:
raise error.Structured(
3,
"eggex separators should never match the empty string",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This error reminds me that it would be nice if it were possible to take a STRING regex, in ERE syntax

The ~ operator in Oils can do that, e.g. if ('mystr' ~ 'myregex+')

But the signature doesn't really allow that


But I'm OK just leaving this out for now ... I think it is possible to just write your own ERE string split in pure YSH

It would be nice to handle this, but we can put it off until someone asks I think ...

(I want to get through Hay/Modules/pure functions before going to deep, and we definitely need this split(), and split() on whitespace)

rd.LeftParenToken())

chunks.append(value.Str(string[cursor:start]))
cursor = end

count -= 1

chunks.append(value.Str(string[cursor:]))

return value.List(chunks)

raise AssertionError()
63 changes: 63 additions & 0 deletions demo/survey-str-api.sh
Original file line number Diff line number Diff line change
Expand Up @@ -122,4 +122,67 @@ survey-trim() {
nodejs -e 'var s = process.argv[1]; var t = s.trim(); console.log(`[${s}] [${t}]`);' "$str"
}

survey-split() {
echo '============== PYTHON'
echo

python3 << EOF
print('a,b,c'.split(','))
print('aa'.split('a'))
print('a<>b<>c<d'.split('<>'))
print('a;b;;c'.split(';'))
print(''.split('foo'))

import re

print(re.split(',|;', 'a,b;c'))
print(re.split('.*', 'aa'))
print(re.split('.', 'aa'))
print(re.split('<>|@@', 'a<>b@@c<d'))
print(re.split('\\s*', 'a b cd'))
print(re.split('\\s+', 'a b cd'))
print(re.split('.', ''))
EOF

echo
echo '============== NODE'
echo

node << EOF
console.log('a,b,c'.split(','))
console.log('aa'.split('a'))
console.log('a<>b<>c<d'.split('<>'))
console.log('a;b;;c'.split(';'))
console.log(''.split('foo'))

console.log('a,b;c'.split(/,|;/))
console.log('aa'.split(/.*/))
console.log('aa'.split(/./))
console.log('a<>b@@c<d'.split(/<>|@@/))
console.log('a b cd'.split(/\s*/))
console.log('a b cd'.split(/\s+/))
console.log(''.split(/./))
EOF

echo
echo '============== YSH'
echo

bin/ysh << EOF
pp test_ ('a,b,c'.split(','))
pp test_ ('aa'.split('a'))
pp test_ ('a<>b<>c<d'.split('<>'))
pp test_ ('a;b;;c'.split(';'))
pp test_ (''.split('foo'))

pp test_ ('a,b;c'.split(/ ',' | ';' /))
pp test_ ('aa'.split(/ dot* /))
pp test_ ('aa'.split(/ dot /))
pp test_ ('a<>b@@c<d'.split(/ '<>' | '@@' /))
pp test_ ('a b cd'.split(/ space* /))
pp test_ ('a b cd'.split(/ space+ /))
pp test_ (''.split(/ dot /))
EOF
}

"$@"
18 changes: 15 additions & 3 deletions doc/ref/chap-type-method.md
Original file line number Diff line number Diff line change
Expand Up @@ -261,18 +261,30 @@ The `%start` or `^` metacharacter will only match when `pos` is zero.
Split a string by a `Str` separator `sep` into a `List` of chunks.

pp ('a;b;;c'.split(';')) # => ["a", "b", "", "c"]
pp ('a<>b<>c<d'.split('<>')) # => ["a","b","c<d"]
pp ('a<>b<>c<d'.split('<>')) # => ["a", "b", "c<d"]
pp ('🌞🌝🌞🌝🌞'.split('🌝')) # => ["🌞", "🌞", "🌞"]

Or split using an `Eggex`.

pp ('a b cd'.split(/ space+ /)) # => ["a", "b", "cd"]
pp ('a,b;c'.split(/ ',' | ';' /)) # => ["a", "b", "c"]

Optionally, provide a `count` to split on `sep` at most `count` times. A
negative `count` will split on all occurrences of `sep`.

pp ('a;b;;c'.split(';', count=2)) # => ["a", "b", ";c"]
pp ('a;b;;c'.split(';', count=-1)) # => ["a", "b", "", "c"]

Passing an empty `sep` will result in an error:
Passing an empty `sep` will result in an error.

pp ('abc'.split('')) # => Error: Sep cannot be ""

Splitting by an `Eggex` has some limitations:

pp test_ ('abc'.split('')) # => Error: Sep cannot be ""
- If a `search()` results in an empty string match, eg.
`'abc'.split(/ space* /)`, then we raise an error to avoid an infinite loop.
- The string to split cannot contain NUL bytes because we use the libc regex
engine.

## List

Expand Down
49 changes: 45 additions & 4 deletions spec/ysh-methods.test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ pp test_ (en2fr => keys())
(List) ["hello","friend","cat"]
## END

#### Str => split(sep), non-empty sep
#### Str => split(sep), non-empty str sep
pp test_ ('a,b,c'.split(','))
pp test_ ('aa'.split('a'))
pp test_ ('a<>b<>c<d'.split('<>'))
Expand All @@ -396,7 +396,21 @@ pp test_ (''.split('foo'))
(List) []
## END

#### Str => split(sep, count), non-empty sep
#### Str => split(sep), eggex sep
pp test_ ('a,b;c'.split(/ ',' | ';' /))
pp test_ ('aa'.split(/ dot /))
pp test_ ('a<>b@@c<d'.split(/ '<>' | '@@' /))
pp test_ ('a b cd'.split(/ space+ /))
pp test_ (''.split(/ dot /))
## STDOUT:
(List) ["a","b","c"]
(List) ["","",""]
(List) ["a","b","c<d"]
(List) ["a","b","cd"]
(List) []
## END

#### Str => split(sep, count), non-empty str sep
pp test_ ('a,b,c'.split(',', count=-1))
pp test_ ('a,b,c'.split(',', count=-2)) # Any negative count means "ignore count"
pp test_ ('aa'.split('a', count=1))
Expand All @@ -416,20 +430,47 @@ pp test_ (''.split(',', count=0))
(List) []
## END

#### Str => split(sep, count), eggex sep
pp test_ ('a,b;c'.split(/ ',' | ';' /, count=-1))
pp test_ ('aa'.split(/ dot /, count=1))
pp test_ ('a<>b@@c<d'.split(/ '<>' | '@@' /, count=50))
pp test_ ('a b c'.split(/ space+ /, count=0))
pp test_ (''.split(/ dot /, count=1))
## STDOUT:
(List) ["a","b","c"]
(List) ["","a"]
(List) ["a","b","c<d"]
(List) ["a b c"]
(List) []
## END

#### Str => split(), usage errors
try { pp test_ ('abc'.split('')) } # Sep cannot be ""
try { pp test_ ('abc'.split('')) } # Sep cannot be ""
echo status=$[_error.code]
try { pp test_ ('abc'.split()) } # Sep must be present
echo status=$[_error.code]
try { pp test_ (b'\y00a\y01'.split(/ 'a' /)) } # Cannot split by eggex when str has NUL-byte
echo status=$[_error.code]
try { pp test_ ('abc'.split()) } # Sep must be present
try { pp test_ (b'abc'.split(/ space* /)) } # Eggex cannot accept empty string
echo status=$[_error.code]
try { pp test_ (b'abc'.split(/ dot* /)) } # But in some cases the input doesn't cause an
# infinite loop, so we actually allow it!
echo status=$[_error.code]
## STDOUT:
status=3
status=3
status=3
status=3
(List) ["",""]
status=0
## END

#### Str => split(), non-ascii
pp test_ ('🌞🌝🌞🌝🌞'.split('🌝'))
pp test_ ('🌞🌝🌞🌝🌞'.split(/ '🌝' /))
## STDOUT:
(List) ["🌞","🌞","🌞"]
(List) ["🌞","🌞","🌞"]
## END

#### Dict => values()
Expand Down