Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add re_contains/2 and re_replace/3 to match with a regex #894

Merged
merged 3 commits into from
Apr 14, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 20 additions & 4 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,9 @@ defmodule Explorer.Backend.LazySeries do
row_index: 1,
# Strings
contains: 2,
re_contains: 2,
replace: 3,
re_replace: 3,
lstrip: 2,
rstrip: 2,
strip: 2,
Expand Down Expand Up @@ -993,8 +995,15 @@ defmodule Explorer.Backend.LazySeries do
end

@impl true
def contains(series, pattern) do
data = new(:contains, [lazy_series!(series), pattern], :boolean)
def contains(series, substring) do
data = new(:contains, [lazy_series!(series), substring], :boolean)

Backend.Series.new(data, :boolean)
end

@impl true
def re_contains(series, pattern) do
data = new(:re_contains, [lazy_series!(series), pattern], :boolean)

Backend.Series.new(data, :boolean)
end
Expand All @@ -1014,8 +1023,15 @@ defmodule Explorer.Backend.LazySeries do
end

@impl true
def replace(series, pattern, replacement) do
data = new(:replace, [lazy_series!(series), pattern, replacement], :string)
def replace(series, substring, replacement) do
data = new(:replace, [lazy_series!(series), substring, replacement], :string)

Backend.Series.new(data, :string)
end

@impl true
def re_replace(series, pattern, replacement) do
data = new(:re_replace, [lazy_series!(series), pattern, replacement], :string)

Backend.Series.new(data, :string)
end
Expand Down
4 changes: 4 additions & 0 deletions lib/explorer/backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,10 @@ defmodule Explorer.Backend.Series do
@callback json_decode(s, dtype()) :: s
@callback json_path_match(s, String.t()) :: s

## String - Regular expression versions
@callback re_contains(s, String.t()) :: s
@callback re_replace(s, String.t(), String.t()) :: s

# Date / DateTime

@callback day_of_week(s) :: s
Expand Down
2 changes: 2 additions & 0 deletions lib/explorer/polars_backend/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,9 @@ defmodule Explorer.PolarsBackend.Expression do

# Strings
contains: 2,
re_contains: 2,
replace: 3,
re_replace: 3,
strip: 2,
lstrip: 2,
rstrip: 2,
Expand Down
4 changes: 2 additions & 2 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ defmodule Explorer.PolarsBackend.Native do
def s_categorise(_s, _s_categories), do: err()
def s_coalesce(_s, _other), do: err()
def s_concat(_series_list), do: err()
def s_contains(_s, _pattern), do: err()
def s_contains(_s, _pattern, _is_literal), do: err()
def s_cumulative_max(_s, _reverse), do: err()
def s_cumulative_min(_s, _reverse), do: err()
def s_cumulative_sum(_s, _reverse), do: err()
Expand Down Expand Up @@ -383,7 +383,7 @@ defmodule Explorer.PolarsBackend.Native do
def s_ceil(_s), do: err()
def s_rstrip(_s, _string), do: err()
def s_rank(_s, _method, _descending, _seed), do: err()
def s_replace(_s, _pattern, _replacement), do: err()
def s_replace(_s, _pattern, _replacement, _literal), do: err()
def s_sample_n(_s, _n, _replace, _shuffle, _seed), do: err()
def s_sample_frac(_s, _frac, _replace, _shuffle, _seed), do: err()
def s_series_equal(_s, _other, _null_equal), do: err()
Expand Down
14 changes: 11 additions & 3 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -659,8 +659,12 @@ defmodule Explorer.PolarsBackend.Series do
# Strings

@impl true
def contains(series, pattern),
do: Shared.apply_series(series, :s_contains, [pattern])
def contains(series, substring),
do: Shared.apply_series(series, :s_contains, [substring, true])

@impl true
def re_contains(series, pattern),
do: Shared.apply_series(series, :s_contains, [pattern, false])

@impl true
def upcase(series),
Expand All @@ -672,7 +676,11 @@ defmodule Explorer.PolarsBackend.Series do

@impl true
def replace(series, pattern, replacement),
do: Shared.apply_series(series, :s_replace, [pattern, replacement])
do: Shared.apply_series(series, :s_replace, [pattern, replacement, true])

@impl true
def re_replace(series, pattern, replacement),
do: Shared.apply_series(series, :s_replace, [pattern, replacement, false])

@impl true
def strip(series, str),
Expand Down
131 changes: 122 additions & 9 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5344,6 +5344,10 @@ defmodule Explorer.Series do
@doc """
Detects whether a string contains a substring.

> ### Notice {: .warning}
>
> This function detects only literal strings. For regular expressions, see `re_contains/2`.

## Examples

iex> s = Explorer.Series.from_list(["abc", "def", "bcd"])
Expand All @@ -5355,12 +5359,46 @@ defmodule Explorer.Series do
"""
@doc type: :string_wise
@spec contains(Series.t(), String.t()) :: Series.t()
def contains(%Series{dtype: :string} = series, pattern)
when K.is_binary(pattern),
do: apply_series(series, :contains, [pattern])
def contains(%Series{dtype: :string} = series, substring)
when K.is_binary(substring),
do: apply_series(series, :contains, [substring])

def contains(%Series{dtype: dtype}, _), do: dtype_error("contains/2", dtype, [:string])

@doc """
Detects whether a string matches a pattern.

> ### Notice {: .warning}
>
> This function matches against a regular expression. It does not expect an Elixir regex, but
> a escaped string - you can use the `~S` sigil for escaping - that follows the [`regex`](https://docs.rs/regex/latest/regex/)
> Rust crate rules. This is because our backend, Polars, expects that format.
>
> To match literal strings, you can use `contains/2`.
philss marked this conversation as resolved.
Show resolved Hide resolved

## Examples

iex> s = Explorer.Series.from_list(["abc", "def", "bcd"])
iex> Explorer.Series.re_contains(s, ~S/(a|e)/)
#Explorer.Series<
Polars[3]
boolean [true, true, false]
>
"""
@doc type: :string_wise
@spec re_contains(Series.t(), String.t()) :: Series.t()
def re_contains(%Series{dtype: :string} = series, pattern)
when K.is_binary(pattern),
do: apply_series(series, :re_contains, [pattern])

def re_contains(%Series{dtype: :string}, %Regex{}) do
raise ArgumentError,
"standard regexes cannot be used as pattern because it may be incompatible with the backend. " <>
"Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`"
end

def re_contains(%Series{dtype: dtype}, _), do: dtype_error("re_contains/2", dtype, [:string])

@doc """
Converts all characters to uppercase.

Expand Down Expand Up @@ -5400,9 +5438,13 @@ defmodule Explorer.Series do
def downcase(%Series{dtype: dtype}), do: dtype_error("downcase/1", dtype, [:string])

@doc """
Replaces all occurences of pattern with replacement in string series.
Replaces all occurences of a substring with replacement in string series.

Both pattern and replacement must be of type string.
Both substring and replacement must be of type string.

> ### Notice {: .warning}
>
> This function replaces only literal strings. For regular expressions, see `re_replace/3`.

## Examples

Expand All @@ -5415,15 +5457,86 @@ defmodule Explorer.Series do
"""
@doc type: :string_wise
@spec replace(Series.t(), binary(), binary()) :: Series.t()
def replace(%Series{dtype: :string} = series, pattern, replacement)
when K.and(is_binary(pattern), is_binary(replacement)),
do: apply_series(series, :replace, [pattern, replacement])
def replace(%Series{dtype: :string} = series, substring, replacement)
when K.and(is_binary(substring), is_binary(replacement)),
do: apply_series(series, :replace, [substring, replacement])

def replace(%Series{dtype: :string}, _, _),
do: raise(ArgumentError, "pattern and replacement in replace/3 need to be a string")
do: raise(ArgumentError, "substring and replacement in replace/3 need to be a string")

def replace(%Series{dtype: dtype}, _, _), do: dtype_error("replace/3", dtype, [:string])

@doc """
Replaces all occurences of a pattern with replacement in string series.

Both pattern and replacement must be of type string. The replacement
can refer to groups captures by using the `${x}`, where `x` is a number starting from 1.
It can also refer to named groups using the same syntax.

> ### Notice {: .warning}
>
> This function matches against a regular expression. It does not expect an Elixir regex, but
> a escaped string - you can use the `~S` sigil for escaping - that follows the [`regex`](https://docs.rs/regex/latest/regex/)
> Rust crate rules. This is because our backend, Polars, expects that format.
>
> To replace by literal strings, you can use `replace/3`.

## Examples

iex> series = Explorer.Series.from_list(["1.200,45", "1.234.567,30", "asdf", nil])
iex> Explorer.Series.re_replace(series, ~S/[,.]/, "")
#Explorer.Series<
Polars[4]
string ["120045", "123456730", "asdf", nil]
>

iex> series = Explorer.Series.from_list(["hat", "hut"])
iex> Explorer.Series.re_replace(series, ~S/h(.)t/, "b${1}d")
#Explorer.Series<
Polars[2]
string ["bad", "bud"]
>

iex> series = Explorer.Series.from_list(["hat", "hut"])
iex> Explorer.Series.re_replace(series, ~S/h(?<vowel>.)t/, "b${vowel}d")
#Explorer.Series<
Polars[2]
string ["bad", "bud"]
>

Apply case-insensitive string replacement using the `(?i)` flag - remember, from the `regex` Rust crate.

iex> series = Explorer.Series.from_list(["Foggy", "Rainy", "Sunny"])
iex> Explorer.Series.re_replace(series, ~S/(?i)foggy|rainy/, "Sunny")
#Explorer.Series<
Polars[3]
string ["Sunny", "Sunny", "Sunny"]
>

With an Elixir regex it causes an error:

iex> series = Explorer.Series.from_list(["hat", "hut"])
iex> Explorer.Series.re_replace(series, ~r/h(.)t/, "b${1}d")
** (ArgumentError) standard regexes cannot be used as pattern because it may be incompatible with the backend. Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`

"""
@doc type: :string_wise
@spec re_replace(Series.t(), binary(), binary()) :: Series.t()
def re_replace(%Series{dtype: :string} = series, pattern, replacement)
when K.and(is_binary(pattern), is_binary(replacement)),
do: apply_series(series, :re_replace, [pattern, replacement])

def re_replace(%Series{dtype: :string}, %Regex{}, _) do
raise ArgumentError,
"standard regexes cannot be used as pattern because it may be incompatible with the backend. " <>
"Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`"
end

def re_replace(%Series{dtype: :string}, _, _),
do: raise(ArgumentError, "pattern and replacement in re_replace/3 need to be a string")

def re_replace(%Series{dtype: dtype}, _, _), do: dtype_error("re_replace/3", dtype, [:string])

@doc """
Returns a string series where all leading and trailing Unicode whitespaces
have been removed.
Expand Down
18 changes: 13 additions & 5 deletions native/explorer/src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,12 @@ pub fn expr_contains(expr: ExExpr, pattern: &str) -> ExExpr {
ExExpr::new(expr.str().contains_literal(pattern.lit()))
}

#[rustler::nif]
pub fn expr_re_contains(expr: ExExpr, pattern: &str) -> ExExpr {
let expr = expr.clone_inner();
ExExpr::new(expr.str().contains(pattern.lit(), true))
}

#[rustler::nif]
pub fn expr_upcase(expr: ExExpr) -> ExExpr {
let expr = expr.clone_inner();
Expand Down Expand Up @@ -896,11 +902,13 @@ pub fn expr_split(expr: ExExpr, substring: String) -> ExExpr {
#[rustler::nif]
pub fn expr_replace(expr: ExExpr, pat: String, value: String) -> ExExpr {
let expr = expr.clone_inner();
ExExpr::new(expr.str().replace_all(
Expr::Literal(LiteralValue::String(pat)),
Expr::Literal(LiteralValue::String(value)),
true,
))
ExExpr::new(expr.str().replace_all(pat.lit(), value.lit(), true))
}

#[rustler::nif]
pub fn expr_re_replace(expr: ExExpr, pat: String, value: String) -> ExExpr {
let expr = expr.clone_inner();
ExExpr::new(expr.str().replace_all(pat.lit(), value.lit(), false))
}

#[rustler::nif]
Expand Down
2 changes: 2 additions & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ rustler::init!(
expr_describe_filter_plan,
// string expressions
expr_contains,
expr_re_contains,
expr_upcase,
expr_downcase,
expr_strip,
Expand All @@ -260,6 +261,7 @@ rustler::init!(
expr_substring,
expr_split,
expr_replace,
expr_re_replace,
expr_json_path_match,
expr_split_into,
// float round expressions
Expand Down
19 changes: 14 additions & 5 deletions native/explorer/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1468,8 +1468,13 @@ pub fn s_not(s1: ExSeries) -> Result<ExSeries, ExplorerError> {
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_contains(s1: ExSeries, pattern: &str) -> Result<ExSeries, ExplorerError> {
Ok(ExSeries::new(s1.str()?.contains_literal(pattern)?.into()))
pub fn s_contains(s1: ExSeries, pattern: &str, literal: bool) -> Result<ExSeries, ExplorerError> {
let chunked_array = if literal {
s1.str()?.contains_literal(pattern)?
} else {
s1.str()?.contains(pattern, true)?
};
Ok(ExSeries::new(chunked_array.into()))
}

#[rustler::nif(schedule = "DirtyCpu")]
Expand All @@ -1487,10 +1492,14 @@ pub fn s_replace(
s1: ExSeries,
pattern: &str,
replacement: &str,
literal: bool,
) -> Result<ExSeries, ExplorerError> {
Ok(ExSeries::new(
s1.str()?.replace_literal_all(pattern, replacement)?.into(),
))
let chunked_array = if literal {
s1.str()?.replace_literal_all(pattern, replacement)?
} else {
s1.str()?.replace_all(pattern, replacement)?
};
Ok(ExSeries::new(chunked_array.into()))
}

#[rustler::nif(schedule = "DirtyCpu")]
Expand Down
Loading
Loading