diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex index 0c246fb46..5e062618d 100644 --- a/lib/explorer/backend/lazy_series.ex +++ b/lib/explorer/backend/lazy_series.ex @@ -119,7 +119,9 @@ defmodule Explorer.Backend.LazySeries do row_index: 1, # Strings contains: 2, + re_contains: 2, replace: 3, + re_replace: 3, lstrip: 2, rstrip: 2, strip: 2, @@ -993,8 +995,15 @@ defmodule Explorer.Backend.LazySeries do end @impl true - def contains(series, pattern) do - data = new(:contains, [lazy_series!(series), pattern], :boolean) + def contains(series, substring) do + data = new(:contains, [lazy_series!(series), substring], :boolean) + + Backend.Series.new(data, :boolean) + end + + @impl true + def re_contains(series, pattern) do + data = new(:re_contains, [lazy_series!(series), pattern], :boolean) Backend.Series.new(data, :boolean) end @@ -1014,8 +1023,15 @@ defmodule Explorer.Backend.LazySeries do end @impl true - def replace(series, pattern, replacement) do - data = new(:replace, [lazy_series!(series), pattern, replacement], :string) + def replace(series, substring, replacement) do + data = new(:replace, [lazy_series!(series), substring, replacement], :string) + + Backend.Series.new(data, :string) + end + + @impl true + def re_replace(series, pattern, replacement) do + data = new(:re_replace, [lazy_series!(series), pattern, replacement], :string) Backend.Series.new(data, :string) end diff --git a/lib/explorer/backend/series.ex b/lib/explorer/backend/series.ex index 459e9f631..5d96a9aaf 100644 --- a/lib/explorer/backend/series.ex +++ b/lib/explorer/backend/series.ex @@ -291,6 +291,10 @@ defmodule Explorer.Backend.Series do @callback json_decode(s, dtype()) :: s @callback json_path_match(s, String.t()) :: s + ## String - Regular expression versions + @callback re_contains(s, String.t()) :: s + @callback re_replace(s, String.t(), String.t()) :: s + # Date / DateTime @callback day_of_week(s) :: s diff --git a/lib/explorer/polars_backend/expression.ex b/lib/explorer/polars_backend/expression.ex index 68b42ff62..85efcb5a8 100644 --- a/lib/explorer/polars_backend/expression.ex +++ b/lib/explorer/polars_backend/expression.ex @@ -128,7 +128,9 @@ defmodule Explorer.PolarsBackend.Expression do # Strings contains: 2, + re_contains: 2, replace: 3, + re_replace: 3, strip: 2, lstrip: 2, rstrip: 2, diff --git a/lib/explorer/polars_backend/native.ex b/lib/explorer/polars_backend/native.ex index 016402274..d77e1ca10 100644 --- a/lib/explorer/polars_backend/native.ex +++ b/lib/explorer/polars_backend/native.ex @@ -284,7 +284,7 @@ defmodule Explorer.PolarsBackend.Native do def s_categorise(_s, _s_categories), do: err() def s_coalesce(_s, _other), do: err() def s_concat(_series_list), do: err() - def s_contains(_s, _pattern), do: err() + def s_contains(_s, _pattern, _is_literal), do: err() def s_cumulative_max(_s, _reverse), do: err() def s_cumulative_min(_s, _reverse), do: err() def s_cumulative_sum(_s, _reverse), do: err() @@ -383,7 +383,7 @@ defmodule Explorer.PolarsBackend.Native do def s_ceil(_s), do: err() def s_rstrip(_s, _string), do: err() def s_rank(_s, _method, _descending, _seed), do: err() - def s_replace(_s, _pattern, _replacement), do: err() + def s_replace(_s, _pattern, _replacement, _literal), do: err() def s_sample_n(_s, _n, _replace, _shuffle, _seed), do: err() def s_sample_frac(_s, _frac, _replace, _shuffle, _seed), do: err() def s_series_equal(_s, _other, _null_equal), do: err() diff --git a/lib/explorer/polars_backend/series.ex b/lib/explorer/polars_backend/series.ex index d3bccf2fe..849d425d1 100644 --- a/lib/explorer/polars_backend/series.ex +++ b/lib/explorer/polars_backend/series.ex @@ -659,8 +659,12 @@ defmodule Explorer.PolarsBackend.Series do # Strings @impl true - def contains(series, pattern), - do: Shared.apply_series(series, :s_contains, [pattern]) + def contains(series, substring), + do: Shared.apply_series(series, :s_contains, [substring, true]) + + @impl true + def re_contains(series, pattern), + do: Shared.apply_series(series, :s_contains, [pattern, false]) @impl true def upcase(series), @@ -672,7 +676,11 @@ defmodule Explorer.PolarsBackend.Series do @impl true def replace(series, pattern, replacement), - do: Shared.apply_series(series, :s_replace, [pattern, replacement]) + do: Shared.apply_series(series, :s_replace, [pattern, replacement, true]) + + @impl true + def re_replace(series, pattern, replacement), + do: Shared.apply_series(series, :s_replace, [pattern, replacement, false]) @impl true def strip(series, str), diff --git a/lib/explorer/series.ex b/lib/explorer/series.ex index 249a02968..95536070c 100644 --- a/lib/explorer/series.ex +++ b/lib/explorer/series.ex @@ -5344,6 +5344,10 @@ defmodule Explorer.Series do @doc """ Detects whether a string contains a substring. + > ### Notice {: .warning} + > + > This function detects only literal strings. For regular expressions, see `re_contains/2`. + ## Examples iex> s = Explorer.Series.from_list(["abc", "def", "bcd"]) @@ -5355,12 +5359,48 @@ defmodule Explorer.Series do """ @doc type: :string_wise @spec contains(Series.t(), String.t()) :: Series.t() - def contains(%Series{dtype: :string} = series, pattern) - when K.is_binary(pattern), - do: apply_series(series, :contains, [pattern]) + def contains(%Series{dtype: :string} = series, substring) + when K.is_binary(substring), + do: apply_series(series, :contains, [substring]) def contains(%Series{dtype: dtype}, _), do: dtype_error("contains/2", dtype, [:string]) + @doc """ + Detects whether a string matches a pattern. + + > ### Notice {: .warning} + > + > This function matches against a regular expression. It does not expect an Elixir regex, + > but a escaped string and you can use the `~S` sigil for escaping it. Since each Explorer + > backend may have its own regular expression rules, you must consult their underlying + > engine. For the default backend (Polars), the rules are outlined in the Rust create named + > [`regex`](https://docs.rs/regex/latest/regex/). + > + > To match literal strings, you can use `contains/2`. + + ## Examples + + iex> s = Explorer.Series.from_list(["abc", "def", "bcd"]) + iex> Explorer.Series.re_contains(s, ~S/(a|e)/) + #Explorer.Series< + Polars[3] + boolean [true, true, false] + > + """ + @doc type: :string_wise + @spec re_contains(Series.t(), String.t()) :: Series.t() + def re_contains(%Series{dtype: :string} = series, pattern) + when K.is_binary(pattern), + do: apply_series(series, :re_contains, [pattern]) + + def re_contains(%Series{dtype: :string}, %Regex{}) do + raise ArgumentError, + "standard regexes cannot be used as pattern because it may be incompatible with the backend. " <> + "Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`" + end + + def re_contains(%Series{dtype: dtype}, _), do: dtype_error("re_contains/2", dtype, [:string]) + @doc """ Converts all characters to uppercase. @@ -5400,9 +5440,13 @@ defmodule Explorer.Series do def downcase(%Series{dtype: dtype}), do: dtype_error("downcase/1", dtype, [:string]) @doc """ - Replaces all occurences of pattern with replacement in string series. + Replaces all occurences of a substring with replacement in string series. - Both pattern and replacement must be of type string. + Both substring and replacement must be of type string. + + > ### Notice {: .warning} + > + > This function replaces only literal strings. For regular expressions, see `re_replace/3`. ## Examples @@ -5415,15 +5459,88 @@ defmodule Explorer.Series do """ @doc type: :string_wise @spec replace(Series.t(), binary(), binary()) :: Series.t() - def replace(%Series{dtype: :string} = series, pattern, replacement) - when K.and(is_binary(pattern), is_binary(replacement)), - do: apply_series(series, :replace, [pattern, replacement]) + def replace(%Series{dtype: :string} = series, substring, replacement) + when K.and(is_binary(substring), is_binary(replacement)), + do: apply_series(series, :replace, [substring, replacement]) def replace(%Series{dtype: :string}, _, _), - do: raise(ArgumentError, "pattern and replacement in replace/3 need to be a string") + do: raise(ArgumentError, "substring and replacement in replace/3 need to be a string") def replace(%Series{dtype: dtype}, _, _), do: dtype_error("replace/3", dtype, [:string]) + @doc """ + Replaces all occurences of a pattern with replacement in string series. + + Both pattern and replacement must be of type string. The replacement + can refer to groups captures by using the `${x}`, where `x` is a number starting from 1. + It can also refer to named groups using the same syntax. + + > ### Notice {: .warning} + > + > This function matches against a regular expression. It does not expect an Elixir regex, + > but a escaped string and you can use the `~S` sigil for escaping it. Since each Explorer + > backend may have its own regular expression rules, you must consult their underlying + > engine. For the default backend (Polars), the rules are outlined in the Rust create named + > [`regex`](https://docs.rs/regex/latest/regex/). + > + > To replace by literal strings, you can use `replace/3`. + + ## Examples + + iex> series = Explorer.Series.from_list(["1.200,45", "1.234.567,30", "asdf", nil]) + iex> Explorer.Series.re_replace(series, ~S/[,.]/, "") + #Explorer.Series< + Polars[4] + string ["120045", "123456730", "asdf", nil] + > + + iex> series = Explorer.Series.from_list(["hat", "hut"]) + iex> Explorer.Series.re_replace(series, ~S/h(.)t/, "b${1}d") + #Explorer.Series< + Polars[2] + string ["bad", "bud"] + > + + iex> series = Explorer.Series.from_list(["hat", "hut"]) + iex> Explorer.Series.re_replace(series, ~S/h(?.)t/, "b${vowel}d") + #Explorer.Series< + Polars[2] + string ["bad", "bud"] + > + + Apply case-insensitive string replacement using the `(?i)` flag - remember, from the `regex` Rust crate. + + iex> series = Explorer.Series.from_list(["Foggy", "Rainy", "Sunny"]) + iex> Explorer.Series.re_replace(series, ~S/(?i)foggy|rainy/, "Sunny") + #Explorer.Series< + Polars[3] + string ["Sunny", "Sunny", "Sunny"] + > + + With an Elixir regex it causes an error: + + iex> series = Explorer.Series.from_list(["hat", "hut"]) + iex> Explorer.Series.re_replace(series, ~r/h(.)t/, "b${1}d") + ** (ArgumentError) standard regexes cannot be used as pattern because it may be incompatible with the backend. Please use the `~S` sigil or extract the source from the regex with `Regex.source/1` + + """ + @doc type: :string_wise + @spec re_replace(Series.t(), binary(), binary()) :: Series.t() + def re_replace(%Series{dtype: :string} = series, pattern, replacement) + when K.and(is_binary(pattern), is_binary(replacement)), + do: apply_series(series, :re_replace, [pattern, replacement]) + + def re_replace(%Series{dtype: :string}, %Regex{}, _) do + raise ArgumentError, + "standard regexes cannot be used as pattern because it may be incompatible with the backend. " <> + "Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`" + end + + def re_replace(%Series{dtype: :string}, _, _), + do: raise(ArgumentError, "pattern and replacement in re_replace/3 need to be a string") + + def re_replace(%Series{dtype: dtype}, _, _), do: dtype_error("re_replace/3", dtype, [:string]) + @doc """ Returns a string series where all leading and trailing Unicode whitespaces have been removed. diff --git a/native/explorer/src/expressions.rs b/native/explorer/src/expressions.rs index b9eb6b1d0..700ef8bc9 100644 --- a/native/explorer/src/expressions.rs +++ b/native/explorer/src/expressions.rs @@ -835,6 +835,12 @@ pub fn expr_contains(expr: ExExpr, pattern: &str) -> ExExpr { ExExpr::new(expr.str().contains_literal(pattern.lit())) } +#[rustler::nif] +pub fn expr_re_contains(expr: ExExpr, pattern: &str) -> ExExpr { + let expr = expr.clone_inner(); + ExExpr::new(expr.str().contains(pattern.lit(), true)) +} + #[rustler::nif] pub fn expr_upcase(expr: ExExpr) -> ExExpr { let expr = expr.clone_inner(); @@ -896,11 +902,13 @@ pub fn expr_split(expr: ExExpr, substring: String) -> ExExpr { #[rustler::nif] pub fn expr_replace(expr: ExExpr, pat: String, value: String) -> ExExpr { let expr = expr.clone_inner(); - ExExpr::new(expr.str().replace_all( - Expr::Literal(LiteralValue::String(pat)), - Expr::Literal(LiteralValue::String(value)), - true, - )) + ExExpr::new(expr.str().replace_all(pat.lit(), value.lit(), true)) +} + +#[rustler::nif] +pub fn expr_re_replace(expr: ExExpr, pat: String, value: String) -> ExExpr { + let expr = expr.clone_inner(); + ExExpr::new(expr.str().replace_all(pat.lit(), value.lit(), false)) } #[rustler::nif] diff --git a/native/explorer/src/lib.rs b/native/explorer/src/lib.rs index b0bee4796..0b99abe95 100644 --- a/native/explorer/src/lib.rs +++ b/native/explorer/src/lib.rs @@ -252,6 +252,7 @@ rustler::init!( expr_describe_filter_plan, // string expressions expr_contains, + expr_re_contains, expr_upcase, expr_downcase, expr_strip, @@ -260,6 +261,7 @@ rustler::init!( expr_substring, expr_split, expr_replace, + expr_re_replace, expr_json_path_match, expr_split_into, // float round expressions diff --git a/native/explorer/src/series.rs b/native/explorer/src/series.rs index 4ec5641d0..f144d30de 100644 --- a/native/explorer/src/series.rs +++ b/native/explorer/src/series.rs @@ -1468,8 +1468,13 @@ pub fn s_not(s1: ExSeries) -> Result { } #[rustler::nif(schedule = "DirtyCpu")] -pub fn s_contains(s1: ExSeries, pattern: &str) -> Result { - Ok(ExSeries::new(s1.str()?.contains_literal(pattern)?.into())) +pub fn s_contains(s1: ExSeries, pattern: &str, literal: bool) -> Result { + let chunked_array = if literal { + s1.str()?.contains_literal(pattern)? + } else { + s1.str()?.contains(pattern, true)? + }; + Ok(ExSeries::new(chunked_array.into())) } #[rustler::nif(schedule = "DirtyCpu")] @@ -1487,10 +1492,14 @@ pub fn s_replace( s1: ExSeries, pattern: &str, replacement: &str, + literal: bool, ) -> Result { - Ok(ExSeries::new( - s1.str()?.replace_literal_all(pattern, replacement)?.into(), - )) + let chunked_array = if literal { + s1.str()?.replace_literal_all(pattern, replacement)? + } else { + s1.str()?.replace_all(pattern, replacement)? + }; + Ok(ExSeries::new(chunked_array.into())) } #[rustler::nif(schedule = "DirtyCpu")] diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index 3851e1639..59c4dba73 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -622,6 +622,20 @@ defmodule Explorer.DataFrameTest do assert DF.to_columns(df1, atom_keys: true) == %{a: [1, 2, 3], b: [9, 8, 7]} end + + test "filter using contains/2" do + df = DF.new(a: [1, 2, 3, nil], b: ["abc", "bcd", "def", nil]) + + df1 = DF.filter(df, contains(b, "b")) + assert DF.to_columns(df1, atom_keys: true) == %{a: [1, 2], b: ["abc", "bcd"]} + end + + test "filter using re_contains/2" do + df = DF.new(a: [1, 2, 3, nil], b: ["abc", "bcd", "def", nil]) + + df1 = DF.filter(df, re_contains(b, ~S/^(b|d)/)) + assert DF.to_columns(df1, atom_keys: true) == %{a: [2, 3], b: ["bcd", "def"]} + end end describe "mutate_with/2" do @@ -1749,12 +1763,14 @@ defmodule Explorer.DataFrameTest do df1 = DF.mutate(df, - b: replace(a, ",", "") + b: replace(a, ",", ""), + c: re_replace(a, ~S/\d{3}$/, "999") ) assert DF.to_columns(df1, atom_keys: true) == %{ a: ["2,000", "2,000,000", ","], - b: ["2000", "2000000", ""] + b: ["2000", "2000000", ""], + c: ["2,999", "2,000,999", ","] } end diff --git a/test/explorer/series_test.exs b/test/explorer/series_test.exs index da4052b8c..ea1f62b59 100644 --- a/test/explorer/series_test.exs +++ b/test/explorer/series_test.exs @@ -5211,11 +5211,15 @@ defmodule Explorer.SeriesTest do end describe "replace/3" do - test "replaces all occurences of pattern in string by replacement string" do + test "replaces all occurences of a substring in string by replacement string" do series = Series.from_list(["1,200", "1,234,567", "asdf", nil]) assert Series.replace(series, ",", "") |> Series.to_list() == ["1200", "1234567", "asdf", nil] + end + + test "does not work with regex patterns" do + series = Series.from_list(["1,200", "1,234,567", "asdf", nil]) assert Series.replace(series, "[,]", "") |> Series.to_list() == ["1,200", "1,234,567", "asdf", nil] @@ -5229,11 +5233,11 @@ defmodule Explorer.SeriesTest do fn -> Series.replace(series, ",", "") end end - test "raises error if pattern is not string" do + test "raises error if substring is not string" do series = Series.from_list(["1,200", "1,234,567", "asdf", nil]) assert_raise ArgumentError, - "pattern and replacement in replace/3 need to be a string", + "substring and replacement in replace/3 need to be a string", fn -> Series.replace(series, 2, "") end end @@ -5241,11 +5245,91 @@ defmodule Explorer.SeriesTest do series = Series.from_list(["1,200", "1,234,567", "asdf", nil]) assert_raise ArgumentError, - "pattern and replacement in replace/3 need to be a string", + "substring and replacement in replace/3 need to be a string", fn -> Series.replace(series, ",", nil) end end end + describe "re_replace/3" do + test "replaces all occurences of pattern in string by replacement string" do + series = Series.from_list(["1,200.42", "1,234,567.54", "asdf", nil]) + + assert Series.re_replace(series, ~S/[^0-9]/, "") |> Series.to_list() == + ["120042", "123456754", "", nil] + end + + test "doesn't work with non string series" do + series = Series.from_list([1200, 1_234_567, nil]) + + assert_raise ArgumentError, + "Explorer.Series.re_replace/3 not implemented for dtype {:s, 64}. Valid dtype is :string", + fn -> Series.re_replace(series, ",", "") end + end + + test "raises error if pattern is not string" do + series = Series.from_list(["1,200", "1,234,567", "asdf", nil]) + + assert_raise ArgumentError, + "pattern and replacement in re_replace/3 need to be a string", + fn -> Series.re_replace(series, 2, "") end + end + + test "raises error if replacement is not string" do + series = Series.from_list(["1,200", "1,234,567", "asdf", nil]) + + assert_raise ArgumentError, + "pattern and replacement in re_replace/3 need to be a string", + fn -> Series.re_replace(series, ",", nil) end + end + + test "raises error if pattern is an Elixir regex" do + series = Series.from_list(["1,200.42", "1,234,567.54", "asdf", nil]) + + assert_raise ArgumentError, + "standard regexes cannot be used as pattern because it may be incompatible with the backend. " <> + "Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`", + fn -> + Series.re_replace(series, ~r/[^0-9]/, "") + end + end + end + + describe "contains/2" do + test "check if a substring is inside the series" do + series = Series.from_list(["abc", "bcd", "def", nil]) + + assert Series.contains(series, "b") |> Series.to_list() == + [true, true, false, nil] + end + + test "does not work with regex patterns" do + series = Series.from_list(["abc", "bcd", "def", nil]) + + assert Series.contains(series, ~S/(b|d)/) |> Series.to_list() == + [false, false, false, nil] + end + end + + describe "re_contains/2" do + test "check if a pattern matches the contents of the series" do + series = Series.from_list(["abc", "bcd", "def", nil]) + + assert Series.re_contains(series, ~S/^(b|d)/) |> Series.to_list() == + [false, true, true, nil] + end + + test "raises error if pattern is an Elixir regex" do + series = Series.from_list(["abc", "bcd", "def", nil]) + + assert_raise ArgumentError, + "standard regexes cannot be used as pattern because it may be incompatible with the backend. " <> + "Please use the `~S` sigil or extract the source from the regex with `Regex.source/1`", + fn -> + Series.re_contains(series, ~r/^(b|d)/) + end + end + end + describe "strip, strip, lstrip, rstrip" do test "strip/1" do series = Series.from_list([" 123 ", " 2 ", " 20$ "])