Skip to content

Commit

Permalink
Implement substring/3 (#669)
Browse files Browse the repository at this point in the history
  • Loading branch information
DeemoONeill authored Jul 28, 2023
1 parent 74bdb47 commit 8e32a9b
Show file tree
Hide file tree
Showing 11 changed files with 161 additions and 1 deletion.
8 changes: 8 additions & 0 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ defmodule Explorer.Backend.LazySeries do
trim: 2,
upcase: 1,
downcase: 1,
substring: 3,
# Float round
round: 2,
floor: 1,
Expand Down Expand Up @@ -892,6 +893,13 @@ defmodule Explorer.Backend.LazySeries do
Backend.Series.new(data, :string)
end

@impl true
def substring(series, offset, length) do
data = new(:substring, [lazy_series!(series), offset, length])

Backend.Series.new(data, :string)
end

@impl true
def round(series, decimals) when is_integer(decimals) and decimals >= 0 do
data = new(:round, [lazy_series!(series), decimals])
Expand Down
1 change: 1 addition & 0 deletions lib/explorer/backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,7 @@ defmodule Explorer.Backend.Series do
@callback trim(s, String.t() | nil) :: s
@callback trim_leading(s) :: s
@callback trim_trailing(s) :: s
@callback substring(s, integer(), non_neg_integer() | nil) :: s

# Date / DateTime

Expand Down
3 changes: 2 additions & 1 deletion lib/explorer/polars_backend/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ defmodule Explorer.PolarsBackend.Expression do
trim_leading: 1,
trim_trailing: 1,
downcase: 1,
upcase: 1
upcase: 1,
substring: 3
]

@custom_expressions [
Expand Down
1 change: 1 addition & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ defmodule Explorer.PolarsBackend.Native do
def s_unordered_distinct(_s), do: err()
def s_frequencies(_s), do: err()
def s_cut(_s, _bins, _labels, _break_point_label, _category_label), do: err()
def s_substring(_s, _offset, _length), do: err()

def s_qcut(_s, _quantiles, _labels, _break_point_label, _category_label),
do: err()
Expand Down
4 changes: 4 additions & 0 deletions lib/explorer/polars_backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -631,6 +631,10 @@ defmodule Explorer.PolarsBackend.Series do
def trim_trailing(series),
do: Shared.apply_series(series, :s_trim_trailing)

@impl true
def substring(series, offset, length),
do: Shared.apply_series(series, :s_substring, [offset, length])

# Float round
@impl true
def round(series, decimals),
Expand Down
63 changes: 63 additions & 0 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4395,6 +4395,69 @@ defmodule Explorer.Series do

def trim_trailing(%Series{dtype: dtype}), do: dtype_error("trim_trailing/1", dtype, [:string])

@doc """
Returns a string sliced from the offset to the end of the string, supporting
negative indexing
## Examples
iex> s = Explorer.Series.from_list(["earth", "mars", "neptune"])
iex> Explorer.Series.substring(s, -3)
#Explorer.Series<
Polars[3]
string ["rth", "ars", "une"]
>
iex> s = Explorer.Series.from_list(["earth", "mars", "neptune"])
iex> Explorer.Series.substring(s, 1)
#Explorer.Series<
Polars[3]
string ["arth", "ars", "eptune"]
>
"""
@doc type: :string_wise
@spec substring(Series.t(), integer()) :: Series.t()
def substring(%Series{dtype: :string} = series, offset) when is_integer(offset),
do: apply_series(series, :substring, [offset, nil])

@doc """
Returns a string sliced from the offset to the length provided, supporting
negative indexing
## Examples
iex> s = Explorer.Series.from_list(["earth", "mars", "neptune"])
iex> Explorer.Series.substring(s, -3, 2)
#Explorer.Series<
Polars[3]
string ["rt", "ar", "un"]
>
iex> s = Explorer.Series.from_list(["earth", "mars", "neptune"])
iex> Explorer.Series.substring(s, 1, 5)
#Explorer.Series<
Polars[3]
string ["arth", "ars", "eptun"]
>
iex> d = Explorer.Series.from_list(["こんにちは世界", "مرحبًا", "안녕하세요"])
iex> Explorer.Series.substring(d, 1, 3)
#Explorer.Series<
Polars[3]
string ["んにち", "رحب", "녕하세"]
>
"""
@doc type: :string_wise
@spec substring(Series.t(), integer(), integer()) :: Series.t()
def substring(%Series{dtype: :string} = series, offset, length)
when is_integer(offset)
when is_integer(length)
when length >= 0,
do: apply_series(series, :substring, [offset, length])

def substring(%Series{dtype: dtype}, _offset, _length),
do: dtype_error("substring/3", dtype, [:string])

# Float

@doc """
Expand Down
6 changes: 6 additions & 0 deletions native/explorer/src/expressions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,12 @@ pub fn expr_trim_trailing(expr: ExExpr) -> ExExpr {
ExExpr::new(expr.str().rstrip(None))
}

#[rustler::nif]
pub fn expr_substring(expr: ExExpr, offset: i64, length: Option<u64>) -> ExExpr {
let expr = expr.clone_inner();
ExExpr::new(expr.str().str_slice(offset, length))
}

#[rustler::nif]
pub fn expr_round(expr: ExExpr, decimals: u32) -> ExExpr {
let expr = expr.clone_inner();
Expand Down
2 changes: 2 additions & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ rustler::init!(
expr_trim,
expr_trim_leading,
expr_trim_trailing,
expr_substring,
// float round expressions
expr_round,
expr_floor,
Expand Down Expand Up @@ -403,6 +404,7 @@ rustler::init!(
s_standard_deviation,
s_tan,
s_trim,
s_substring,
s_subtract,
s_sum,
s_tail,
Expand Down
11 changes: 11 additions & 0 deletions native/explorer/src/series.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1377,6 +1377,17 @@ pub fn s_trim_trailing(s1: ExSeries) -> Result<ExSeries, ExplorerError> {
Ok(ExSeries::new(s1.utf8()?.replace(r#"[ \s]+$"#, "")?.into()))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_substring(
s1: ExSeries,
offset: i64,
length: Option<u64>,
) -> Result<ExSeries, ExplorerError> {
Ok(ExSeries::new(
s1.utf8()?.str_slice(offset, length)?.into_series(),
))
}

#[rustler::nif(schedule = "DirtyCpu")]
pub fn s_round(s: ExSeries, decimals: u32) -> Result<ExSeries, ExplorerError> {
Ok(ExSeries::new(s.round(decimals)?.into_series()))
Expand Down
33 changes: 33 additions & 0 deletions test/explorer/data_frame_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -1517,6 +1517,39 @@ defmodule Explorer.DataFrameTest do
end
end

test "slice strings" do
df =
DF.new(
a: ["_hello", "_world", "_foo", "_bar"],
b: ["venus", "earth", "mars", "jupiter"],
c: ["_foo", "_bar", "_baz", "_quox"],
d: ["_foo", "_bar", "_baz", "_quox"],
e: ["_foo", "_bar", "_baz", "_quox"]
)

df1 =
DF.mutate(df,
f: substring(a, 1),
g: substring(b, 2, 5),
h: substring(c, -3),
i: substring(d, 6, 10),
j: substring(e, -15, 2)
)

assert DF.to_columns(df1, atom_keys: true) == %{
a: ["_hello", "_world", "_foo", "_bar"],
b: ["venus", "earth", "mars", "jupiter"],
c: ["_foo", "_bar", "_baz", "_quox"],
d: ["_foo", "_bar", "_baz", "_quox"],
e: ["_foo", "_bar", "_baz", "_quox"],
f: ["hello", "world", "foo", "bar"],
g: ["nus", "rth", "rs", "piter"],
h: ["foo", "bar", "baz", "uox"],
i: ["", "", "", ""],
j: ["_f", "_b", "_b", "_q"]
}
end

test "trim characters from string" do
df =
DF.new(
Expand Down
30 changes: 30 additions & 0 deletions test/explorer/series_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -3980,6 +3980,36 @@ defmodule Explorer.SeriesTest do
end
end

describe "string_slicing" do
test "string_slice/2 positive offset" do
series = Series.from_list(["earth", "mars", "neptune"])

assert Series.substring(series, 2) |> Series.to_list() == ["rth", "rs", "ptune"]
assert Series.substring(series, 20) |> Series.to_list() == ["", "", ""]
end

test "string_slice/2 negative offset" do
series = Series.from_list(["earth", "mars", "neptune"])

assert Series.substring(series, -3) |> Series.to_list() == ["rth", "ars", "une"]
assert Series.substring(series, -9) |> Series.to_list() == ["earth", "mars", "neptune"]
end

test "string_slice/3 positive offset" do
series = Series.from_list(["earth", "mars", "neptune"])

assert Series.substring(series, 2, 3) |> Series.to_list() == ["rth", "rs", "ptu"]
assert Series.substring(series, 12, 13) |> Series.to_list() == ["", "", ""]
end

test "string_slice/3 negative offset" do
series = Series.from_list(["earth", "mars", "neptune"])

assert Series.substring(series, -4, 4) |> Series.to_list() == ["arth", "mars", "tune"]
assert Series.substring(series, -20, 4) |> Series.to_list() == ["eart", "mars", "nept"]
end
end

describe "strptime/2 and strftime/2" do
test "parse datetime from string" do
series = Series.from_list(["2023-01-05 12:34:56", "XYZ", nil])
Expand Down

0 comments on commit 8e32a9b

Please sign in to comment.