Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

De-functionalize query internals #989

Merged
merged 30 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
0362cc2
Rename `traverse/2` as `traverse_root/2`
billylanchantin Sep 21, 2024
99ef296
Append `?` to `collect_pins_and_vars`
billylanchantin Sep 21, 2024
fff7b00
Alias `Kernel` as `K`
billylanchantin Sep 21, 2024
4011747
Slight format refactor
billylanchantin Sep 21, 2024
54cf52e
Change `query/1` to `query/2`
billylanchantin Sep 21, 2024
ffbb9cd
Add comment about non-obvious function wrap
billylanchantin Sep 21, 2024
6d82614
Try to fix specs
billylanchantin Sep 21, 2024
507354a
Comment out broken test
billylanchantin Sep 21, 2024
6dd174b
Revert style changes
billylanchantin Sep 22, 2024
8edd4de
Revert all changes to `query.ex`
billylanchantin Sep 23, 2024
fcdad36
Revert all changes to `data_frame.ex`
billylanchantin Sep 23, 2024
667c195
Revert changes to `series.ex`
billylanchantin Sep 23, 2024
e4f56c0
Add new/1 that returns a Backend.LazyFrame
billylanchantin Sep 23, 2024
a5edb77
Make is_column_pairs return false for structs
billylanchantin Sep 23, 2024
6f7e303
Add _with clauses
billylanchantin Sep 23, 2024
6709af4
Add Access behaviour
billylanchantin Sep 23, 2024
3925218
Make all impls specify behaviour
billylanchantin Sep 23, 2024
7ac1ada
Uncomment test
billylanchantin Sep 23, 2024
e728977
Add TODO for `is_non_struct_map`
billylanchantin Sep 25, 2024
54809e5
Change Backend.LazyFrame to Backend.QueryFrame
billylanchantin Sep 30, 2024
e1cc3b6
Rename files
billylanchantin Sep 30, 2024
9341e4b
Edit test internals for readability
billylanchantin Sep 30, 2024
89cb90f
Add docs
billylanchantin Sep 30, 2024
12c9319
Doc tweaks
billylanchantin Sep 30, 2024
f6c5c14
Revert 1st sentence
billylanchantin Sep 30, 2024
11e25f3
Revert "Implementation details"
billylanchantin Sep 30, 2024
372638e
Add smaller addendum to that section
billylanchantin Sep 30, 2024
3d72aad
Add example to new/1 docs
billylanchantin Sep 30, 2024
cb8aee2
Update lib/explorer/backend/query_frame.ex
billylanchantin Sep 30, 2024
291cab0
Fix punctuation
billylanchantin Sep 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 29 additions & 6 deletions lib/explorer/backend/lazy_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ defmodule Explorer.Backend.LazyFrame do
names: Backend.DataFrame.column_name(),
resource: reference() | nil
}

@behaviour Access
@behaviour Backend.DataFrame

@doc false
Expand All @@ -40,16 +42,16 @@ defmodule Explorer.Backend.LazyFrame do
# cross node operations happen at the lazy frame level.
# Instead, we store the resource and we delegate them
# to the underlying lazy series.
@impl true
@impl Backend.DataFrame
def owner_reference(_), do: nil

@impl true
@impl Backend.DataFrame
def lazy, do: __MODULE__

@impl true
@impl Backend.DataFrame
def lazy(ldf), do: ldf

@impl true
@impl Backend.DataFrame
def inspect(ldf, opts) do
import Inspect.Algebra

Expand Down Expand Up @@ -86,7 +88,7 @@ defmodule Explorer.Backend.LazyFrame do

defp groups_algebra([], _), do: ""

@impl true
@impl Backend.DataFrame
def pull(%{data: data, dtypes: dtypes}, column) do
dtype_for_column = dtypes[column]

Expand All @@ -103,7 +105,7 @@ defmodule Explorer.Backend.LazyFrame do
for {fun, arity} <- funs do
args = Macro.generate_arguments(arity, __MODULE__)

@impl true
@impl Backend.DataFrame
def unquote(fun)(unquote_splicing(args)) do
raise """
cannot perform operation #{unquote(fun)} on Explorer.Backend.LazyFrame.
Expand All @@ -113,4 +115,25 @@ defmodule Explorer.Backend.LazyFrame do
"""
end
end

@impl Access
def fetch(%__MODULE__{} = lazy_frame, name) do
case pull(lazy_frame, name) do
%Explorer.Series{data: %Explorer.Backend.LazySeries{}} = lazy_series ->
{:ok, lazy_series}

_other ->
:error
end
end

@impl Access
def get_and_update(%__MODULE__{}, _name, _callback) do
raise "cannot update an `Explorer.Backend.LazyFrame`"
end

@impl Access
def pop(%__MODULE__{}, _name) do
raise "cannot delete from an `Explorer.Backend.LazyFrame`"
end
end
87 changes: 63 additions & 24 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,10 @@ defmodule Explorer.DataFrame do

defguardp is_column(column) when is_binary(column) or is_atom(column) or is_integer(column)
defguardp is_column_name(column) when is_binary(column) or is_atom(column)
defguardp is_column_pairs(columns) when is_list(columns) or is_map(columns)

# TODO: Use is_non_struct_map when we require Elixir v1.18+
defguardp is_column_pairs(columns)
billylanchantin marked this conversation as resolved.
Show resolved Hide resolved
when is_list(columns) or (is_map(columns) and not is_struct(columns))

# Normalize a column name to string
defp to_column_name(column) when is_binary(column), do: column
Expand Down Expand Up @@ -2606,12 +2609,22 @@ defmodule Explorer.DataFrame do
@doc type: :single
@spec filter_with(
df :: DataFrame.t(),
callback :: (Explorer.Backend.LazyFrame.t() -> Series.lazy_t() | [Series.lazy_t()])
callback_or_lazy_series_or_list ::
(Explorer.Backend.LazyFrame.t() -> Series.lazy_t() | [Series.lazy_t()])
| Series.lazy_t()
| [Series.lazy_t()]
) :: DataFrame.t()
def filter_with(df, fun) when is_function(fun, 1) do
ldf = Explorer.Backend.LazyFrame.new(df)
filter =
df
|> Explorer.Query.new()
|> fun.()

filter_with(df, filter)
end

case fun.(ldf) do
def filter_with(df, filter) do
case filter do
%Series{dtype: :boolean, data: %LazySeries{} = data} ->
Shared.apply_dataframe(df, :filter_with, [df, data])

Expand Down Expand Up @@ -2925,21 +2938,31 @@ defmodule Explorer.DataFrame do
@doc type: :single
@spec mutate_with(
df :: DataFrame.t(),
callback :: (Explorer.Backend.LazyFrame.t() -> column_pairs(Series.lazy_t())),
callback_or_column_pairs ::
(Explorer.Backend.LazyFrame.t() -> column_pairs(Series.lazy_t()))
| column_pairs(Series.lazy_t()),
opts :: keyword()
) :: DataFrame.t()
def mutate_with(%DataFrame{} = df, fun, opts \\ []) when is_function(fun) and is_list(opts) do
def mutate_with(df, query_or_fun, opts \\ [])

def mutate_with(%DataFrame{} = df, fun, opts) when is_function(fun, 1) and is_list(opts) do
column_pairs =
df
|> Explorer.Query.new()
|> fun.()

mutate_with(df, column_pairs, opts)
end

def mutate_with(%DataFrame{} = df, column_pairs, opts)
when is_column_pairs(column_pairs) and is_list(opts) do
keep = Keyword.get(opts, :keep, :all)

unless keep in [:all, :none] do
raise ArgumentError, "Invalid value for :keep option. Allowed values are :all or :none."
end

ldf = Explorer.Backend.LazyFrame.new(df)

result = fun.(ldf)

column_pairs = to_column_pairs(df, result, &query_to_series!/1)
column_pairs = to_column_pairs(df, column_pairs, &query_to_series!/1)

new_dtypes =
for {column_name, series} <- column_pairs, into: %{} do
Expand Down Expand Up @@ -3422,21 +3445,30 @@ defmodule Explorer.DataFrame do
>
"""
@doc type: :single
@type sort_callback_result ::
Series.lazy_t() | [Series.lazy_t()] | [{:asc | :desc, Series.lazy_t()}]
@spec sort_with(
df :: DataFrame.t(),
(Explorer.Backend.LazyFrame.t() ->
Series.lazy_t() | [Series.lazy_t()] | [{:asc | :desc, Series.lazy_t()}]),
callback_or_result ::
(Explorer.Backend.LazyFrame.t() -> sort_callback_result()) | sort_callback_result(),
opts :: [nils: :first | :last, stable: boolean()]
) :: DataFrame.t()
def sort_with(%DataFrame{} = df, fun, opts \\ []) when is_function(fun, 1) do
[_descending? | opts] = Shared.validate_sort_options!(opts)
def sort_with(df, fun, opts \\ [])

ldf = Explorer.Backend.LazyFrame.new(df)
def sort_with(%DataFrame{} = df, fun, opts) when is_function(fun, 1) do
sortable =
df
|> Explorer.Query.new()
|> fun.()

sort_with(df, sortable, opts)
end

result = fun.(ldf)
def sort_with(%DataFrame{} = df, sortable, opts) do
[_descending? | opts] = Shared.validate_sort_options!(opts)

dir_and_lazy_series_pairs =
result
sortable
|> List.wrap()
|> Enum.map(fn
{dir, %Series{data: %LazySeries{} = lazy_series}} when dir in [:asc, :desc] ->
Expand Down Expand Up @@ -5674,15 +5706,22 @@ defmodule Explorer.DataFrame do
@doc type: :single
@spec summarise_with(
df :: DataFrame.t(),
callback :: (Explorer.Backend.LazyFrame.t() -> column_pairs(Series.lazy_t()))
callback_or_column_pairs ::
(Explorer.Backend.LazyFrame.t() -> column_pairs(Series.lazy_t()))
| column_pairs(Series.lazy_t())
) :: DataFrame.t()
def summarise_with(%DataFrame{} = df, fun) when is_function(fun, 1) do
ldf = Explorer.Backend.LazyFrame.new(df)
column_pairs =
df
|> Explorer.Query.new()
|> fun.()

result = fun.(ldf)
summarise_with(df, column_pairs)
end

result =
Enum.map(result, fn
def summarise_with(%DataFrame{} = df, column_pairs) when is_column_pairs(column_pairs) do
column_pairs =
Enum.map(column_pairs, fn
{key, nil} ->
lazy_s = LazySeries.unbacked(:lazy, [nil], :null)
{key, Explorer.Backend.Series.new(lazy_s, :null)}
Expand All @@ -5692,7 +5731,7 @@ defmodule Explorer.DataFrame do
end)

column_pairs =
to_column_pairs(df, result, fn value ->
to_column_pairs(df, column_pairs, fn value ->
case value do
%Series{data: %LazySeries{op: :lazy, args: [nil], dtype: :null}} ->
value
Expand Down
7 changes: 7 additions & 0 deletions lib/explorer/query.ex
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,13 @@ defmodule Explorer.Query do

@kernel_only kernel_only -- kernel_only -- kernel_all

@doc """
Builds a frame that returns lazy series when accessed.
"""
def new(%Explorer.DataFrame{} = df) do
Explorer.Backend.LazyFrame.new(df)
end

@doc """
Builds an anonymous function from a query.

Expand Down
Loading