From fe5235268d9e085cf35dc8f5be35c4ae52f88df5 Mon Sep 17 00:00:00 2001 From: Billy Lanchantin Date: Mon, 30 Sep 2024 16:30:19 -0400 Subject: [PATCH] De-functionalize query internals (#989) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Rename `traverse/2` as `traverse_root/2` This just makes the intended use clearer. * Append `?` to `collect_pins_and_vars` This makes it clearer it's a boolean. * Alias `Kernel` as `K` This is in line with what we do elsewhere. * Slight format refactor * Change `query/1` to `query/2` This is the primary change. Now `query/2` takes the dataframe as an explicit argument intead of an implicit, unhygienized variable. * Add comment about non-obvious function wrap * Try to fix specs * Comment out broken test * Revert style changes * Revert all changes to `query.ex` * Revert all changes to `data_frame.ex` * Revert changes to `series.ex` * Add new/1 that returns a Backend.LazyFrame * Make is_column_pairs return false for structs * Add _with clauses * Add Access behaviour * Make all impls specify behaviour * Uncomment test * Add TODO for `is_non_struct_map` Co-authored-by: José Valim * Change Backend.LazyFrame to Backend.QueryFrame * Rename files * Edit test internals for readability * Add docs Re-write the "Implementation details" section to reference the new functionality. * Doc tweaks * Remove an extra "the" * Reference the rewritten section in the `new/1` docs * Revert 1st sentence * Revert "Implementation details" * Add smaller addendum to that section * Add example to new/1 docs * Update lib/explorer/backend/query_frame.ex Co-authored-by: José Valim * Fix punctuation --------- Co-authored-by: José Valim --- lib/explorer/backend/data_frame.ex | 2 +- .../backend/{lazy_frame.ex => query_frame.ex} | 46 +++++++--- lib/explorer/data_frame.ex | 87 ++++++++++++++----- lib/explorer/query.ex | 71 +++++++++++++++ mix.exs | 2 +- ...zy_frame_test.exs => query_frame_test.exs} | 10 +-- 6 files changed, 176 insertions(+), 42 deletions(-) rename lib/explorer/backend/{lazy_frame.ex => query_frame.ex} (72%) rename test/explorer/backend/{lazy_frame_test.exs => query_frame_test.exs} (62%) diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex index 8a0fece92..149ac6493 100644 --- a/lib/explorer/backend/data_frame.ex +++ b/lib/explorer/backend/data_frame.ex @@ -29,7 +29,7 @@ defmodule Explorer.Backend.DataFrame do | [basic_types()] | (df() -> series() | basic_types() | [basic_types()]) - @type lazy_frame :: Explorer.Backend.LazyFrame.t() + @type query_frame :: Explorer.Backend.QueryFrame.t() @type lazy_series :: Explorer.Backend.LazySeries.t() @type compression :: {algorithm :: option(atom()), level :: option(integer())} diff --git a/lib/explorer/backend/lazy_frame.ex b/lib/explorer/backend/query_frame.ex similarity index 72% rename from lib/explorer/backend/lazy_frame.ex rename to lib/explorer/backend/query_frame.ex index 06bb624c8..c79e78d59 100644 --- a/lib/explorer/backend/lazy_frame.ex +++ b/lib/explorer/backend/query_frame.ex @@ -1,8 +1,9 @@ -defmodule Explorer.Backend.LazyFrame do +defmodule Explorer.Backend.QueryFrame do @moduledoc """ Represents a lazy dataframe for building query expressions. - The LazyFrame is available inside `filter_with`, `mutate_with`, and + You may call `Explorer.Query.new` to create a query-backed dataframe. + The QueryFrame is available inside `filter_with`, `mutate_with`, and similar. You cannot perform any operation on them except accessing its underlying series. """ @@ -18,6 +19,8 @@ defmodule Explorer.Backend.LazyFrame do names: Backend.DataFrame.column_name(), resource: reference() | nil } + + @behaviour Access @behaviour Backend.DataFrame @doc false @@ -40,16 +43,16 @@ defmodule Explorer.Backend.LazyFrame do # cross node operations happen at the lazy frame level. # Instead, we store the resource and we delegate them # to the underlying lazy series. - @impl true + @impl Backend.DataFrame def owner_reference(_), do: nil - @impl true + @impl Backend.DataFrame def lazy, do: __MODULE__ - @impl true + @impl Backend.DataFrame def lazy(ldf), do: ldf - @impl true + @impl Backend.DataFrame def inspect(ldf, opts) do import Inspect.Algebra @@ -68,7 +71,7 @@ defmodule Explorer.Backend.LazyFrame do end concat([ - color("LazyFrame", :atom, opts), + color("QueryFrame", :atom, opts), open, "??? x #{length(cols_algebra)}", close, @@ -86,7 +89,7 @@ defmodule Explorer.Backend.LazyFrame do defp groups_algebra([], _), do: "" - @impl true + @impl Backend.DataFrame def pull(%{data: data, dtypes: dtypes}, column) do dtype_for_column = dtypes[column] @@ -103,14 +106,35 @@ defmodule Explorer.Backend.LazyFrame do for {fun, arity} <- funs do args = Macro.generate_arguments(arity, __MODULE__) - @impl true + @impl Backend.DataFrame def unquote(fun)(unquote_splicing(args)) do raise """ - cannot perform operation #{unquote(fun)} on Explorer.Backend.LazyFrame. + cannot perform operation #{unquote(fun)} on Explorer.Backend.QueryFrame. - The LazyFrame is available inside filter_with, mutate_with, and \ + The QueryFrame is available inside filter_with, mutate_with, and \ similar and they support only a limited subset of the Series API """ end end + + @impl Access + def fetch(%__MODULE__{} = lazy_frame, name) do + case pull(lazy_frame, name) do + %Explorer.Series{data: %Explorer.Backend.LazySeries{}} = lazy_series -> + {:ok, lazy_series} + + _other -> + :error + end + end + + @impl Access + def get_and_update(%__MODULE__{}, _name, _callback) do + raise "cannot update an `Explorer.Backend.QueryFrame`" + end + + @impl Access + def pop(%__MODULE__{}, _name) do + raise "cannot delete from an `Explorer.Backend.QueryFrame`" + end end diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex index 9b744c525..fc9f22b0f 100644 --- a/lib/explorer/data_frame.ex +++ b/lib/explorer/data_frame.ex @@ -270,7 +270,10 @@ defmodule Explorer.DataFrame do defguardp is_column(column) when is_binary(column) or is_atom(column) or is_integer(column) defguardp is_column_name(column) when is_binary(column) or is_atom(column) - defguardp is_column_pairs(columns) when is_list(columns) or is_map(columns) + + # TODO: Use is_non_struct_map when we require Elixir v1.18+ + defguardp is_column_pairs(columns) + when is_list(columns) or (is_map(columns) and not is_struct(columns)) # Normalize a column name to string defp to_column_name(column) when is_binary(column), do: column @@ -2607,12 +2610,22 @@ defmodule Explorer.DataFrame do @doc type: :single @spec filter_with( df :: DataFrame.t(), - callback :: (Explorer.Backend.LazyFrame.t() -> Series.lazy_t() | [Series.lazy_t()]) + callback_or_lazy_series_or_list :: + (Explorer.Backend.QueryFrame.t() -> Series.lazy_t() | [Series.lazy_t()]) + | Series.lazy_t() + | [Series.lazy_t()] ) :: DataFrame.t() def filter_with(df, fun) when is_function(fun, 1) do - ldf = Explorer.Backend.LazyFrame.new(df) + filter = + df + |> Explorer.Query.new() + |> fun.() + + filter_with(df, filter) + end - case fun.(ldf) do + def filter_with(df, filter) do + case filter do %Series{dtype: :boolean, data: %LazySeries{} = data} -> Shared.apply_dataframe(df, :filter_with, [df, data]) @@ -2926,21 +2939,31 @@ defmodule Explorer.DataFrame do @doc type: :single @spec mutate_with( df :: DataFrame.t(), - callback :: (Explorer.Backend.LazyFrame.t() -> column_pairs(Series.lazy_t())), + callback_or_column_pairs :: + (Explorer.Backend.QueryFrame.t() -> column_pairs(Series.lazy_t())) + | column_pairs(Series.lazy_t()), opts :: keyword() ) :: DataFrame.t() - def mutate_with(%DataFrame{} = df, fun, opts \\ []) when is_function(fun) and is_list(opts) do + def mutate_with(df, query_or_fun, opts \\ []) + + def mutate_with(%DataFrame{} = df, fun, opts) when is_function(fun, 1) and is_list(opts) do + column_pairs = + df + |> Explorer.Query.new() + |> fun.() + + mutate_with(df, column_pairs, opts) + end + + def mutate_with(%DataFrame{} = df, column_pairs, opts) + when is_column_pairs(column_pairs) and is_list(opts) do keep = Keyword.get(opts, :keep, :all) unless keep in [:all, :none] do raise ArgumentError, "Invalid value for :keep option. Allowed values are :all or :none." end - ldf = Explorer.Backend.LazyFrame.new(df) - - result = fun.(ldf) - - column_pairs = to_column_pairs(df, result, &query_to_series!/1) + column_pairs = to_column_pairs(df, column_pairs, &query_to_series!/1) new_dtypes = for {column_name, series} <- column_pairs, into: %{} do @@ -3423,21 +3446,30 @@ defmodule Explorer.DataFrame do > """ @doc type: :single + @type sort_callback_result :: + Series.lazy_t() | [Series.lazy_t()] | [{:asc | :desc, Series.lazy_t()}] @spec sort_with( df :: DataFrame.t(), - (Explorer.Backend.LazyFrame.t() -> - Series.lazy_t() | [Series.lazy_t()] | [{:asc | :desc, Series.lazy_t()}]), + callback_or_result :: + (Explorer.Backend.QueryFrame.t() -> sort_callback_result()) | sort_callback_result(), opts :: [nils: :first | :last, stable: boolean()] ) :: DataFrame.t() - def sort_with(%DataFrame{} = df, fun, opts \\ []) when is_function(fun, 1) do - [_descending? | opts] = Shared.validate_sort_options!(opts) + def sort_with(df, fun, opts \\ []) - ldf = Explorer.Backend.LazyFrame.new(df) + def sort_with(%DataFrame{} = df, fun, opts) when is_function(fun, 1) do + sortable = + df + |> Explorer.Query.new() + |> fun.() + + sort_with(df, sortable, opts) + end - result = fun.(ldf) + def sort_with(%DataFrame{} = df, sortable, opts) do + [_descending? | opts] = Shared.validate_sort_options!(opts) dir_and_lazy_series_pairs = - result + sortable |> List.wrap() |> Enum.map(fn {dir, %Series{data: %LazySeries{} = lazy_series}} when dir in [:asc, :desc] -> @@ -5675,15 +5707,22 @@ defmodule Explorer.DataFrame do @doc type: :single @spec summarise_with( df :: DataFrame.t(), - callback :: (Explorer.Backend.LazyFrame.t() -> column_pairs(Series.lazy_t())) + callback_or_column_pairs :: + (Explorer.Backend.QueryFrame.t() -> column_pairs(Series.lazy_t())) + | column_pairs(Series.lazy_t()) ) :: DataFrame.t() def summarise_with(%DataFrame{} = df, fun) when is_function(fun, 1) do - ldf = Explorer.Backend.LazyFrame.new(df) + column_pairs = + df + |> Explorer.Query.new() + |> fun.() - result = fun.(ldf) + summarise_with(df, column_pairs) + end - result = - Enum.map(result, fn + def summarise_with(%DataFrame{} = df, column_pairs) when is_column_pairs(column_pairs) do + column_pairs = + Enum.map(column_pairs, fn {key, nil} -> lazy_s = LazySeries.unbacked(:lazy, [nil], :null) {key, Explorer.Backend.Series.new(lazy_s, :null)} @@ -5693,7 +5732,7 @@ defmodule Explorer.DataFrame do end) column_pairs = - to_column_pairs(df, result, fn value -> + to_column_pairs(df, column_pairs, fn value -> case value do %Series{data: %LazySeries{op: :lazy, args: [nil], dtype: :null}} -> value diff --git a/lib/explorer/query.ex b/lib/explorer/query.ex index 62d5e1171..7cccd63ab 100644 --- a/lib/explorer/query.ex +++ b/lib/explorer/query.ex @@ -268,6 +268,36 @@ defmodule Explorer.Query do This means that, whenever you want to generate queries programatically, you can fallback to the regular `_with` APIs. + + In the `_with` APIs, the callbacks receive an `Explorer.DataFrame` as an + input. That dataframe is backed by the special `Explorer.Backend.QueryFrame` + backend. + + Explorer.DataFrame.filter_with(df, fn query_backed_frame -> + IO.inspect(query_backed_frame) + ... + end) + # #Explorer.DataFrame< + # QueryFrame[??? x 1] + # ... + # > + + A "query-backed" dataframe cannot be manipulated. You may only access its + series. And when you do, you get back "lazy-backed" versions of those series: + + Explorer.DataFrame.filter_with(df, fn query_backed_frame -> + IO.inspect(query_backed_frame["a"]) + ... + end) + # #Explorer.Series< + # LazySeries[???] + # s64 (column("a")) + # > + + "Lazy-backed" series are backed by the special `Explorer.Backend.LazySeries` + backend. All `Explorer.Series` functions work on lazy-backed series too. So + you can write your `_with` callbacks without ever referencing the fact that + the backend is the lazy one. """ kernel_all = Kernel.__info__(:functions) ++ Kernel.__info__(:macros) @@ -292,6 +322,47 @@ defmodule Explorer.Query do @kernel_only kernel_only -- kernel_only -- kernel_all + @doc """ + Returns a "query-backed" `Explorer.DataFrame` for use in queries. + + This function is mostly an implementation detail for the `*_with` callbacks. + See the "Implementation details" section of the `@moduledoc` for details. + + There are some limited instances where it's more convenient to work with + query-backed `DataFrame`s. For example, if you want to re-use a lazy series, + you can do so like this: + + alias Explorer.{DataFrame, Query, Series} + + df = DataFrame.new(a: [1, 2, 3]) + qf = Query.new(df) + + gt_1 = Series.greater(qf["a"], 1) + lt_3 = Series.less(qf["a"], 3) + + df + |> DataFrame.filter_with(gt_1) + |> DataFrame.to_columns(atom_keys: true) + #=> %{a: [2, 3]} + + df + |> DataFrame.filter_with(lt_3) + |> DataFrame.to_columns(atom_keys: true) + #=> %{a: [1, 2]} + + df + |> DataFrame.filter_with(Series.and(gt_1, lt_3)) + |> DataFrame.to_columns(atom_keys: true) + #=> %{a: [2]} + + However, if you think you need `new/1`, first check that you can't accomplish + the same thing with `across/0` inside a macro. The latter is usually easier to + work with. + """ + def new(%Explorer.DataFrame{} = df) do + Explorer.Backend.QueryFrame.new(df) + end + @doc """ Builds an anonymous function from a query. diff --git a/mix.exs b/mix.exs index ab262e10d..af8850333 100644 --- a/mix.exs +++ b/mix.exs @@ -86,7 +86,7 @@ defmodule Explorer.MixProject do Explorer.Backend, Explorer.Backend.DataFrame, Explorer.Backend.Series, - Explorer.Backend.LazyFrame, + Explorer.Backend.QueryFrame, Explorer.Backend.LazySeries, Explorer.PolarsBackend ] diff --git a/test/explorer/backend/lazy_frame_test.exs b/test/explorer/backend/query_frame_test.exs similarity index 62% rename from test/explorer/backend/lazy_frame_test.exs rename to test/explorer/backend/query_frame_test.exs index eba49ed77..c81414847 100644 --- a/test/explorer/backend/lazy_frame_test.exs +++ b/test/explorer/backend/query_frame_test.exs @@ -1,15 +1,15 @@ -defmodule Explorer.Backend.LazyFrameTest do +defmodule Explorer.Backend.QueryFrameTest do use ExUnit.Case, async: true - alias Explorer.Backend.LazyFrame + alias Explorer.Backend.QueryFrame test "inspect/2 prints the columns without data" do df = Explorer.DataFrame.new(a: [1, 2], b: [3.1, 4.5]) - ldf = LazyFrame.new(df) + qf = QueryFrame.new(df) - assert inspect(ldf) == + assert inspect(qf) == """ #Explorer.DataFrame< - LazyFrame[??? x 2] + QueryFrame[??? x 2] a s64 b f64 >\