Skip to content

Commit

Permalink
Stop inference of type if dtype is given and manually decode terms wh…
Browse files Browse the repository at this point in the history
…ere needed (#928)

* Add manual decoding of terms in "from_list"

The idea is to avoid the need to cast from the Elixir side. This also
enable the usage of mixed data if the dtype is provided (in the next
commit this will be true).

* Do not infer if type has been given

* Remove implicit casting from "from_list/2" and update docs

* Move remaining implementations of "from_list" to its module

* Fix error messages and simplify things

---------

Co-authored-by: José Valim <[email protected]>
  • Loading branch information
philss and josevalim authored Jun 14, 2024
1 parent e49af86 commit d10d553
Show file tree
Hide file tree
Showing 8 changed files with 405 additions and 308 deletions.
13 changes: 7 additions & 6 deletions lib/explorer/polars_backend/shared.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,12 @@ defmodule Explorer.PolarsBackend.Shared do
alias Explorer.PolarsBackend.Native
alias Explorer.PolarsBackend.Series, as: PolarsSeries
alias Explorer.Series, as: Series
import Kernel, except: [apply: 2]

@polars_df [PolarsDataFrame, PolarsLazyFrame]

def apply(fun, args \\ []) do
case apply(Native, fun, args) do
case Kernel.apply(Native, fun, args) do
{:ok, value} -> value
{:error, error} -> raise runtime_error(error)
end
Expand Down Expand Up @@ -185,11 +186,11 @@ defmodule Explorer.PolarsBackend.Shared do
:boolean -> Native.s_from_list_bool(name, list)
:string -> Native.s_from_list_str(name, list)
:category -> Native.s_from_list_categories(name, list)
:date -> Native.s_from_list_date(name, list)
:time -> Native.s_from_list_time(name, list)
{:naive_datetime, precision} -> Native.s_from_list_naive_datetime(name, list, precision)
{:datetime, precision, tz} -> Native.s_from_list_datetime(name, list, precision, tz)
{:duration, precision} -> Native.s_from_list_duration(name, list, precision)
:date -> apply(:s_from_list_date, [name, list])
:time -> apply(:s_from_list_time, [name, list])
{:naive_datetime, precision} -> apply(:s_from_list_naive_datetime, [name, list, precision])
{:datetime, precision, tz} -> apply(:s_from_list_datetime, [name, list, precision, tz])
{:duration, precision} -> apply(:s_from_list_duration, [name, list, precision])
:binary -> Native.s_from_list_binary(name, list)
:null -> Native.s_from_list_null(name, length(list))
end
Expand Down
33 changes: 19 additions & 14 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ defmodule Explorer.Series do
## Options
* `:backend` - The backend to allocate the series on.
* `:dtype` - Cast the series to a given `:dtype`. By default this is `nil`, which means
* `:dtype` - Create a series of a given `:dtype`. By default this is `nil`, which means
that Explorer will infer the type from the values in the list.
See the module docs for the list of valid dtypes and aliases.
Expand Down Expand Up @@ -387,12 +387,6 @@ defmodule Explorer.Series do
s64 [nil, nil]
>
iex> Explorer.Series.from_list([1, nil], dtype: :string)
#Explorer.Series<
Polars[2]
string ["1", nil]
>
iex> Explorer.Series.from_list([1, 2], dtype: :f32)
#Explorer.Series<
Polars[2]
Expand Down Expand Up @@ -431,6 +425,14 @@ defmodule Explorer.Series do
category ["EUA", "Brazil", "Poland"]
>
It is possible to create a series of `:date` from a list of days since Unix Epoch.
iex> Explorer.Series.from_list([1, nil], dtype: :date)
#Explorer.Series<
Polars[2]
date [1970-01-02, nil]
>
It is possible to create a series of `:datetime` from a list of microseconds since Unix Epoch.
iex> Explorer.Series.from_list([1649883642 * 1_000 * 1_000], dtype: {:naive_datetime, :microsecond})
Expand All @@ -451,6 +453,15 @@ defmodule Explorer.Series do
iex> Explorer.Series.from_list([1, "a"])
** (ArgumentError) the value "a" does not match the inferred dtype {:s, 64}
But mixing integers and some of the types for `:date`, `:datetime`, `:time`, or `:duration`
will work if the desired dtype is given:
iex> Explorer.Series.from_list([1, nil, ~D[2024-06-13]], dtype: :date)
#Explorer.Series<
Polars[3]
date [1970-01-02, nil, 2024-06-13]
>
"""
@doc type: :conversion
@spec from_list(list :: list(), opts :: Keyword.t()) :: Series.t()
Expand All @@ -462,13 +473,7 @@ defmodule Explorer.Series do

type = Shared.dtype_from_list!(list, normalised_dtype)

series = backend.from_list(list, type)

case normalised_dtype do
nil -> series
^type -> series
other -> cast(series, other)
end
backend.from_list(list, type)
end

defp from_same_value(%{data: %backend{}}, value) do
Expand Down
41 changes: 1 addition & 40 deletions lib/explorer/shared.ex
Original file line number Diff line number Diff line change
Expand Up @@ -295,15 +295,8 @@ defmodule Explorer.Shared do
If no preferred type is given (nil), then the inferred type is returned.
"""
def dtype_from_list!(_list, :null), do: :null

def dtype_from_list!(list, nil), do: dtype_from_list!(list)

def dtype_from_list!(list, preferred_type) do
list
|> dtype_from_list!()
|> merge_preferred(preferred_type)
end
def dtype_from_list!(_list, preferred_type), do: preferred_type

@non_finite [:nan, :infinity, :neg_infinity]

Expand Down Expand Up @@ -363,38 +356,6 @@ defmodule Explorer.Shared do
{:struct, Enum.sort(types)}
end

defp merge_preferred(type, type), do: type
defp merge_preferred(:null, type), do: type
defp merge_preferred({:s, 64}, {:u, _} = type), do: type
defp merge_preferred({:s, 64}, {:s, _} = type), do: type
defp merge_preferred({:s, 64}, {:f, _} = type), do: type
defp merge_preferred({:f, 64}, {:f, _} = type), do: type
defp merge_preferred(:string, type) when type in [:binary, :string, :category], do: type

defp merge_preferred({:list, inferred}, {:list, preferred}) do
{:list, merge_preferred(inferred, preferred)}
end

defp merge_preferred({:struct, inferred}, {:struct, preferred}) do
{remaining, all_merged} =
Enum.reduce(preferred, {inferred, []}, fn {col, dtype}, {inferred_rest, merged} ->
case List.keytake(inferred_rest, col, 0) do
{{^col, inferred_dtype}, rest} ->
solved = merge_preferred(inferred_dtype, dtype)
{rest, List.keystore(merged, col, 0, {col, solved})}

nil ->
{inferred, List.keystore(merged, col, 0, {col, dtype})}
end
end)

{:struct, all_merged ++ remaining}
end

defp merge_preferred(inferred, _preferred) do
inferred
end

@doc """
Returns the leaf dtype from a {:list, _} dtype, or itself.
"""
Expand Down
1 change: 1 addition & 0 deletions native/explorer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ pub use error::ExplorerError;
use expressions::*;
use lazyframe::io::*;
use lazyframe::*;
use series::from_list::*;
use series::log::*;
use series::*;

Expand Down
Loading

0 comments on commit d10d553

Please sign in to comment.