Skip to content

Commit

Permalink
Merge branch 'main' into trim_other/2
Browse files Browse the repository at this point in the history
  • Loading branch information
josevalim authored Jul 28, 2023
2 parents 63d3e96 + 8e32a9b commit 8445d93
Show file tree
Hide file tree
Showing 28 changed files with 899 additions and 179 deletions.
29 changes: 29 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -240,6 +240,35 @@ mix localstack.setup
mix test --only cloud_integration
```

## Precompilation

Explorer ships with the NIF code precompiled for the most popular architectures out there.
We support the following:

- `aarch64-apple-darwin` - MacOS running on ARM 64 bits CPUs.
- `aarch64-unknown-linux-gnu` - Linux running on ARM 64 bits CPUs, compiled with GCC.
- `aarch64-unknown-linux-musl` - Linux running on ARM 64 bits CPUs, compiled with Musl.
- `riscv64gc-unknown-linux-gnu` - Linux running on RISCV 64 bits CPUs, compiled with GCC.
- `x86_64-apple-darwin` - MacOS running on Intel/AMD 64 bits CPUs.
- `x86_64-pc-windows-msvc` - Windows running on Intel/AMD 64 bits CPUs, compiled with Visual C++.
- `x86_64-pc-windows-gnu` - Windows running on Intel/AMD 64 bits CPUs, compiled with GCC.
- `x86_64-unknown-linux-gnu` - Linux running on Intel/AMD 64 bits CPUs, compiled with GCC.
- `x86_64-unknown-linux-musl` - Linux running on Intel/AMD 64 bits CPUs, compiled with Musl.
- `x86_64-unknown-freebsd` - FreeBSD running on Intel/AMD 64 bits.

This means that the problem is going to work without the need to compile it from source.
This currently **only works for Hex releases**. For more information on how it works, please
check the [RustlerPrecompiled project](https://hexdocs.pm/rustler_precompiled).

### Features disabled

Some of the features cannot be compiled to some targets, because one of the dependencies
don't work on it.

This is the case for the **NDJSON** reads and writes, that don't work for the RISCV target.
We also disable the AWS S3 reads and writes for the RISCV target, because one of the dependencies
of `ObjectStore` does not compile on it.

## Sponsors

<a href="https://amplified.ai"><img src="sponsors/amplified.png" width=100 alt="Amplified"></a>
4 changes: 2 additions & 2 deletions lib/explorer/backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ defmodule Explorer.Backend.DataFrame do
entry :: fs_entry(),
dtypes,
delimiter :: String.t(),
null_character :: String.t(),
nil_values :: list(String.t()),
skip_rows :: integer(),
header? :: boolean(),
encoding :: String.t(),
Expand All @@ -61,7 +61,7 @@ defmodule Explorer.Backend.DataFrame do
contents :: String.t(),
dtypes,
delimiter :: String.t(),
null_character :: String.t(),
nil_values :: list(String.t()),
skip_rows :: integer(),
header? :: boolean(),
encoding :: String.t(),
Expand Down
10 changes: 10 additions & 0 deletions lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ defmodule Explorer.Backend.LazySeries do
cumulative_product: 2,
window_max: 5,
window_mean: 5,
window_median: 5,
window_min: 5,
window_sum: 5,
window_standard_deviation: 5,
Expand Down Expand Up @@ -116,6 +117,7 @@ defmodule Explorer.Backend.LazySeries do
trim: 2,
upcase: 1,
downcase: 1,
substring: 3,
# Float round
round: 2,
floor: 1,
Expand Down Expand Up @@ -155,6 +157,7 @@ defmodule Explorer.Backend.LazySeries do
@window_fun_operations [
:window_max,
:window_mean,
:window_median,
:window_min,
:window_sum
]
Expand Down Expand Up @@ -890,6 +893,13 @@ defmodule Explorer.Backend.LazySeries do
Backend.Series.new(data, :string)
end

@impl true
def substring(series, offset, length) do
data = new(:substring, [lazy_series!(series), offset, length])

Backend.Series.new(data, :string)
end

@impl true
def round(series, decimals) when is_integer(decimals) and decimals >= 0 do
data = new(:round, [lazy_series!(series), decimals])
Expand Down
8 changes: 8 additions & 0 deletions lib/explorer/backend/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,13 @@ defmodule Explorer.Backend.Series do
min_periods :: integer() | nil,
center :: boolean()
) :: s
@callback window_median(
s,
window_size :: integer(),
weights :: [float()] | nil,
min_periods :: integer() | nil,
center :: boolean()
) :: s
@callback window_standard_deviation(
s,
window_size :: integer(),
Expand Down Expand Up @@ -240,6 +247,7 @@ defmodule Explorer.Backend.Series do
@callback trim(s, String.t() | nil) :: s
@callback trim_leading(s, String.t() | nil) :: s
@callback trim_trailing(s, String.t() | nil) :: s
@callback substring(s, integer(), non_neg_integer() | nil) :: s

# Date / DateTime

Expand Down
14 changes: 7 additions & 7 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ defmodule Explorer.DataFrame do
* `:max_rows` - Maximum number of lines to read. (default: `nil`)
* `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
* `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
Expand Down Expand Up @@ -553,7 +553,7 @@ defmodule Explorer.DataFrame do
encoding: "utf8",
header: true,
max_rows: nil,
null_character: "NA",
nil_values: [],
skip_rows: 0,
columns: nil,
infer_schema_length: @default_infer_schema_length,
Expand All @@ -568,7 +568,7 @@ defmodule Explorer.DataFrame do
entry,
check_dtypes!(opts[:dtypes]),
opts[:delimiter],
opts[:null_character],
opts[:nil_values],
opts[:skip_rows],
opts[:header],
opts[:encoding],
Expand Down Expand Up @@ -611,7 +611,7 @@ defmodule Explorer.DataFrame do
imputed from the first 1000 rows. (default: `[]`)
* `:header` - Does the file have a header of column names as the first row or not? (default: `true`)
* `:max_rows` - Maximum number of lines to read. (default: `nil`)
* `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
* `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
* `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
* `:columns` - A list of column names or indexes to keep. If present, only these columns are read into the dataframe. (default: `nil`)
* `:infer_schema_length` Maximum number of rows read for schema inference. Setting this to nil will do a full table scan and will be slow (default: `1000`).
Expand All @@ -633,7 +633,7 @@ defmodule Explorer.DataFrame do
encoding: "utf8",
header: true,
max_rows: nil,
null_character: "NA",
nil_values: [],
skip_rows: 0,
columns: nil,
infer_schema_length: @default_infer_schema_length,
Expand All @@ -647,7 +647,7 @@ defmodule Explorer.DataFrame do
contents,
check_dtypes!(opts[:dtypes]),
opts[:delimiter],
opts[:null_character],
opts[:nil_values],
opts[:skip_rows],
opts[:header],
opts[:encoding],
Expand Down Expand Up @@ -785,7 +785,7 @@ defmodule Explorer.DataFrame do
@doc type: :io
@spec to_parquet(df :: DataFrame.t(), filename :: String.t() | fs_entry(), opts :: Keyword.t()) ::
:ok | {:error, term()}
def to_parquet(df, filename, opts \\ []) do
def to_parquet(%DataFrame{} = df, filename, opts \\ []) do
opts = Keyword.validate!(opts, compression: nil, streaming: true, config: nil)
compression = parquet_compression(opts[:compression])

Expand Down
63 changes: 39 additions & 24 deletions lib/explorer/polars_backend/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
%Local.Entry{} = entry,
dtypes,
<<delimiter::utf8>>,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down Expand Up @@ -91,7 +91,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
columns,
dtypes,
encoding,
null_character,
nil_values,
parse_dates,
char_byte(eol_delimiter)
)
Expand Down Expand Up @@ -130,14 +130,10 @@ defmodule Explorer.PolarsBackend.DataFrame do
end

@impl true
def to_csv(_df, %S3.Entry{}, _header?, _delimiter) do
raise "S3 is not supported yet"
end

def to_csv_writer_sample(%DataFrame{data: df}, path, header?, delimiter) do
def to_csv(%DataFrame{data: df}, %S3.Entry{} = entry, header?, delimiter) do
<<delimiter::utf8>> = delimiter

case Native.df_to_csv_writer_sample(df, path, header?, delimiter) do
case Native.df_to_csv_cloud(df, entry, header?, delimiter) do
{:ok, _} -> :ok
{:error, error} -> {:error, error}
end
Expand All @@ -153,7 +149,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
contents,
dtypes,
<<delimiter::utf8>>,
null_character,
nil_values,
skip_rows,
header?,
encoding,
Expand Down Expand Up @@ -188,7 +184,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
columns,
dtypes,
encoding,
null_character,
nil_values,
parse_dates,
char_byte(eol_delimiter)
)
Expand Down Expand Up @@ -221,8 +217,11 @@ defmodule Explorer.PolarsBackend.DataFrame do
end
end

def to_ndjson(_, %S3.Entry{}) do
raise "S3 is not supported yet"
@impl true
def to_ndjson(%DataFrame{data: df}, %S3.Entry{} = entry) do
with {:ok, _} <- Native.df_to_ndjson_cloud(df, entry) do
:ok
end
end

@impl true
Expand Down Expand Up @@ -280,12 +279,19 @@ defmodule Explorer.PolarsBackend.DataFrame do

@impl true
def to_parquet(
_df,
%S3.Entry{},
_compression,
%DataFrame{data: df},
%S3.Entry{} = entry,
{compression, compression_level},
_streaming
) do
raise "S3 is not supported yet"
case Native.df_to_parquet_cloud(
df,
entry,
parquet_compression(compression, compression_level)
) do
{:ok, _} -> :ok
{:error, error} -> {:error, error}
end
end

@impl true
Expand Down Expand Up @@ -326,20 +332,23 @@ defmodule Explorer.PolarsBackend.DataFrame do

@impl true
def to_ipc(%DataFrame{data: df}, %Local.Entry{} = entry, {compression, _level}, _streaming) do
case Native.df_to_ipc(df, entry.path, Atom.to_string(compression)) do
case Native.df_to_ipc(df, entry.path, maybe_atom_to_string(compression)) do
{:ok, _} -> :ok
{:error, error} -> {:error, error}
end
end

@impl true
def to_ipc(_df, %S3.Entry{}, _, _) do
raise "S3 is not supported yet"
def to_ipc(%DataFrame{data: df}, %S3.Entry{} = entry, {compression, _level}, _streaming) do
case Native.df_to_ipc_cloud(df, entry, maybe_atom_to_string(compression)) do
{:ok, _} -> :ok
{:error, error} -> {:error, error}
end
end

@impl true
def dump_ipc(%DataFrame{data: df}, {compression, _level}) do
Native.df_dump_ipc(df, Atom.to_string(compression))
Native.df_dump_ipc(df, maybe_atom_to_string(compression))
end

@impl true
Expand Down Expand Up @@ -369,20 +378,23 @@ defmodule Explorer.PolarsBackend.DataFrame do

@impl true
def to_ipc_stream(%DataFrame{data: df}, %Local.Entry{} = entry, {compression, _level}) do
case Native.df_to_ipc_stream(df, entry.path, Atom.to_string(compression)) do
case Native.df_to_ipc_stream(df, entry.path, maybe_atom_to_string(compression)) do
{:ok, _} -> :ok
{:error, error} -> {:error, error}
end
end

@impl true
def to_ipc_stream(_df, %S3.Entry{}, _compression) do
raise "S3 is not supported yet"
def to_ipc_stream(%DataFrame{data: df}, %S3.Entry{} = entry, {compression, _level}) do
case Native.df_to_ipc_stream_cloud(df, entry, maybe_atom_to_string(compression)) do
{:ok, _} -> :ok
{:error, error} -> {:error, error}
end
end

@impl true
def dump_ipc_stream(%DataFrame{data: df}, {compression, _level}) do
Native.df_dump_ipc_stream(df, Atom.to_string(compression))
Native.df_dump_ipc_stream(df, maybe_atom_to_string(compression))
end

@impl true
Expand All @@ -395,6 +407,9 @@ defmodule Explorer.PolarsBackend.DataFrame do
end
end

defp maybe_atom_to_string(nil), do: nil
defp maybe_atom_to_string(atom) when is_atom(atom), do: Atom.to_string(atom)

# Conversion

@impl true
Expand Down
4 changes: 3 additions & 1 deletion lib/explorer/polars_backend/expression.ex
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ defmodule Explorer.PolarsBackend.Expression do
cumulative_product: 2,
window_max: 5,
window_mean: 5,
window_median: 5,
window_min: 5,
window_sum: 5,
window_standard_deviation: 5,
Expand All @@ -124,7 +125,8 @@ defmodule Explorer.PolarsBackend.Expression do
trim_leading: 2,
trim_trailing: 2,
downcase: 1,
upcase: 1
upcase: 1,
substring: 3
]

@custom_expressions [
Expand Down
Loading

0 comments on commit 8445d93

Please sign in to comment.