Merge branch 'main' into trim_other/2

elixir-explorer · Jul 28, 2023 · 8445d93 · 8445d93
2 parents 63d3e96 + 8e32a9b
commit 8445d93
Show file tree

Hide file tree

Showing 28 changed files with 899 additions and 179 deletions.
diff --git a/README.md b/README.md
@@ -240,6 +240,35 @@ mix localstack.setup
 mix test --only cloud_integration
 ```
 
+## Precompilation
+
+Explorer ships with the NIF code precompiled for the most popular architectures out there.
+We support the following:
+
+- `aarch64-apple-darwin` - MacOS running on ARM 64 bits CPUs.
+- `aarch64-unknown-linux-gnu` - Linux running on ARM 64 bits CPUs, compiled with GCC.
+- `aarch64-unknown-linux-musl` - Linux running on ARM 64 bits CPUs, compiled with Musl.
+- `riscv64gc-unknown-linux-gnu` - Linux running on RISCV 64 bits CPUs, compiled with GCC.
+- `x86_64-apple-darwin` - MacOS running on Intel/AMD 64 bits CPUs.
+- `x86_64-pc-windows-msvc` - Windows running on Intel/AMD 64 bits CPUs, compiled with Visual C++.
+- `x86_64-pc-windows-gnu` - Windows running on Intel/AMD 64 bits CPUs, compiled with GCC.
+- `x86_64-unknown-linux-gnu` - Linux running on Intel/AMD 64 bits CPUs, compiled with GCC.
+- `x86_64-unknown-linux-musl` - Linux running on Intel/AMD 64 bits CPUs, compiled with Musl.
+- `x86_64-unknown-freebsd` - FreeBSD running on Intel/AMD 64 bits.
+
+This means that the problem is going to work without the need to compile it from source.
+This currently **only works for Hex releases**. For more information on how it works, please
+check the [RustlerPrecompiled project](https://hexdocs.pm/rustler_precompiled).
+
+### Features disabled
+
+Some of the features cannot be compiled to some targets, because one of the dependencies
+don't work on it.
+
+This is the case for the **NDJSON** reads and writes, that don't work for the RISCV target.
+We also disable the AWS S3 reads and writes for the RISCV target, because one of the dependencies
+of `ObjectStore` does not compile on it.
+
 ## Sponsors
 
 <a href="https://amplified.ai"><img src="sponsors/amplified.png" width=100 alt="Amplified"></a>
diff --git a/lib/explorer/backend/data_frame.ex b/lib/explorer/backend/data_frame.ex
@@ -43,7 +43,7 @@ defmodule Explorer.Backend.DataFrame do
               entry :: fs_entry(),
               dtypes,
               delimiter :: String.t(),
-              null_character :: String.t(),
+              nil_values :: list(String.t()),
               skip_rows :: integer(),
               header? :: boolean(),
               encoding :: String.t(),
@@ -61,7 +61,7 @@ defmodule Explorer.Backend.DataFrame do
               contents :: String.t(),
               dtypes,
               delimiter :: String.t(),
-              null_character :: String.t(),
+              nil_values :: list(String.t()),
               skip_rows :: integer(),
               header? :: boolean(),
               encoding :: String.t(),

diff --git a/lib/explorer/backend/lazy_series.ex b/lib/explorer/backend/lazy_series.ex
@@ -68,6 +68,7 @@ defmodule Explorer.Backend.LazySeries do
     cumulative_product: 2,
     window_max: 5,
     window_mean: 5,
+    window_median: 5,
     window_min: 5,
     window_sum: 5,
     window_standard_deviation: 5,
@@ -116,6 +117,7 @@ defmodule Explorer.Backend.LazySeries do
     trim: 2,
     upcase: 1,
     downcase: 1,
+    substring: 3,
     # Float round
     round: 2,
     floor: 1,
@@ -155,6 +157,7 @@ defmodule Explorer.Backend.LazySeries do
   @window_fun_operations [
     :window_max,
     :window_mean,
+    :window_median,
     :window_min,
     :window_sum
   ]
@@ -890,6 +893,13 @@ defmodule Explorer.Backend.LazySeries do
     Backend.Series.new(data, :string)
   end
 
+  @impl true
+  def substring(series, offset, length) do
+    data = new(:substring, [lazy_series!(series), offset, length])
+
+    Backend.Series.new(data, :string)
+  end
+
   @impl true
   def round(series, decimals) when is_integer(decimals) and decimals >= 0 do
     data = new(:round, [lazy_series!(series), decimals])

diff --git a/lib/explorer/backend/series.ex b/lib/explorer/backend/series.ex
@@ -199,6 +199,13 @@ defmodule Explorer.Backend.Series do
               min_periods :: integer() | nil,
               center :: boolean()
             ) :: s
+  @callback window_median(
+              s,
+              window_size :: integer(),
+              weights :: [float()] | nil,
+              min_periods :: integer() | nil,
+              center :: boolean()
+            ) :: s
   @callback window_standard_deviation(
               s,
               window_size :: integer(),
@@ -240,6 +247,7 @@ defmodule Explorer.Backend.Series do
   @callback trim(s, String.t() | nil) :: s
   @callback trim_leading(s, String.t() | nil) :: s
   @callback trim_trailing(s, String.t() | nil) :: s
+  @callback substring(s, integer(), non_neg_integer() | nil) :: s
 
   # Date / DateTime
 

diff --git a/lib/explorer/data_frame.ex b/lib/explorer/data_frame.ex
@@ -516,7 +516,7 @@ defmodule Explorer.DataFrame do
 
     * `:max_rows` - Maximum number of lines to read. (default: `nil`)
 
-    * `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
+    * `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
 
     * `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
 
@@ -553,7 +553,7 @@ defmodule Explorer.DataFrame do
         encoding: "utf8",
         header: true,
         max_rows: nil,
-        null_character: "NA",
+        nil_values: [],
         skip_rows: 0,
         columns: nil,
         infer_schema_length: @default_infer_schema_length,
@@ -568,7 +568,7 @@ defmodule Explorer.DataFrame do
         entry,
         check_dtypes!(opts[:dtypes]),
         opts[:delimiter],
-        opts[:null_character],
+        opts[:nil_values],
         opts[:skip_rows],
         opts[:header],
         opts[:encoding],
@@ -611,7 +611,7 @@ defmodule Explorer.DataFrame do
       imputed from the first 1000 rows. (default: `[]`)
     * `:header` - Does the file have a header of column names as the first row or not? (default: `true`)
     * `:max_rows` - Maximum number of lines to read. (default: `nil`)
-    * `:null_character` - The string that should be interpreted as a nil value. (default: `"NA"`)
+    * `:nil_values` - A list of strings that should be interpreted as a nil values. (default: `[]`)
     * `:skip_rows` - The number of lines to skip at the beginning of the file. (default: `0`)
     * `:columns` - A list of column names or indexes to keep. If present, only these columns are read into the dataframe. (default: `nil`)
     * `:infer_schema_length` Maximum number of rows read for schema inference. Setting this to nil will do a full table scan and will be slow (default: `1000`).
@@ -633,7 +633,7 @@ defmodule Explorer.DataFrame do
         encoding: "utf8",
         header: true,
         max_rows: nil,
-        null_character: "NA",
+        nil_values: [],
         skip_rows: 0,
         columns: nil,
         infer_schema_length: @default_infer_schema_length,
@@ -647,7 +647,7 @@ defmodule Explorer.DataFrame do
       contents,
       check_dtypes!(opts[:dtypes]),
       opts[:delimiter],
-      opts[:null_character],
+      opts[:nil_values],
       opts[:skip_rows],
       opts[:header],
       opts[:encoding],
@@ -785,7 +785,7 @@ defmodule Explorer.DataFrame do
   @doc type: :io
   @spec to_parquet(df :: DataFrame.t(), filename :: String.t() | fs_entry(), opts :: Keyword.t()) ::
           :ok | {:error, term()}
-  def to_parquet(df, filename, opts \\ []) do
+  def to_parquet(%DataFrame{} = df, filename, opts \\ []) do
     opts = Keyword.validate!(opts, compression: nil, streaming: true, config: nil)
     compression = parquet_compression(opts[:compression])
 

diff --git a/lib/explorer/polars_backend/data_frame.ex b/lib/explorer/polars_backend/data_frame.ex
@@ -56,7 +56,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
         %Local.Entry{} = entry,
         dtypes,
         <<delimiter::utf8>>,
-        null_character,
+        nil_values,
         skip_rows,
         header?,
         encoding,
@@ -91,7 +91,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
         columns,
         dtypes,
         encoding,
-        null_character,
+        nil_values,
         parse_dates,
         char_byte(eol_delimiter)
       )
@@ -130,14 +130,10 @@ defmodule Explorer.PolarsBackend.DataFrame do
   end
 
   @impl true
-  def to_csv(_df, %S3.Entry{}, _header?, _delimiter) do
-    raise "S3 is not supported yet"
-  end
-
-  def to_csv_writer_sample(%DataFrame{data: df}, path, header?, delimiter) do
+  def to_csv(%DataFrame{data: df}, %S3.Entry{} = entry, header?, delimiter) do
     <<delimiter::utf8>> = delimiter
 
-    case Native.df_to_csv_writer_sample(df, path, header?, delimiter) do
+    case Native.df_to_csv_cloud(df, entry, header?, delimiter) do
       {:ok, _} -> :ok
       {:error, error} -> {:error, error}
     end
@@ -153,7 +149,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
         contents,
         dtypes,
         <<delimiter::utf8>>,
-        null_character,
+        nil_values,
         skip_rows,
         header?,
         encoding,
@@ -188,7 +184,7 @@ defmodule Explorer.PolarsBackend.DataFrame do
         columns,
         dtypes,
         encoding,
-        null_character,
+        nil_values,
         parse_dates,
         char_byte(eol_delimiter)
       )
@@ -221,8 +217,11 @@ defmodule Explorer.PolarsBackend.DataFrame do
     end
   end
 
-  def to_ndjson(_, %S3.Entry{}) do
-    raise "S3 is not supported yet"
+  @impl true
+  def to_ndjson(%DataFrame{data: df}, %S3.Entry{} = entry) do
+    with {:ok, _} <- Native.df_to_ndjson_cloud(df, entry) do
+      :ok
+    end
   end
 
   @impl true
@@ -280,12 +279,19 @@ defmodule Explorer.PolarsBackend.DataFrame do
 
   @impl true
   def to_parquet(
-        _df,
-        %S3.Entry{},
-        _compression,
+        %DataFrame{data: df},
+        %S3.Entry{} = entry,
+        {compression, compression_level},
         _streaming
       ) do
-    raise "S3 is not supported yet"
+    case Native.df_to_parquet_cloud(
+           df,
+           entry,
+           parquet_compression(compression, compression_level)
+         ) do
+      {:ok, _} -> :ok
+      {:error, error} -> {:error, error}
+    end
   end
 
   @impl true
@@ -326,20 +332,23 @@ defmodule Explorer.PolarsBackend.DataFrame do
 
   @impl true
   def to_ipc(%DataFrame{data: df}, %Local.Entry{} = entry, {compression, _level}, _streaming) do
-    case Native.df_to_ipc(df, entry.path, Atom.to_string(compression)) do
+    case Native.df_to_ipc(df, entry.path, maybe_atom_to_string(compression)) do
       {:ok, _} -> :ok
       {:error, error} -> {:error, error}
     end
   end
 
   @impl true
-  def to_ipc(_df, %S3.Entry{}, _, _) do
-    raise "S3 is not supported yet"
+  def to_ipc(%DataFrame{data: df}, %S3.Entry{} = entry, {compression, _level}, _streaming) do
+    case Native.df_to_ipc_cloud(df, entry, maybe_atom_to_string(compression)) do
+      {:ok, _} -> :ok
+      {:error, error} -> {:error, error}
+    end
   end
 
   @impl true
   def dump_ipc(%DataFrame{data: df}, {compression, _level}) do
-    Native.df_dump_ipc(df, Atom.to_string(compression))
+    Native.df_dump_ipc(df, maybe_atom_to_string(compression))
   end
 
   @impl true
@@ -369,20 +378,23 @@ defmodule Explorer.PolarsBackend.DataFrame do
 
   @impl true
   def to_ipc_stream(%DataFrame{data: df}, %Local.Entry{} = entry, {compression, _level}) do
-    case Native.df_to_ipc_stream(df, entry.path, Atom.to_string(compression)) do
+    case Native.df_to_ipc_stream(df, entry.path, maybe_atom_to_string(compression)) do
       {:ok, _} -> :ok
       {:error, error} -> {:error, error}
     end
   end
 
   @impl true
-  def to_ipc_stream(_df, %S3.Entry{}, _compression) do
-    raise "S3 is not supported yet"
+  def to_ipc_stream(%DataFrame{data: df}, %S3.Entry{} = entry, {compression, _level}) do
+    case Native.df_to_ipc_stream_cloud(df, entry, maybe_atom_to_string(compression)) do
+      {:ok, _} -> :ok
+      {:error, error} -> {:error, error}
+    end
   end
 
   @impl true
   def dump_ipc_stream(%DataFrame{data: df}, {compression, _level}) do
-    Native.df_dump_ipc_stream(df, Atom.to_string(compression))
+    Native.df_dump_ipc_stream(df, maybe_atom_to_string(compression))
   end
 
   @impl true
@@ -395,6 +407,9 @@ defmodule Explorer.PolarsBackend.DataFrame do
     end
   end
 
+  defp maybe_atom_to_string(nil), do: nil
+  defp maybe_atom_to_string(atom) when is_atom(atom), do: Atom.to_string(atom)
+
   # Conversion
 
   @impl true

diff --git a/lib/explorer/polars_backend/expression.ex b/lib/explorer/polars_backend/expression.ex
@@ -109,6 +109,7 @@ defmodule Explorer.PolarsBackend.Expression do
     cumulative_product: 2,
     window_max: 5,
     window_mean: 5,
+    window_median: 5,
     window_min: 5,
     window_sum: 5,
     window_standard_deviation: 5,
@@ -124,7 +125,8 @@ defmodule Explorer.PolarsBackend.Expression do
     trim_leading: 2,
     trim_trailing: 2,
     downcase: 1,
-    upcase: 1
+    upcase: 1,
+    substring: 3
   ]
 
   @custom_expressions [