From dd10c6e34d5ca9ef37a436e693127ac02a37fa6b Mon Sep 17 00:00:00 2001 From: William Lanchantin Date: Thu, 14 Nov 2024 15:43:46 -0500 Subject: [PATCH 1/4] Restrict decimal generation Before we were generating values that the Rust side couldn't handle. --- test/support/generator.ex | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/test/support/generator.ex b/test/support/generator.ex index 881d55d45..172fdfcb1 100644 --- a/test/support/generator.ex +++ b/test/support/generator.ex @@ -295,8 +295,8 @@ defmodule Explorer.Generator do date: constant(:date), datetime: tuple({constant(:datetime), time_unit(), constant("Etc/UTC")}), decimal: - bind(integer(0..37), fn scale -> - bind(integer((scale + 1)..38), fn precision -> + bind(integer(0..19), fn scale -> + bind(integer((scale + 1)..20), fn precision -> tuple({constant(:decimal), constant(precision), constant(scale)}) end) end), @@ -397,14 +397,23 @@ defmodule Explorer.Generator do |> map(&elem(&1, 1)) end + @max_u64 2 ** 64 - 1 @spec datetime(pos_integer(), pos_integer()) :: gen(Decimal.t()) defp decimal(precision, scale) do + # Smallest integer with `(scale + 1)` digits. + min_scale = 10 ** scale + # Largest integer with `precision` digits. + max_precision = 10 ** precision - 1 + tuple({ one_of([constant("-"), constant("+")]), - string(?0..?9, min_length: precision - scale, max_length: precision - scale), - string(?0..?9, min_length: scale, max_length: scale) + integer(min_scale..min(max_precision, @max_u64)) }) - |> map(fn {sign, integer_part, fractional_part} -> + |> map(fn {sign, coef} -> + # By construction, we are guaranteed that precision > scale. + {integer_part, fractional_part} = + coef |> Integer.to_string() |> String.split_at(precision - scale) + Decimal.new(sign <> integer_part <> "." <> fractional_part) end) end From 489acab3661041782d7f76833367815ec5bc4d18 Mon Sep 17 00:00:00 2001 From: William Lanchantin Date: Thu, 14 Nov 2024 15:44:56 -0500 Subject: [PATCH 2/4] Add a decode failure hint Sometimes this decode failure happens because Elixir Decimals can hold a wider range of values than Rust (Arrow) Decimals. Here, we're giving a hint about why the decode may have failed. --- native/explorer/src/series/from_list.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/native/explorer/src/series/from_list.rs b/native/explorer/src/series/from_list.rs index 94c631f69..7df4c0672 100644 --- a/native/explorer/src/series/from_list.rs +++ b/native/explorer/src/series/from_list.rs @@ -242,7 +242,7 @@ pub fn s_from_list_decimal( .map(|ex_decimal| AnyValue::Decimal(ex_decimal.signed_coef(), ex_decimal.scale())) .map_err(|error| { ExplorerError::Other(format!( - "cannot decode a valid decimal from term. error: {error:?}" + "cannot decode a valid decimal from term; check that `coef` fits into an `i128`. error: {error:?}" )) }), TermType::Atom => Ok(AnyValue::Null), From 92e8a9f440c9130e00d23d2b4a77950f9a409609 Mon Sep 17 00:00:00 2001 From: William Lanchantin Date: Thu, 14 Nov 2024 15:47:03 -0500 Subject: [PATCH 3/4] Fix `{:list, {:decimal, ...}}` for empty lists The bug turned out to be that in the presence of empty lists, we weren't casting when we should have been. --- native/explorer/src/series/from_list.rs | 17 +++++++++++++---- test/explorer/series_test.exs | 4 ++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/native/explorer/src/series/from_list.rs b/native/explorer/src/series/from_list.rs index 7df4c0672..f1287ddbf 100644 --- a/native/explorer/src/series/from_list.rs +++ b/native/explorer/src/series/from_list.rs @@ -264,13 +264,22 @@ pub fn s_from_list_decimal( let mut series = Series::from_any_values(name.into(), &values, true)?; - if let DataType::Decimal(result_precision, result_scale) = series.dtype() { - let p: Option = Some(precision.unwrap_or(result_precision.unwrap_or(38))); - let s: Option = Some(scale.unwrap_or(result_scale.unwrap_or(0))); + match series.dtype() { + DataType::Decimal(result_precision, result_scale) => { + let p: Option = Some(precision.unwrap_or(result_precision.unwrap_or(38))); + let s: Option = Some(scale.unwrap_or(result_scale.unwrap_or(0))); - if *result_precision != p || *result_scale != s { + if *result_precision != p || *result_scale != s { + series = series.cast(&DataType::Decimal(p, s))?; + } + } + // An empty list will result in the `Null` dtype. + DataType::Null => { + let p = Some(precision.unwrap_or(38)); + let s = Some(scale.unwrap_or(0)); series = series.cast(&DataType::Decimal(p, s))?; } + other_dtype => panic!("expected dtype to be Decimal. found: {other_dtype:?}"), } Ok(ExSeries::new(series)) diff --git a/test/explorer/series_test.exs b/test/explorer/series_test.exs index 0eef59ba5..df750af44 100644 --- a/test/explorer/series_test.exs +++ b/test/explorer/series_test.exs @@ -503,6 +503,10 @@ defmodule Explorer.SeriesTest do assert Series.dtype(s) == {:decimal, 38, 5} end + test "{:list, {:decimal, _, _}} works with empty lists" do + Series.from_list([[Decimal.new("3.21")], []], dtype: {:list, {:decimal, 3, 2}}) + end + test "mixing dates and integers with `:date` dtype" do s = Series.from_list([1, nil, ~D[2024-06-13]], dtype: :date) From ae566d14fe0e6c69e9d2f09eca7acb3990359aa9 Mon Sep 17 00:00:00 2001 From: William Lanchantin Date: Thu, 14 Nov 2024 15:49:09 -0500 Subject: [PATCH 4/4] Remove decimal-related TODOs After restricting the decimal generator such that it only produced valid Decimals and fixing the empty list bug, it now seems like it's safe to include decimals in the property tests. --- test/explorer/data_frame_test.exs | 24 ++++++------------------ 1 file changed, 6 insertions(+), 18 deletions(-) diff --git a/test/explorer/data_frame_test.exs b/test/explorer/data_frame_test.exs index d6360bebe..0b3fba4f0 100644 --- a/test/explorer/data_frame_test.exs +++ b/test/explorer/data_frame_test.exs @@ -4730,9 +4730,7 @@ defmodule Explorer.DataFrameTest do describe "properties" do property "should be able to create a DataFrame from valid rows" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4742,9 +4740,7 @@ defmodule Explorer.DataFrameTest do property "should be able to create a DataFrame from valid columns" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), cols <- Explorer.Generator.columns(dtypes), max_runs: 1_000 ) do @@ -4770,9 +4766,7 @@ defmodule Explorer.DataFrameTest do @tag :skip property "can dump any DataFrame (without duration) to CSV" do check all( - # TODO: remove `:decimal` once we fix whatever bug(s) this is - # finding. - dtypes <- Explorer.Generator.dtypes(exclude: [:decimal, :duration]), + dtypes <- Explorer.Generator.dtypes(exclude: :duration), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4785,9 +4779,7 @@ defmodule Explorer.DataFrameTest do @tag :skip property "can dump any DataFrame to IPC" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4800,9 +4792,7 @@ defmodule Explorer.DataFrameTest do @tag :skip property "can dump any DataFrame to NDJSON" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do @@ -4815,9 +4805,7 @@ defmodule Explorer.DataFrameTest do @tag :skip property "can dump any DataFrame to PARQUET" do check all( - # TODO: remove `exclude: :decimal` once we fix whatever bug(s) - # this is finding. - dtypes <- Explorer.Generator.dtypes(exclude: :decimal), + dtypes <- Explorer.Generator.dtypes(), rows <- Explorer.Generator.rows(dtypes), max_runs: 1_000 ) do