diff --git a/cpp/src/arrow/array/array_union_test.cc b/cpp/src/arrow/array/array_union_test.cc index 545425c264619..77ba2477791bb 100644 --- a/cpp/src/arrow/array/array_union_test.cc +++ b/cpp/src/arrow/array/array_union_test.cc @@ -166,6 +166,36 @@ TEST(TestSparseUnionArray, Validate) { ASSERT_RAISES(Invalid, arr->ValidateFull()); } +TEST(TestSparseUnionArray, Comparison) { + auto ints1 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]"); + auto ints2 = ArrayFromJSON(int32(), "[1, 2, -3, 4, -5, 6]"); + auto strs1 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "d", "e", "f"])"); + auto strs2 = ArrayFromJSON(utf8(), R"(["a", "*", "c", "d", "e", "*"])"); + std::vector type_codes{8, 42}; + + auto check_equality = [&](const std::string& type_ids_json1, + const std::string& type_ids_json2, bool expected_equals) { + auto type_ids1 = ArrayFromJSON(int8(), type_ids_json1); + auto type_ids2 = ArrayFromJSON(int8(), type_ids_json2); + ASSERT_OK_AND_ASSIGN(auto arr1, + SparseUnionArray::Make(*type_ids1, {ints1, strs1}, type_codes)); + ASSERT_OK_AND_ASSIGN(auto arr2, + SparseUnionArray::Make(*type_ids2, {ints2, strs2}, type_codes)); + ASSERT_EQ(arr1->Equals(arr2), expected_equals); + ASSERT_EQ(arr2->Equals(arr1), expected_equals); + }; + + // Same type ids + check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", true); + check_equality("[8, 8, 42, 42, 42, 42]", "[8, 8, 42, 42, 42, 42]", false); + check_equality("[8, 8, 8, 42, 42, 8]", "[8, 8, 8, 42, 42, 8]", false); + check_equality("[8, 42, 42, 42, 42, 8]", "[8, 42, 42, 42, 42, 8]", false); + + // Different type ids + check_equality("[42, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", false); + check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 42]", false); +} + // ------------------------------------------------------------------------- // Tests for MakeDense and MakeSparse diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc index 23a921cc5a0a4..e0e6d183393a7 100644 --- a/cpp/src/arrow/compare.cc +++ b/cpp/src/arrow/compare.cc @@ -381,21 +381,49 @@ class RangeDataEqualsImpl { const int8_t* right_codes = right_.GetValues(1); // Unions don't have a null bitmap + int64_t run_start = 0; // Start index of the current run + for (int64_t i = 0; i < range_length_; ++i) { - const auto type_id = left_codes[left_start_idx_ + i]; - if (type_id != right_codes[right_start_idx_ + i]) { + const auto current_type_id = left_codes[left_start_idx_ + i]; + + if (current_type_id != right_codes[right_start_idx_ + i]) { result_ = false; break; } - const auto child_num = child_ids[type_id]; - // XXX can we instead detect runs of same-child union values? + // Check if the current element breaks the run + if (i > 0 && current_type_id != left_codes[left_start_idx_ + i - 1]) { + // Compare the previous run + const auto previous_child_num = child_ids[left_codes[left_start_idx_ + i - 1]]; + int64_t run_length = i - run_start; + + RangeDataEqualsImpl impl( + options_, floating_approximate_, *left_.child_data[previous_child_num], + *right_.child_data[previous_child_num], + left_start_idx_ + left_.offset + run_start, + right_start_idx_ + right_.offset + run_start, run_length); + + if (!impl.Compare()) { + result_ = false; + break; + } + + // Start a new run + run_start = i; + } + } + + // Handle the final run + if (result_) { + const auto final_child_num = child_ids[left_codes[left_start_idx_ + run_start]]; + int64_t final_run_length = range_length_ - run_start; + RangeDataEqualsImpl impl( - options_, floating_approximate_, *left_.child_data[child_num], - *right_.child_data[child_num], left_start_idx_ + left_.offset + i, - right_start_idx_ + right_.offset + i, 1); + options_, floating_approximate_, *left_.child_data[final_child_num], + *right_.child_data[final_child_num], left_start_idx_ + left_.offset + run_start, + right_start_idx_ + right_.offset + run_start, final_run_length); + if (!impl.Compare()) { result_ = false; - break; } } return Status::OK();