diff --git a/.github/actions/install-deps/action.yaml b/.github/actions/install-deps/action.yaml index f45a21e71c..a82826f69b 100644 --- a/.github/actions/install-deps/action.yaml +++ b/.github/actions/install-deps/action.yaml @@ -113,7 +113,7 @@ runs: id: vcpkg-step shell: pwsh run: | - vcpkg.exe install boost-thread boost-algorithm boost-filesystem boost-multi-index boost-multiprecision boost-program-options boost-system boost-unordered boost-uuid + vcpkg.exe install boost-headers boost-thread boost-algorithm boost-filesystem boost-multi-index boost-multiprecision boost-program-options boost-system boost-unordered boost-uuid vcpkg.exe integrate install echo "VCPKG_INSTALLATION_ROOT=${env:VCPKG_INSTALLATION_ROOT}" echo "VCPKG_INSTALLATION_ROOT=${env:VCPKG_INSTALLATION_ROOT}" >> $env:GITHUB_OUTPUT diff --git a/cmake/arrow.txt.in b/cmake/arrow.txt.in index f00fc68624..faba182b5b 100644 --- a/cmake/arrow.txt.in +++ b/cmake/arrow.txt.in @@ -10,7 +10,7 @@ project(arrow-download NONE) include(ExternalProject) ExternalProject_Add(apachearrow GIT_REPOSITORY https://github.com/apache/arrow.git - GIT_TAG apache-arrow-12.0.0 + GIT_TAG apache-arrow-17.0.0 SOURCE_DIR "${CMAKE_BINARY_DIR}/arrow-src" BINARY_DIR "${CMAKE_BINARY_DIR}/arrow-build" CONFIGURE_COMMAND "" diff --git a/cmake/arrow/CMakeLists.txt b/cmake/arrow/CMakeLists.txt deleted file mode 100644 index b4ebab4534..0000000000 --- a/cmake/arrow/CMakeLists.txt +++ /dev/null @@ -1,308 +0,0 @@ -include(CheckCCompilerFlag) - -set(CMAKE_SHARED_LIBRARY_SUFFIX .so) - -if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - check_c_compiler_flag("-arch x86_64" x86_64Supported) - check_c_compiler_flag("-arch arm64" arm64Supported) - - if(x86_64Supported AND arm64Supported) - set(CMAKE_OSX_ARCHITECTURES "x86_64;arm64" CACHE STRING "Build universal architecture for OSX" FORCE) - elseif(x86_64Supported) - set(CMAKE_REQUIRED_LINK_OPTIONS "-arch;x86_64") - set(CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build universal architecture for OSX" FORCE) - elseif(arm64Supported) - set(CMAKE_REQUIRED_LINK_OPTIONS "-arch;arm64") - set(CMAKE_OSX_ARCHITECTURES "arm64" CACHE STRING "Build universal architecture for OSX" FORCE) - endif() -endif() - -set(ARROW_SRCS - - # Base - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_base.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_binary.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_decimal.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_dict.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_nested.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_primitive.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/array_run_end.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_adaptive.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_base.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_binary.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_decimal.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_dict.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_nested.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_primitive.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_run_end.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/builder_union.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/concatenate.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/data.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/diff.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/util.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/array/validate.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/builder.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/buffer.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/chunked_array.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/chunk_resolver.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compare.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/config.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/datum.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/device.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/extension_type.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/memory_pool.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/pretty_print.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/record_batch.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/result.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/scalar.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/sparse_tensor.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/status.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/table.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/table_builder.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/tensor.cc - - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/tensor/coo_converter.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/tensor/csf_converter.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/tensor/csx_converter.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/type.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/visitor.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/c/bridge.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/buffered.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/caching.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/compressed.cc - - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/file.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/interfaces.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/memory.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/slow.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/stdio.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/transform.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/async_util.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/atfork_internal.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/basic_decimal.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bit_block_counter.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bit_run_reader.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bit_util.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bitmap.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bitmap_builders.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bitmap_ops.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/bpacking.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/byte_size.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/cancel.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/compression.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/compression_zstd.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/compression_lz4.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/counting_semaphore.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/debug.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/decimal.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/delimiting.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/formatting.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/future.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/int_util.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/io_util.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/logging.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/key_value_metadata.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/memory.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/mutex.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/ree_util.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/string.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/string_builder.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/task_group.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/tdigest.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/thread_pool.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/time.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/tracing.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/trie.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/unreachable.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/uri.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/utf8.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/value_parsing.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/base64.cpp - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/datetime/tz.cpp - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/bignum.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/double-conversion.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/bignum-dtoa.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/fast-dtoa.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/cached-powers.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/fixed-dtoa.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/diy-fp.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/strtod.cc - - # CSV - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/converter.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/chunker.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/column_builder.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/column_decoder.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/options.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/parser.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/reader.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/writer.cc - - # IPC - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/dictionary.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/feather.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/message.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/metadata_internal.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/options.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/writer.cc - - # Compute - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/api_aggregate.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/api_scalar.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/api_vector.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/cast.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/expression.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/function.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/function_internal.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernel.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/ordering.cc - - - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/registry.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/aggregate_basic.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/aggregate_mode.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/aggregate_quantile.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/aggregate_tdigest.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/aggregate_var_std.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/codegen_internal.cc - - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/hash_aggregate.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_arithmetic.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_boolean.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_cast_boolean.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_cast_dictionary.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_cast_extension.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_cast_internal.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_cast_nested.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_cast_string.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_cast_temporal.cc - - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_compare.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_nested.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_string.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_temporal_binary.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_temporal_unary.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_validity.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/scalar_if_else.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/util_internal.cc - - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/vector_array_sort.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/vector_hash.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/vector_nested.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/vector_replace.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/vector_selection.cc - ${PSP_CPP_SRC}/src/cpp/vendor/arrow_compute_registry.cpp - - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/vector_sort.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/kernels/row_encoder.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/union_node.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/key_hash.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/key_map.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/key_compare.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/key_encode.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/util.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/hash_join_dict.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/hash_join.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/hash_join_node.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/compute/exec/task_util.cc -) - -if(PSP_PYTHON_BUILD AND NOT PSP_PYODIDE) - set(ARROW_SRCS - ${ARROW_SRCS} - # use standard reader in Python builds. - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/reader.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/csv/reader.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/io/file.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/tensor/coo_converter.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/tensor/csf_converter.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/tensor/csx_converter.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/cpu_info.cc - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/union_util.cc - - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/util/time.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/bignum-dtoa.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/fast-dtoa.cc - # ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/double-conversion/fixed-dtoa.cc - ) - - if(WIN32) - set(ARROW_SRCS - ${ARROW_SRCS} - ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/vendored/musl/strptime.c) - endif() -else() - set(ARROW_SRCS - ${ARROW_SRCS} - # Use our vendored reader that does not use threads. - ${PSP_CPP_SRC}/src/cpp/vendor/single_threaded_reader.cpp - ${PSP_CPP_SRC}/src/cpp/vendor/arrow_single_threaded_reader.cpp) -endif() - -set_property(SOURCE util/io_util.cc - APPEND_STRING - PROPERTY COMPILE_FLAGS " -Wno-unused-macros -stdlib=libc++") - -# # make clean will delete the generated file -# set_source_files_properties(${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/Message_generated.h PROPERTIES GENERATED TRUE) -# set_source_files_properties(${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/File_generated.h PROPERTIES GENERATED TRUE) -# set_source_files_properties(${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/Schema_generated.h PROPERTIES GENERATED TRUE) -# set_source_files_properties(${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/Tensor_generated.h PROPERTIES GENERATED TRUE) -# set_source_files_properties(${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/SparseTensor_generated.h PROPERTIES GENERATED TRUE) -# set_source_files_properties(${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/feather_generated.h PROPERTIES GENERATED TRUE) - -# set(FBS_OUTPUT_FILES -# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/Message_generated.h -# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/File_generated.h -# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/Schema_generated.h -# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/Tensor_generated.h -# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/SparseTensor_generated.h -# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/feather_generated.h) - -# set(FBS_SRC -# ${CMAKE_BINARY_DIR}/arrow-src/format/Message.fbs -# ${CMAKE_BINARY_DIR}/arrow-src/format/File.fbs -# ${CMAKE_BINARY_DIR}/arrow-src/format/Schema.fbs -# ${CMAKE_BINARY_DIR}/arrow-src/format/Tensor.fbs -# ${CMAKE_BINARY_DIR}/arrow-src/format/SparseTensor.fbs -# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/feather.fbs) -include_directories(src) - -# Build Arrow as a static library -set(ARROW_BUILD_STATIC ON) - -add_library(arrow STATIC ${ARROW_SRCS}) - -target_compile_definitions(arrow PUBLIC ARROW_NO_DEPRECATED_API) -target_compile_definitions(arrow PUBLIC ARROW_STATIC) -target_compile_definitions(arrow PUBLIC ARROW_WITH_ZSTD=ON) -target_compile_definitions(arrow PUBLIC ARROW_WITH_LZ4) - -target_include_directories(arrow SYSTEM PRIVATE ${zstd_SOURCE_DIR}/lib) - -# will need built boost filesystem and system .lib to work, even though -# perspective itself does not use those dependencies -target_link_libraries(arrow - ${double-conversion_LIBRARIES} - ${Boost_FILESYSTEM_LIBRARY} - ${Boost_SYSTEM_LIBRARY} - lz4_static - libzstd_static - ${ARROW_TEST_LINK_TOOLCHAIN}) - -# find_package(Flatbuffers) - -# add_custom_command(OUTPUT ${FBS_OUTPUT_FILES} -# COMMAND ${FLATBUFFERS_COMPILER} -# -c -# -o -# ${CMAKE_BINARY_DIR}/arrow-src/cpp/src/arrow/ipc/ -# ${FBS_SRC} -# DEPENDS ${FLATBUFFERS_COMPILER} -# COMMENT "Running flatc compiler on ${FBS_SRC}" -# VERBATIM) -add_custom_target(arrow_fb_files DEPENDS ${FBS_OUTPUT_FILES}) -add_dependencies(arrow arrow_fb_files) diff --git a/cmake/arrow/config.h b/cmake/arrow/config.h deleted file mode 100644 index 281106c2af..0000000000 --- a/cmake/arrow/config.h +++ /dev/null @@ -1,53 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#define ARROW_VERSION_MAJOR 8 -#define ARROW_VERSION_MINOR 0 -#define ARROW_VERSION_PATCH 0 -#define ARROW_VERSION ((ARROW_VERSION_MAJOR * 1000) + ARROW_VERSION_MINOR) * 1000 + ARROW_VERSION_PATCH - -/* #undef DOUBLE_CONVERSION_HAS_CASE_INSENSIBILITY */ -/* #undef GRPCPP_PP_INCLUDE */ - -#define ARROW_VERSION_STRING "8000000" - -#define ARROW_SO_VERSION "" -#define ARROW_FULL_SO_VERSION "" - -#define ARROW_CXX_COMPILER_ID "" -#define ARROW_CXX_COMPILER_VERSION "" -#define ARROW_CXX_COMPILER_FLAGS "" - -#define ARROW_GIT_ID "" -#define ARROW_GIT_DESCRIPTION "" - -#define ARROW_PACKAGE_KIND "" - -#define ARROW_BUILD_TYPE "release" - -// #cmakedefine ARROW_COMPUTE -// #cmakedefine ARROW_CSV -// #cmakedefine ARROW_DATASET -// #cmakedefine ARROW_FILESYSTEM -// #cmakedefine ARROW_FLIGHT -// #cmakedefine ARROW_IPC -// #cmakedefine ARROW_JSON - -// #cmakedefine ARROW_S3 -// #cmakedefine ARROW_USE_NATIVE_INT128 - -// #cmakedefine GRPCPP_PP_INCLUDE \ No newline at end of file diff --git a/cmake/flatbuffers.txt.in b/cmake/flatbuffers.txt.in index 3ea11e17e7..a6ed57b729 100644 --- a/cmake/flatbuffers.txt.in +++ b/cmake/flatbuffers.txt.in @@ -5,7 +5,7 @@ project(flatbuffers-download NONE) include(ExternalProject) ExternalProject_Add(flatbuffers GIT_REPOSITORY https://github.com/google/flatbuffers.git - GIT_TAG v2.0.5 + GIT_TAG v23.5.26 SOURCE_DIR "${CMAKE_BINARY_DIR}/flatbuffers-src" BINARY_DIR "${CMAKE_BINARY_DIR}/flatbuffers-build" CONFIGURE_COMMAND "" diff --git a/cmake/modules/FindFlatbuffers.cmake b/cmake/modules/FindFlatbuffers.cmake deleted file mode 100644 index f4d306b333..0000000000 --- a/cmake/modules/FindFlatbuffers.cmake +++ /dev/null @@ -1,64 +0,0 @@ -cmake_minimum_required(VERSION 3.7.2) - -############################################################################## -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -############################################################################## - -# Find FLATBUFFERS (flatbuffers/include, libflatbuffers.a, flatc) -# This module defines: -# FLATBUFFERS_INCLUDE_DIR, directory containing headers -# FLATBUFFERS_STATIC_LIB, path to flatbuffers's static library -# FLATBUFFERS_COMPILER, path to flatc compiler -# -# TODO: [01-15-2021] now that we use Flatbuffers on all platforms, it might be -# a good time to figure out how we can install Flatbuffers as a dependency -# inside our CMakeLists (we would just need to build the flatc executable -# before our Arrow build starts). Right now, I've put in some hacks to make -# sure our Windows build works on Azure by pre-installing flatc (like we do on -# all other platforms), and then pulling down the headers for Windows so they -# can be included. - -# this might fail -# https://gitlab.kitware.com/cmake/cmake/issues/19120 -# -# find_path(FLATBUFFERS_INCLUDE_DIR flatbuffers/flatbuffers.h -# PATHS ${FLATBUFFERS_ROOT}/include -# HINTS /usr/local /usr/local/flatbuffers /usr/local/Homebrew /usr ~/homebrew/ /usr/local/include /usr/local/flatbuffers/include /usr/include ~/homebrew/include /opt/homebrew/include ${CMAKE_SOURCE_DIR}/../../vcpkg/installed/x64-windows/include -# NO_CMAKE_SYSTEM_PATH -# NO_SYSTEM_ENVIRONMENT_PATH) - -# find_program(FLATBUFFERS_COMPILER flatc -# PATHS ${FLATBUFFERS_ROOT}/bin -# HINTS /usr/local/bin /usr/bin /usr/local/Homebrew/bin ~/homebrew/bin /opt/homebrew/bin ${CMAKE_SOURCE_DIR}/../../vcpkg/installed/x64-windows/tools/flatbuffers -# NO_CMAKE_SYSTEM_PATH -# NO_SYSTEM_ENVIRONMENT_PATH) - -# if(NOT ${FLATBUFFERS_INCLUDE_DIR}) -# # HACK -# set(FLATBUFFERS_INCLUDE_DIR /usr/local/include) -# endif() - -# include(FindPackageHandleStandardArgs) - -# if (WIN32) -# find_package_handle_standard_args(Flatbuffers REQUIRED_VARS -# FLATBUFFERS_INCLUDE_DIR FLATBUFFERS_COMPILER) -# else() -# find_package_handle_standard_args(FLATBUFFERS REQUIRED_VARS -# FLATBUFFERS_INCLUDE_DIR FLATBUFFERS_COMPILER) -# endif() \ No newline at end of file diff --git a/cmake/modules/FindInstallDependency.cmake b/cmake/modules/FindInstallDependency.cmake index df883bf020..96ac83bb8f 100644 --- a/cmake/modules/FindInstallDependency.cmake +++ b/cmake/modules/FindInstallDependency.cmake @@ -37,13 +37,26 @@ function(psp_build_dep name cmake_file) endif() if(${name} STREQUAL arrow) - # Overwrite arrow's CMakeLists with our custom, minimal CMakeLists. - configure_file(${PSP_CMAKE_MODULE_PATH}/${name}/CMakeLists.txt ${CMAKE_BINARY_DIR}/${name}-src/cpp/ COPYONLY) - configure_file(${PSP_CMAKE_MODULE_PATH}/${name}/config.h ${CMAKE_BINARY_DIR}/${name}-src/cpp/src/arrow/util/ COPYONLY) + set(ARROW_SIMD_LEVEL "NONE") + set(ARROW_DEFINE_OPTIONS ON) + set(ARROW_RUNTIME_SIMD_LEVEL "NONE") + set(ARROW_BUILD_SHARED OFF) + set(ARROW_BUILD_STATIC ON) + set(ARROW_JEMALLOC OFF) + set(ARROW_CSV ON) + set(ARROW_LZ4 ON) + set(ARROW_WITH_ZSTD ON) + set(ARROW_WITH_LZ4 ON) + set(ARROW_ENABLE_THREADING OFF) + set(ARROW_NO_EXPORT ON) + set(ARROW_BOOST_USE_SHARED OFF) + + include_directories(SYSTEM ${CMAKE_BINARY_DIR}/${name}-build/src) add_subdirectory(${CMAKE_BINARY_DIR}/${name}-src/cpp/ ${CMAKE_BINARY_DIR}/${name}-build EXCLUDE_FROM_ALL) + set_property(DIRECTORY "${CMAKE_BINARY_DIR}/${name}-src/cpp/" APPEND PROPERTY COMPILE_DEFINITIONS "-DARROW_BUILD_STATIC=ON -DARROW_BUILD_SHARED=OFF -DARROW_NO_DEPRECATED_API") include_directories(SYSTEM ${CMAKE_BINARY_DIR}/${name}-src/cpp/src/) elseif(${name} STREQUAL exprtk) # no cmakelists - just include the header @@ -78,7 +91,7 @@ function(psp_build_dep name cmake_file) endif() if(NOT PSP_WASM_BUILD AND (MACOS OR NOT MANYLINUX)) - if(${name} STREQUAL arrow OR ${name} STREQUAL flatbuffers OR ${name} STREQUAL double-conversion OR ${name} STREQUAL re2) + if(${name} STREQUAL flatbuffers OR ${name} STREQUAL double-conversion OR ${name} STREQUAL re2) target_compile_options(${name} PRIVATE -fvisibility=hidden) endif() endif() diff --git a/cpp/perspective/CMakeLists.txt b/cpp/perspective/CMakeLists.txt index 69783da7da..9729af74a5 100644 --- a/cpp/perspective/CMakeLists.txt +++ b/cpp/perspective/CMakeLists.txt @@ -303,9 +303,9 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD) -fexceptions \ -g3 \ ") - if (PSP_WASM_EXCEPTIONS) - set(OPT_FLAGS "${OPT_FLAGS} -fwasm-exceptions ") - endif() + if (PSP_WASM_EXCEPTIONS) + set(OPT_FLAGS "${OPT_FLAGS} -fwasm-exceptions ") + endif() endif () else() set(OPT_FLAGS " \ @@ -406,10 +406,18 @@ set(RAPIDJSON_BUILD_TESTS OFF CACHE BOOL "Disable rapidjson tests") set(CMAKE_C_FLAGS " \ -O3 \ ") +if (PSP_WASM_EXCEPTIONS) +set(CMAKE_CXX_FLAGS " -fwasm-exceptions \ +-O3 \ +-g0 \ +") +else() set(CMAKE_CXX_FLAGS " \ -O3 \ ") +endif() + if(PSP_PYODIDE) set(RELOCATABLE_FLAGS "-sRELOCATABLE=1 -sSIDE_MODULE=2 -sWASM_BIGINT=1") @@ -640,7 +648,7 @@ if(PSP_WASM_BUILD AND NOT PSP_PYTHON_BUILD) add_library(psp ${WASM_SOURCE_FILES}) target_compile_definitions(psp PRIVATE PSP_ENABLE_WASM=1) set_target_properties(psp PROPERTIES COMPILE_FLAGS "") - target_link_libraries(psp PRIVATE arrow re2 protos) + target_link_libraries(psp PRIVATE arrow_static re2 protos) add_executable(perspective_esm src/cpp/binding_api.cpp) target_link_libraries(perspective_esm psp protos) @@ -695,34 +703,19 @@ elseif(PSP_CPP_BUILD OR PSP_PYTHON_BUILD) # .dll not importable # set_property(TARGET psppy PROPERTY SUFFIX .pyd) elseif(MACOS OR NOT MANYLINUX) - # target_compile_options(psppy PRIVATE -Wdeprecated-declarations) set_property(TARGET psp PROPERTY INSTALL_RPATH ${CMAKE_INSTALL_RPATH} ${module_origin_path}) - # set_property(TARGET psppy PROPERTY INSTALL_RPATH ${CMAKE_INSTALL_RPATH} ${module_origin_path}) - target_compile_options(psp PRIVATE -fvisibility=hidden) - # target_compile_options(psppy PRIVATE -fvisibility=hidden) elseif(MANYLINUX) else() target_compile_options(psp PRIVATE -fvisibility=hidden) - # target_compile_options(psppy PRIVATE -Wdeprecated-declarations) endif() # Link against minimal arrow static library - target_link_libraries(psp PRIVATE arrow re2 protos) - # target_link_libraries(psppy psp) - - # The compiled libraries will be put in CMAKE_LIBRARY_OUTPUT_DIRECTORY by default. In the - # setup.py file, we designate this to be in the build/lib. directory. However, - # since we want to be able to test perspective in-source, we also copy the libraries into - # the source folder. These two commands do that. - # add_custom_command(TARGET psp POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy $ ${PSP_PYTHON_SRC}/table/) - # add_custom_command(TARGET psppy POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy $ ${PSP_PYTHON_SRC}/table/) - - # ####################### + target_link_libraries(psp PRIVATE arrow_static re2 protos lz4_static libzstd_static) else() add_library(psp STATIC ${WASM_SOURCE_FILES}) target_compile_options(psp PRIVATE -fvisibility=hidden) - target_link_libraries(psp PRIVATE arrow re2 protos) + target_link_libraries(psp PRIVATE arrow_static re2 protos) endif() if(PSP_CPP_BUILD_STRICT AND NOT WIN32) @@ -745,6 +738,6 @@ if(NOT DEFINED ENV{PSP_DISABLE_CLANGD}) include(SetupClangd) endif() -if(NOT WIN32) - include_directories(SYSTEM ${Boost_INCLUDE_DIRS}) -endif() +# if(NOT WIN32) +include_directories(SYSTEM ${Boost_INCLUDE_DIRS}) +# endif() diff --git a/cpp/perspective/src/cpp/arrow_csv.cpp b/cpp/perspective/src/cpp/arrow_csv.cpp index 81eeadae0c..b4a6017816 100644 --- a/cpp/perspective/src/cpp/arrow_csv.cpp +++ b/cpp/perspective/src/cpp/arrow_csv.cpp @@ -15,14 +15,7 @@ #include #include #include - -#ifdef PSP_ENABLE_WASM -// This causes build warnings -// https://github.com/emscripten-core/emscripten/issues/8574 -#include -#else #include -#endif template static inline arrow::TimestampType::c_type diff --git a/cpp/perspective/src/cpp/table.cpp b/cpp/perspective/src/cpp/table.cpp index 103a7acf15..145f461a44 100644 --- a/cpp/perspective/src/cpp/table.cpp +++ b/cpp/perspective/src/cpp/table.cpp @@ -16,7 +16,6 @@ #include "perspective/data_table.h" #include "perspective/raw_types.h" #include "perspective/schema.h" -// #include "arrow/vendored/datetime/date.h" #include "rapidjson/document.h" #include #include @@ -448,18 +447,6 @@ rapidjson_type_to_dtype(const rapidjson::Value& value) { std::chrono::system_clock::time_point tp; if (parse_all_date_time(tm, tp, str)) { - LOG_DEBUG( - "Parsed date: " << tm.tm_year + 1900 << "-" << tm.tm_mon + 1 - << "-" << tm.tm_mday << " " << tm.tm_hour - << ":" << tm.tm_min << ":" << tm.tm_sec - ); - auto tpm = - std::chrono::duration_cast( - tp.time_since_epoch() - ) - .count(); - LOG_DEBUG("TP: " << tpm << '\n'); - if (tm.tm_hour == 0 && tm.tm_min == 0 && tm.tm_sec == 0) { return t_dtype::DTYPE_DATE; } diff --git a/cpp/perspective/src/cpp/vendor/arrow_compute_registry.cpp b/cpp/perspective/src/cpp/vendor/arrow_compute_registry.cpp deleted file mode 100644 index ed07a907a8..0000000000 --- a/cpp/perspective/src/cpp/vendor/arrow_compute_registry.cpp +++ /dev/null @@ -1,235 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/compute/registry.h" - -#include -#include -#include -#include -#include - -#include "arrow/compute/function.h" -#include "arrow/compute/function_internal.h" -#include "arrow/compute/registry_internal.h" -#include "arrow/status.h" -#include "arrow/util/logging.h" - -namespace arrow::compute { - -class FunctionRegistry::FunctionRegistryImpl { -public: - Status - AddFunction(std::shared_ptr function, bool allow_overwrite) { - RETURN_NOT_OK(function->Validate()); - - std::lock_guard mutation_guard(lock_); - - const std::string& name = function->name(); - auto it = name_to_function_.find(name); - if (it != name_to_function_.end() && !allow_overwrite) { - return Status::KeyError( - "Already have a function registered with name: ", name - ); - } - name_to_function_[name] = std::move(function); - return Status::OK(); - } - - Status - AddAlias(const std::string& target_name, const std::string& source_name) { - std::lock_guard mutation_guard(lock_); - - auto it = name_to_function_.find(source_name); - if (it == name_to_function_.end()) { - return Status::KeyError( - "No function registered with name: ", source_name - ); - } - name_to_function_[target_name] = it->second; - return Status::OK(); - } - - Status - AddFunctionOptionsType( - const FunctionOptionsType* options_type, bool allow_overwrite = false - ) { - std::lock_guard mutation_guard(lock_); - - const std::string name = options_type->type_name(); - auto it = name_to_options_type_.find(name); - if (it != name_to_options_type_.end() && !allow_overwrite) { - return Status::KeyError( - "Already have a function options type registered with name: ", - name - ); - } - name_to_options_type_[name] = options_type; - return Status::OK(); - } - - Result> - GetFunction(const std::string& name) const { - auto it = name_to_function_.find(name); - if (it == name_to_function_.end()) { - return Status::KeyError("No function registered with name: ", name); - } - return it->second; - } - - [[nodiscard]] - std::vector - GetFunctionNames() const { - std::vector results; - results.reserve(name_to_function_.size()); - for (const auto& it : name_to_function_) { - results.push_back(it.first); - } - std::sort(results.begin(), results.end()); - return results; - } - - Result - GetFunctionOptionsType(const std::string& name) const { - auto it = name_to_options_type_.find(name); - if (it == name_to_options_type_.end()) { - return Status::KeyError( - "No function options type registered with name: ", name - ); - } - return it->second; - } - - [[nodiscard]] - int - num_functions() const { - return static_cast(name_to_function_.size()); - } - -private: - std::mutex lock_; - std::unordered_map> - name_to_function_; - std::unordered_map - name_to_options_type_; -}; - -std::unique_ptr -FunctionRegistry::Make() { - return std::unique_ptr(new FunctionRegistry()); -} - -FunctionRegistry::FunctionRegistry() { - impl_ = std::make_unique(); -} - -FunctionRegistry::~FunctionRegistry() = default; - -Status -FunctionRegistry::AddFunction( - std::shared_ptr function, bool allow_overwrite -) { - return impl_->AddFunction(std::move(function), allow_overwrite); -} - -Status -FunctionRegistry::AddAlias( - const std::string& target_name, const std::string& source_name -) { - return impl_->AddAlias(target_name, source_name); -} - -Status -FunctionRegistry::AddFunctionOptionsType( - const FunctionOptionsType* options_type, bool allow_overwrite -) { - return impl_->AddFunctionOptionsType(options_type, allow_overwrite); -} - -Result> -FunctionRegistry::GetFunction(const std::string& name) const { - return impl_->GetFunction(name); -} - -std::vector -FunctionRegistry::GetFunctionNames() const { - return impl_->GetFunctionNames(); -} - -Result -FunctionRegistry::GetFunctionOptionsType(const std::string& name) const { - return impl_->GetFunctionOptionsType(name); -} - -int -FunctionRegistry::num_functions() const { - return impl_->num_functions(); -} - -namespace internal { - - static std::unique_ptr - CreateBuiltInRegistry() { - auto registry = FunctionRegistry::Make(); - - // Scalar functions - // RegisterScalarArithmetic(registry.get()); - // RegisterScalarBoolean(registry.get()); - RegisterScalarCast(registry.get()); - // RegisterScalarComparison(registry.get()); - // RegisterScalarIfElse(registry.get()); - // RegisterScalarNested(registry.get()); - // RegisterScalarSetLookup(registry.get()); - // RegisterScalarStringAscii(registry.get()); - // RegisterScalarTemporalBinary(registry.get()); - // RegisterScalarTemporalUnary(registry.get()); - // RegisterScalarValidity(registry.get()); - - // RegisterScalarOptions(registry.get()); - - // // Vector functions - // RegisterVectorArraySort(registry.get()); - // RegisterVectorHash(registry.get()); - // RegisterVectorNested(registry.get()); - // RegisterVectorReplace(registry.get()); - RegisterVectorSelection(registry.get()); - // RegisterVectorSort(registry.get()); - - // RegisterVectorOptions(registry.get()); - - // // Aggregate functions - // RegisterHashAggregateBasic(registry.get()); - // RegisterScalarAggregateBasic(registry.get()); - // RegisterScalarAggregateMode(registry.get()); - // RegisterScalarAggregateQuantile(registry.get()); - // RegisterScalarAggregateTDigest(registry.get()); - // RegisterScalarAggregateVariance(registry.get()); - - // RegisterAggregateOptions(registry.get()); - - return registry; - } - -} // namespace internal - -FunctionRegistry* -GetFunctionRegistry() { - static auto g_registry = internal::CreateBuiltInRegistry(); - return g_registry.get(); -} - -} // namespace arrow::compute diff --git a/cpp/perspective/src/cpp/vendor/arrow_single_threaded_reader.cpp b/cpp/perspective/src/cpp/vendor/arrow_single_threaded_reader.cpp deleted file mode 100644 index b9e80235ef..0000000000 --- a/cpp/perspective/src/cpp/vendor/arrow_single_threaded_reader.cpp +++ /dev/null @@ -1,1038 +0,0 @@ -/****************************************************************************** - * - * Copyright (c) 2019, the Perspective Authors. - * - * This file is part of the Perspective library, distributed under the terms of - * the Apache License 2.0. The full license can be found in the LICENSE file. - * - * Originally forked from - * https://github.com/apache/arrow/blob/apache-arrow-1.0.1/cpp/src/arrow/csv/reader.cc - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/* * * * WARNING * * * - * - * This file and respective header is a fork of - * https://github.com/apache/arrow/blob/apache-arrow-1.0.1/cpp/src/arrow/csv/reader.cc - * which removes references to `std::thread` such that compilation under - * Emscripten is possible. It should not be modified directly. - * - * TODO Pending a better solution or upstream fix .. - * - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/csv/chunker.h" -#include "arrow/csv/column_builder.h" -#include "arrow/csv/column_decoder.h" -#include "arrow/csv/options.h" -#include "arrow/csv/parser.h" -#include "arrow/io/interfaces.h" -#include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/table.h" -#include "arrow/type.h" -#include "arrow/type_fwd.h" -// #include "arrow/util/async_generator.h" -// #include "arrow/util/future.h" -#include "arrow/util/iterator.h" -#include "arrow/util/logging.h" -#include "arrow/util/macros.h" -#include "arrow/util/task_group.h" -// #include "arrow/util/thread_pool.h" -#include "arrow/util/utf8_internal.h" -#include "arrow/util/vector.h" - -namespace arrow { - -using internal::Executor; -using internal::TaskGroup; -using internal::UnwrapOrRaise; - -namespace csv { - namespace { - - struct ConversionSchema { - struct Column { - std::string name; - // Physical column index in CSV file - int32_t index; - // If true, make a column of nulls - bool is_missing; - // If set, convert the CSV column to this type - // If unset (and is_missing is false), infer the type from the - // CSV column - std::shared_ptr type; - }; - - static Column - NullColumn(std::string col_name, std::shared_ptr type) { - return Column{std::move(col_name), -1, true, std::move(type)}; - } - - static Column - TypedColumn( - std::string col_name, - int32_t col_index, - std::shared_ptr type - ) { - return Column{ - std::move(col_name), col_index, false, std::move(type) - }; - } - - static Column - InferredColumn(std::string col_name, int32_t col_index) { - return Column{std::move(col_name), col_index, false, nullptr}; - } - - std::vector columns; - }; - - // An iterator of Buffers that makes sure there is no straddling CRLF - // sequence. - class CSVBufferIterator { - public: - static Iterator> - Make(Iterator> buffer_iterator) { - Transformer, std::shared_ptr> - fn = CSVBufferIterator(); - return MakeTransformedIterator(std::move(buffer_iterator), fn); - } - - // static AsyncGenerator> MakeAsync( - // AsyncGenerator> buffer_iterator) { - // Transformer, std::shared_ptr> - // fn = - // CSVBufferIterator(); - // return MakeTransformedGenerator(std::move(buffer_iterator), - // fn); - // } - - Result>> - operator()(std::shared_ptr buf) { - if (buf == nullptr) { - // EOF - return TransformFinish(); - } - - int64_t offset = 0; - if (first_buffer_) { - ARROW_ASSIGN_OR_RAISE( - auto data, util::SkipUTF8BOM(buf->data(), buf->size()) - ); - offset += data - buf->data(); - DCHECK_GE(offset, 0); - first_buffer_ = false; - } - - if (trailing_cr_ && buf->data()[offset] == '\n') { - // Skip '\r\n' line separator that started at the end of - // previous buffer - ++offset; - } - - trailing_cr_ = (buf->data()[buf->size() - 1] == '\r'); - buf = SliceBuffer(buf, offset); - if (buf->size() == 0) { - // EOF - return TransformFinish(); - } - return TransformYield(buf); - } - - protected: - bool first_buffer_ = true; - // Whether there was a trailing CR at the end of last received - // buffer - bool trailing_cr_ = false; - }; - - struct CSVBlock { - // (partial + completion + buffer) is an entire delimited CSV - // buffer. - std::shared_ptr partial; - std::shared_ptr completion; - std::shared_ptr buffer; - int64_t block_index; - bool is_final; - int64_t bytes_skipped; - std::function consume_bytes; - }; - - } // namespace -} // namespace csv - -template <> -struct IterationTraits { - static csv::CSVBlock - End() { - return csv::CSVBlock{{}, {}, {}, -1, true, 0, {}}; - } - static bool - IsEnd(const csv::CSVBlock& val) { - return val.block_index < 0; - } -}; - -namespace csv { - namespace { - - // This is a callable that can be used to transform an iterator. The - // source iterator will contain buffers of data and the output iterator - // will contain delimited CSV blocks. std::optional is used so that - // there is an end token (required by the iterator APIs (e.g. Visit)) - // even though an empty optional is never used in this code. - class BlockReader { - public: - BlockReader( - std::unique_ptr chunker, - std::shared_ptr first_buffer, - int64_t skip_rows - ) : - chunker_(std::move(chunker)), - partial_(std::make_shared("")), - buffer_(std::move(first_buffer)), - skip_rows_(skip_rows) {} - - protected: - std::unique_ptr chunker_; - std::shared_ptr partial_, buffer_; - int64_t skip_rows_; - int64_t block_index_ = 0; - // Whether there was a trailing CR at the end of last received - // buffer - bool trailing_cr_ = false; - }; - - // An object that reads delimited CSV blocks for serial use. - // The number of bytes consumed should be notified after each read, - // using CSVBlock::consume_bytes. - class SerialBlockReader : public BlockReader { - public: - using BlockReader::BlockReader; - - static Iterator - MakeIterator( - Iterator> buffer_iterator, - std::unique_ptr chunker, - const std::shared_ptr& first_buffer, - int64_t skip_rows - ) { - auto block_reader = std::make_shared( - std::move(chunker), first_buffer, skip_rows - ); - // Wrap shared pointer in callable - Transformer, CSVBlock> block_reader_fn = - [block_reader](const std::shared_ptr& buf) { - return (*block_reader)(buf); - }; - return MakeTransformedIterator( - std::move(buffer_iterator), block_reader_fn - ); - } - - // static AsyncGenerator MakeAsyncIterator( - // AsyncGenerator> buffer_generator, - // std::unique_ptr chunker, std::shared_ptr - // first_buffer, int64_t skip_rows) { - // auto block_reader = - // std::make_shared(std::move(chunker), - // first_buffer, skip_rows); - // // Wrap shared pointer in callable - // Transformer, CSVBlock> - // block_reader_fn = - // [block_reader](std::shared_ptr next) { - // return (*block_reader)(std::move(next)); - // }; - // return MakeTransformedGenerator(std::move(buffer_generator), - // block_reader_fn); - // } - - Result> - operator()(const std::shared_ptr& next_buffer) { - if (buffer_ == nullptr) { - return TransformFinish(); - } - - bool is_final = (next_buffer == nullptr); - int64_t bytes_skipped = 0; - - if (skip_rows_ != 0) { - bytes_skipped += partial_->size(); - auto orig_size = buffer_->size(); - RETURN_NOT_OK(chunker_->ProcessSkip( - partial_, buffer_, is_final, &skip_rows_, &buffer_ - )); - bytes_skipped += orig_size - buffer_->size(); - auto empty = std::make_shared(nullptr, 0); - if (skip_rows_ != 0) { - // Still have rows beyond this buffer to skip return - // empty block - partial_ = std::move(buffer_); - buffer_ = next_buffer; - return TransformYield(CSVBlock{ - empty, - empty, - empty, - block_index_++, - is_final, - bytes_skipped, - [](int64_t) { return Status::OK(); } - }); - } - partial_ = std::move(empty); - } - - std::shared_ptr completion; - - if (is_final) { - // End of file reached => compute completion from - // penultimate block - RETURN_NOT_OK(chunker_->ProcessFinal( - partial_, buffer_, &completion, &buffer_ - )); - } else { - // Get completion of partial from previous block. - RETURN_NOT_OK(chunker_->ProcessWithPartial( - partial_, buffer_, &completion, &buffer_ - )); - } - int64_t bytes_before_buffer = - partial_->size() + completion->size(); - - auto consume_bytes = [this, - bytes_before_buffer, - next_buffer](int64_t nbytes) -> Status { - DCHECK_GE(nbytes, 0); - auto offset = nbytes - bytes_before_buffer; - if (offset < 0) { - // Should not happen - return Status::Invalid( - "CSV parser got out of sync with chunker" - ); - } - partial_ = SliceBuffer(buffer_, offset); - buffer_ = next_buffer; - return Status::OK(); - }; - - return TransformYield(CSVBlock{ - partial_, - completion, - buffer_, - block_index_++, - is_final, - bytes_skipped, - std::move(consume_bytes) - }); - } - }; - - struct ParsedBlock { - std::shared_ptr parser; - int64_t block_index; - int64_t bytes_parsed_or_skipped; - }; - - struct DecodedBlock { - std::shared_ptr record_batch; - // Represents the number of input bytes represented by this batch - // This will include bytes skipped when skipping rows after the - // header - int64_t bytes_processed; - }; - - } // namespace - -} // namespace csv - -template <> -struct IterationTraits { - static csv::ParsedBlock - End() { - return csv::ParsedBlock{nullptr, -1, -1}; - } - static bool - IsEnd(const csv::ParsedBlock& val) { - return val.block_index < 0; - } -}; - -template <> -struct IterationTraits { - static csv::DecodedBlock - End() { - return csv::DecodedBlock{nullptr, -1}; - } - static bool - IsEnd(const csv::DecodedBlock& val) { - return val.bytes_processed < 0; - } -}; - -namespace csv { - namespace { - - // A function object that takes in a buffer of CSV data and returns a - // parsed batch of CSV data (CSVBlock -> ParsedBlock) for use with - // MakeMappedGenerator. The parsed batch contains a list of offsets for - // each of the columns so that columns can be individually scanned - // - // This operator is not re-entrant - class BlockParsingOperator { - public: - BlockParsingOperator( - io::IOContext io_context, - ParseOptions parse_options, - int num_csv_cols, - int64_t first_row - ) : - io_context_(std::move(std::move(io_context))), - parse_options_(std::move(std::move(parse_options))), - num_csv_cols_(num_csv_cols), - count_rows_(first_row >= 0), - num_rows_seen_(first_row) {} - - Result - operator()(const CSVBlock& block) { - constexpr int32_t max_num_rows = - std::numeric_limits::max(); - auto parser = std::make_shared( - io_context_.pool(), - parse_options_, - num_csv_cols_, - num_rows_seen_, - max_num_rows - ); - - std::shared_ptr straddling; - std::vector views; - if (block.partial->size() != 0 - || block.completion->size() != 0) { - if (block.partial->size() == 0) { - straddling = block.completion; - } else if (block.completion->size() == 0) { - straddling = block.partial; - } else { - ARROW_ASSIGN_OR_RAISE( - straddling, - ConcatenateBuffers( - {block.partial, block.completion}, - io_context_.pool() - ) - ); - } - views = { - std::string_view(*straddling), - std::string_view(*block.buffer) - }; - } else { - views = {std::string_view(*block.buffer)}; - } - uint32_t parsed_size; - if (block.is_final) { - RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size)); - } else { - RETURN_NOT_OK(parser->Parse(views, &parsed_size)); - } - if (count_rows_) { - num_rows_seen_ += parser->total_num_rows(); - } - RETURN_NOT_OK(block.consume_bytes(parsed_size)); - return ParsedBlock{ - std::move(parser), - block.block_index, - static_cast(parsed_size) + block.bytes_skipped - }; - } - - private: - io::IOContext io_context_; - ParseOptions parse_options_; - int num_csv_cols_; - bool count_rows_; - int64_t num_rows_seen_; - }; - - ///////////////////////////////////////////////////////////////////////// - // Base class for common functionality - - class ReaderMixin { - public: - ReaderMixin( - io::IOContext io_context, - std::shared_ptr input, - ReadOptions read_options, - ParseOptions parse_options, - ConvertOptions convert_options, - bool count_rows - ) : - io_context_(std::move(io_context)), - read_options_(std::move(read_options)), - parse_options_(std::move(parse_options)), - convert_options_(std::move(convert_options)), - count_rows_(count_rows), - num_rows_seen_(count_rows_ ? 1 : -1), - input_(std::move(input)) {} - - protected: - // Read header and column names from buffer, create column builders - // Returns the # of bytes consumed - Result - ProcessHeader( - const std::shared_ptr& buf, - std::shared_ptr* rest - ) { - const uint8_t* data = buf->data(); - const auto* const data_end = data + buf->size(); - DCHECK_GT(data_end - data, 0); - - if (read_options_.skip_rows != 0) { - // Skip initial rows (potentially invalid CSV data) - auto num_skipped_rows = SkipRows( - data, - static_cast(data_end - data), - read_options_.skip_rows, - &data - ); - if (num_skipped_rows < read_options_.skip_rows) { - return Status::Invalid( - "Could not skip initial ", - read_options_.skip_rows, - " rows from CSV file, " - "either file is too short or header is larger than " - "block size" - ); - } - if (count_rows_) { - num_rows_seen_ += num_skipped_rows; - } - } - - if (read_options_.column_names.empty()) { - // Parse one row (either to read column names or to know the - // number of columns) - BlockParser parser( - io_context_.pool(), - parse_options_, - num_csv_cols_, - num_rows_seen_, - 1 - ); - uint32_t parsed_size = 0; - RETURN_NOT_OK(parser.Parse( - std::string_view( - reinterpret_cast(data), data_end - data - ), - &parsed_size - )); - if (parser.num_rows() != 1) { - return Status::Invalid( - "Could not read first row from CSV file, either " - "file is too short or header is larger than block " - "size" - ); - } - if (parser.num_cols() == 0) { - return Status::Invalid("No columns in CSV file"); - } - - if (read_options_.autogenerate_column_names) { - column_names_ = GenerateColumnNames(parser.num_cols()); - } else { - // Read column names from header row - auto visit = [&](const uint8_t* data, - uint32_t size, - bool quoted) -> Status { - column_names_.emplace_back( - reinterpret_cast(data), size - ); - return Status::OK(); - }; - RETURN_NOT_OK(parser.VisitLastRow(visit)); - DCHECK_EQ( - static_cast(parser.num_cols()), - column_names_.size() - ); - // Skip parsed header row - data += parsed_size; - if (count_rows_) { - ++num_rows_seen_; - } - } - } else { - column_names_ = read_options_.column_names; - } - - if (count_rows_) { - // increase rows seen to skip past rows which will be - // skipped - num_rows_seen_ += read_options_.skip_rows_after_names; - } - - auto bytes_consumed = data - buf->data(); - *rest = SliceBuffer(buf, bytes_consumed); - - num_csv_cols_ = static_cast(column_names_.size()); - DCHECK_GT(num_csv_cols_, 0); - - RETURN_NOT_OK(MakeConversionSchema()); - return bytes_consumed; - } - - std::vector - GenerateColumnNames(int32_t num_cols) { - std::vector res; - res.reserve(num_cols); - for (int32_t i = 0; i < num_cols; ++i) { - std::stringstream ss; - ss << "f" << i; - res.push_back(ss.str()); - } - return res; - } - - // Make conversion schema from options and parsed CSV header - Status - MakeConversionSchema() { - // Append a column converted from CSV data - auto append_csv_column = [&](std::string col_name, - int32_t col_index) { - // Does the named column have a fixed type? - auto it = convert_options_.column_types.find(col_name); - if (it == convert_options_.column_types.end()) { - conversion_schema_.columns.push_back( - ConversionSchema::InferredColumn( - std::move(col_name), col_index - ) - ); - } else { - conversion_schema_.columns.push_back( - ConversionSchema::TypedColumn( - std::move(col_name), col_index, it->second - ) - ); - } - }; - - // Append a column of nulls - auto append_null_column = [&](std::string col_name) { - // If the named column has a fixed type, use it, otherwise - // use null() - std::shared_ptr type; - auto it = convert_options_.column_types.find(col_name); - if (it == convert_options_.column_types.end()) { - type = null(); - } else { - type = it->second; - } - conversion_schema_.columns.push_back( - ConversionSchema::NullColumn( - std::move(col_name), std::move(type) - ) - ); - }; - - if (convert_options_.include_columns.empty()) { - // Include all columns in CSV file order - for (int32_t col_index = 0; col_index < num_csv_cols_; - ++col_index) { - append_csv_column(column_names_[col_index], col_index); - } - } else { - // Include columns from `include_columns` (in that order) - // Compute indices of columns in the CSV file - std::unordered_map col_indices; - col_indices.reserve(column_names_.size()); - for (int32_t i = 0; - i < static_cast(column_names_.size()); - ++i) { - col_indices.emplace(column_names_[i], i); - } - - for (const auto& col_name : - convert_options_.include_columns) { - auto it = col_indices.find(col_name); - if (it != col_indices.end()) { - append_csv_column(col_name, it->second); - } else if (convert_options_.include_missing_columns) { - append_null_column(col_name); - } else { - return Status::KeyError( - "Column '", - col_name, - "' in include_columns " - "does not exist in CSV file" - ); - } - } - } - return Status::OK(); - } - - struct ParseResult { - std::shared_ptr parser; - int64_t parsed_bytes; - }; - - Result - Parse( - const std::shared_ptr& partial, - const std::shared_ptr& completion, - const std::shared_ptr& block, - int64_t block_index, - bool is_final - ) { - static constexpr int32_t max_num_rows = - std::numeric_limits::max(); - auto parser = std::make_shared( - io_context_.pool(), - parse_options_, - num_csv_cols_, - num_rows_seen_, - max_num_rows - ); - - std::shared_ptr straddling; - std::vector views; - if (partial->size() != 0 || completion->size() != 0) { - if (partial->size() == 0) { - straddling = completion; - } else if (completion->size() == 0) { - straddling = partial; - } else { - ARROW_ASSIGN_OR_RAISE( - straddling, - ConcatenateBuffers( - {partial, completion}, io_context_.pool() - ) - ); - } - views = { - std::string_view(*straddling), std::string_view(*block) - }; - } else { - views = {std::string_view(*block)}; - } - uint32_t parsed_size; - if (is_final) { - RETURN_NOT_OK(parser->ParseFinal(views, &parsed_size)); - } else { - RETURN_NOT_OK(parser->Parse(views, &parsed_size)); - } - if (count_rows_) { - num_rows_seen_ += parser->total_num_rows(); - } - return ParseResult{ - std::move(parser), static_cast(parsed_size) - }; - } - - io::IOContext io_context_; - ReadOptions read_options_; - ParseOptions parse_options_; - ConvertOptions convert_options_; - - // Number of columns in the CSV file - int32_t num_csv_cols_ = -1; - // Whether num_rows_seen_ tracks the number of rows seen in the CSV - // being parsed - bool count_rows_; - // Number of rows seen in the csv. Not used if count_rows is false - int64_t num_rows_seen_; - // Column names in the CSV file - std::vector column_names_; - ConversionSchema conversion_schema_; - - std::shared_ptr input_; - std::shared_ptr task_group_; - }; - - ///////////////////////////////////////////////////////////////////////// - // Base class for one-shot table readers - - class BaseTableReader : public ReaderMixin, public csv::TableReader { - public: - using ReaderMixin::ReaderMixin; - - virtual Status Init() = 0; - - // Future> ReadAsync() override { - // return Future>::MakeFinished(Read()); - // } - - protected: - // Make column builders from conversion schema - Status - MakeColumnBuilders() { - for (const auto& column : conversion_schema_.columns) { - std::shared_ptr builder; - if (column.is_missing) { - ARROW_ASSIGN_OR_RAISE( - builder, - ColumnBuilder::MakeNull( - io_context_.pool(), column.type, task_group_ - ) - ); - } else if (column.type != nullptr) { - ARROW_ASSIGN_OR_RAISE( - builder, - ColumnBuilder::Make( - io_context_.pool(), - column.type, - column.index, - convert_options_, - task_group_ - ) - ); - } else { - ARROW_ASSIGN_OR_RAISE( - builder, - ColumnBuilder::Make( - io_context_.pool(), - column.index, - convert_options_, - task_group_ - ) - ); - } - column_builders_.push_back(std::move(builder)); - } - return Status::OK(); - } - - Result - ParseAndInsert( - const std::shared_ptr& partial, - const std::shared_ptr& completion, - const std::shared_ptr& block, - int64_t block_index, - bool is_final - ) { - ARROW_ASSIGN_OR_RAISE( - auto result, - Parse(partial, completion, block, block_index, is_final) - ); - RETURN_NOT_OK(ProcessData(result.parser, block_index)); - return result.parsed_bytes; - } - - // Trigger conversion of parsed block data - Status - ProcessData( - const std::shared_ptr& parser, int64_t block_index - ) { - for (auto& builder : column_builders_) { - builder->Insert(block_index, parser); - } - return Status::OK(); - } - - Result> - MakeTable() { - DCHECK_EQ( - column_builders_.size(), conversion_schema_.columns.size() - ); - - std::vector> fields; - std::vector> columns; - - for (int32_t i = 0; - i < static_cast(column_builders_.size()); - ++i) { - const auto& column = conversion_schema_.columns[i]; - ARROW_ASSIGN_OR_RAISE( - auto array, column_builders_[i]->Finish() - ); - fields.push_back(::arrow::field(column.name, array->type()) - ); - columns.emplace_back(std::move(array)); - } - return Table::Make( - schema(std::move(fields)), std::move(columns) - ); - } - - // Column builders for target Table (in ConversionSchema order) - std::vector> column_builders_; - }; - - } // namespace - - ///////////////////////////////////////////////////////////////////////// - // Serial TableReader implementation - - class SerialTableReader : public BaseTableReader { - public: - using BaseTableReader::BaseTableReader; - - Status - Init() override { - ARROW_ASSIGN_OR_RAISE( - auto istream_it, - io::MakeInputStreamIterator(input_, read_options_.block_size) - ); - - // Since we're converting serially, no need to readahead more than - // one block int32_t block_queue_size = 1; - // ARROW_ASSIGN_OR_RAISE(auto rh_it, - // MakeReadaheadIterator(std::move(istream_it), - // block_queue_size)); - buffer_iterator_ = CSVBufferIterator::Make(std::move(istream_it)); - return Status::OK(); - } - - Result> - Read() override { - task_group_ = TaskGroup::MakeSerial(io_context_.stop_token()); - - // First block - ARROW_ASSIGN_OR_RAISE(auto first_buffer, buffer_iterator_.Next()); - if (first_buffer == nullptr) { - return Status::Invalid("Empty CSV file"); - } - RETURN_NOT_OK(ProcessHeader(first_buffer, &first_buffer)); - RETURN_NOT_OK(MakeColumnBuilders()); - - auto block_iterator = SerialBlockReader::MakeIterator( - std::move(buffer_iterator_), - MakeChunker(parse_options_), - first_buffer, - read_options_.skip_rows_after_names - ); - while (true) { - RETURN_NOT_OK(io_context_.stop_token().Poll()); - - ARROW_ASSIGN_OR_RAISE(auto maybe_block, block_iterator.Next()); - if (IsIterationEnd(maybe_block)) { - // EOF - break; - } - ARROW_ASSIGN_OR_RAISE( - int64_t parsed_bytes, - ParseAndInsert( - maybe_block.partial, - maybe_block.completion, - maybe_block.buffer, - maybe_block.block_index, - maybe_block.is_final - ) - ); - RETURN_NOT_OK(maybe_block.consume_bytes(parsed_bytes)); - } - // Finish conversion, create schema and table - RETURN_NOT_OK(task_group_->Finish()); - return MakeTable(); - } - - protected: - Iterator> buffer_iterator_; - }; - - Result> - MakeTableReader( - MemoryPool* pool, - const io::IOContext& io_context, - const std::shared_ptr& input, - const ReadOptions& read_options, - const ParseOptions& parse_options, - const ConvertOptions& convert_options - ) { - RETURN_NOT_OK(parse_options.Validate()); - RETURN_NOT_OK(read_options.Validate()); - RETURN_NOT_OK(convert_options.Validate()); - std::shared_ptr reader; - // if (read_options.use_threads) { - // auto cpu_executor = internal::GetCpuThreadPool(); - // reader = std::make_shared( - // io_context, input, read_options, parse_options, - // convert_options, cpu_executor); - // } else { - reader = std::make_shared( - io_context, - input, - read_options, - parse_options, - convert_options, - /*count_rows=*/true - ); - // } - RETURN_NOT_OK(reader->Init()); - return reader; - } - - ///////////////////////////////////////////////////////////////////////// - // Factory functions - - Result> - TableReader::Make( - const io::IOContext io_context, - const std::shared_ptr input, - const ReadOptions& read_options, - const ParseOptions& parse_options, - const ConvertOptions& convert_options - ) { - return MakeTableReader( - io_context.pool(), - io_context, - input, - read_options, - parse_options, - convert_options - ); - } - - Result> - TableReader::Make( - MemoryPool* pool, - const io::IOContext& io_context, - const std::shared_ptr& input, - const ReadOptions& read_options, - const ParseOptions& parse_options, - const ConvertOptions& convert_options - ) { - return MakeTableReader( - pool, - io_context, - input, - read_options, - parse_options, - convert_options - ); - } - -} // namespace csv - -} // namespace arrow diff --git a/cpp/perspective/src/cpp/vendor/single_threaded_reader.cpp b/cpp/perspective/src/cpp/vendor/single_threaded_reader.cpp deleted file mode 100644 index 159a889bf9..0000000000 --- a/cpp/perspective/src/cpp/vendor/single_threaded_reader.cpp +++ /dev/null @@ -1,3632 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// This file is a fork of `arrow/ipc/reader.cc` -// this file removes a usage of threading so it is compatible with WASM. - -#include "arrow/ipc/reader.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include // IWYU pragma: export - -#include "arrow/array.h" -#include "arrow/buffer.h" -#include "arrow/extension_type.h" -#include "arrow/io/caching.h" -#include "arrow/io/interfaces.h" -#include "arrow/io/memory.h" -#include "arrow/ipc/message.h" -#include "arrow/ipc/metadata_internal.h" -#include "arrow/ipc/reader_internal.h" -#include "arrow/ipc/writer.h" -#include "arrow/record_batch.h" -#include "arrow/sparse_tensor.h" -#include "arrow/status.h" -#include "arrow/type.h" -#include "arrow/type_traits.h" -#include "arrow/util/bit_util.h" -#include "arrow/util/bitmap_ops.h" -#include "arrow/util/checked_cast.h" -#include "arrow/util/compression.h" -#include "arrow/util/endian.h" -#include "arrow/util/key_value_metadata.h" -#include "arrow/util/logging.h" -#include "arrow/util/parallel.h" -#include "arrow/util/string.h" -#include "arrow/util/thread_pool.h" -#include "arrow/util/ubsan.h" -#include "arrow/util/vector.h" -#include "arrow/visit_type_inline.h" - -#include "generated/File_generated.h" // IWYU pragma: export -#include "generated/Message_generated.h" -#include "generated/Schema_generated.h" -#include "generated/SparseTensor_generated.h" - -namespace arrow { - -namespace flatbuf = org::apache::arrow::flatbuf; - -using internal::checked_cast; -using internal::checked_pointer_cast; - -namespace ipc { - - using internal::FileBlock; - using internal::kArrowMagicBytes; - - namespace { - - enum class DictionaryKind { New, Delta, Replacement }; - - Status - InvalidMessageType(MessageType expected, MessageType actual) { - return Status::IOError( - "Expected IPC message of type ", - FormatMessageType(expected), - " but got ", - FormatMessageType(actual) - ); - } - -#define CHECK_MESSAGE_TYPE(expected, actual) \ - do { \ - if ((actual) != (expected)) { \ - return InvalidMessageType((expected), (actual)); \ - } \ - } while (0) - -#define CHECK_HAS_BODY(message) \ - do { \ - if ((message).body() == nullptr) { \ - return Status::IOError( \ - "Expected body in IPC message of type ", \ - FormatMessageType((message).type()) \ - ); \ - } \ - } while (0) - -#define CHECK_HAS_NO_BODY(message) \ - do { \ - if ((message).body_length() != 0) { \ - return Status::IOError( \ - "Unexpected body in IPC message of type ", \ - FormatMessageType((message).type()) \ - ); \ - } \ - } while (0) - - } // namespace - - // ---------------------------------------------------------------------- - // Record batch read path - - /// \brief Structure to keep common arguments to be passed - struct IpcReadContext { - IpcReadContext( - DictionaryMemo* memo, - const IpcReadOptions& option, - bool swap, - MetadataVersion version = MetadataVersion::V5, - Compression::type kind = Compression::UNCOMPRESSED - ) : - dictionary_memo(memo), - options(option), - metadata_version(version), - compression(kind), - swap_endian(swap) {} - - DictionaryMemo* dictionary_memo; - - const IpcReadOptions& options; - - MetadataVersion metadata_version; - - Compression::type compression; - - /// \brief LoadRecordBatch() or LoadRecordBatchSubset() swaps endianness - /// of elements if this flag is true - const bool swap_endian; - }; - - /// A collection of ranges to read and pointers to set to those ranges when - /// they are available. This allows the ArrayLoader to utilize a two pass - /// cache-then-read strategy with a ReadRangeCache - class BatchDataReadRequest { - public: - [[nodiscard]] - const std::vector& - ranges_to_read() const { - return ranges_to_read_; - } - - void - RequestRange( - int64_t offset, int64_t length, std::shared_ptr* out - ) { - ranges_to_read_.push_back({offset, length}); - destinations_.push_back(out); - } - - void - FulfillRequest(const std::vector>& buffers) { - for (std::size_t i = 0; i < buffers.size(); i++) { - *destinations_[i] = buffers[i]; - } - } - - private: - std::vector ranges_to_read_; - std::vector*> destinations_; - }; - - /// The field_index and buffer_index are incremented based on how much of - /// the batch is "consumed" (through nested data reconstruction, for - /// example) - class ArrayLoader { - public: - explicit ArrayLoader( - const flatbuf::RecordBatch* metadata, - MetadataVersion metadata_version, - const IpcReadOptions& options, - io::RandomAccessFile* file - ) : - metadata_(metadata), - metadata_version_(metadata_version), - file_(file), - file_offset_(0), - max_recursion_depth_(options.max_recursion_depth) {} - - explicit ArrayLoader( - const flatbuf::RecordBatch* metadata, - MetadataVersion metadata_version, - const IpcReadOptions& options, - int64_t file_offset - ) : - metadata_(metadata), - metadata_version_(metadata_version), - file_(nullptr), - file_offset_(file_offset), - max_recursion_depth_(options.max_recursion_depth) {} - - Status - ReadBuffer( - int64_t offset, int64_t length, std::shared_ptr* out - ) { - if (skip_io_) { - return Status::OK(); - } - if (offset < 0) { - return Status::Invalid( - "Negative offset for reading buffer ", buffer_index_ - ); - } - if (length < 0) { - return Status::Invalid( - "Negative length for reading buffer ", buffer_index_ - ); - } - // This construct permits overriding GetBuffer at compile time - if (!bit_util::IsMultipleOf8(offset)) { - return Status::Invalid( - "Buffer ", - buffer_index_, - " did not start on 8-byte aligned offset: ", - offset - ); - } - if (file_ != nullptr) { - return file_->ReadAt(offset, length).Value(out); - } - read_request_.RequestRange(offset + file_offset_, length, out); - return Status::OK(); - } - - Status - LoadType(const DataType& type) { - return VisitTypeInline(type, this); - } - - Status - Load(const Field* field, ArrayData* out) { - if (max_recursion_depth_ <= 0) { - return Status::Invalid("Max recursion depth reached"); - } - - field_ = field; - out_ = out; - out_->type = field_->type(); - return LoadType(*field_->type()); - } - - Status - SkipField(const Field* field) { - ArrayData dummy; - skip_io_ = true; - Status status = Load(field, &dummy); - skip_io_ = false; - return status; - } - - Status - GetBuffer(int buffer_index, std::shared_ptr* out) { - const auto* buffers = metadata_->buffers(); - CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers"); - if (buffer_index >= static_cast(buffers->size())) { - return Status::IOError("buffer_index out of range."); - } - const flatbuf::Buffer* buffer = buffers->Get(buffer_index); - if (buffer->length() == 0) { - // Should never return a null buffer here. - // (zero-sized buffer allocations are cheap) - return AllocateBuffer(0).Value(out); - } - return ReadBuffer(buffer->offset(), buffer->length(), out); - } - - Status - GetFieldMetadata(int field_index, ArrayData* out) { - const auto* nodes = metadata_->nodes(); - CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes"); - // pop off a field - if (field_index >= static_cast(nodes->size())) { - return Status::Invalid( - "Ran out of field metadata, likely malformed" - ); - } - const flatbuf::FieldNode* node = nodes->Get(field_index); - - out->length = node->length(); - out->null_count = node->null_count(); - out->offset = 0; - return Status::OK(); - } - - Status - LoadCommon(Type::type type_id) { - // This only contains the length and null count, which we need to - // figure out what to do with the buffers. For example, if - // null_count == 0, then we can skip that buffer without reading - // from shared memory - RETURN_NOT_OK(GetFieldMetadata(field_index_++, out_)); - - if (internal::HasValidityBitmap(type_id, metadata_version_)) { - // Extract null_bitmap which is common to all arrays except for - // unions and nulls. - if (out_->null_count != 0) { - RETURN_NOT_OK(GetBuffer(buffer_index_, out_->buffers.data()) - ); - } - buffer_index_++; - } - return Status::OK(); - } - - template - Status - LoadPrimitive(Type::type type_id) { - out_->buffers.resize(2); - - RETURN_NOT_OK(LoadCommon(type_id)); - if (out_->length > 0) { - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); - } else { - buffer_index_++; - out_->buffers[1] = std::make_shared(nullptr, 0); - } - return Status::OK(); - } - - template - Status - LoadBinary(Type::type type_id) { - out_->buffers.resize(3); - - RETURN_NOT_OK(LoadCommon(type_id)); - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); - return GetBuffer(buffer_index_++, &out_->buffers[2]); - } - - template - Status - LoadList(const TYPE& type) { - out_->buffers.resize(2); - - RETURN_NOT_OK(LoadCommon(type.id())); - RETURN_NOT_OK(GetBuffer(buffer_index_++, &out_->buffers[1])); - - const int num_children = type.num_fields(); - if (num_children != 1) { - return Status::Invalid( - "Wrong number of children: ", num_children - ); - } - - return LoadChildren(type.fields()); - } - - Status - LoadChildren(const std::vector>& child_fields) { - ArrayData* parent = out_; - - parent->child_data.resize(child_fields.size()); - for (int i = 0; i < static_cast(child_fields.size()); ++i) { - parent->child_data[i] = std::make_shared(); - --max_recursion_depth_; - RETURN_NOT_OK( - Load(child_fields[i].get(), parent->child_data[i].get()) - ); - ++max_recursion_depth_; - } - out_ = parent; - return Status::OK(); - } - - Status - Visit(const NullType& type) { - out_->buffers.resize(1); - - // ARROW-6379: NullType has no buffers in the IPC payload - return GetFieldMetadata(field_index_++, out_); - } - - template - enable_if_t< - std::is_base_of_v - && !std::is_base_of_v - && !std::is_base_of_v, - Status> - Visit(const T& type) { - return LoadPrimitive(type.id()); - } - - template - enable_if_base_binary - Visit(const T& type) { - return LoadBinary(type.id()); - } - - Status - Visit(const FixedSizeBinaryType& type) { - out_->buffers.resize(2); - RETURN_NOT_OK(LoadCommon(type.id())); - return GetBuffer(buffer_index_++, &out_->buffers[1]); - } - - template - enable_if_var_size_list - Visit(const T& type) { - return LoadList(type); - } - - Status - Visit(const MapType& type) { - RETURN_NOT_OK(LoadList(type)); - return MapArray::ValidateChildData(out_->child_data); - } - - Status - Visit(const FixedSizeListType& type) { - out_->buffers.resize(1); - - RETURN_NOT_OK(LoadCommon(type.id())); - - const int num_children = type.num_fields(); - if (num_children != 1) { - return Status::Invalid( - "Wrong number of children: ", num_children - ); - } - - return LoadChildren(type.fields()); - } - - Status - Visit(const StructType& type) { - out_->buffers.resize(1); - RETURN_NOT_OK(LoadCommon(type.id())); - return LoadChildren(type.fields()); - } - - Status - Visit(const UnionType& type) { - int n_buffers = type.mode() == UnionMode::SPARSE ? 2 : 3; - out_->buffers.resize(n_buffers); - - RETURN_NOT_OK(LoadCommon(type.id())); - - // With metadata V4, we can get a validity bitmap. - // Trying to fix up union data to do without the top-level validity - // bitmap is hairy: - // - type ids must be rewritten to all have valid values (even for - // former - // null slots) - // - sparse union children must have their validity bitmaps - // rewritten - // by ANDing the top-level validity bitmap - // - dense union children must be rewritten (at least one of them) - // to insert the required null slots that were formerly omitted - // So instead we bail out. - if (out_->null_count != 0 && out_->buffers[0] != nullptr) { - return Status::Invalid( - "Cannot read pre-1.0.0 Union array with " - "top-level validity bitmap" - ); - } - out_->buffers[0] = nullptr; - out_->null_count = 0; - - if (out_->length > 0) { - RETURN_NOT_OK(GetBuffer(buffer_index_, &out_->buffers[1])); - if (type.mode() == UnionMode::DENSE) { - RETURN_NOT_OK( - GetBuffer(buffer_index_ + 1, &out_->buffers[2]) - ); - } - } - buffer_index_ += n_buffers - 1; - return LoadChildren(type.fields()); - } - - Status - Visit(const DictionaryType& type) { - // out_->dictionary will be filled later in ResolveDictionaries() - return LoadType(*type.index_type()); - } - - Status - Visit(const RunEndEncodedType& type) { - out_->buffers.resize(1); - RETURN_NOT_OK(LoadCommon(type.id())); - return LoadChildren(type.fields()); - } - - Status - Visit(const ExtensionType& type) { - return LoadType(*type.storage_type()); - } - - BatchDataReadRequest& - read_request() { - return read_request_; - } - - private: - const flatbuf::RecordBatch* metadata_; - const MetadataVersion metadata_version_; - io::RandomAccessFile* file_; - int64_t file_offset_; - int max_recursion_depth_; - int buffer_index_ = 0; - int field_index_ = 0; - bool skip_io_ = false; - - BatchDataReadRequest read_request_; - const Field* field_ = nullptr; - ArrayData* out_ = nullptr; - }; - - Result> - DecompressBuffer( - const std::shared_ptr& buf, - const IpcReadOptions& options, - util::Codec* codec - ) { - if (buf == nullptr || buf->size() == 0) { - return buf; - } - - if (buf->size() < 8) { - return Status::Invalid( - "Likely corrupted message, compressed buffers " - "are larger than 8 bytes by construction" - ); - } - - const uint8_t* data = buf->data(); - int64_t compressed_size = buf->size() - sizeof(int64_t); - int64_t uncompressed_size = - bit_util::FromLittleEndian(util::SafeLoadAs(data)); - - if (uncompressed_size == -1) { - return SliceBuffer(buf, sizeof(int64_t), compressed_size); - } - - ARROW_ASSIGN_OR_RAISE( - auto uncompressed, - AllocateBuffer(uncompressed_size, options.memory_pool) - ); - - ARROW_ASSIGN_OR_RAISE( - int64_t actual_decompressed, - codec->Decompress( - compressed_size, - data + sizeof(int64_t), - uncompressed_size, - uncompressed->mutable_data() - ) - ); - if (actual_decompressed != uncompressed_size) { - return Status::Invalid( - "Failed to fully decompress buffer, expected ", - uncompressed_size, - " bytes but decompressed ", - actual_decompressed - ); - } - - return std::move(uncompressed); - } - - Status - DecompressBuffers( - Compression::type compression, - const IpcReadOptions& options, - ArrayDataVector* fields - ) { - struct BufferAccumulator { - using BufferPtrVector = std::vector*>; - - void - AppendFrom(const ArrayDataVector& fields) { - for (const auto& field : fields) { - for (auto& buffer : field->buffers) { - buffers_.push_back(&buffer); - } - AppendFrom(field->child_data); - } - } - - BufferPtrVector - Get(const ArrayDataVector& fields) && { - AppendFrom(fields); - return std::move(buffers_); - } - - BufferPtrVector buffers_; - }; - - // Flatten all buffers - auto buffers = BufferAccumulator{}.Get(*fields); - - std::unique_ptr codec; - ARROW_ASSIGN_OR_RAISE(codec, util::Codec::Create(compression)); - - // PSP_PARALLEL_FOR works in wasm. - // TODO: THIS ONE RIGHT HERE! - - for (auto& buffer : buffers) { - ARROW_ASSIGN_OR_RAISE( - *buffer, DecompressBuffer(*buffer, options, codec.get()) - ); - } - return Status::OK(); - } - - Result> - LoadRecordBatchSubset( - const flatbuf::RecordBatch* metadata, - const std::shared_ptr& schema, - const std::vector* inclusion_mask, - const IpcReadContext& context, - io::RandomAccessFile* file - ) { - ArrayLoader loader( - metadata, context.metadata_version, context.options, file - ); - - ArrayDataVector columns(schema->num_fields()); - ArrayDataVector filtered_columns; - FieldVector filtered_fields; - std::shared_ptr filtered_schema; - - for (int i = 0; i < schema->num_fields(); ++i) { - const Field& field = *schema->field(i); - if ((inclusion_mask == nullptr) || (*inclusion_mask)[i]) { - // Read field - auto column = std::make_shared(); - RETURN_NOT_OK(loader.Load(&field, column.get())); - if (metadata->length() != column->length) { - return Status::IOError( - "Array length did not match record batch length" - ); - } - columns[i] = std::move(column); - if (inclusion_mask != nullptr) { - filtered_columns.push_back(columns[i]); - filtered_fields.push_back(schema->field(i)); - } - } else { - // Skip field. This logic must be executed to advance the state - // of the loader to the next field - RETURN_NOT_OK(loader.SkipField(&field)); - } - } - - // Dictionary resolution needs to happen on the unfiltered columns, - // because fields are mapped structurally (by path in the original - // schema). - RETURN_NOT_OK(ResolveDictionaries( - columns, *context.dictionary_memo, context.options.memory_pool - )); - - if (inclusion_mask != nullptr) { - filtered_schema = - ::arrow::schema(std::move(filtered_fields), schema->metadata()); - columns.clear(); - } else { - filtered_schema = schema; - filtered_columns = std::move(columns); - } - if (context.compression != Compression::UNCOMPRESSED) { - RETURN_NOT_OK(DecompressBuffers( - context.compression, context.options, &filtered_columns - )); - } - - // swap endian in a set of ArrayData if necessary (swap_endian == true) - if (context.swap_endian) { - for (auto& filtered_column : filtered_columns) { - ARROW_ASSIGN_OR_RAISE( - filtered_column, - arrow::internal::SwapEndianArrayData(filtered_column) - ); - } - } - return RecordBatch::Make( - std::move(filtered_schema), - metadata->length(), - std::move(filtered_columns) - ); - } - - Result> - LoadRecordBatch( - const flatbuf::RecordBatch* metadata, - const std::shared_ptr& schema, - const std::vector& inclusion_mask, - const IpcReadContext& context, - io::RandomAccessFile* file - ) { - if (!inclusion_mask.empty()) { - return LoadRecordBatchSubset( - metadata, schema, &inclusion_mask, context, file - ); - } - return LoadRecordBatchSubset( - metadata, schema, /*inclusion_mask=*/nullptr, context, file - ); - } - - // ---------------------------------------------------------------------- - // Array loading - - Status - GetCompression(const flatbuf::RecordBatch* batch, Compression::type* out) { - *out = Compression::UNCOMPRESSED; - const flatbuf::BodyCompression* compression = batch->compression(); - if (compression != nullptr) { - if (compression->method() - != flatbuf::BodyCompressionMethod::BUFFER) { - // Forward compatibility - return Status::Invalid( - "This library only supports BUFFER compression method" - ); - } - - if (compression->codec() == flatbuf::CompressionType::LZ4_FRAME) { - *out = Compression::LZ4_FRAME; - } else if (compression->codec() == flatbuf::CompressionType::ZSTD) { - *out = Compression::ZSTD; - } else { - return Status::Invalid( - "Unsupported codec in RecordBatch::compression metadata" - ); - } - return Status::OK(); - } - return Status::OK(); - } - - Status - GetCompressionExperimental( - const flatbuf::Message* message, Compression::type* out - ) { - *out = Compression::UNCOMPRESSED; - if (message->custom_metadata() != nullptr) { - // TODO: Ensure this deserialization only ever happens once - std::shared_ptr metadata; - RETURN_NOT_OK(internal::GetKeyValueMetadata( - message->custom_metadata(), &metadata - )); - int index = metadata->FindKey("ARROW:experimental_compression"); - if (index != -1) { - // Arrow 0.17 stored string in upper case, internal utils now - // require lower case - auto name = - arrow::internal::AsciiToLower(metadata->value(index)); - ARROW_ASSIGN_OR_RAISE( - *out, util::Codec::GetCompressionType(name) - ); - } - return internal::CheckCompressionSupported(*out); - } - return Status::OK(); - } - - static Status - ReadContiguousPayload( - io::InputStream* file, std::unique_ptr* message - ) { - ARROW_ASSIGN_OR_RAISE(*message, ReadMessage(file)); - if (*message == nullptr) { - return Status::Invalid("Unable to read metadata at offset"); - } - return Status::OK(); - } - - Result> - ReadRecordBatch( - const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, - io::InputStream* file - ) { - std::unique_ptr message; - RETURN_NOT_OK(ReadContiguousPayload(file, &message)); - CHECK_HAS_BODY(*message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); - return ReadRecordBatch( - *message->metadata(), schema, dictionary_memo, options, reader.get() - ); - } - - Result> - ReadRecordBatch( - const Message& message, - const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options - ) { - CHECK_MESSAGE_TYPE(MessageType::RECORD_BATCH, message.type()); - CHECK_HAS_BODY(message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); - return ReadRecordBatch( - *message.metadata(), schema, dictionary_memo, options, reader.get() - ); - } - - Result - ReadRecordBatchInternal( - const Buffer& metadata, - const std::shared_ptr& schema, - const std::vector& inclusion_mask, - IpcReadContext& context, - io::RandomAccessFile* file - ) { - const flatbuf::Message* message = nullptr; - RETURN_NOT_OK( - internal::VerifyMessage(metadata.data(), metadata.size(), &message) - ); - const auto* batch = message->header_as_RecordBatch(); - if (batch == nullptr) { - return Status::IOError( - "Header-type of flatbuffer-encoded Message " - "is not RecordBatch." - ); - } - - Compression::type compression; - RETURN_NOT_OK(GetCompression(batch, &compression)); - if (context.compression == Compression::UNCOMPRESSED - && message->version() == flatbuf::MetadataVersion::V4) { - // Possibly obtain codec information from experimental serialization - // format in 0.17.x - RETURN_NOT_OK(GetCompressionExperimental(message, &compression)); - } - context.compression = compression; - context.metadata_version = - internal::GetMetadataVersion(message->version()); - - std::shared_ptr custom_metadata; - if (message->custom_metadata() != nullptr) { - RETURN_NOT_OK(internal::GetKeyValueMetadata( - message->custom_metadata(), &custom_metadata - )); - } - ARROW_ASSIGN_OR_RAISE( - auto record_batch, - LoadRecordBatch(batch, schema, inclusion_mask, context, file) - ); - return RecordBatchWithMetadata{record_batch, custom_metadata}; - } - - // If we are selecting only certain fields, populate an inclusion mask for - // fast lookups. Additionally, drop deselected fields from the reader's - // schema. - Status - GetInclusionMaskAndOutSchema( - const std::shared_ptr& full_schema, - const std::vector& included_indices, - std::vector* inclusion_mask, - std::shared_ptr* out_schema - ) { - inclusion_mask->clear(); - if (included_indices.empty()) { - *out_schema = full_schema; - return Status::OK(); - } - - inclusion_mask->resize(full_schema->num_fields(), false); - - auto included_indices_sorted = included_indices; - std::sort( - included_indices_sorted.begin(), included_indices_sorted.end() - ); - - FieldVector included_fields; - for (int i : included_indices_sorted) { - // Ignore out of bounds indices - if (i < 0 || i >= full_schema->num_fields()) { - return Status::Invalid("Out of bounds field index: ", i); - } - - if (inclusion_mask->at(i)) { - continue; - } - - inclusion_mask->at(i) = true; - included_fields.push_back(full_schema->field(i)); - } - - *out_schema = schema( - std::move(included_fields), - full_schema->endianness(), - full_schema->metadata() - ); - return Status::OK(); - } - - Status - UnpackSchemaMessage( - const void* opaque_schema, - const IpcReadOptions& options, - DictionaryMemo* dictionary_memo, - std::shared_ptr* schema, - std::shared_ptr* out_schema, - std::vector* field_inclusion_mask, - bool* swap_endian - ) { - RETURN_NOT_OK( - internal::GetSchema(opaque_schema, dictionary_memo, schema) - ); - - // If we are selecting only certain fields, populate the inclusion mask - // now for fast lookups - RETURN_NOT_OK(GetInclusionMaskAndOutSchema( - *schema, options.included_fields, field_inclusion_mask, out_schema - )); - *swap_endian = options.ensure_native_endian - && !out_schema->get()->is_native_endian(); - if (*swap_endian) { - // create a new schema with native endianness before swapping endian - // in ArrayData - *schema = schema->get()->WithEndianness(Endianness::Native); - *out_schema = out_schema->get()->WithEndianness(Endianness::Native); - } - return Status::OK(); - } - - Status - UnpackSchemaMessage( - const Message& message, - const IpcReadOptions& options, - DictionaryMemo* dictionary_memo, - std::shared_ptr* schema, - std::shared_ptr* out_schema, - std::vector* field_inclusion_mask, - bool* swap_endian - ) { - CHECK_MESSAGE_TYPE(MessageType::SCHEMA, message.type()); - CHECK_HAS_NO_BODY(message); - - return UnpackSchemaMessage( - message.header(), - options, - dictionary_memo, - schema, - out_schema, - field_inclusion_mask, - swap_endian - ); - } - - Result> - ReadRecordBatch( - const Buffer& metadata, - const std::shared_ptr& schema, - const DictionaryMemo* dictionary_memo, - const IpcReadOptions& options, - io::RandomAccessFile* file - ) { - std::shared_ptr out_schema; - // Empty means do not use - std::vector inclusion_mask; - IpcReadContext context( - const_cast(dictionary_memo), options, false - ); - RETURN_NOT_OK(GetInclusionMaskAndOutSchema( - schema, - context.options.included_fields, - &inclusion_mask, - &out_schema - )); - ARROW_ASSIGN_OR_RAISE( - auto batch_and_custom_metadata, - ReadRecordBatchInternal( - metadata, schema, inclusion_mask, context, file - ) - ); - return batch_and_custom_metadata.batch; - } - - Status - ReadDictionary( - const Buffer& metadata, - const IpcReadContext& context, - DictionaryKind* kind, - io::RandomAccessFile* file - ) { - const flatbuf::Message* message = nullptr; - RETURN_NOT_OK( - internal::VerifyMessage(metadata.data(), metadata.size(), &message) - ); - const auto* const dictionary_batch = - message->header_as_DictionaryBatch(); - if (dictionary_batch == nullptr) { - return Status::IOError( - "Header-type of flatbuffer-encoded Message " - "is not DictionaryBatch." - ); - } - - // The dictionary is embedded in a record batch with a single column - const auto* const batch_meta = dictionary_batch->data(); - - CHECK_FLATBUFFERS_NOT_NULL(batch_meta, "DictionaryBatch.data"); - - Compression::type compression; - RETURN_NOT_OK(GetCompression(batch_meta, &compression)); - if (compression == Compression::UNCOMPRESSED - && message->version() == flatbuf::MetadataVersion::V4) { - // Possibly obtain codec information from experimental serialization - // format in 0.17.x - RETURN_NOT_OK(GetCompressionExperimental(message, &compression)); - } - - const int64_t id = dictionary_batch->id(); - - // Look up the dictionary value type, which must have been added to the - // DictionaryMemo already prior to invoking this function - ARROW_ASSIGN_OR_RAISE( - auto value_type, context.dictionary_memo->GetDictionaryType(id) - ); - - // Load the dictionary data from the dictionary batch - ArrayLoader loader( - batch_meta, - internal::GetMetadataVersion(message->version()), - context.options, - file - ); - auto dict_data = std::make_shared(); - const Field dummy_field("", value_type); - RETURN_NOT_OK(loader.Load(&dummy_field, dict_data.get())); - - if (compression != Compression::UNCOMPRESSED) { - ArrayDataVector dict_fields{dict_data}; - RETURN_NOT_OK( - DecompressBuffers(compression, context.options, &dict_fields) - ); - } - - // swap endian in dict_data if necessary (swap_endian == true) - if (context.swap_endian) { - ARROW_ASSIGN_OR_RAISE( - dict_data, ::arrow::internal::SwapEndianArrayData(dict_data) - ); - } - - if (dictionary_batch->isDelta()) { - if (kind != nullptr) { - *kind = DictionaryKind::Delta; - } - return context.dictionary_memo->AddDictionaryDelta(id, dict_data); - } - ARROW_ASSIGN_OR_RAISE( - bool inserted, - context.dictionary_memo->AddOrReplaceDictionary(id, dict_data) - ); - if (kind != nullptr) { - *kind = - inserted ? DictionaryKind::New : DictionaryKind::Replacement; - } - return Status::OK(); - } - - Status - ReadDictionary( - const Message& message, - const IpcReadContext& context, - DictionaryKind* kind - ) { - // Only invoke this method if we already know we have a dictionary - // message - DCHECK_EQ(message.type(), MessageType::DICTIONARY_BATCH); - CHECK_HAS_BODY(message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); - return ReadDictionary(*message.metadata(), context, kind, reader.get()); - } - - // ---------------------------------------------------------------------- - // RecordBatchStreamReader implementation - - class RecordBatchStreamReaderImpl : public RecordBatchStreamReader { - public: - Status - Open( - std::unique_ptr message_reader, - const IpcReadOptions& options - ) { - message_reader_ = std::move(message_reader); - options_ = options; - - // Read schema - ARROW_ASSIGN_OR_RAISE( - std::unique_ptr message, ReadNextMessage() - ); - if (!message) { - return Status::Invalid( - "Tried reading schema message, was null or length 0" - ); - } - - RETURN_NOT_OK(UnpackSchemaMessage( - *message, - options, - &dictionary_memo_, - &schema_, - &out_schema_, - &field_inclusion_mask_, - &swap_endian_ - )); - return Status::OK(); - } - - Status - ReadNext(std::shared_ptr* batch) override { - ARROW_ASSIGN_OR_RAISE(auto batch_with_metadata, ReadNext()); - *batch = std::move(batch_with_metadata.batch); - return Status::OK(); - } - - Result - ReadNext() override { - if (!have_read_initial_dictionaries_) { - RETURN_NOT_OK(ReadInitialDictionaries()); - } - - RecordBatchWithMetadata batch_with_metadata; - if (empty_stream_) { - // ARROW-6006: Degenerate case where stream contains no data, we - // do not bother trying to read a RecordBatch message from the - // stream - return batch_with_metadata; - } - - // Continue to read other dictionaries, if any - std::unique_ptr message; - ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage()); - - while (message != nullptr - && message->type() == MessageType::DICTIONARY_BATCH) { - RETURN_NOT_OK(ReadDictionary(*message)); - ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage()); - } - - if (message == nullptr) { - // End of stream - return batch_with_metadata; - } - - CHECK_HAS_BODY(*message); - ARROW_ASSIGN_OR_RAISE( - auto reader, Buffer::GetReader(message->body()) - ); - IpcReadContext context(&dictionary_memo_, options_, swap_endian_); - return ReadRecordBatchInternal( - *message->metadata(), - schema_, - field_inclusion_mask_, - context, - reader.get() - ); - } - - [[nodiscard]] - std::shared_ptr - schema() const override { - return out_schema_; - } - - [[nodiscard]] - ReadStats - stats() const override { - return stats_; - } - - private: - Result> - ReadNextMessage() { - ARROW_ASSIGN_OR_RAISE( - auto message, message_reader_->ReadNextMessage() - ); - if (message) { - ++stats_.num_messages; - switch (message->type()) { - case MessageType::RECORD_BATCH: - ++stats_.num_record_batches; - break; - case MessageType::DICTIONARY_BATCH: - ++stats_.num_dictionary_batches; - break; - default: - break; - } - } - return std::move(message); - } - - // Read dictionary from dictionary batch - Status - ReadDictionary(const Message& message) { - DictionaryKind kind; - IpcReadContext context(&dictionary_memo_, options_, swap_endian_); - RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind) - ); - switch (kind) { - case DictionaryKind::New: - break; - case DictionaryKind::Delta: - ++stats_.num_dictionary_deltas; - break; - case DictionaryKind::Replacement: - ++stats_.num_replaced_dictionaries; - break; - } - return Status::OK(); - } - - Status - ReadInitialDictionaries() { - // We must receive all dictionaries before reconstructing the - // first record batch. Subsequent dictionary deltas modify the memo - std::unique_ptr message; - - // TODO(wesm): In future, we may want to reconcile the ids in the - // stream with those found in the schema - const auto num_dicts = dictionary_memo_.fields().num_dicts(); - for (int i = 0; i < num_dicts; ++i) { - ARROW_ASSIGN_OR_RAISE(message, ReadNextMessage()); - if (!message) { - if (i == 0) { - /// ARROW-6006: If we fail to find any dictionaries in - /// the stream, then it may be that the stream has a - /// schema but no actual data. In such case we - /// communicate that we were unable to find the - /// dictionaries (but there was no failure otherwise), - /// so the caller can decide what to do - empty_stream_ = true; - break; - } // ARROW-6126, the stream terminated before receiving - // the expected number of dictionaries - return Status::Invalid( - "IPC stream ended without " - "reading the expected number (", - num_dicts, - ") of dictionaries" - ); - } - - if (message->type() != MessageType::DICTIONARY_BATCH) { - return Status::Invalid( - "IPC stream did not have the expected number (", - num_dicts, - ") of dictionaries at the start of the stream" - ); - } - RETURN_NOT_OK(ReadDictionary(*message)); - } - - have_read_initial_dictionaries_ = true; - return Status::OK(); - } - - std::unique_ptr message_reader_; - IpcReadOptions options_; - std::vector field_inclusion_mask_; - - bool have_read_initial_dictionaries_ = false; - - // Flag to set in case where we fail to observe all dictionaries in a - // stream, and so the reader should not attempt to parse any messages - bool empty_stream_ = false; - - ReadStats stats_; - - DictionaryMemo dictionary_memo_; - std::shared_ptr schema_, out_schema_; - - bool swap_endian_; - }; - - // ---------------------------------------------------------------------- - // Stream reader constructors - - Result> - RecordBatchStreamReader::Open( - std::unique_ptr message_reader, - const IpcReadOptions& options - ) { - // Private ctor - auto result = std::make_shared(); - RETURN_NOT_OK(result->Open(std::move(message_reader), options)); - return result; - } - - Result> - RecordBatchStreamReader::Open( - io::InputStream* stream, const IpcReadOptions& options - ) { - return Open(MessageReader::Open(stream), options); - } - - Result> - RecordBatchStreamReader::Open( - const std::shared_ptr& stream, - const IpcReadOptions& options - ) { - return Open(MessageReader::Open(stream), options); - } - - // ---------------------------------------------------------------------- - // Reader implementation - - // Common functions used in both the random-access file reader and the - // asynchronous generator - static inline FileBlock - FileBlockFromFlatbuffer(const flatbuf::Block* block) { - return FileBlock{ - block->offset(), block->metaDataLength(), block->bodyLength() - }; - } - - Status - CheckAligned(const FileBlock& block) { - if (!bit_util::IsMultipleOf8(block.offset) - || !bit_util::IsMultipleOf8(block.metadata_length) - || !bit_util::IsMultipleOf8(block.body_length)) { - return Status::Invalid("Unaligned block in IPC file"); - } - return Status::OK(); - } - - static Result> - ReadMessageFromBlock( - const FileBlock& block, - io::RandomAccessFile* file, - const FieldsLoaderFunction& fields_loader - ) { - RETURN_NOT_OK(CheckAligned(block)); - // TODO(wesm): this breaks integration tests, see ARROW-3256 - // DCHECK_EQ((*out)->body_length(), block.body_length); - - ARROW_ASSIGN_OR_RAISE( - auto message, - ReadMessage( - block.offset, block.metadata_length, file, fields_loader - ) - ); - return std::move(message); - } - - static Future> - ReadMessageFromBlockAsync( - const FileBlock& block, - io::RandomAccessFile* file, - const io::IOContext& io_context - ) { - if (!bit_util::IsMultipleOf8(block.offset) - || !bit_util::IsMultipleOf8(block.metadata_length) - || !bit_util::IsMultipleOf8(block.body_length)) { - return Status::Invalid("Unaligned block in IPC file"); - } - - // TODO(wesm): this breaks integration tests, see ARROW-3256 - // DCHECK_EQ((*out)->body_length(), block.body_length); - - return ReadMessageAsync( - block.offset, - block.metadata_length, - block.body_length, - file, - io_context - ); - } - - class RecordBatchFileReaderImpl; - - /// A generator of record batches. - /// - /// All batches are yielded in order. - class ARROW_EXPORT WholeIpcFileRecordBatchGenerator { - public: - using Item = std::shared_ptr; - - explicit WholeIpcFileRecordBatchGenerator( - std::shared_ptr state, - std::shared_ptr cached_source, - io::IOContext io_context, - arrow::internal::Executor* executor - ) : - state_(std::move(state)), - cached_source_(std::move(cached_source)), - io_context_(std::move(io_context)), - executor_(executor) {} - - Future operator()(); - Future> ReadBlock(const FileBlock& block); - - static Status ReadDictionaries( - RecordBatchFileReaderImpl* state, - const std::vector>& dictionary_messages - ); - static Result> - ReadRecordBatch(RecordBatchFileReaderImpl* state, Message* message); - - private: - std::shared_ptr state_; - std::shared_ptr cached_source_; - io::IOContext io_context_; - arrow::internal::Executor* executor_; - int index_{0}; - // Odd Future type, but this lets us use All() easily - Future<> read_dictionaries_; - }; - - /// A generator of record batches for use when reading - /// a subset of columns from the file. - /// - /// All batches are yielded in order. - class ARROW_EXPORT SelectiveIpcFileRecordBatchGenerator { - public: - using Item = std::shared_ptr; - - explicit SelectiveIpcFileRecordBatchGenerator( - std::shared_ptr state - ) : - state_(std::move(state)) {} - - Future operator()(); - - private: - std::shared_ptr state_; - int index_{0}; - }; - - class RecordBatchFileReaderImpl : public RecordBatchFileReader { - public: - RecordBatchFileReaderImpl() = default; - - int - num_record_batches() const override { - return static_cast( - internal::FlatBuffersVectorSize(footer_->recordBatches()) - ); - } - - MetadataVersion - version() const override { - return internal::GetMetadataVersion(footer_->version()); - } - - static Status - LoadFieldsSubset( - const flatbuf::RecordBatch* metadata, - const IpcReadOptions& options, - io::RandomAccessFile* file, - const std::shared_ptr& schema, - const std::vector* inclusion_mask, - MetadataVersion metadata_version = MetadataVersion::V5 - ) { - ArrayLoader loader(metadata, metadata_version, options, file); - for (int i = 0; i < schema->num_fields(); ++i) { - const Field& field = *schema->field(i); - if ((inclusion_mask == nullptr) || (*inclusion_mask)[i]) { - // Read field - ArrayData column; - RETURN_NOT_OK(loader.Load(&field, &column)); - if (metadata->length() != column.length) { - return Status::IOError( - "Array length did not match record batch length" - ); - } - } else { - // Skip field. This logic must be executed to advance the - // state of the loader to the next field - RETURN_NOT_OK(loader.SkipField(&field)); - } - } - return Status::OK(); - } - - Future> - ReadRecordBatchAsync(int i) { - DCHECK_GE(i, 0); - DCHECK_LT(i, num_record_batches()); - - auto cached_metadata = cached_metadata_.find(i); - if (cached_metadata != cached_metadata_.end()) { - return ReadCachedRecordBatch(i, cached_metadata->second); - } - - return Status::Invalid( - "Asynchronous record batch reading is only " - "supported after a call to " - "PreBufferMetadata or PreBufferBatches" - ); - } - - Result> - ReadRecordBatch(int i) override { - ARROW_ASSIGN_OR_RAISE( - auto batch_with_metadata, ReadRecordBatchWithCustomMetadata(i) - ); - return batch_with_metadata.batch; - } - - Result - ReadRecordBatchWithCustomMetadata(int i) override { - DCHECK_GE(i, 0); - DCHECK_LT(i, num_record_batches()); - - auto cached_metadata = cached_metadata_.find(i); - if (cached_metadata != cached_metadata_.end()) { - auto result = - ReadCachedRecordBatch(i, cached_metadata->second).result(); - ARROW_ASSIGN_OR_RAISE(auto batch, result); - ARROW_ASSIGN_OR_RAISE( - auto message_obj, cached_metadata->second.result() - ); - ARROW_ASSIGN_OR_RAISE( - auto message, GetFlatbufMessage(message_obj) - ); - std::shared_ptr custom_metadata; - if (message->custom_metadata() != nullptr) { - RETURN_NOT_OK(internal::GetKeyValueMetadata( - message->custom_metadata(), &custom_metadata - )); - } - return RecordBatchWithMetadata{ - std::move(batch), std::move(custom_metadata) - }; - } - - RETURN_NOT_OK(WaitForDictionaryReadFinished()); - - FieldsLoaderFunction fields_loader = {}; - if (!field_inclusion_mask_.empty()) { - auto& schema = schema_; - auto& inclusion_mask = field_inclusion_mask_; - auto& read_options = options_; - fields_loader = [schema, inclusion_mask, read_options]( - const void* metadata, - io::RandomAccessFile* file - ) { - return LoadFieldsSubset( - static_cast(metadata), - read_options, - file, - schema, - &inclusion_mask - ); - }; - } - ARROW_ASSIGN_OR_RAISE( - auto message, - ReadMessageFromBlock(GetRecordBatchBlock(i), fields_loader) - ); - - CHECK_HAS_BODY(*message); - ARROW_ASSIGN_OR_RAISE( - auto reader, Buffer::GetReader(message->body()) - ); - IpcReadContext context(&dictionary_memo_, options_, swap_endian_); - ARROW_ASSIGN_OR_RAISE( - auto batch_with_metadata, - ReadRecordBatchInternal( - *message->metadata(), - schema_, - field_inclusion_mask_, - context, - reader.get() - ) - ); - stats_.num_record_batches.fetch_add(1, std::memory_order_relaxed); - return batch_with_metadata; - } - - Result - CountRows() override { - int64_t total = 0; - for (int i = 0; i < num_record_batches(); i++) { - ARROW_ASSIGN_OR_RAISE( - auto outer_message, - ReadMessageFromBlock(GetRecordBatchBlock(i)) - ); - auto metadata = outer_message->metadata(); - const flatbuf::Message* message = nullptr; - RETURN_NOT_OK(internal::VerifyMessage( - metadata->data(), metadata->size(), &message - )); - const auto* batch = message->header_as_RecordBatch(); - if (batch == nullptr) { - return Status::IOError( - "Header-type of flatbuffer-encoded " - "Message is not RecordBatch." - ); - } - total += batch->length(); - } - return total; - } - - Status - Open( - const std::shared_ptr& file, - int64_t footer_offset, - const IpcReadOptions& options - ) { - owned_file_ = file; - metadata_cache_ = std::make_shared( - file, file->io_context(), options.pre_buffer_cache_options - ); - return Open(file.get(), footer_offset, options); - } - - Status - Open( - io::RandomAccessFile* file, - int64_t footer_offset, - const IpcReadOptions& options - ) { - // The metadata_cache_ may have already been constructed with an - // owned file in the owning overload of Open - if (!metadata_cache_) { - metadata_cache_ = - std::make_shared( - file, - file->io_context(), - options.pre_buffer_cache_options - ); - } - file_ = file; - options_ = options; - footer_offset_ = footer_offset; - RETURN_NOT_OK(ReadFooter()); - - // Get the schema and record any observed dictionaries - RETURN_NOT_OK(UnpackSchemaMessage( - footer_->schema(), - options, - &dictionary_memo_, - &schema_, - &out_schema_, - &field_inclusion_mask_, - &swap_endian_ - )); - stats_.num_messages.fetch_add(1, std::memory_order_relaxed); - return Status::OK(); - } - - Future<> - OpenAsync( - const std::shared_ptr& file, - int64_t footer_offset, - const IpcReadOptions& options - ) { - owned_file_ = file; - metadata_cache_ = std::make_shared( - file, file->io_context(), options.pre_buffer_cache_options - ); - return OpenAsync(file.get(), footer_offset, options); - } - - Future<> - OpenAsync( - io::RandomAccessFile* file, - int64_t footer_offset, - const IpcReadOptions& options - ) { - // The metadata_cache_ may have already been constructed with an - // owned file in the owning overload of OpenAsync - if (!metadata_cache_) { - metadata_cache_ = - std::make_shared( - file, - file->io_context(), - options.pre_buffer_cache_options - ); - } - file_ = file; - options_ = options; - footer_offset_ = footer_offset; - auto* cpu_executor = ::arrow::internal::GetCpuThreadPool(); - auto self = std::dynamic_pointer_cast( - shared_from_this() - ); - return ReadFooterAsync(cpu_executor) - .Then([self, options]() -> Status { - // Get the schema and record any observed dictionaries - RETURN_NOT_OK(UnpackSchemaMessage( - self->footer_->schema(), - options, - &self->dictionary_memo_, - &self->schema_, - &self->out_schema_, - &self->field_inclusion_mask_, - &self->swap_endian_ - )); - self->stats_.num_messages.fetch_add( - 1, std::memory_order_relaxed - ); - return Status::OK(); - }); - } - - std::shared_ptr - schema() const override { - return out_schema_; - } - - std::shared_ptr - metadata() const override { - return metadata_; - } - - ReadStats - stats() const override { - return stats_.poll(); - } - - Result>> - GetRecordBatchGenerator( - const bool coalesce, - const io::IOContext& io_context, - const io::CacheOptions cache_options, - arrow::internal::Executor* executor - ) override { - auto state = std::dynamic_pointer_cast( - shared_from_this() - ); - // Prebuffering causes us to use a lot of futures which, at the - // moment, can only slow things down when we are doing zero-copy - // in-memory reads. - // - // Prebuffering's read patterns are also slightly worse than the - // alternative when doing whole-file reads because the logic is not - // in place to recognize we can just read the entire file up-front - if (!options_.included_fields.empty() - && options_.included_fields.size() != schema_->fields().size() - && !file_->supports_zero_copy()) { - RETURN_NOT_OK(state->PreBufferMetadata({})); - return SelectiveIpcFileRecordBatchGenerator(std::move(state)); - } - - std::shared_ptr cached_source; - if (coalesce && !file_->supports_zero_copy()) { - if (!owned_file_) { - return Status::Invalid( - "Cannot coalesce without an owned file" - ); - } - // Since the user is asking for all fields then we can cache the - // entire file (up to the footer) - cached_source = std::make_shared( - file_, io_context, cache_options - ); - RETURN_NOT_OK(cached_source->Cache({{0, footer_offset_}})); - } - return WholeIpcFileRecordBatchGenerator( - std::move(state), std::move(cached_source), io_context, executor - ); - } - - Status - DoPreBufferMetadata(const std::vector& indices) { - RETURN_NOT_OK(CacheMetadata(indices)); - EnsureDictionaryReadStarted(); - Future<> all_metadata_ready = WaitForMetadatas(indices); - for (int index : indices) { - Future> metadata_loaded = - all_metadata_ready.Then( - [this, index]() -> Result> { - stats_.num_messages.fetch_add( - 1, std::memory_order_relaxed - ); - FileBlock block = GetRecordBatchBlock(index); - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr metadata, - metadata_cache_->Read( - {block.offset, block.metadata_length} - ) - ); - return ReadMessage(std::move(metadata), nullptr); - } - ); - cached_metadata_.emplace(index, metadata_loaded); - } - return Status::OK(); - } - - std::vector - AllIndices() const { - std::vector all_indices(num_record_batches()); - std::iota(all_indices.begin(), all_indices.end(), 0); - return all_indices; - } - - Status - PreBufferMetadata(const std::vector& indices) override { - if (indices.empty()) { - return DoPreBufferMetadata(AllIndices()); - } - return DoPreBufferMetadata(indices); - } - - private: - friend class WholeIpcFileRecordBatchGenerator; - - struct AtomicReadStats { - std::atomic num_messages{0}; - std::atomic num_record_batches{0}; - std::atomic num_dictionary_batches{0}; - std::atomic num_dictionary_deltas{0}; - std::atomic num_replaced_dictionaries{0}; - - /// \brief Capture a copy of the current counters - ReadStats - poll() const { - ReadStats stats; - stats.num_messages = - num_messages.load(std::memory_order_relaxed); - stats.num_record_batches = - num_record_batches.load(std::memory_order_relaxed); - stats.num_dictionary_batches = - num_dictionary_batches.load(std::memory_order_relaxed); - stats.num_dictionary_deltas = - num_dictionary_deltas.load(std::memory_order_relaxed); - stats.num_replaced_dictionaries = - num_replaced_dictionaries.load(std::memory_order_relaxed); - return stats; - } - }; - - FileBlock - GetRecordBatchBlock(int i) const { - return FileBlockFromFlatbuffer(footer_->recordBatches()->Get(i)); - } - - FileBlock - GetDictionaryBlock(int i) const { - return FileBlockFromFlatbuffer(footer_->dictionaries()->Get(i)); - } - - Result> - ReadMessageFromBlock( - const FileBlock& block, - const FieldsLoaderFunction& fields_loader = {} - ) { - ARROW_ASSIGN_OR_RAISE( - auto message, - arrow::ipc::ReadMessageFromBlock(block, file_, fields_loader) - ); - stats_.num_messages.fetch_add(1, std::memory_order_relaxed); - return std::move(message); - } - - Status - ReadDictionaries() { - // Read all the dictionaries - IpcReadContext context(&dictionary_memo_, options_, swap_endian_); - for (int i = 0; i < num_dictionaries(); ++i) { - ARROW_ASSIGN_OR_RAISE( - auto message, ReadMessageFromBlock(GetDictionaryBlock(i)) - ); - RETURN_NOT_OK(ReadOneDictionary(message.get(), context)); - stats_.num_dictionary_batches.fetch_add( - 1, std::memory_order_relaxed - ); - } - return Status::OK(); - } - - Status - ReadOneDictionary(Message* message, const IpcReadContext& context) { - CHECK_HAS_BODY(*message); - ARROW_ASSIGN_OR_RAISE( - auto reader, Buffer::GetReader(message->body()) - ); - DictionaryKind kind; - RETURN_NOT_OK(ReadDictionary( - *message->metadata(), context, &kind, reader.get() - )); - if (kind == DictionaryKind::Replacement) { - return Status::Invalid( - "Unsupported dictionary replacement in IPC file" - ); - } - if (kind == DictionaryKind::Delta) { - stats_.num_dictionary_deltas.fetch_add( - 1, std::memory_order_relaxed - ); - } - return Status::OK(); - } - - void - AddDictionaryRanges(std::vector* ranges) const { - // Adds all dictionaries to the range cache - for (int i = 0; i < num_dictionaries(); ++i) { - FileBlock block = GetDictionaryBlock(i); - ranges->push_back( - {block.offset, block.metadata_length + block.body_length} - ); - } - } - - void - AddMetadataRanges( - const std::vector& indices, std::vector* ranges - ) { - for (int index : indices) { - FileBlock block = GetRecordBatchBlock(static_cast(index)); - ranges->push_back({block.offset, block.metadata_length}); - } - } - - Status - CacheMetadata(const std::vector& indices) { - std::vector ranges; - if (!read_dictionaries_) { - AddDictionaryRanges(&ranges); - } - AddMetadataRanges(indices, &ranges); - return metadata_cache_->Cache(std::move(ranges)); - } - - void - EnsureDictionaryReadStarted() { - if (!dictionary_load_finished_.is_valid()) { - read_dictionaries_ = true; - std::vector ranges; - AddDictionaryRanges(&ranges); - dictionary_load_finished_ = - metadata_cache_->WaitFor(std::move(ranges)).Then([this] { - return ReadDictionaries(); - }); - } - } - - Status - WaitForDictionaryReadFinished() { - if (!read_dictionaries_) { - RETURN_NOT_OK(ReadDictionaries()); - read_dictionaries_ = true; - return Status::OK(); - } - if (dictionary_load_finished_.is_valid()) { - return dictionary_load_finished_.status(); - } - // Dictionaries were previously loaded synchronously - return Status::OK(); - } - - Future<> - WaitForMetadatas(const std::vector& indices) { - std::vector ranges; - AddMetadataRanges(indices, &ranges); - return metadata_cache_->WaitFor(std::move(ranges)); - } - - Result - GetIpcReadContext( - const flatbuf::Message* message, const flatbuf::RecordBatch* batch - ) { - IpcReadContext context(&dictionary_memo_, options_, swap_endian_); - Compression::type compression; - RETURN_NOT_OK(GetCompression(batch, &compression)); - if (context.compression == Compression::UNCOMPRESSED - && message->version() == flatbuf::MetadataVersion::V4) { - // Possibly obtain codec information from experimental - // serialization format in 0.17.x - RETURN_NOT_OK(GetCompressionExperimental(message, &compression) - ); - } - context.compression = compression; - context.metadata_version = - internal::GetMetadataVersion(message->version()); - return std::move(context); - } - - Result - GetBatchFromMessage(const flatbuf::Message* message) { - const auto* batch = message->header_as_RecordBatch(); - if (batch == nullptr) { - return Status::IOError( - "Header-type of flatbuffer-encoded " - "Message is not RecordBatch." - ); - } - return batch; - } - - Result - GetFlatbufMessage(const std::shared_ptr& message) { - const Buffer& metadata = *message->metadata(); - const flatbuf::Message* flatbuf_message = nullptr; - RETURN_NOT_OK(internal::VerifyMessage( - metadata.data(), metadata.size(), &flatbuf_message - )); - return flatbuf_message; - } - - struct CachedRecordBatchReadContext { - CachedRecordBatchReadContext( - std::shared_ptr sch, - const flatbuf::RecordBatch* batch, - IpcReadContext context, - io::RandomAccessFile* file, - std::shared_ptr owned_file, - int64_t block_data_offset - ) : - schema(std::move(sch)), - context(context), - file(file), - owned_file(std::move(owned_file)), - loader( - batch, - context.metadata_version, - context.options, - block_data_offset - ), - columns(schema->num_fields()), - cache( - file, file->io_context(), io::CacheOptions::LazyDefaults() - ), - length(batch->length()) {} - - Status - CalculateLoadRequest() { - std::shared_ptr out_schema; - RETURN_NOT_OK(GetInclusionMaskAndOutSchema( - schema, - context.options.included_fields, - &inclusion_mask, - &out_schema - )); - - for (int i = 0; i < schema->num_fields(); ++i) { - const Field& field = *schema->field(i); - if (inclusion_mask.empty() || inclusion_mask[i]) { - // Read field - auto column = std::make_shared(); - RETURN_NOT_OK(loader.Load(&field, column.get())); - if (length != column->length) { - return Status::IOError( - "Array length did not match " - "record batch length" - ); - } - columns[i] = std::move(column); - if (!inclusion_mask.empty()) { - filtered_columns.push_back(columns[i]); - filtered_fields.push_back(schema->field(i)); - } - } else { - // Skip field. This logic must be executed to advance - // the state of the loader to the next field - RETURN_NOT_OK(loader.SkipField(&field)); - } - } - if (!inclusion_mask.empty()) { - filtered_schema = ::arrow::schema( - std::move(filtered_fields), schema->metadata() - ); - } else { - filtered_schema = schema; - } - return Status::OK(); - } - - Future<> - ReadAsync() { - RETURN_NOT_OK(cache.Cache(loader.read_request().ranges_to_read() - )); - return cache.WaitFor(loader.read_request().ranges_to_read()); - } - - Result> - CreateRecordBatch() { - std::vector> buffers; - for (const auto& range_to_read : - loader.read_request().ranges_to_read()) { - ARROW_ASSIGN_OR_RAISE( - auto buffer, cache.Read(range_to_read) - ); - buffers.push_back(std::move(buffer)); - } - loader.read_request().FulfillRequest(buffers); - - // Dictionary resolution needs to happen on the unfiltered - // columns, because fields are mapped structurally (by path in - // the original schema). - RETURN_NOT_OK(ResolveDictionaries( - columns, - *context.dictionary_memo, - context.options.memory_pool - )); - if (!inclusion_mask.empty()) { - columns.clear(); - } else { - filtered_columns = std::move(columns); - } - - if (context.compression != Compression::UNCOMPRESSED) { - RETURN_NOT_OK(DecompressBuffers( - context.compression, context.options, &filtered_columns - )); - } - - // swap endian in a set of ArrayData if necessary (swap_endian - // == true) - if (context.swap_endian) { - for (auto& filtered_column : filtered_columns) { - ARROW_ASSIGN_OR_RAISE( - filtered_column, - arrow::internal::SwapEndianArrayData(filtered_column - ) - ); - } - } - return RecordBatch::Make( - std::move(filtered_schema), - length, - std::move(filtered_columns) - ); - } - - std::shared_ptr schema; - IpcReadContext context; - io::RandomAccessFile* file; - std::shared_ptr owned_file; - - ArrayLoader loader; - ArrayDataVector columns; - io::internal::ReadRangeCache cache; - int64_t length; - ArrayDataVector filtered_columns; - FieldVector filtered_fields; - std::shared_ptr filtered_schema; - std::vector inclusion_mask; - }; - - Future> - ReadCachedRecordBatch( - int index, const Future>& message_fut - ) { - stats_.num_record_batches.fetch_add(1, std::memory_order_relaxed); - return dictionary_load_finished_ - .Then([message_fut] { return message_fut; }) - .Then( - [this, index](const std::shared_ptr& message_obj - ) -> Future> { - FileBlock block = GetRecordBatchBlock(index); - ARROW_ASSIGN_OR_RAISE( - auto message, GetFlatbufMessage(message_obj) - ); - ARROW_ASSIGN_OR_RAISE( - auto batch, GetBatchFromMessage(message) - ); - ARROW_ASSIGN_OR_RAISE( - auto context, GetIpcReadContext(message, batch) - ); - - auto read_context = - std::make_shared( - schema_, - batch, - context, - file_, - owned_file_, - block.offset - + static_cast(block.metadata_length - ) - ); - RETURN_NOT_OK(read_context->CalculateLoadRequest()); - return read_context->ReadAsync().Then([read_context] { - return read_context->CreateRecordBatch(); - }); - } - ); - } - - Status - ReadFooter() { - auto fut = ReadFooterAsync(/*executor=*/nullptr); - return fut.status(); - } - - Future<> - ReadFooterAsync(arrow::internal::Executor* executor) { - const auto magic_size = static_cast(strlen(kArrowMagicBytes)); - - if (footer_offset_ <= magic_size * 2 + 4) { - return Status::Invalid("File is too small: ", footer_offset_); - } - - int file_end_size = static_cast(magic_size + sizeof(int32_t)); - auto self = std::dynamic_pointer_cast( - shared_from_this() - ); - auto read_magic = - file_->ReadAsync(footer_offset_ - file_end_size, file_end_size); - if (executor != nullptr) { - read_magic = executor->Transfer(std::move(read_magic)); - } - return read_magic - .Then( - [=](const std::shared_ptr& buffer - ) -> Future> { - const int64_t expected_footer_size = - magic_size + sizeof(int32_t); - if (buffer->size() < expected_footer_size) { - return Status::Invalid( - "Unable to read ", - expected_footer_size, - "from end of file" - ); - } - - if (static_cast( - memcmp( - buffer->data() + sizeof(int32_t), - kArrowMagicBytes, - magic_size - ) - != 0 - ) - != 0) { - return Status::Invalid("Not an Arrow file"); - } - - int32_t footer_length = bit_util::FromLittleEndian( - *reinterpret_cast(buffer->data()) - ); - - if (footer_length <= 0 - || footer_length - > self->footer_offset_ - magic_size * 2 - 4) { - return Status::Invalid( - "File is smaller than indicated metadata size" - ); - } - - // Now read the footer - auto read_footer = self->file_->ReadAsync( - self->footer_offset_ - footer_length - - file_end_size, - footer_length - ); - if (executor != nullptr) { - read_footer = - executor->Transfer(std::move(read_footer)); - } - return read_footer; - } - ) - .Then([=](const std::shared_ptr& buffer) -> Status { - self->footer_buffer_ = buffer; - const auto* const data = self->footer_buffer_->data(); - const auto size = self->footer_buffer_->size(); - if (!internal::VerifyFlatbuffers( - data, size - )) { - return Status::IOError( - "Verification of flatbuffer-encoded Footer " - "failed." - ); - } - self->footer_ = flatbuf::GetFooter(data); - - const auto* fb_metadata = self->footer_->custom_metadata(); - if (fb_metadata != nullptr) { - std::shared_ptr md; - RETURN_NOT_OK( - internal::GetKeyValueMetadata(fb_metadata, &md) - ); - self->metadata_ = std::move(md); // const-ify - } - return Status::OK(); - }); - } - - int - num_dictionaries() const { - return static_cast( - internal::FlatBuffersVectorSize(footer_->dictionaries()) - ); - } - - io::RandomAccessFile* file_{NULLPTR}; - IpcReadOptions options_; - std::vector field_inclusion_mask_; - - std::shared_ptr owned_file_; - - // The location where the Arrow file layout ends. May be the end of the - // file or some other location if embedded in a larger file. - int64_t footer_offset_{0}; - - // Footer metadata - std::shared_ptr footer_buffer_; - const flatbuf::Footer* footer_{NULLPTR}; - std::shared_ptr metadata_; - - bool read_dictionaries_ = false; - DictionaryMemo dictionary_memo_; - - // Reconstructed schema, including any read dictionaries - std::shared_ptr schema_; - // Schema with deselected fields dropped - std::shared_ptr out_schema_; - - AtomicReadStats stats_; - std::shared_ptr metadata_cache_; - std::unordered_set cached_data_blocks_; - Future<> dictionary_load_finished_; - std::unordered_map>> - cached_metadata_; - std::unordered_map> cached_data_requests_; - - bool swap_endian_; - }; - - Result> - RecordBatchFileReader::Open( - io::RandomAccessFile* file, const IpcReadOptions& options - ) { - ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return Open(file, footer_offset, options); - } - - Result> - RecordBatchFileReader::Open( - io::RandomAccessFile* file, - int64_t footer_offset, - const IpcReadOptions& options - ) { - auto result = std::make_shared(); - RETURN_NOT_OK(result->Open(file, footer_offset, options)); - return result; - } - - Result> - RecordBatchFileReader::Open( - const std::shared_ptr& file, - const IpcReadOptions& options - ) { - ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return Open(file, footer_offset, options); - } - - Result> - RecordBatchFileReader::Open( - const std::shared_ptr& file, - int64_t footer_offset, - const IpcReadOptions& options - ) { - auto result = std::make_shared(); - RETURN_NOT_OK(result->Open(file, footer_offset, options)); - return result; - } - - Future> - RecordBatchFileReader::OpenAsync( - const std::shared_ptr& file, - const IpcReadOptions& options - ) { - ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return OpenAsync(file, footer_offset, options); - } - - Future> - RecordBatchFileReader::OpenAsync( - io::RandomAccessFile* file, const IpcReadOptions& options - ) { - ARROW_ASSIGN_OR_RAISE(int64_t footer_offset, file->GetSize()); - return OpenAsync(file, footer_offset, options); - } - - Future> - RecordBatchFileReader::OpenAsync( - const std::shared_ptr& file, - int64_t footer_offset, - const IpcReadOptions& options - ) { - auto result = std::make_shared(); - return result->OpenAsync(file, footer_offset, options) - .Then([=]() -> Result> { - return result; - }); - } - - Future> - RecordBatchFileReader::OpenAsync( - io::RandomAccessFile* file, - int64_t footer_offset, - const IpcReadOptions& options - ) { - auto result = std::make_shared(); - return result->OpenAsync(file, footer_offset, options) - .Then([=]() -> Result> { - return result; - }); - } - - Future - SelectiveIpcFileRecordBatchGenerator::operator()() { - int index = index_++; - if (index >= state_->num_record_batches()) { - return IterationEnd(); - } - return state_->ReadRecordBatchAsync(index); - } - - Future - WholeIpcFileRecordBatchGenerator::operator()() { - auto state = state_; - if (!read_dictionaries_.is_valid()) { - std::vector>> messages( - state->num_dictionaries() - ); - for (int i = 0; i < state->num_dictionaries(); i++) { - auto block = FileBlockFromFlatbuffer( - state->footer_->dictionaries()->Get(i) - ); - messages[i] = ReadBlock(block); - } - auto read_messages = All(std::move(messages)); - if (executor_ != nullptr) { - read_messages = executor_->Transfer(read_messages); - } - read_dictionaries_ = read_messages.Then( - [=](const std::vector>>& - maybe_messages) -> Status { - ARROW_ASSIGN_OR_RAISE( - auto messages, - arrow::internal::UnwrapOrRaise(maybe_messages) - ); - return ReadDictionaries(state.get(), messages); - } - ); - } - if (index_ >= state_->num_record_batches()) { - return Future::MakeFinished(IterationTraits::End()); - } - auto block = FileBlockFromFlatbuffer( - state->footer_->recordBatches()->Get(index_++) - ); - auto read_message = ReadBlock(block); - auto read_messages = - read_dictionaries_.Then([read_message]() { return read_message; }); - // Force transfer. This may be wasteful in some cases, but ensures we - // get off the I/O threads as soon as possible, and ensures we don't - // decode record batches synchronously in the case that the message read - // has already finished. - if (executor_ != nullptr) { - auto* executor = executor_; - return read_messages.Then( - [=](const std::shared_ptr& message) -> Future { - return DeferNotOk(executor->Submit([=]() { - return ReadRecordBatch(state.get(), message.get()); - })); - } - ); - } - return read_messages.Then( - [=](const std::shared_ptr& message) -> Result { - return ReadRecordBatch(state.get(), message.get()); - } - ); - } - - Future> - WholeIpcFileRecordBatchGenerator::ReadBlock(const FileBlock& block) { - if (cached_source_) { - auto cached_source = cached_source_; - io::ReadRange range{ - block.offset, block.metadata_length + block.body_length - }; - auto* pool = state_->options_.memory_pool; - return cached_source->WaitFor({range}).Then( - [cached_source, pool, range]( - ) -> Result> { - ARROW_ASSIGN_OR_RAISE( - auto buffer, cached_source->Read(range) - ); - io::BufferReader stream(std::move(buffer)); - return ReadMessage(&stream, pool); - } - ); - } - return ReadMessageFromBlockAsync(block, state_->file_, io_context_); - } - - Status - WholeIpcFileRecordBatchGenerator::ReadDictionaries( - RecordBatchFileReaderImpl* state, - const std::vector>& dictionary_messages - ) { - IpcReadContext context( - &state->dictionary_memo_, state->options_, state->swap_endian_ - ); - for (const auto& message : dictionary_messages) { - RETURN_NOT_OK(state->ReadOneDictionary(message.get(), context)); - } - return Status::OK(); - } - - Result> - WholeIpcFileRecordBatchGenerator::ReadRecordBatch( - RecordBatchFileReaderImpl* state, Message* message - ) { - CHECK_HAS_BODY(*message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); - IpcReadContext context( - &state->dictionary_memo_, state->options_, state->swap_endian_ - ); - ARROW_ASSIGN_OR_RAISE( - auto batch_with_metadata, - ReadRecordBatchInternal( - *message->metadata(), - state->schema_, - state->field_inclusion_mask_, - context, - reader.get() - ) - ); - return batch_with_metadata.batch; - } - - Status - Listener::OnEOS() { - return Status::OK(); - } - - Status - Listener::OnSchemaDecoded(std::shared_ptr schema) { - return Status::OK(); - } - - Status - Listener::OnRecordBatchDecoded(std::shared_ptr record_batch) { - return Status::NotImplemented( - "OnRecordBatchDecoded() callback isn't implemented" - ); - } - - class StreamDecoder::StreamDecoderImpl : public MessageDecoderListener { - private: - enum State { - SCHEMA, - INITIAL_DICTIONARIES, - RECORD_BATCHES, - EOS, - }; - - public: - explicit StreamDecoderImpl( - std::shared_ptr listener, IpcReadOptions options - ) : - listener_(std::move(listener)), - options_(std::move(options)), - message_decoder_( - std::shared_ptr(this, [](void*) {}), - options_.memory_pool - ) {} - - Status - OnMessageDecoded(std::unique_ptr message) override { - ++stats_.num_messages; - switch (state_) { - case State::SCHEMA: - ARROW_RETURN_NOT_OK(OnSchemaMessageDecoded(std::move(message - ))); - break; - case State::INITIAL_DICTIONARIES: - ARROW_RETURN_NOT_OK( - OnInitialDictionaryMessageDecoded(std::move(message)) - ); - break; - case State::RECORD_BATCHES: - ARROW_RETURN_NOT_OK( - OnRecordBatchMessageDecoded(std::move(message)) - ); - break; - case State::EOS: - break; - } - return Status::OK(); - } - - Status - OnEOS() override { - state_ = State::EOS; - return listener_->OnEOS(); - } - - Status - Consume(const uint8_t* data, int64_t size) { - return message_decoder_.Consume(data, size); - } - - Status - Consume(std::shared_ptr buffer) { - return message_decoder_.Consume(std::move(buffer)); - } - - [[nodiscard]] - std::shared_ptr - schema() const { - return out_schema_; - } - - [[nodiscard]] - int64_t - next_required_size() const { - return message_decoder_.next_required_size(); - } - - [[nodiscard]] - ReadStats - stats() const { - return stats_; - } - - private: - Status - OnSchemaMessageDecoded(std::unique_ptr message) { - RETURN_NOT_OK(UnpackSchemaMessage( - *message, - options_, - &dictionary_memo_, - &schema_, - &out_schema_, - &field_inclusion_mask_, - &swap_endian_ - )); - - n_required_dictionaries_ = dictionary_memo_.fields().num_fields(); - if (n_required_dictionaries_ == 0) { - state_ = State::RECORD_BATCHES; - RETURN_NOT_OK(listener_->OnSchemaDecoded(schema_)); - } else { - state_ = State::INITIAL_DICTIONARIES; - } - return Status::OK(); - } - - Status - OnInitialDictionaryMessageDecoded(std::unique_ptr message) { - if (message->type() != MessageType::DICTIONARY_BATCH) { - return Status::Invalid( - "IPC stream did not have the expected number (", - dictionary_memo_.fields().num_fields(), - ") of dictionaries at the start of the stream" - ); - } - RETURN_NOT_OK(ReadDictionary(*message)); - n_required_dictionaries_--; - if (n_required_dictionaries_ == 0) { - state_ = State::RECORD_BATCHES; - ARROW_RETURN_NOT_OK(listener_->OnSchemaDecoded(schema_)); - } - return Status::OK(); - } - - Status - OnRecordBatchMessageDecoded(std::unique_ptr message) { - if (message->type() == MessageType::DICTIONARY_BATCH) { - return ReadDictionary(*message); - } - CHECK_HAS_BODY(*message); - ARROW_ASSIGN_OR_RAISE( - auto reader, Buffer::GetReader(message->body()) - ); - IpcReadContext context(&dictionary_memo_, options_, swap_endian_); - ARROW_ASSIGN_OR_RAISE( - auto batch_with_metadata, - ReadRecordBatchInternal( - *message->metadata(), - schema_, - field_inclusion_mask_, - context, - reader.get() - ) - ); - ++stats_.num_record_batches; - return listener_->OnRecordBatchDecoded( - std::move(batch_with_metadata.batch) - ); - } - - // Read dictionary from dictionary batch - Status - ReadDictionary(const Message& message) { - DictionaryKind kind; - IpcReadContext context(&dictionary_memo_, options_, swap_endian_); - RETURN_NOT_OK(::arrow::ipc::ReadDictionary(message, context, &kind) - ); - ++stats_.num_dictionary_batches; - switch (kind) { - case DictionaryKind::New: - break; - case DictionaryKind::Delta: - ++stats_.num_dictionary_deltas; - break; - case DictionaryKind::Replacement: - ++stats_.num_replaced_dictionaries; - break; - } - return Status::OK(); - } - - std::shared_ptr listener_; - const IpcReadOptions options_; - State state_{State::SCHEMA}; - MessageDecoder message_decoder_; - std::vector field_inclusion_mask_; - int n_required_dictionaries_{0}; - DictionaryMemo dictionary_memo_; - std::shared_ptr schema_, out_schema_; - ReadStats stats_; - bool swap_endian_; - }; - - StreamDecoder::StreamDecoder( - std::shared_ptr listener, IpcReadOptions options - ) { - impl_ = std::make_unique( - std::move(listener), std::move(options) - ); - } - - StreamDecoder::~StreamDecoder() = default; - - Status - StreamDecoder::Consume(const uint8_t* data, int64_t size) { - return impl_->Consume(data, size); - } - Status - StreamDecoder::Consume(std::shared_ptr buffer) { - return impl_->Consume(std::move(buffer)); - } - - std::shared_ptr - StreamDecoder::schema() const { - return impl_->schema(); - } - - int64_t - StreamDecoder::next_required_size() const { - return impl_->next_required_size(); - } - - ReadStats - StreamDecoder::stats() const { - return impl_->stats(); - } - - Result> - ReadSchema(io::InputStream* stream, DictionaryMemo* dictionary_memo) { - std::unique_ptr reader = MessageReader::Open(stream); - ARROW_ASSIGN_OR_RAISE( - std::unique_ptr message, reader->ReadNextMessage() - ); - if (!message) { - return Status::Invalid( - "Tried reading schema message, was null or length 0" - ); - } - CHECK_MESSAGE_TYPE(MessageType::SCHEMA, message->type()); - return ReadSchema(*message, dictionary_memo); - } - - Result> - ReadSchema(const Message& message, DictionaryMemo* dictionary_memo) { - std::shared_ptr result; - RETURN_NOT_OK( - internal::GetSchema(message.header(), dictionary_memo, &result) - ); - return result; - } - - Result> - ReadTensor(io::InputStream* file) { - std::unique_ptr message; - RETURN_NOT_OK(ReadContiguousPayload(file, &message)); - return ReadTensor(*message); - } - - Result> - ReadTensor(const Message& message) { - std::shared_ptr type; - std::vector shape; - std::vector strides; - std::vector dim_names; - CHECK_HAS_BODY(message); - RETURN_NOT_OK(internal::GetTensorMetadata( - *message.metadata(), &type, &shape, &strides, &dim_names - )); - return Tensor::Make(type, message.body(), shape, strides, dim_names); - } - - namespace { - - Result> - ReadSparseCOOIndex( - const flatbuf::SparseTensor* sparse_tensor, - const std::vector& shape, - int64_t non_zero_length, - io::RandomAccessFile* file - ) { - const auto* sparse_index = - sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(); - const auto ndim = static_cast(shape.size()); - - std::shared_ptr indices_type; - RETURN_NOT_OK( - internal::GetSparseCOOIndexMetadata(sparse_index, &indices_type) - ); - const int64_t indices_elsize = indices_type->byte_width(); - - const auto* indices_buffer = sparse_index->indicesBuffer(); - ARROW_ASSIGN_OR_RAISE( - auto indices_data, - file->ReadAt(indices_buffer->offset(), indices_buffer->length()) - ); - std::vector indices_shape({non_zero_length, ndim}); - const auto* indices_strides = sparse_index->indicesStrides(); - std::vector strides(2); - if ((indices_strides != nullptr) && indices_strides->size() > 0) { - if (indices_strides->size() != 2) { - return Status::Invalid( - "Wrong size for indicesStrides in SparseCOOIndex" - ); - } - strides[0] = indices_strides->Get(0); - strides[1] = indices_strides->Get(1); - } else { - // Row-major by default - strides[0] = indices_elsize * ndim; - strides[1] = indices_elsize; - } - return SparseCOOIndex::Make( - std::make_shared( - indices_type, indices_data, indices_shape, strides - ), - sparse_index->isCanonical() - ); - } - - Result> - ReadSparseCSXIndex( - const flatbuf::SparseTensor* sparse_tensor, - const std::vector& shape, - int64_t non_zero_length, - io::RandomAccessFile* file - ) { - if (shape.size() != 2) { - return Status::Invalid( - "Invalid shape length for a sparse matrix" - ); - } - - const auto* sparse_index = - sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(); - - std::shared_ptr indptr_type; - std::shared_ptr indices_type; - RETURN_NOT_OK(internal::GetSparseCSXIndexMetadata( - sparse_index, &indptr_type, &indices_type - )); - const int indptr_byte_width = indptr_type->byte_width(); - - const auto* indptr_buffer = sparse_index->indptrBuffer(); - ARROW_ASSIGN_OR_RAISE( - auto indptr_data, - file->ReadAt(indptr_buffer->offset(), indptr_buffer->length()) - ); - - const auto* indices_buffer = sparse_index->indicesBuffer(); - ARROW_ASSIGN_OR_RAISE( - auto indices_data, - file->ReadAt(indices_buffer->offset(), indices_buffer->length()) - ); - - std::vector indices_shape({non_zero_length}); - const auto indices_minimum_bytes = - indices_shape[0] * indices_type->byte_width(); - if (indices_minimum_bytes > indices_buffer->length()) { - return Status::Invalid( - "shape is inconsistent to the size of indices buffer" - ); - } - - switch (sparse_index->compressedAxis()) { - case flatbuf::SparseMatrixCompressedAxis::Row: { - std::vector indptr_shape({shape[0] + 1}); - const int64_t indptr_minimum_bytes = - indptr_shape[0] * indptr_byte_width; - if (indptr_minimum_bytes > indptr_buffer->length()) { - return Status::Invalid( - "shape is inconsistent to the " - "size of indptr buffer" - ); - } - return std::make_shared( - std::make_shared( - indptr_type, indptr_data, indptr_shape - ), - std::make_shared( - indices_type, indices_data, indices_shape - ) - ); - } - case flatbuf::SparseMatrixCompressedAxis::Column: { - std::vector indptr_shape({shape[1] + 1}); - const int64_t indptr_minimum_bytes = - indptr_shape[0] * indptr_byte_width; - if (indptr_minimum_bytes > indptr_buffer->length()) { - return Status::Invalid( - "shape is inconsistent to the " - "size of indptr buffer" - ); - } - return std::make_shared( - std::make_shared( - indptr_type, indptr_data, indptr_shape - ), - std::make_shared( - indices_type, indices_data, indices_shape - ) - ); - } - default: - return Status::Invalid( - "Invalid value of SparseMatrixCompressedAxis" - ); - } - } - - Result> - ReadSparseCSFIndex( - const flatbuf::SparseTensor* sparse_tensor, - const std::vector& shape, - io::RandomAccessFile* file - ) { - const auto* sparse_index = - sparse_tensor->sparseIndex_as_SparseTensorIndexCSF(); - const auto ndim = static_cast(shape.size()); - const auto* indptr_buffers = sparse_index->indptrBuffers(); - const auto* indices_buffers = sparse_index->indicesBuffers(); - std::vector> indptr_data(ndim - 1); - std::vector> indices_data(ndim); - - std::shared_ptr indptr_type; - std::shared_ptr indices_type; - std::vector axis_order; - std::vector indices_size; - - RETURN_NOT_OK(internal::GetSparseCSFIndexMetadata( - sparse_index, - &axis_order, - &indices_size, - &indptr_type, - &indices_type - )); - for (int i = 0; i < static_cast(indptr_buffers->size()); ++i) { - ARROW_ASSIGN_OR_RAISE( - indptr_data[i], - file->ReadAt( - indptr_buffers->Get(i)->offset(), - indptr_buffers->Get(i)->length() - ) - ); - } - for (int i = 0; i < static_cast(indices_buffers->size()); - ++i) { - ARROW_ASSIGN_OR_RAISE( - indices_data[i], - file->ReadAt( - indices_buffers->Get(i)->offset(), - indices_buffers->Get(i)->length() - ) - ); - } - - return SparseCSFIndex::Make( - indptr_type, - indices_type, - indices_size, - axis_order, - indptr_data, - indices_data - ); - } - - Result> - MakeSparseTensorWithSparseCOOIndex( - const std::shared_ptr& type, - const std::vector& shape, - const std::vector& dim_names, - const std::shared_ptr& sparse_index, - int64_t non_zero_length, - const std::shared_ptr& data - ) { - return SparseCOOTensor::Make( - sparse_index, type, data, shape, dim_names - ); - } - - Result> - MakeSparseTensorWithSparseCSRIndex( - const std::shared_ptr& type, - const std::vector& shape, - const std::vector& dim_names, - const std::shared_ptr& sparse_index, - int64_t non_zero_length, - const std::shared_ptr& data - ) { - return SparseCSRMatrix::Make( - sparse_index, type, data, shape, dim_names - ); - } - - Result> - MakeSparseTensorWithSparseCSCIndex( - const std::shared_ptr& type, - const std::vector& shape, - const std::vector& dim_names, - const std::shared_ptr& sparse_index, - int64_t non_zero_length, - const std::shared_ptr& data - ) { - return SparseCSCMatrix::Make( - sparse_index, type, data, shape, dim_names - ); - } - - Result> - MakeSparseTensorWithSparseCSFIndex( - const std::shared_ptr& type, - const std::vector& shape, - const std::vector& dim_names, - const std::shared_ptr& sparse_index, - const std::shared_ptr& data - ) { - return SparseCSFTensor::Make( - sparse_index, type, data, shape, dim_names - ); - } - - Status - ReadSparseTensorMetadata( - const Buffer& metadata, - std::shared_ptr* out_type, - std::vector* out_shape, - std::vector* out_dim_names, - int64_t* out_non_zero_length, - SparseTensorFormat::type* out_format_id, - const flatbuf::SparseTensor** out_fb_sparse_tensor, - const flatbuf::Buffer** out_buffer - ) { - RETURN_NOT_OK(internal::GetSparseTensorMetadata( - metadata, - out_type, - out_shape, - out_dim_names, - out_non_zero_length, - out_format_id - )); - - const flatbuf::Message* message = nullptr; - RETURN_NOT_OK(internal::VerifyMessage( - metadata.data(), metadata.size(), &message - )); - - const auto* sparse_tensor = message->header_as_SparseTensor(); - if (sparse_tensor == nullptr) { - return Status::IOError( - "Header-type of flatbuffer-encoded " - "Message is not SparseTensor." - ); - } - *out_fb_sparse_tensor = sparse_tensor; - - const auto* buffer = sparse_tensor->data(); - if (!bit_util::IsMultipleOf8(buffer->offset())) { - return Status::Invalid( - "Buffer of sparse index data did not " - "start on 8-byte aligned offset: ", - buffer->offset() - ); - } - *out_buffer = buffer; - - return Status::OK(); - } - - } // namespace - - namespace internal { - - namespace { - - Result - GetSparseTensorBodyBufferCount( - SparseTensorFormat::type format_id, const size_t ndim - ) { - switch (format_id) { - case SparseTensorFormat::COO: - return 2; - - case SparseTensorFormat::CSR: - return 3; - - case SparseTensorFormat::CSC: - return 3; - - case SparseTensorFormat::CSF: - return 2 * ndim; - - default: - return Status::Invalid( - "Unrecognized sparse tensor format" - ); - } - } - - Status - CheckSparseTensorBodyBufferCount( - const IpcPayload& payload, - SparseTensorFormat::type sparse_tensor_format_id, - const size_t ndim - ) { - size_t expected_body_buffer_count = 0; - ARROW_ASSIGN_OR_RAISE( - expected_body_buffer_count, - GetSparseTensorBodyBufferCount( - sparse_tensor_format_id, ndim - ) - ); - if (payload.body_buffers.size() != expected_body_buffer_count) { - return Status::Invalid( - "Invalid body buffer count for a sparse tensor" - ); - } - - return Status::OK(); - } - - } // namespace - - Result - ReadSparseTensorBodyBufferCount(const Buffer& metadata) { - SparseTensorFormat::type format_id{}; - std::vector shape; - - RETURN_NOT_OK(internal::GetSparseTensorMetadata( - metadata, nullptr, &shape, nullptr, nullptr, &format_id - )); - - return GetSparseTensorBodyBufferCount( - format_id, static_cast(shape.size()) - ); - } - - Result> - ReadSparseTensorPayload(const IpcPayload& payload) { - std::shared_ptr type; - std::vector shape; - std::vector dim_names; - int64_t non_zero_length; - SparseTensorFormat::type sparse_tensor_format_id; - const flatbuf::SparseTensor* sparse_tensor; - const flatbuf::Buffer* buffer; - - RETURN_NOT_OK(ReadSparseTensorMetadata( - *payload.metadata, - &type, - &shape, - &dim_names, - &non_zero_length, - &sparse_tensor_format_id, - &sparse_tensor, - &buffer - )); - - RETURN_NOT_OK(CheckSparseTensorBodyBufferCount( - payload, - sparse_tensor_format_id, - static_cast(shape.size()) - )); - - switch (sparse_tensor_format_id) { - case SparseTensorFormat::COO: { - std::shared_ptr sparse_index; - std::shared_ptr indices_type; - RETURN_NOT_OK(internal::GetSparseCOOIndexMetadata( - sparse_tensor->sparseIndex_as_SparseTensorIndexCOO(), - &indices_type - )); - ARROW_ASSIGN_OR_RAISE( - sparse_index, - SparseCOOIndex::Make( - indices_type, - shape, - non_zero_length, - payload.body_buffers[0] - ) - ); - return MakeSparseTensorWithSparseCOOIndex( - type, - shape, - dim_names, - sparse_index, - non_zero_length, - payload.body_buffers[1] - ); - } - case SparseTensorFormat::CSR: { - std::shared_ptr sparse_index; - std::shared_ptr indptr_type; - std::shared_ptr indices_type; - RETURN_NOT_OK(internal::GetSparseCSXIndexMetadata( - sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(), - &indptr_type, - &indices_type - )); - ARROW_CHECK_EQ(indptr_type, indices_type); - ARROW_ASSIGN_OR_RAISE( - sparse_index, - SparseCSRIndex::Make( - indices_type, - shape, - non_zero_length, - payload.body_buffers[0], - payload.body_buffers[1] - ) - ); - return MakeSparseTensorWithSparseCSRIndex( - type, - shape, - dim_names, - sparse_index, - non_zero_length, - payload.body_buffers[2] - ); - } - case SparseTensorFormat::CSC: { - std::shared_ptr sparse_index; - std::shared_ptr indptr_type; - std::shared_ptr indices_type; - RETURN_NOT_OK(internal::GetSparseCSXIndexMetadata( - sparse_tensor->sparseIndex_as_SparseMatrixIndexCSX(), - &indptr_type, - &indices_type - )); - ARROW_CHECK_EQ(indptr_type, indices_type); - ARROW_ASSIGN_OR_RAISE( - sparse_index, - SparseCSCIndex::Make( - indices_type, - shape, - non_zero_length, - payload.body_buffers[0], - payload.body_buffers[1] - ) - ); - return MakeSparseTensorWithSparseCSCIndex( - type, - shape, - dim_names, - sparse_index, - non_zero_length, - payload.body_buffers[2] - ); - } - case SparseTensorFormat::CSF: { - std::shared_ptr sparse_index; - std::shared_ptr indptr_type; - std::shared_ptr indices_type; - std::vector axis_order; - std::vector indices_size; - - RETURN_NOT_OK(internal::GetSparseCSFIndexMetadata( - sparse_tensor->sparseIndex_as_SparseTensorIndexCSF(), - &axis_order, - &indices_size, - &indptr_type, - &indices_type - )); - ARROW_CHECK_EQ(indptr_type, indices_type); - - const int64_t ndim = shape.size(); - std::vector> indptr_data(ndim - 1); - std::vector> indices_data(ndim); - - for (int64_t i = 0; i < ndim - 1; ++i) { - indptr_data[i] = payload.body_buffers[i]; - } - for (int64_t i = 0; i < ndim; ++i) { - indices_data[i] = payload.body_buffers[i + ndim - 1]; - } - - ARROW_ASSIGN_OR_RAISE( - sparse_index, - SparseCSFIndex::Make( - indptr_type, - indices_type, - indices_size, - axis_order, - indptr_data, - indices_data - ) - ); - return MakeSparseTensorWithSparseCSFIndex( - type, - shape, - dim_names, - sparse_index, - payload.body_buffers[2 * ndim - 1] - ); - } - default: - return Status::Invalid("Unsupported sparse index format"); - } - } - - } // namespace internal - - Result> - ReadSparseTensor(const Buffer& metadata, io::RandomAccessFile* file) { - std::shared_ptr type; - std::vector shape; - std::vector dim_names; - int64_t non_zero_length; - SparseTensorFormat::type sparse_tensor_format_id; - const flatbuf::SparseTensor* sparse_tensor; - const flatbuf::Buffer* buffer; - - RETURN_NOT_OK(ReadSparseTensorMetadata( - metadata, - &type, - &shape, - &dim_names, - &non_zero_length, - &sparse_tensor_format_id, - &sparse_tensor, - &buffer - )); - - ARROW_ASSIGN_OR_RAISE( - auto data, file->ReadAt(buffer->offset(), buffer->length()) - ); - - std::shared_ptr sparse_index; - switch (sparse_tensor_format_id) { - case SparseTensorFormat::COO: { - ARROW_ASSIGN_OR_RAISE( - sparse_index, - ReadSparseCOOIndex( - sparse_tensor, shape, non_zero_length, file - ) - ); - return MakeSparseTensorWithSparseCOOIndex( - type, - shape, - dim_names, - checked_pointer_cast(sparse_index), - non_zero_length, - data - ); - } - case SparseTensorFormat::CSR: { - ARROW_ASSIGN_OR_RAISE( - sparse_index, - ReadSparseCSXIndex( - sparse_tensor, shape, non_zero_length, file - ) - ); - return MakeSparseTensorWithSparseCSRIndex( - type, - shape, - dim_names, - checked_pointer_cast(sparse_index), - non_zero_length, - data - ); - } - case SparseTensorFormat::CSC: { - ARROW_ASSIGN_OR_RAISE( - sparse_index, - ReadSparseCSXIndex( - sparse_tensor, shape, non_zero_length, file - ) - ); - return MakeSparseTensorWithSparseCSCIndex( - type, - shape, - dim_names, - checked_pointer_cast(sparse_index), - non_zero_length, - data - ); - } - case SparseTensorFormat::CSF: { - ARROW_ASSIGN_OR_RAISE( - sparse_index, ReadSparseCSFIndex(sparse_tensor, shape, file) - ); - return MakeSparseTensorWithSparseCSFIndex( - type, - shape, - dim_names, - checked_pointer_cast(sparse_index), - data - ); - } - default: - return Status::Invalid("Unsupported sparse index format"); - } - } - - Result> - ReadSparseTensor(const Message& message) { - CHECK_HAS_BODY(message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message.body())); - return ReadSparseTensor(*message.metadata(), reader.get()); - } - - Result> - ReadSparseTensor(io::InputStream* file) { - std::unique_ptr message; - RETURN_NOT_OK(ReadContiguousPayload(file, &message)); - CHECK_MESSAGE_TYPE(MessageType::SPARSE_TENSOR, message->type()); - CHECK_HAS_BODY(*message); - ARROW_ASSIGN_OR_RAISE(auto reader, Buffer::GetReader(message->body())); - return ReadSparseTensor(*message->metadata(), reader.get()); - } - - /////////////////////////////////////////////////////////////////////////// - // Helpers for fuzzing - - namespace internal { - namespace { - - Status - ValidateFuzzBatch(const RecordBatch& batch) { - auto st = batch.ValidateFull(); - if (st.ok()) { - // If the batch is valid, printing should succeed - batch.ToString(); - } - return st; - } - - } // namespace - - Status - FuzzIpcStream(const uint8_t* data, int64_t size) { - auto buffer = std::make_shared(data, size); - io::BufferReader buffer_reader(buffer); - - std::shared_ptr batch_reader; - ARROW_ASSIGN_OR_RAISE( - batch_reader, RecordBatchStreamReader::Open(&buffer_reader) - ); - Status st; - - while (true) { - std::shared_ptr batch; - RETURN_NOT_OK(batch_reader->ReadNext(&batch)); - if (batch == nullptr) { - break; - } - st &= ValidateFuzzBatch(*batch); - } - - return st; - } - - Status - FuzzIpcFile(const uint8_t* data, int64_t size) { - auto buffer = std::make_shared(data, size); - io::BufferReader buffer_reader(buffer); - - std::shared_ptr batch_reader; - ARROW_ASSIGN_OR_RAISE( - batch_reader, RecordBatchFileReader::Open(&buffer_reader) - ); - Status st; - - const int n_batches = batch_reader->num_record_batches(); - for (int i = 0; i < n_batches; ++i) { - ARROW_ASSIGN_OR_RAISE( - auto batch, batch_reader->ReadRecordBatch(i) - ); - st &= ValidateFuzzBatch(*batch); - } - - return st; - } - - Status - FuzzIpcTensorStream(const uint8_t* data, int64_t size) { - auto buffer = std::make_shared(data, size); - io::BufferReader buffer_reader(buffer); - - std::shared_ptr tensor; - - while (true) { - ARROW_ASSIGN_OR_RAISE(tensor, ReadTensor(&buffer_reader)); - if (tensor == nullptr) { - break; - } - RETURN_NOT_OK(tensor->Validate()); - } - - return Status::OK(); - } - - Result - IoRecordedRandomAccessFile::GetSize() { - return file_size_; - } - - Result - IoRecordedRandomAccessFile::ReadAt( - int64_t position, int64_t nbytes, void* out - ) { - auto num_bytes_read = - std::min(file_size_, position + nbytes) - position; - - if (!read_ranges_.empty() - && position - == read_ranges_.back().offset - + read_ranges_.back().length) { - // merge continuous IOs into one if possible - read_ranges_.back().length += num_bytes_read; - } else { - // no real IO is performed, it is only saved into a vector for - // replaying later - read_ranges_.emplace_back( - io::ReadRange{position, num_bytes_read} - ); - } - return num_bytes_read; - } - - Result> - IoRecordedRandomAccessFile::ReadAt(int64_t position, int64_t nbytes) { - std::shared_ptr out; - auto result = ReadAt(position, nbytes, &out); - return out; - } - - Status - IoRecordedRandomAccessFile::Close() { - closed_ = true; - return Status::OK(); - } - - Status - IoRecordedRandomAccessFile::Abort() { - return Status::OK(); - } - - Result - IoRecordedRandomAccessFile::Tell() const { - return position_; - } - - bool - IoRecordedRandomAccessFile::closed() const { - return closed_; - } - - Status - IoRecordedRandomAccessFile::Seek(int64_t position) { - position_ = position; - return Status::OK(); - } - - Result - IoRecordedRandomAccessFile::Read(int64_t nbytes, void* out) { - ARROW_ASSIGN_OR_RAISE( - int64_t bytes_read, ReadAt(position_, nbytes, out) - ); - position_ += bytes_read; - return bytes_read; - } - - Result> - IoRecordedRandomAccessFile::Read(int64_t nbytes) { - ARROW_ASSIGN_OR_RAISE( - std::shared_ptr buffer, ReadAt(position_, nbytes) - ); - auto num_bytes_read = - std::min(file_size_, position_ + nbytes) - position_; - position_ += num_bytes_read; - return std::move(buffer); - } - - const io::IOContext& - IoRecordedRandomAccessFile::io_context() const { - return io_context_; - } - - const std::vector& - IoRecordedRandomAccessFile::GetReadRanges() const { - return read_ranges_; - } - - } // namespace internal -} // namespace ipc -} // namespace arrow diff --git a/cpp/perspective/src/include/perspective/vendor/arrow_compute_registry.h b/cpp/perspective/src/include/perspective/vendor/arrow_compute_registry.h deleted file mode 100644 index abeba8de39..0000000000 --- a/cpp/perspective/src/include/perspective/vendor/arrow_compute_registry.h +++ /dev/null @@ -1,102 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// NOTE: API is EXPERIMENTAL and will change without going through a -// deprecation cycle - -#pragma once - -#include -#include -#include - -#include "arrow/result.h" -#include "arrow/status.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace compute { - - class Function; - class FunctionOptionsType; - - /// \brief A mutable central function registry for built-in functions as - /// well as user-defined functions. Functions are implementations of - /// arrow::compute::Function. - /// - /// Generally, each function contains kernels which are implementations of a - /// function for a specific argument signature. After looking up a function - /// in the registry, one can either execute it eagerly with - /// Function::Execute or use one of the function's dispatch methods to pick - /// a suitable kernel for lower-level function execution. - class ARROW_EXPORT FunctionRegistry { - public: - ~FunctionRegistry(); - - /// \brief Construct a new registry. Most users only need to use the - /// global registry - static std::unique_ptr Make(); - - /// \brief Add a new function to the registry. Returns Status::KeyError - /// if a function with the same name is already registered - Status AddFunction( - std::shared_ptr function, bool allow_overwrite = false - ); - - /// \brief Add aliases for the given function name. Returns - /// Status::KeyError if the function with the given name is not - /// registered - Status AddAlias( - const std::string& target_name, const std::string& source_name - ); - - /// \brief Add a new function options type to the registry. Returns - /// Status::KeyError if a function options type with the same name is - /// already registered - Status AddFunctionOptionsType( - const FunctionOptionsType* options_type, - bool allow_overwrite = false - ); - - /// \brief Retrieve a function by name from the registry - Result> GetFunction(const std::string& name - ) const; - - /// \brief Return vector of all entry names in the registry. Helpful for - /// displaying a manifest of available functions - std::vector GetFunctionNames() const; - - /// \brief Retrieve a function options type by name from the registry - Result - GetFunctionOptionsType(const std::string& name) const; - - /// \brief The number of currently registered functions - int num_functions() const; - - private: - FunctionRegistry(); - - // Use PIMPL pattern to not have std::unordered_map here - class FunctionRegistryImpl; - std::unique_ptr impl_; - }; - - /// \brief Return the process-global function registry - ARROW_EXPORT FunctionRegistry* GetFunctionRegistry(); - -} // namespace compute -} // namespace arrow diff --git a/cpp/perspective/src/include/perspective/vendor/arrow_single_threaded_reader.h b/cpp/perspective/src/include/perspective/vendor/arrow_single_threaded_reader.h deleted file mode 100644 index ed668b6a70..0000000000 --- a/cpp/perspective/src/include/perspective/vendor/arrow_single_threaded_reader.h +++ /dev/null @@ -1,72 +0,0 @@ -/****************************************************************************** - * - * Copyright (c) 2019, the Perspective Authors. - * - * This file is part of the Perspective library, distributed under the terms of - * the Apache License 2.0. The full license can be found in the LICENSE file. - * - * Originally forked from - * https://github.com/apache/arrow/blob/apache-arrow-1.0.1/cpp/src/arrow/csv/reader.h - * - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -#include - -#include "arrow/csv/options.h" // IWYU pragma: keep -#include "arrow/io/interfaces.h" -#include "arrow/record_batch.h" -#include "arrow/result.h" -#include "arrow/type.h" -#include "arrow/type_fwd.h" -// #include "arrow/util/future.h" -// #include "arrow/util/thread_pool.h" -#include "arrow/util/visibility.h" - -namespace arrow { -namespace io { - class InputStream; -} // namespace io - -namespace csv { - - /// A class that reads an entire CSV file into a Arrow Table - class ARROW_EXPORT TableReader { - public: - virtual ~TableReader() = default; - - /// Read the entire CSV file and convert it to a Arrow Table - virtual Result> Read() = 0; - /// Read the entire CSV file and convert it to a Arrow Table - // virtual Future> ReadAsync() = 0; - - /// Create a TableReader instance - static Result> - Make(io::IOContext io_context, std::shared_ptr input, const ReadOptions&, const ParseOptions&, const ConvertOptions&); - - ARROW_DEPRECATED( - "Use MemoryPool-less variant (the IOContext holds a pool already)" - ) - static Result> - Make(MemoryPool* pool, const io::IOContext& io_context, const std::shared_ptr& input, const ReadOptions&, const ParseOptions&, const ConvertOptions&); - }; - -} // namespace csv -} // namespace arrow diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 7d29ba7c33..dc43110eb1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -831,6 +831,8 @@ importers: specifier: ^0.1.16 version: 0.1.16 + rust/perspective: {} + rust/perspective-js: dependencies: stoppable: @@ -986,6 +988,9 @@ importers: perspective-2-9-0: specifier: npm:@finos/perspective@2.9.0 version: '@finos/perspective@2.9.0' + perspective-3-0-0: + specifier: npm:@finos/perspective@3.0.0 + version: '@finos/perspective@3.0.0' devDependencies: '@finos/perspective': specifier: workspace:^ @@ -2285,6 +2290,9 @@ packages: resolution: {integrity: sha512-O40EzleYRpmX2k8xwbcJfWlgi4QDJ2cUJlmEA0wG1LLVmSOQD7vFeQK9H2v6xQBwn6+/andpDLglrzDYCEBOHg==} engines: {node: '>=14'} + '@finos/perspective@3.0.0': + resolution: {integrity: sha512-myWGlfxN97vV3qvv34QAxCH389Oo2SH0EGN4eyrGdZSb+UlFjv9HR1TYM18GRhdM1ASg7jFwnIvGDcw7oM+qNA==} + '@fontsource/roboto-mono@4.5.10': resolution: {integrity: sha512-KrJdmkqz6DszT2wV/bbhXef4r0hV3B0vw2mAqei8A2kRnvq+gcJLmmIeQ94vu9VEXrUQzos5M9lH1TAAXpRphw==} @@ -11361,6 +11369,14 @@ snapshots: - bufferutil - utf-8-validate + '@finos/perspective@3.0.0': + dependencies: + stoppable: 1.1.0 + ws: 8.17.0 + transitivePeerDependencies: + - bufferutil + - utf-8-validate + '@fontsource/roboto-mono@4.5.10': {} '@fortawesome/fontawesome-free@5.15.4': {} diff --git a/rust/perspective-js/test/js/constructors.spec.js b/rust/perspective-js/test/js/constructors.spec.js index 06b3918db6..b4b578b76f 100644 --- a/rust/perspective-js/test/js/constructors.spec.js +++ b/rust/perspective-js/test/js/constructors.spec.js @@ -149,7 +149,7 @@ let arrow_lists_data = { ], float_arr: [ "[]", - "[3.14,3.141592653589793,6.283185307179586,1.4142135623730952]", + "[3.14,3.141592653589793,6.283185307179586,1.4142135623730951]", "[12.00001]", "[100000000.0,100000000.0,100000000.0,100000000.0]", ], diff --git a/rust/perspective-python/build.mjs b/rust/perspective-python/build.mjs index 16a17cb7bd..d52ff2ddfe 100644 --- a/rust/perspective-python/build.mjs +++ b/rust/perspective-python/build.mjs @@ -11,11 +11,12 @@ // ┗━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┛ import * as fs from "node:fs"; -import pkg from "./package.json" assert { type: "json" }; +import * as path from "node:path"; import sh from "../../tools/perspective-scripts/sh.mjs"; import * as url from "url"; const __dirname = url.fileURLToPath(new URL(".", import.meta.url)).slice(0, -1); +const pkg = JSON.parse(fs.readFileSync(path.join(__dirname, "./package.json"))); let flags = "--release"; if (!!process.env.PSP_DEBUG) { diff --git a/tools/perspective-bench/package.json b/tools/perspective-bench/package.json index a55715f06d..a81e1cfe7a 100644 --- a/tools/perspective-bench/package.json +++ b/tools/perspective-bench/package.json @@ -26,6 +26,7 @@ "express-ws": "^5.0.2" }, "dependencies": { + "perspective-3-0-0": "npm:@finos/perspective@3.0.0", "perspective-2-10-0": "npm:@finos/perspective@2.10.0", "perspective-2-9-0": "npm:@finos/perspective@2.9.0", "perspective-2-8-0": "npm:@finos/perspective@2.8.0", diff --git a/tools/perspective-scripts/lint_cpp.mjs b/tools/perspective-scripts/lint_cpp.mjs index bec8199c41..f63a1298de 100644 --- a/tools/perspective-scripts/lint_cpp.mjs +++ b/tools/perspective-scripts/lint_cpp.mjs @@ -89,33 +89,12 @@ function clangFormatFix(dir) { export function checkFormatting() { formatLint(sh.path`./cpp/perspective/src/cpp/*.cpp`); - formatLint(sh.path`./cpp/perspective/src/cpp/vendor/*.cpp`); formatLint(sh.path`./cpp/perspective/src/include/perspective/*.h`); - formatLint(sh.path`./cpp/perspective/src/include/perspective/vendor/*.h`); - // formatLint(sh.path`./python/perspective/perspective/src/*.cpp`); - // formatLint( - // sh.path`./python/perspective/perspective/include/perspective/*.h` - // ); - // formatLint( - // sh.path`./python/perspective/perspective/include/perspective/python/*.h` - // ); - // tidyLint(); } export function fixFormatting() { // tidyLint("-fix"); clangFormatFix(sh.path`./cpp/perspective/src/cpp/*.cpp`); - clangFormatFix(sh.path`./cpp/perspective/src/cpp/vendor/*.cpp`); clangFormatFix(sh.path`./cpp/perspective/src/include/perspective/*.h`); - clangFormatFix( - sh.path`./cpp/perspective/src/include/perspective/vendor/*.h` - ); - // clangFormatFix(sh.path`./python/perspective/perspective/src/*.cpp`); - // clangFormatFix( - // sh.path`./python/perspective/perspective/include/perspective/*.h` - // ); - // clangFormatFix( - // sh.path`./python/perspective/perspective/include/perspective/python/*.h` - // ); } diff --git a/tools/perspective-scripts/lint_headers.mjs b/tools/perspective-scripts/lint_headers.mjs index 177b3da5e2..beca49e9a5 100644 --- a/tools/perspective-scripts/lint_headers.mjs +++ b/tools/perspective-scripts/lint_headers.mjs @@ -20,14 +20,7 @@ const IGNORE_PATHS = fs_sync .split("\n") .map((x) => x.trim()) .filter((x) => x.length > 0 && !x.startsWith("#")) - .concat([ - "llvm/*", - "cmake/*", - "pnpm-lock.yaml", - "pnpm-workspace.yaml", - "cpp/perspective/src/cpp/vendor", - "cpp/perspective/src/include/perspective/vendor", - ]); + .concat(["llvm/*", "cmake/*", "pnpm-lock.yaml", "pnpm-workspace.yaml"]); const FIX_PATHS = [ ["**/*.rs", ["//", "/*", " *", " */"]], diff --git a/tools/perspective-scripts/test_js.mjs b/tools/perspective-scripts/test_js.mjs index ef4a475523..313fe50124 100644 --- a/tools/perspective-scripts/test_js.mjs +++ b/tools/perspective-scripts/test_js.mjs @@ -30,6 +30,7 @@ const IS_PLAYWRIGHT = process.env.PACKAGE.split(",").reduce( [ "perspective-cli", "perspective-js", + "perspective", "perspective-viewer", "perspective-viewer-datagrid", "perspective-viewer-d3fc",