diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000..71722b21777e6 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "cpp/submodules/parquet-testing"] + path = cpp/submodules/parquet-testing + url = https://github.com/apache/parquet-testing.git diff --git a/.travis.yml b/.travis.yml index f77e22da56ef9..d919baff6d386 100644 --- a/.travis.yml +++ b/.travis.yml @@ -60,6 +60,7 @@ matrix: - ARROW_TRAVIS_ORC=1 - ARROW_TRAVIS_CLANG_FORMAT=1 - ARROW_TRAVIS_COVERAGE=1 + - ARROW_TRAVIS_PARQUET=1 - ARROW_TRAVIS_PYTHON_DOCS=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN - ARROW_TRAVIS_PYTHON_JVM=1 @@ -77,11 +78,11 @@ matrix: - $TRAVIS_BUILD_DIR/ci/travis_install_clang_tools.sh - $TRAVIS_BUILD_DIR/ci/travis_lint.sh # If either C++ or Python changed, we must install the C++ libraries + - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: # All test steps are required for accurate C++ coverage info - $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh - - $TRAVIS_BUILD_DIR/ci/travis_build_parquet_cpp.sh # Build Arrow Java to test the pyarrow<->JVM in-process bridge - $TRAVIS_BUILD_DIR/ci/travis_script_java.sh # Only run Plasma tests with valgrind in one of the Python builds because @@ -102,14 +103,15 @@ matrix: - ARROW_TRAVIS_USE_TOOLCHAIN=1 - ARROW_TRAVIS_PLASMA=1 - ARROW_TRAVIS_ORC=1 + - ARROW_TRAVIS_PARQUET=1 - ARROW_BUILD_WARNING_LEVEL=CHECKIN before_script: - if [ $ARROW_CI_PYTHON_AFFECTED != "1" ]; then exit; fi # If either C++ or Python changed, we must install the C++ libraries + - git submodule update --init - $TRAVIS_BUILD_DIR/ci/travis_before_script_cpp.sh script: - if [ $ARROW_CI_CPP_AFFECTED == "1" ]; then $TRAVIS_BUILD_DIR/ci/travis_script_cpp.sh; fi - - $TRAVIS_BUILD_DIR/ci/travis_build_parquet_cpp.sh - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 2.7 - $TRAVIS_BUILD_DIR/ci/travis_script_python.sh 3.6 # [manylinux1] Python diff --git a/appveyor.yml b/appveyor.yml index e03bfdfc9df2c..1bbdf65bd2ede 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -48,7 +48,6 @@ environment: GENERATOR: Ninja CONFIGURATION: "Release" APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017 - BOOST_ROOT: C:\Libraries\boost_1_64_0 - JOB: "Toolchain" GENERATOR: Visual Studio 14 2015 Win64 CONFIGURATION: "Release" @@ -67,8 +66,8 @@ environment: USE_CLCACHE: false MSVC_DEFAULT_OPTIONS: ON - BOOST_ROOT: C:\Libraries\boost_1_63_0 - BOOST_LIBRARYDIR: C:\Libraries\boost_1_63_0\lib64-msvc-14.0 + BOOST_ROOT: C:\Libraries\boost_1_67_0 + BOOST_LIBRARYDIR: C:\Libraries\boost_1_67_0\lib64-msvc-14.0 APPVEYOR_SAVE_CACHE_ON_ERROR: true install: diff --git a/ci/cpp-python-msvc-build.bat b/ci/cpp-python-msvc-build.bat index a0f5fbe8b36e6..522843dccb0c6 100644 --- a/ci/cpp-python-msvc-build.bat +++ b/ci/cpp-python-msvc-build.bat @@ -68,16 +68,20 @@ if "%JOB%" == "Build_Debug" ( exit /B 0 ) -conda create -n arrow -q -y python=%PYTHON% ^ +conda create -n arrow -q -y -c conda-forge ^ + python=%PYTHON% ^ six pytest setuptools numpy pandas cython ^ - thrift-cpp=0.11.0 + thrift-cpp=0.11.0 boost-cpp call activate arrow +@rem Use Boost from conda-forge +set BOOST_ROOT=%CONDA_PREFIX%\Library +set BOOST_LIBRARYDIR=%CONDA_PREFIX%\Library\lib + if "%JOB%" == "Toolchain" ( @rem Install pre-built "toolchain" packages for faster builds conda install -q -y -c conda-forge ^ - boost-cpp ^ brotli ^ cmake ^ flatbuffers ^ @@ -94,6 +98,10 @@ if "%JOB%" == "Toolchain" ( set ARROW_HOME=%CONDA_PREFIX%\Library +@rem Retrieve git submodules, configure env var for Parquet unit tests +git submodule update --init +set PARQUET_TEST_DATA=%CD%\cpp\submodules\parquet-testing\data + @rem Build and test Arrow C++ libraries mkdir cpp\build @@ -104,6 +112,7 @@ cmake -G "%GENERATOR%" ^ -DARROW_BOOST_USE_SHARED=OFF ^ -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ -DARROW_CXXFLAGS="/WX /MP" ^ + -DARROW_PARQUET=ON ^ -DARROW_PYTHON=ON ^ .. || exit /B cmake --build . --target install --config %CONFIGURATION% || exit /B @@ -117,26 +126,7 @@ ctest -VV || exit /B set PYTHONHOME=%OLD_PYTHONHOME% popd -@rem Build parquet-cpp - -git clone https://github.com/apache/parquet-cpp.git || exit /B -mkdir parquet-cpp\build -pushd parquet-cpp\build - -set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library -set PARQUET_HOME=%CONDA_PREFIX%\Library -cmake -G "%GENERATOR%" ^ - -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^ - -DCMAKE_BUILD_TYPE=%CONFIGURATION% ^ - -DPARQUET_BOOST_USE_SHARED=OFF ^ - -DPARQUET_BUILD_TESTS=OFF ^ - .. || exit /B -cmake --build . --target install --config %CONFIGURATION% || exit /B -popd - @rem Build and install pyarrow -@rem parquet-cpp has some additional runtime dependencies that we need to figure out -@rem see PARQUET-1018 pushd python @@ -149,6 +139,9 @@ set PYARROW_BUNDLE_BOOST=OFF set PYARROW_WITH_STATIC_BOOST=ON set PYARROW_WITH_PARQUET=ON +@rem ARROW-3075; pkgconfig is broken for Parquet for now +set PARQUET_HOME=%CONDA_PREFIX%\Library + python setup.py build_ext ^ install -q --single-version-externally-managed --record=record.text ^ bdist_wheel -q || exit /B diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 541d5fd937d92..e1c231ce4d89e 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -82,6 +82,10 @@ if [ $ARROW_TRAVIS_ORC == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_ORC=ON" fi +if [ $ARROW_TRAVIS_PARQUET == "1" ]; then + CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_PARQUET=ON" +fi + if [ $ARROW_TRAVIS_VALGRIND == "1" ]; then CMAKE_COMMON_FLAGS="$CMAKE_COMMON_FLAGS -DARROW_TEST_MEMCHECK=ON" fi diff --git a/ci/travis_build_parquet_cpp.sh b/ci/travis_build_parquet_cpp.sh deleted file mode 100755 index f64a85d621dcc..0000000000000 --- a/ci/travis_build_parquet_cpp.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set -e - -source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh - -source $TRAVIS_BUILD_DIR/ci/travis_install_toolchain.sh - -export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN -export ARROW_HOME=$ARROW_CPP_INSTALL - -PARQUET_DIR=$TRAVIS_BUILD_DIR/parquet -mkdir -p $PARQUET_DIR - -git clone -q https://github.com/apache/parquet-cpp.git $PARQUET_DIR - -pushd $PARQUET_DIR -mkdir build-dir -cd build-dir - -cmake \ - -GNinja \ - -DCMAKE_BUILD_TYPE=debug \ - -DCMAKE_INSTALL_PREFIX=$ARROW_PYTHON_PARQUET_HOME \ - -DPARQUET_BOOST_USE_SHARED=on \ - -DPARQUET_BUILD_BENCHMARKS=off \ - -DPARQUET_BUILD_EXECUTABLES=off \ - -DPARQUET_BUILD_TESTS=off \ - .. - -ninja -ninja install - -popd diff --git a/ci/travis_env_common.sh b/ci/travis_env_common.sh index 622a7959f4c9f..009dff19ecc3a 100755 --- a/ci/travis_env_common.sh +++ b/ci/travis_env_common.sh @@ -44,8 +44,6 @@ export ARROW_CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build export ARROW_C_GLIB_INSTALL_AUTOTOOLS=$TRAVIS_BUILD_DIR/c-glib-install-autotools export ARROW_C_GLIB_INSTALL_MESON=$TRAVIS_BUILD_DIR/c-glib-install-meson -export ARROW_PYTHON_PARQUET_HOME=$TRAVIS_BUILD_DIR/parquet-env - export CMAKE_EXPORT_COMPILE_COMMANDS=1 export ARROW_BUILD_TYPE=${ARROW_BUILD_TYPE:=debug} @@ -70,3 +68,5 @@ fi if [ $TRAVIS_OS_NAME == "osx" ]; then export GOPATH=$TRAVIS_BUILD_DIR/gopath fi + +export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/cpp/submodules/parquet-testing/data diff --git a/ci/travis_install_toolchain.sh b/ci/travis_install_toolchain.sh index 924f75214fa35..f6b5cf88c7c84 100755 --- a/ci/travis_install_toolchain.sh +++ b/ci/travis_install_toolchain.sh @@ -40,8 +40,4 @@ if [ ! -e $CPP_TOOLCHAIN ]; then thrift-cpp=0.11.0 \ zlib \ zstd - - # HACK(wesm): We started experiencing OpenSSL failures when Miniconda was - # updated sometime on October 2 or October 3 -# conda update -y -q -p $CPP_TOOLCHAIN ca-certificates -c defaults fi diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 1f774dad5a667..e12a37391c7a6 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -23,8 +23,8 @@ source $TRAVIS_BUILD_DIR/ci/travis_env_common.sh source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh export ARROW_HOME=$ARROW_CPP_INSTALL -export PARQUET_HOME=$ARROW_PYTHON_PARQUET_HOME -export LD_LIBRARY_PATH=$ARROW_HOME/lib:$PARQUET_HOME/lib:$LD_LIBRARY_PATH +export PARQUET_HOME=$ARROW_CPP_INSTALL +export LD_LIBRARY_PATH=$ARROW_HOME/lib:$LD_LIBRARY_PATH export PYARROW_CXXFLAGS="-Werror" PYARROW_PYTEST_FLAGS=" -r sxX --durations=15 --parquet" diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9f41142492a92..462581c3f4738 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -275,6 +275,20 @@ Pass multiple labels by dividing with semicolons") "Build Arrow with statically linked CRT" OFF) endif() + + # Parquet-related build options + option(ARROW_PARQUET + "Build the Parquet libraries" + OFF) + + option(PARQUET_MINIMAL_DEPENDENCY + "Depend only on Thirdparty headers to build libparquet. \ +Always OFF if building binaries" + OFF) + + set(PARQUET_ARROW_LINKAGE "shared" CACHE STRING + "How to link Arrow libraries with libparquet.so. static|shared (default shared)") + endif() if(ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) @@ -584,6 +598,8 @@ endif(UNIX) ############################################################ set(ARROW_LINK_LIBS) + +# Libraries to link statically with libarrow.so set(ARROW_STATIC_LINK_LIBS) if (ARROW_WITH_BROTLI) @@ -643,11 +659,8 @@ set(ARROW_BENCHMARK_LINK_LIBS gtest ${ARROW_STATIC_LINK_LIBS}) -set(ARROW_LINK_LIBS - ${ARROW_STATIC_LINK_LIBS} - ${ARROW_LINK_LIBS}) - set(ARROW_SHARED_PRIVATE_LINK_LIBS + ${ARROW_STATIC_LINK_LIBS} ${BOOST_SYSTEM_LIBRARY} ${BOOST_FILESYSTEM_LIBRARY} ${BOOST_REGEX_LIBRARY}) @@ -729,3 +742,7 @@ endif() if(ARROW_HIVESERVER2) add_subdirectory(src/arrow/dbi/hiveserver2) endif() + +if(ARROW_PARQUET) + add_subdirectory(src/parquet) +endif() diff --git a/cpp/build-support/lint_cpp_cli.py b/cpp/build-support/lint_cpp_cli.py index d6b0c2b6e9d6e..d330d8be50dbb 100644 --- a/cpp/build-support/lint_cpp_cli.py +++ b/cpp/build-support/lint_cpp_cli.py @@ -19,6 +19,8 @@ import argparse import re import os +import sys +import traceback parser = argparse.ArgumentParser( description="Check for illegal headers for C++/CLI applications") @@ -59,23 +61,29 @@ def lint_file(path): 'arrow/util/macros.h', 'arrow/python/iterators.h', 'arrow/util/parallel.h', - 'arrow/io/hdfs-internal.h' + 'arrow/io/hdfs-internal.h', + 'parquet/arrow/test-util.h', + 'parquet/encoding-internal.h', + 'parquet/test-util.h' ] - -for dirpath, _, filenames in os.walk(arguments.source_path): - for filename in filenames: - full_path = os.path.join(dirpath, filename) - - exclude = False - for exclusion in EXCLUSIONS: - if exclusion in full_path: - exclude = True - break - - if exclude: - continue - - # Only run on header files - if filename.endswith('.h'): - lint_file(full_path) +try: + for dirpath, _, filenames in os.walk(arguments.source_path): + for filename in filenames: + full_path = os.path.join(dirpath, filename) + + exclude = False + for exclusion in EXCLUSIONS: + if exclusion in full_path: + exclude = True + break + + if exclude: + continue + + # Only run on header files + if filename.endswith('.h'): + lint_file(full_path) +except Exception: + traceback.print_exc() + sys.exit(1) diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 95eadad4c93c6..e0564175e3fb1 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -89,15 +89,20 @@ function(ADD_THIRDPARTY_LIB LIB_NAME) endif() endfunction() +# \arg OUTPUTS list to append built targets to function(ADD_ARROW_LIB LIB_NAME) set(options) set(one_value_args SHARED_LINK_FLAGS) - set(multi_value_args SOURCES STATIC_LINK_LIBS STATIC_PRIVATE_LINK_LIBS SHARED_LINK_LIBS SHARED_PRIVATE_LINK_LIBS EXTRA_INCLUDES DEPENDENCIES) + set(multi_value_args SOURCES OUTPUTS STATIC_LINK_LIBS STATIC_PRIVATE_LINK_LIBS SHARED_LINK_LIBS SHARED_PRIVATE_LINK_LIBS EXTRA_INCLUDES DEPENDENCIES) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if(ARG_UNPARSED_ARGUMENTS) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") endif() + if (ARG_OUTPUTS) + set(${ARG_OUTPUTS}) + endif() + if(MSVC) set(LIB_DEPS ${ARG_SOURCES}) set(EXTRA_DEPS ${ARG_DEPENDENCIES}) @@ -117,6 +122,10 @@ function(ADD_ARROW_LIB LIB_NAME) set(LIB_INCLUDES) set(EXTRA_DEPS) + if (ARG_OUTPUTS) + list(APPEND ${ARG_OUTPUTS} ${LIB_NAME}_objlib) + endif() + if (ARG_EXTRA_INCLUDES) target_include_directories(${LIB_NAME}_objlib SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES} @@ -132,6 +141,10 @@ function(ADD_ARROW_LIB LIB_NAME) add_dependencies(${LIB_NAME}_shared ${EXTRA_DEPS}) endif() + if (ARG_OUTPUTS) + list(APPEND ${ARG_OUTPUTS} ${LIB_NAME}_shared) + endif() + if (LIB_INCLUDES) target_include_directories(${LIB_NAME}_shared SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES} @@ -195,6 +208,10 @@ function(ADD_ARROW_LIB LIB_NAME) add_dependencies(${LIB_NAME}_static ${EXTRA_DEPS}) endif() + if (ARG_OUTPUTS) + list(APPEND ${ARG_OUTPUTS} ${LIB_NAME}_static) + endif() + if (LIB_INCLUDES) target_include_directories(${LIB_NAME}_static SYSTEM PUBLIC ${ARG_EXTRA_INCLUDES} @@ -203,7 +220,6 @@ function(ADD_ARROW_LIB LIB_NAME) if (MSVC) set(LIB_NAME_STATIC ${LIB_NAME}_static) - target_compile_definitions(${LIB_NAME}_static PUBLIC ARROW_STATIC) else() set(LIB_NAME_STATIC ${LIB_NAME}) endif() @@ -223,6 +239,10 @@ function(ADD_ARROW_LIB LIB_NAME) ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() + # Modify variable in calling scope + if (ARG_OUTPUTS) + set(${ARG_OUTPUTS} ${${ARG_OUTPUTS}} PARENT_SCOPE) + endif() endfunction() @@ -305,10 +325,20 @@ endfunction() # ctest. # # Arguments after the test name will be passed to set_tests_properties(). +# +# \arg PREFIX a string to append to the name of the test executable. For +# example, if you have src/arrow/foo/bar-test.cc, then PREFIX "foo" will create +# test executable foo-bar-test +# \arg LABELS the unit test label or labels to assign the unit tests +# to. By default, unit tests will go in the "unittest" group, but if we have +# multiple unit tests in some subgroup, you can assign a test to multiple +# groups using the syntax unittest;GROUP2;GROUP3. Custom targets for the group +# names must exist function(ADD_ARROW_TEST REL_TEST_NAME) set(options NO_VALGRIND) set(one_value_args) - set(multi_value_args STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES EXTRA_DEPENDENCIES LABELS) + set(multi_value_args STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES EXTRA_DEPENDENCIES + LABELS PREFIX) cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) if(ARG_UNPARSED_ARGUMENTS) message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") @@ -331,6 +361,16 @@ function(ADD_ARROW_TEST REL_TEST_NAME) endif() get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + if(ARG_PREFIX) + set(TEST_NAME "${ARG_PREFIX}-${TEST_NAME}") + endif() + + if (ARG_LABELS) + set(ARG_LABELS "unittest;${ARG_LABELS}") + else() + set(ARG_LABELS unittest) + endif() + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}.cc) # This test has a corresponding .cc file, set it up as an executable. set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") @@ -357,7 +397,9 @@ function(ADD_ARROW_TEST REL_TEST_NAME) add_dependencies(${TEST_NAME} ${ARG_EXTRA_DEPENDENCIES}) endif() - add_dependencies(unittest ${TEST_NAME}) + foreach (TEST_LABEL ${ARG_LABELS}) + add_dependencies(${TEST_LABEL} ${TEST_NAME}) + endforeach() else() # No executable, just invoke the test (probably a script) directly. set(TEST_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_TEST_NAME}) @@ -380,13 +422,7 @@ function(ADD_ARROW_TEST REL_TEST_NAME) set_property(TEST ${TEST_NAME} APPEND PROPERTY - LABELS "unittest") - - if (ARG_LABELS) - set_property(TEST ${TEST_NAME} - APPEND PROPERTY - LABELS ${ARG_LABELS}) - endif() + LABELS ${ARG_LABELS}) endfunction() # A wrapper for add_dependencies() that is compatible with NO_TESTS. diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake index 59792e80103a8..87714b349d6d0 100644 --- a/cpp/cmake_modules/SetupCxxFlags.cmake +++ b/cpp/cmake_modules/SetupCxxFlags.cmake @@ -30,10 +30,6 @@ if (MSVC) # insecure, like std::getenv add_definitions(-D_CRT_SECURE_NO_WARNINGS) - # Use __declspec(dllexport) during library build, other users of the Arrow - # headers will see dllimport - add_definitions(-DARROW_EXPORTING) - # ARROW-1931 See https://github.com/google/googletest/issues/1318 # # This is added to CMAKE_CXX_FLAGS instead of CXX_COMMON_FLAGS since only the @@ -106,7 +102,7 @@ if ("${UPPERCASE_BUILD_WARNING_LEVEL}" STREQUAL "CHECKIN") -Wno-cast-align -Wno-vla-extension -Wno-shift-sign-overflow \ -Wno-used-but-marked-unused -Wno-missing-variable-declarations \ -Wno-gnu-zero-variadic-macro-arguments -Wconversion -Wno-sign-conversion \ --Wno-disabled-macro-expansion") +-Wno-disabled-macro-expansion -Wno-format-nonliteral -Wno-missing-noreturn") # Version numbers where warnings are introduced if ("${COMPILER_VERSION}" VERSION_GREATER "3.3") diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index a7c4b20c886fd..36dcf3017d70e 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -106,6 +106,18 @@ if (DEFINED ENV{ZSTD_HOME}) set(ZSTD_HOME "$ENV{ZSTD_HOME}") endif() +# ---------------------------------------------------------------------- +# Some EP's require other EP's + +if (ARROW_THRIFT OR ARROW_WITH_ZLIB) + set(ARROW_WITH_ZLIB ON) +endif() + +if (ARROW_HIVESERVER2 OR ARROW_PARQUET) + set(ARROW_WITH_THRIFT ON) +else() + set(ARROW_WITH_THRIFT OFF) +endif() # ---------------------------------------------------------------------- # Versions and URLs for toolchain builds, which also can be used to configure @@ -295,8 +307,11 @@ if (ARROW_BOOST_VENDORED) "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_system${CMAKE_STATIC_LIBRARY_SUFFIX}") set(BOOST_STATIC_FILESYSTEM_LIBRARY "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_filesystem${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(BOOST_STATIC_REGEX_LIBRARY + "${BOOST_LIB_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}boost_regex${CMAKE_STATIC_LIBRARY_SUFFIX}") set(BOOST_SYSTEM_LIBRARY "${BOOST_STATIC_SYSTEM_LIBRARY}") set(BOOST_FILESYSTEM_LIBRARY "${BOOST_STATIC_FILESYSTEM_LIBRARY}") + set(BOOST_REGEX_LIBRARY "${BOOST_STATIC_REGEX_LIBRARY}") if (ARROW_BOOST_HEADER_ONLY) set(BOOST_BUILD_PRODUCTS) set(BOOST_CONFIGURE_COMMAND "") @@ -304,11 +319,12 @@ if (ARROW_BOOST_VENDORED) else() set(BOOST_BUILD_PRODUCTS ${BOOST_SYSTEM_LIBRARY} - ${BOOST_FILESYSTEM_LIBRARY}) + ${BOOST_FILESYSTEM_LIBRARY} + ${BOOST_REGEX_LIBRARY}) set(BOOST_CONFIGURE_COMMAND "./bootstrap.sh" "--prefix=${BOOST_PREFIX}" - "--with-libraries=filesystem,system") + "--with-libraries=filesystem,regex,system") if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_BUILD_VARIANT "debug") else() @@ -348,16 +364,19 @@ else() if (ARROW_BOOST_HEADER_ONLY) find_package(Boost REQUIRED) else() - find_package(Boost COMPONENTS system filesystem REQUIRED) + find_package(Boost COMPONENTS regex system filesystem REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_SHARED_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_SHARED_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_SHARED_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_shared) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_shared) + set(BOOST_REGEX_LIBRARY boost_regex_shared) endif() else() # Find static boost headers and libs @@ -366,16 +385,19 @@ else() if (ARROW_BOOST_HEADER_ONLY) find_package(Boost REQUIRED) else() - find_package(Boost COMPONENTS system filesystem REQUIRED) + find_package(Boost COMPONENTS regex system filesystem REQUIRED) if ("${CMAKE_BUILD_TYPE}" STREQUAL "DEBUG") set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_DEBUG}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_DEBUG}) + set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_DEBUG}) else() set(BOOST_STATIC_SYSTEM_LIBRARY ${Boost_SYSTEM_LIBRARY_RELEASE}) set(BOOST_STATIC_FILESYSTEM_LIBRARY ${Boost_FILESYSTEM_LIBRARY_RELEASE}) + set(BOOST_STATIC_REGEX_LIBRARY ${Boost_REGEX_LIBRARY_RELEASE}) endif() set(BOOST_SYSTEM_LIBRARY boost_system_static) set(BOOST_FILESYSTEM_LIBRARY boost_filesystem_static) + set(BOOST_REGEX_LIBRARY boost_regex_static) endif() endif() endif() @@ -392,6 +414,10 @@ if (NOT ARROW_BOOST_HEADER_ONLY) STATIC_LIB "${BOOST_STATIC_FILESYSTEM_LIBRARY}" SHARED_LIB "${BOOST_SHARED_FILESYSTEM_LIBRARY}") + ADD_THIRDPARTY_LIB(boost_regex + STATIC_LIB "${BOOST_STATIC_REGEX_LIBRARY}" + SHARED_LIB "${BOOST_SHARED_REGEX_LIBRARY}") + SET(ARROW_BOOST_LIBS boost_system boost_filesystem) endif() @@ -1122,36 +1148,12 @@ endif() # ---------------------------------------------------------------------- # Thrift -if (ARROW_HIVESERVER2) +if (ARROW_WITH_THRIFT) # find thrift headers and libs find_package(Thrift) if (NOT THRIFT_FOUND) - set(ZLIB_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/zlib_ep/src/zlib_ep-install") - set(ZLIB_HOME "${ZLIB_PREFIX}") - set(ZLIB_INCLUDE_DIR "${ZLIB_PREFIX}/include") - if (MSVC) - if (${UPPERCASE_BUILD_TYPE} STREQUAL "DEBUG") - set(ZLIB_STATIC_LIB_NAME zlibstaticd.lib) - else() - set(ZLIB_STATIC_LIB_NAME zlibstatic.lib) - endif() - else() - set(ZLIB_STATIC_LIB_NAME libz.a) - endif() - set(ZLIB_STATIC_LIB "${ZLIB_PREFIX}/lib/${ZLIB_STATIC_LIB_NAME}") - set(ZLIB_CMAKE_ARGS -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} - -DCMAKE_INSTALL_PREFIX=${ZLIB_PREFIX} - -DCMAKE_C_FLAGS=${EP_C_FLAGS} - -DBUILD_SHARED_LIBS=OFF) - ExternalProject_Add(zlib_ep - URL "http://zlib.net/fossils/zlib-1.2.8.tar.gz" - BUILD_BYPRODUCTS "${ZLIB_STATIC_LIB}" - ${ZLIB_BUILD_BYPRODUCTS} - ${EP_LOG_OPTIONS} - CMAKE_ARGS ${ZLIB_CMAKE_ARGS}) - set(THRIFT_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/thrift_ep/src/thrift_ep-install") set(THRIFT_HOME "${THRIFT_PREFIX}") set(THRIFT_INCLUDE_DIR "${THRIFT_PREFIX}/include") diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 1f0a62530232a..b46f35c2c0a69 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -145,6 +145,7 @@ set(ARROW_ALL_SRCS ADD_ARROW_LIB(arrow SOURCES ${ARROW_ALL_SRCS} + OUTPUTS ARROW_LIBRARIES DEPENDENCIES arrow_dependencies SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} SHARED_LINK_LIBS ${ARROW_LINK_LIBS} @@ -153,6 +154,15 @@ ADD_ARROW_LIB(arrow STATIC_PRIVATE_LINK_LIBS ${ARROW_STATIC_PRIVATE_LINK_LIBS} ) +foreach(LIB_TARGET ${ARROW_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} + PRIVATE ARROW_EXPORTING) +endforeach() + +if (ARROW_BUILD_STATIC AND MSVC) + target_compile_definitions(arrow_static PUBLIC ARROW_STATIC) +endif() + # Headers: top level install(FILES allocator.h diff --git a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt index 60d88daad99d5..ddeb26f4dff42 100644 --- a/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt +++ b/cpp/src/arrow/dbi/hiveserver2/CMakeLists.txt @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +add_custom_target(arrow_hiveserver2) + # Headers: top level install(FILES api.h @@ -85,11 +87,17 @@ set_target_properties(arrow_hiveserver2_thrift ADD_ARROW_LIB(arrow_hiveserver2 SOURCES ${ARROW_HIVESERVER2_SRCS} + OUTPUTS ARROW_HIVESERVER2_LIBRARIES DEPENDENCIES arrow_hiveserver2_thrift SHARED_LINK_FLAGS "" SHARED_LINK_LIBS ${ARROW_PYTHON_SHARED_LINK_LIBS} ) +foreach(LIB_TARGET ${ARROW_HIVESERVER2_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} + PRIVATE ARROW_EXPORTING) +endforeach() + set_property(SOURCE ${ARROW_HIVESERVER2_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -Wno-shadow-field") @@ -102,7 +110,7 @@ set(ARROW_HIVESERVER2_TEST_LINK_LIBS ADD_ARROW_TEST(hiveserver2-test STATIC_LINK_LIBS "${ARROW_HIVESERVER2_TEST_LINK_LIBS}" - LABELS "hiveserver2" + LABELS "arrow_hiveserver2" ) set_property(TARGET hiveserver2-test diff --git a/cpp/src/arrow/dbi/hiveserver2/sample-usage.cc b/cpp/src/arrow/dbi/hiveserver2/sample-usage.cc index f16a81b07b65c..e2c6079c65a26 100644 --- a/cpp/src/arrow/dbi/hiveserver2/sample-usage.cc +++ b/cpp/src/arrow/dbi/hiveserver2/sample-usage.cc @@ -81,7 +81,7 @@ int main(int argc, char** argv) { unique_ptr execute_results; bool has_more_rows = true; - int total_retrieved = 0; + int64_t total_retrieved = 0; std::cout << "Contents of test:\n"; while (has_more_rows) { status = execute_op->Fetch(&execute_results, &has_more_rows); diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index 2235915c4ac26..a5b11e5a917ac 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -41,6 +41,7 @@ set(ARROW_GPU_SHARED_LINK_LIBS ADD_ARROW_LIB(arrow_gpu SOURCES ${ARROW_GPU_SRCS} + OUTPUTS ARROW_GPU_LIBRARIES DEPENDENCIES metadata_fbs SHARED_LINK_FLAGS "" SHARED_LINK_LIBS arrow_shared ${ARROW_GPU_SHARED_LINK_LIBS} @@ -48,6 +49,11 @@ ADD_ARROW_LIB(arrow_gpu STATIC_LINK_LIBS ${ARROW_GPU_SHARED_LINK_LIBS} ) +foreach(LIB_TARGET ${ARROW_GPU_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} + PRIVATE ARROW_EXPORTING) +endforeach() + # CUDA build version configure_file(cuda_version.h.in "${CMAKE_CURRENT_BINARY_DIR}/cuda_version.h" diff --git a/cpp/src/arrow/io/file.cc b/cpp/src/arrow/io/file.cc index 6cd1546f7c0a9..92ecb74e4c09f 100644 --- a/cpp/src/arrow/io/file.cc +++ b/cpp/src/arrow/io/file.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/io/windows_compatibility.h" +#include "arrow/util/windows_compatibility.h" // sys/mman.h not present in Visual Studio or Cygwin #ifdef _WIN32 diff --git a/cpp/src/arrow/io/hdfs-internal.h b/cpp/src/arrow/io/hdfs-internal.h index 9321b211988c8..3912f2f1144f7 100644 --- a/cpp/src/arrow/io/hdfs-internal.h +++ b/cpp/src/arrow/io/hdfs-internal.h @@ -23,8 +23,8 @@ #include -#include "arrow/io/windows_compatibility.h" // IWYU pragma: keep #include "arrow/util/visibility.h" +#include "arrow/util/windows_compatibility.h" // IWYU pragma: keep using std::size_t; diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index d6dcf2f32dbb1..798217d58624a 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -22,6 +22,8 @@ find_package(PythonLibsNew REQUIRED) find_package(NumPy REQUIRED) +add_custom_target(arrow_python) + set(ARROW_PYTHON_SRCS arrow_to_pandas.cc benchmark.cc @@ -65,12 +67,22 @@ set(ARROW_PYTHON_INCLUDES ADD_ARROW_LIB(arrow_python SOURCES ${ARROW_PYTHON_SRCS} + OUTPUTS ARROW_PYTHON_LIBRARIES SHARED_LINK_FLAGS "" SHARED_LINK_LIBS ${ARROW_PYTHON_SHARED_LINK_LIBS} STATIC_LINK_LIBS "${PYTHON_OTHER_LIBS}" EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" ) +foreach(LIB_TARGET ${ARROW_PYTHON_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} + PRIVATE ARROW_EXPORTING) +endforeach() + +if (ARROW_BUILD_STATIC AND MSVC) + target_compile_definitions(arrow_python_static PUBLIC ARROW_STATIC) +endif() + if ("${COMPILER_FAMILY}" STREQUAL "clang") # Clang, be quiet. Python C API has lots of macros set_property(SOURCE ${ARROW_PYTHON_SRCS} @@ -143,6 +155,6 @@ if (ARROW_BUILD_TESTS) STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}" EXTRA_LINK_LIBS ${PYTHON_LIBRARIES} EXTRA_INCLUDES "${ARROW_PYTHON_INCLUDES}" - LABELS "python" + LABELS "arrow_python" NO_VALGRIND) endif() diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 8e128809e26b2..178df73ad5d9b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -178,9 +178,6 @@ inline std::ostream& operator<<(std::ostream& os, const DataType& type) { return os; } -// TODO(wesm): Remove this from parquet-cpp -using TypePtr = std::shared_ptr; - class ARROW_EXPORT FixedWidthType : public DataType { public: using DataType::DataType; diff --git a/cpp/src/arrow/util/io-util.cc b/cpp/src/arrow/util/io-util.cc index c191e52c1fe0a..8e578e4f5394c 100644 --- a/cpp/src/arrow/util/io-util.cc +++ b/cpp/src/arrow/util/io-util.cc @@ -22,7 +22,7 @@ #define _FILE_OFFSET_BITS 64 -#include "arrow/io/windows_compatibility.h" +#include "arrow/util/windows_compatibility.h" #include #include diff --git a/cpp/src/arrow/util/visibility.h b/cpp/src/arrow/util/visibility.h index 119c55df3b619..34aa752fd2153 100644 --- a/cpp/src/arrow/util/visibility.h +++ b/cpp/src/arrow/util/visibility.h @@ -55,8 +55,10 @@ // This is a complicated topic, some reading on it: // http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/ #if defined(_MSC_VER) || defined(__clang__) +#define ARROW_TEMPLATE_CLASS_EXPORT #define ARROW_TEMPLATE_EXPORT ARROW_EXPORT #else +#define ARROW_TEMPLATE_CLASS_EXPORT ARROW_EXPORT #define ARROW_TEMPLATE_EXPORT #endif diff --git a/cpp/src/arrow/io/windows_compatibility.h b/cpp/src/arrow/util/windows_compatibility.h similarity index 89% rename from cpp/src/arrow/io/windows_compatibility.h rename to cpp/src/arrow/util/windows_compatibility.h index ac8f6aeeb5cac..aeef810cf85fe 100644 --- a/cpp/src/arrow/io/windows_compatibility.h +++ b/cpp/src/arrow/util/windows_compatibility.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_IO_WINDOWS_COMPATIBILITY -#define ARROW_IO_WINDOWS_COMPATIBILITY +#pragma once #ifdef _WIN32 @@ -32,5 +31,3 @@ // #include #endif // _WIN32 - -#endif // ARROW_IO_WINDOWS_COMPATIBILITY diff --git a/cpp/src/parquet/.parquetcppversion b/cpp/src/parquet/.parquetcppversion new file mode 100644 index 0000000000000..d65937f100bb5 --- /dev/null +++ b/cpp/src/parquet/.parquetcppversion @@ -0,0 +1 @@ +1.4.1-SNAPSHOT diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 93a242c68bfee..f73117955e6ea 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -15,6 +15,214 @@ # specific language governing permissions and limitations # under the License. +file(READ "${CMAKE_CURRENT_SOURCE_DIR}/.parquetcppversion" PARQUET_VERSION) +string(REPLACE "\n" "" PARQUET_VERSION "${PARQUET_VERSION}") +string(REGEX MATCH "^([0-9]+\.[0-9]+\.[0-9]+(\.[0-9]+)?)" VERSION ${PARQUET_VERSION}) +if(NOT VERSION) + message(FATAL_ERROR "invalid .parquetcppversion") +endif() + +# For "make parquet" to build everything Parquet-related +add_custom_target(parquet) + +function(ADD_PARQUET_TEST REL_TEST_NAME) + set(options NO_VALGRIND) + set(one_value_args) + set(multi_value_args EXTRA_DEPENDENCIES LABELS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + ADD_ARROW_TEST(${REL_TEST_NAME} + STATIC_LINK_LIBS ${PARQUET_TEST_LINK_LIBS} + PREFIX "parquet" + LABELS "parquet") +endfunction() + +# ---------------------------------------------------------------------- +# Link libraries setup + +# TODO(wesm): Handling of ABI/SO version + +if ("${PARQUET_ARROW_LINKAGE}" STREQUAL "shared") + set(PARQUET_ARROW_LINK_LIBS + arrow_shared) +else() + set(PARQUET_ARROW_LINK_LIBS + arrow_static + ${ARROW_STATIC_LINK_LIBS}) +endif() + +if (ARROW_BOOST_USE_SHARED) + set(PARQUET_BOOST_LINK_LIBS + boost_regex_shared) + if(MSVC) + set(PARQUET_BOOST_LINK_LIBS ${PARQUET_BOOST_LINK_LIBS} + boost_system_shared) + endif() +else() + set(PARQUET_BOOST_LINK_LIBS + boost_regex_static) + if(MSVC) + set(PARQUET_BOOST_LINK_LIBS ${PARQUET_BOOST_LINK_LIBS} + boost_system_static boost_filesystem_static) + endif() +endif() + +set(PARQUET_MIN_TEST_LIBS + gtest_main + gtest) + +if (APPLE) + set(PARQUET_MIN_TEST_LIBS + ${PARQUET_MIN_TEST_LIBS} + ${CMAKE_DL_LIBS}) +elseif(NOT MSVC) + set(PARQUET_MIN_TEST_LIBS + ${PARQUET_MIN_TEST_LIBS} + pthread + ${CMAKE_DL_LIBS}) +endif() + +set(PARQUET_TEST_LINK_LIBS + ${PARQUET_MIN_TEST_LIBS} + ${PARQUET_ARROW_LINK_LIBS} + parquet_static + thriftstatic) + +set(PARQUET_TEST_SHARED_LINK_LIBS + ${PARQUET_MIN_TEST_LIBS} + parquet_shared) + +############################################################# +# Benchmark linking + +if (PARQUET_BUILD_STATIC) + set(PARQUET_BENCHMARK_LINK_LIBS + parquet_benchmark_main + parquet_static) +else() + set(PARQUET_BENCHMARK_LINK_LIBS + parquet_benchmark_main + parquet_shared) +endif() + +############################################################ +# Generated Thrift sources + +if (NOT MSVC) + set_source_files_properties(src/parquet/parquet_types.cpp PROPERTIES + COMPILE_FLAGS -Wno-unused-variable) +endif() + +# List of thrift output targets +set(THRIFT_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(THRIFT_OUTPUT_FILES "${THRIFT_OUTPUT_DIR}/parquet_types.cpp") +set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${THRIFT_OUTPUT_DIR}/parquet_types.h") +set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${THRIFT_OUTPUT_DIR}/parquet_constants.cpp") +set(THRIFT_OUTPUT_FILES ${THRIFT_OUTPUT_FILES} "${THRIFT_OUTPUT_DIR}/parquet_constants.h") + +set_source_files_properties(${THRIFT_OUTPUT_FILES} PROPERTIES GENERATED TRUE) + +get_filename_component(ABS_PARQUET_THRIFT parquet.thrift ABSOLUTE) + +add_custom_command( + OUTPUT ${THRIFT_OUTPUT_FILES} + COMMAND ${THRIFT_COMPILER} --gen cpp -out ${THRIFT_OUTPUT_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/parquet.thrift + DEPENDS ${ABS_PARQUET_THRIFT} thriftstatic + COMMENT "Running thrift compiler on parquet.thrift" + VERBATIM +) + +############################################################ +# Library config + +set(PARQUET_SRCS + arrow/reader.cc + arrow/record_reader.cc + arrow/schema.cc + arrow/writer.cc + bloom_filter.cc + column_reader.cc + column_scanner.cc + column_writer.cc + file_reader.cc + file_writer.cc + metadata.cc + murmur3.cc + parquet_constants.cpp + parquet_types.cpp + printer.cc + schema.cc + statistics.cc + types.cc + util/comparison.cc + util/memory.cc +) + +# # Ensure that thrift compilation is done before using its generated headers +# # in parquet code. +add_custom_target(parquet-thrift-deps ALL + DEPENDS ${THRIFT_OUTPUT_FILES}) +set(PARQUET_DEPENDENCIES ${PARQUET_DEPENDENCIES} parquet-thrift-deps) + +if (NOT PARQUET_MINIMAL_DEPENDENCY) +# These are libraries that we will link privately with parquet_shared (as they +# do not need to be linked transitively by other linkers), but publicly with +# parquet_static (because internal users need to transitively link all +# dependencies) + set(PARQUET_INTERFACE_LINK_LIBS + ${PARQUET_ARROW_LINK_LIBS} + ${PARQUET_BOOST_LINK_LIBS} + thriftstatic + ) +# Although we don't link parquet_objlib against anything, we need it to depend +# on these libs as we may generate their headers via ExternalProject_Add + set(PARQUET_DEPENDENCIES ${PARQUET_DEPENDENCIES} ${PARQUET_INTERFACE_LINK_LIBS}) +endif() + +if(NOT APPLE AND NOT MSVC) + # Localize thirdparty symbols using a linker version script. This hides them + # from the client application. The OS X linker does not support the + # version-script option. + set(SHARED_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/src/parquet/symbols.map") +endif() + +ADD_ARROW_LIB(parquet + SOURCES ${PARQUET_SRCS} + OUTPUTS PARQUET_LIBRARIES + DEPENDENCIES ${PARQUET_DEPENDENCIES} + SHARED_LINK_FLAGS ${PARQUET_SHARED_LINK_FLAGS} + SHARED_PRIVATE_LINK_LIBS ${PARQUET_INTERFACE_LINK_LIBS} + STATIC_LINK_LIBS ${PARQUET_INTERFACE_LINK_LIBS} +) + +# Thrift requires these definitions for some types that we use +foreach(LIB_TARGET ${PARQUET_LIBRARIES}) + if ("${PARQUET_ARROW_LINKAGE}" STREQUAL "static") + target_compile_definitions(${LIB_TARGET} + PRIVATE ARROW_STATIC) + endif() + target_compile_definitions(${LIB_TARGET} + PRIVATE PARQUET_EXPORTING + PRIVATE HAVE_INTTYPES_H + PRIVATE HAVE_NETDB_H) + if (MSVC) + target_compile_definitions(${LIB_TARGET} + PRIVATE NOMINMAX) + else() + target_compile_definitions(${LIB_TARGET} + PRIVATE HAVE_NETINET_IN_H) + endif() +endforeach() + +add_dependencies(parquet ${PARQUET_LIBRARIES}) + +add_subdirectory(api) +add_subdirectory(arrow) +add_subdirectory(util) + # Headers: top level install(FILES bloom_filter.h @@ -37,11 +245,11 @@ install(FILES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") configure_file(parquet_version.h.in - "${CMAKE_CURRENT_SOURCE_DIR}/parquet_version.h" + "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h" @ONLY) install(FILES - "${CMAKE_CURRENT_SOURCE_DIR}/parquet_version.h" + "${CMAKE_CURRENT_BINARY_DIR}/parquet_version.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet") # pkg-config support @@ -68,5 +276,16 @@ ADD_PARQUET_TEST(types-test) ADD_PARQUET_TEST(reader-test) ADD_PARQUET_TEST(schema-test) -ADD_PARQUET_BENCHMARK(column-io-benchmark) -ADD_PARQUET_BENCHMARK(encoding-benchmark) +set(PARQUET_BENCHMARK_LINK_LIBRARIES + arrow_benchmark_main + parquet_static + arrow_static + thriftstatic) + +ADD_ARROW_BENCHMARK(column-io-benchmark) +ARROW_BENCHMARK_LINK_LIBRARIES(column-io-benchmark + ${PARQUET_BENCHMARK_LINK_LIBRARIES}) + +ADD_ARROW_BENCHMARK(encoding-benchmark) +ARROW_BENCHMARK_LINK_LIBRARIES(encoding-benchmark + ${PARQUET_BENCHMARK_LINK_LIBRARIES}) diff --git a/cpp/src/parquet/arrow/CMakeLists.txt b/cpp/src/parquet/arrow/CMakeLists.txt index 616555fae87c1..aa58a5d0d552e 100644 --- a/cpp/src/parquet/arrow/CMakeLists.txt +++ b/cpp/src/parquet/arrow/CMakeLists.txt @@ -19,7 +19,9 @@ ADD_PARQUET_TEST(arrow-schema-test) ADD_PARQUET_TEST(arrow-reader-writer-test) if(PARQUET_BUILD_BENCHMARKS) - ADD_PARQUET_BENCHMARK(arrow-reader-writer-benchmark) + ADD_ARROW_BENCHMARK(arrow-reader-writer-benchmark) + ARROW_BENCHMARK_LINK_LIBRARIES(arrow-reader-writer-benchmark + ${PARQUET_BENCHMARK_LINK_LIBRARIES}) endif() # Headers: top level diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-benchmark.cc b/cpp/src/parquet/arrow/arrow-reader-writer-benchmark.cc index 41cb88d6c424d..775c1028bb43f 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-benchmark.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-benchmark.cc @@ -98,7 +98,7 @@ void SetBytesProcessed(::benchmark::State& state) { template std::shared_ptr<::arrow::Table> TableFromVector( const std::vector& vec, bool nullable) { - ::arrow::TypePtr type = std::make_shared>(); + std::shared_ptr<::arrow::DataType> type = std::make_shared>(); NumericBuilder> builder; if (nullable) { std::vector valid_bytes(BENCHMARK_SIZE, 0); diff --git a/cpp/src/parquet/arrow/reader.cc b/cpp/src/parquet/arrow/reader.cc index 2e4dc8155787d..11fb20cd1cd8b 100644 --- a/cpp/src/parquet/arrow/reader.cc +++ b/cpp/src/parquet/arrow/reader.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "arrow/api.h" diff --git a/cpp/src/parquet/arrow/reader.h b/cpp/src/parquet/arrow/reader.h index db135da53e603..6eee0f6e254ce 100644 --- a/cpp/src/parquet/arrow/reader.h +++ b/cpp/src/parquet/arrow/reader.h @@ -204,6 +204,7 @@ class PARQUET_EXPORT FileReader { /// Set the number of threads to use during reads of multiple columns. By /// default only 1 thread is used /// \deprecated Use set_use_threads instead. + ARROW_DEPRECATED("Use set_use_threads instead") void set_num_threads(int num_threads); /// Set whether to use multiple threads during reads of multiple columns. diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index 781e1ba436d39..3fbdfd5861ad3 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -532,7 +533,7 @@ inline void TypedRecordReader::ReadValuesDense(int64_t values_to_ auto builder = static_cast<::arrow::BinaryBuilder*>(builder_.get()); for (int64_t i = 0; i < num_decoded; i++) { PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); + builder->Append(values[i].ptr, static_cast(values[i].len))); } ResetValues(); } @@ -568,7 +569,7 @@ inline void TypedRecordReader::ReadValuesSpaced(int64_t values_to for (int64_t i = 0; i < num_decoded; i++) { if (::arrow::BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { PARQUET_THROW_NOT_OK( - builder->Append(values[i].ptr, static_cast(values[i].len))); + builder->Append(values[i].ptr, static_cast(values[i].len))); } else { PARQUET_THROW_NOT_OK(builder->AppendNull()); } diff --git a/cpp/src/parquet/arrow/record_reader.h b/cpp/src/parquet/arrow/record_reader.h index f02bf0539850d..4935713a24ec1 100644 --- a/cpp/src/parquet/arrow/record_reader.h +++ b/cpp/src/parquet/arrow/record_reader.h @@ -32,7 +32,7 @@ #include "parquet/column_reader.h" #include "parquet/schema.h" -#include "parquet/util/visibility.h" +#include "parquet/util/macros.h" namespace parquet { namespace internal { diff --git a/cpp/src/parquet/arrow/schema.cc b/cpp/src/parquet/arrow/schema.cc index 48b1181cb0c5d..d0014a6f3aa2a 100644 --- a/cpp/src/parquet/arrow/schema.cc +++ b/cpp/src/parquet/arrow/schema.cc @@ -29,9 +29,9 @@ using arrow::Field; using arrow::Status; -using arrow::TypePtr; -using ArrowType = arrow::Type; +using ArrowType = arrow::DataType; +using ArrowTypeId = arrow::Type; using parquet::Repetition; using parquet::schema::GroupNode; @@ -50,12 +50,12 @@ const auto TIMESTAMP_MS = ::arrow::timestamp(::arrow::TimeUnit::MILLI); const auto TIMESTAMP_US = ::arrow::timestamp(::arrow::TimeUnit::MICRO); const auto TIMESTAMP_NS = ::arrow::timestamp(::arrow::TimeUnit::NANO); -TypePtr MakeDecimal128Type(const PrimitiveNode& node) { +std::shared_ptr MakeDecimal128Type(const PrimitiveNode& node) { const auto& metadata = node.decimal_metadata(); return ::arrow::decimal(metadata.precision, metadata.scale); } -static Status FromByteArray(const PrimitiveNode& node, TypePtr* out) { +static Status FromByteArray(const PrimitiveNode& node, std::shared_ptr* out) { switch (node.logical_type()) { case LogicalType::UTF8: *out = ::arrow::utf8(); @@ -71,7 +71,7 @@ static Status FromByteArray(const PrimitiveNode& node, TypePtr* out) { return Status::OK(); } -static Status FromFLBA(const PrimitiveNode& node, TypePtr* out) { +static Status FromFLBA(const PrimitiveNode& node, std::shared_ptr* out) { switch (node.logical_type()) { case LogicalType::NONE: *out = ::arrow::fixed_size_binary(node.type_length()); @@ -89,7 +89,7 @@ static Status FromFLBA(const PrimitiveNode& node, TypePtr* out) { return Status::OK(); } -static Status FromInt32(const PrimitiveNode& node, TypePtr* out) { +static Status FromInt32(const PrimitiveNode& node, std::shared_ptr* out) { switch (node.logical_type()) { case LogicalType::NONE: *out = ::arrow::int32(); @@ -130,7 +130,7 @@ static Status FromInt32(const PrimitiveNode& node, TypePtr* out) { return Status::OK(); } -static Status FromInt64(const PrimitiveNode& node, TypePtr* out) { +static Status FromInt64(const PrimitiveNode& node, std::shared_ptr* out) { switch (node.logical_type()) { case LogicalType::NONE: *out = ::arrow::int64(); @@ -162,7 +162,7 @@ static Status FromInt64(const PrimitiveNode& node, TypePtr* out) { return Status::OK(); } -Status FromPrimitive(const PrimitiveNode& primitive, TypePtr* out) { +Status FromPrimitive(const PrimitiveNode& primitive, std::shared_ptr* out) { if (primitive.logical_type() == LogicalType::NA) { *out = ::arrow::null(); return Status::OK(); @@ -217,7 +217,7 @@ inline bool IsIncludedLeaf(const Node& node, Status StructFromGroup(const GroupNode& group, const std::unordered_set* included_leaf_nodes, - TypePtr* out) { + std::shared_ptr* out) { std::vector> fields; std::shared_ptr field; @@ -237,7 +237,7 @@ Status StructFromGroup(const GroupNode& group, Status NodeToList(const GroupNode& group, const std::unordered_set* included_leaf_nodes, - TypePtr* out) { + std::shared_ptr* out) { *out = nullptr; if (group.field_count() == 1) { // This attempts to resolve the preferred 3-level list encoding. @@ -258,7 +258,7 @@ Status NodeToList(const GroupNode& group, } } else { // List of struct - std::shared_ptr<::arrow::DataType> inner_type; + std::shared_ptr inner_type; RETURN_NOT_OK(StructFromGroup(list_group, included_leaf_nodes, &inner_type)); if (inner_type != nullptr) { auto item_field = std::make_shared(list_node.name(), inner_type, false); @@ -267,7 +267,7 @@ Status NodeToList(const GroupNode& group, } } else if (list_node.is_repeated()) { // repeated primitive node - std::shared_ptr<::arrow::DataType> inner_type; + std::shared_ptr inner_type; if (IsIncludedLeaf(static_cast(list_node), included_leaf_nodes)) { RETURN_NOT_OK( FromPrimitive(static_cast(list_node), &inner_type)); @@ -292,14 +292,14 @@ Status NodeToField(const Node& node, std::shared_ptr* out) { Status NodeToFieldInternal(const Node& node, const std::unordered_set* included_leaf_nodes, std::shared_ptr* out) { - std::shared_ptr<::arrow::DataType> type = nullptr; + std::shared_ptr type = nullptr; bool nullable = !node.is_required(); *out = nullptr; if (node.is_repeated()) { // 1-level LIST encoding fields are required - std::shared_ptr<::arrow::DataType> inner_type; + std::shared_ptr inner_type; if (node.is_group()) { RETURN_NOT_OK(StructFromGroup(static_cast(node), included_leaf_nodes, &inner_type)); @@ -476,30 +476,30 @@ Status FieldToNode(const std::shared_ptr& field, int scale = -1; switch (field->type()->id()) { - case ArrowType::NA: + case ArrowTypeId::NA: type = ParquetType::INT32; logical_type = LogicalType::NA; break; - case ArrowType::BOOL: + case ArrowTypeId::BOOL: type = ParquetType::BOOLEAN; break; - case ArrowType::UINT8: + case ArrowTypeId::UINT8: type = ParquetType::INT32; logical_type = LogicalType::UINT_8; break; - case ArrowType::INT8: + case ArrowTypeId::INT8: type = ParquetType::INT32; logical_type = LogicalType::INT_8; break; - case ArrowType::UINT16: + case ArrowTypeId::UINT16: type = ParquetType::INT32; logical_type = LogicalType::UINT_16; break; - case ArrowType::INT16: + case ArrowTypeId::INT16: type = ParquetType::INT32; logical_type = LogicalType::INT_16; break; - case ArrowType::UINT32: + case ArrowTypeId::UINT32: if (properties.version() == ::parquet::ParquetVersion::PARQUET_1_0) { type = ParquetType::INT64; } else { @@ -507,36 +507,36 @@ Status FieldToNode(const std::shared_ptr& field, logical_type = LogicalType::UINT_32; } break; - case ArrowType::INT32: + case ArrowTypeId::INT32: type = ParquetType::INT32; break; - case ArrowType::UINT64: + case ArrowTypeId::UINT64: type = ParquetType::INT64; logical_type = LogicalType::UINT_64; break; - case ArrowType::INT64: + case ArrowTypeId::INT64: type = ParquetType::INT64; break; - case ArrowType::FLOAT: + case ArrowTypeId::FLOAT: type = ParquetType::FLOAT; break; - case ArrowType::DOUBLE: + case ArrowTypeId::DOUBLE: type = ParquetType::DOUBLE; break; - case ArrowType::STRING: + case ArrowTypeId::STRING: type = ParquetType::BYTE_ARRAY; logical_type = LogicalType::UTF8; break; - case ArrowType::BINARY: + case ArrowTypeId::BINARY: type = ParquetType::BYTE_ARRAY; break; - case ArrowType::FIXED_SIZE_BINARY: { + case ArrowTypeId::FIXED_SIZE_BINARY: { type = ParquetType::FIXED_LEN_BYTE_ARRAY; const auto& fixed_size_binary_type = static_cast(*field->type()); length = fixed_size_binary_type.byte_width(); } break; - case ArrowType::DECIMAL: { + case ArrowTypeId::DECIMAL: { type = ParquetType::FIXED_LEN_BYTE_ARRAY; logical_type = LogicalType::DECIMAL; const auto& decimal_type = @@ -545,24 +545,24 @@ Status FieldToNode(const std::shared_ptr& field, scale = decimal_type.scale(); length = DecimalSize(precision); } break; - case ArrowType::DATE32: + case ArrowTypeId::DATE32: type = ParquetType::INT32; logical_type = LogicalType::DATE; break; - case ArrowType::DATE64: + case ArrowTypeId::DATE64: type = ParquetType::INT32; logical_type = LogicalType::DATE; break; - case ArrowType::TIMESTAMP: + case ArrowTypeId::TIMESTAMP: RETURN_NOT_OK( GetTimestampMetadata(static_cast<::arrow::TimestampType&>(*field->type()), arrow_properties, &type, &logical_type)); break; - case ArrowType::TIME32: + case ArrowTypeId::TIME32: type = ParquetType::INT32; logical_type = LogicalType::TIME_MILLIS; break; - case ArrowType::TIME64: { + case ArrowTypeId::TIME64: { auto time_type = static_cast<::arrow::Time64Type*>(field->type().get()); if (time_type->unit() == ::arrow::TimeUnit::NANO) { return Status::NotImplemented("Nanosecond time not supported in Parquet."); @@ -570,17 +570,17 @@ Status FieldToNode(const std::shared_ptr& field, type = ParquetType::INT64; logical_type = LogicalType::TIME_MICROS; } break; - case ArrowType::STRUCT: { + case ArrowTypeId::STRUCT: { auto struct_type = std::static_pointer_cast<::arrow::StructType>(field->type()); return StructToNode(struct_type, field->name(), field->nullable(), properties, arrow_properties, out); } - case ArrowType::LIST: { + case ArrowTypeId::LIST: { auto list_type = std::static_pointer_cast<::arrow::ListType>(field->type()); return ListToNode(list_type, field->name(), field->nullable(), properties, arrow_properties, out); } - case ArrowType::DICTIONARY: { + case ArrowTypeId::DICTIONARY: { // Parquet has no Dictionary type, dictionary-encoded is handled on // the encoding, not the schema level. const ::arrow::DictionaryType& dict_type = diff --git a/cpp/src/parquet/arrow/test-util.h b/cpp/src/parquet/arrow/test-util.h index 631bb7103a6c5..d425cb0db7e48 100644 --- a/cpp/src/parquet/arrow/test-util.h +++ b/cpp/src/parquet/arrow/test-util.h @@ -16,7 +16,9 @@ // under the License. #include +#include #include +#include #include #include "arrow/api.h" diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index b7d139e09eabf..9247b84cf37e5 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -19,6 +19,7 @@ #include #include +#include #include #include "arrow/api.h" @@ -26,8 +27,8 @@ #include "arrow/util/bit-util.h" #include "arrow/visitor_inline.h" +#include "arrow/util/logging.h" #include "parquet/arrow/schema.h" -#include "parquet/util/logging.h" using arrow::Array; using arrow::BinaryArray; diff --git a/cpp/src/parquet/bloom_filter.cc b/cpp/src/parquet/bloom_filter.cc index faa344cb6147f..31a33fa782a7b 100644 --- a/cpp/src/parquet/bloom_filter.cc +++ b/cpp/src/parquet/bloom_filter.cc @@ -21,11 +21,11 @@ #include "arrow/status.h" #include "arrow/util/bit-util.h" +#include "arrow/util/logging.h" #include "parquet/bloom_filter.h" #include "parquet/exception.h" #include "parquet/murmur3.h" #include "parquet/types.h" -#include "parquet/util/logging.h" namespace parquet { constexpr uint32_t BlockSplitBloomFilter::SALT[kBitsSetPerBlock]; diff --git a/cpp/src/parquet/bloom_filter.h b/cpp/src/parquet/bloom_filter.h index e39370a44c907..876f70382a165 100644 --- a/cpp/src/parquet/bloom_filter.h +++ b/cpp/src/parquet/bloom_filter.h @@ -19,11 +19,12 @@ #define PARQUET_BLOOM_FILTER_H #include +#include +#include "arrow/util/logging.h" #include "parquet/exception.h" #include "parquet/hasher.h" #include "parquet/types.h" -#include "parquet/util/logging.h" #include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/column-io-benchmark.cc b/cpp/src/parquet/column-io-benchmark.cc index a9a7530eb3662..8f286f4910000 100644 --- a/cpp/src/parquet/column-io-benchmark.cc +++ b/cpp/src/parquet/column-io-benchmark.cc @@ -20,7 +20,7 @@ #include "parquet/column_reader.h" #include "parquet/column_writer.h" #include "parquet/file_reader.h" -#include "parquet/parquet_types.h" +#include "parquet/thrift.h" #include "parquet/util/memory.h" namespace parquet { @@ -223,7 +223,7 @@ static void BM_RleDecoding(::benchmark::State& state) { LevelDecoder level_decoder; level_decoder.SetData(Encoding::RLE, max_level, static_cast(levels.size()), buffer_rle->data()); - level_decoder.Decode(state.range(0), levels.data()); + level_decoder.Decode(static_cast(state.range(0)), levels.data()); } state.SetBytesProcessed(state.iterations() * state.range(0) * sizeof(int16_t)); diff --git a/cpp/src/parquet/column_reader.cc b/cpp/src/parquet/column_reader.cc index 28d0dcb6d32fa..173292ecddbd9 100644 --- a/cpp/src/parquet/column_reader.cc +++ b/cpp/src/parquet/column_reader.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -29,7 +30,6 @@ #include "parquet/column_page.h" #include "parquet/encoding-internal.h" -#include "parquet/parquet_types.h" #include "parquet/properties.h" #include "parquet/thrift.h" diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 71346320e3727..d1b4d2ef50081 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -37,6 +38,7 @@ #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -209,13 +211,13 @@ static inline void DefinitionLevelsToBitmap( // API to read values from a single column. This is a main client facing API. template -class PARQUET_EXPORT TypedColumnReader : public ColumnReader { +class PARQUET_TEMPLATE_CLASS_EXPORT TypedColumnReader : public ColumnReader { public: typedef typename DType::c_type T; TypedColumnReader(const ColumnDescriptor* schema, std::unique_ptr pager, ::arrow::MemoryPool* pool = ::arrow::default_memory_pool()) - : ColumnReader(schema, std::move(pager), pool), current_decoder_(nullptr) {} + : ColumnReader(schema, std::move(pager), pool), current_decoder_(NULLPTR) {} // Read a batch of repetition levels, definition levels, and values from the // column. @@ -521,14 +523,14 @@ typedef TypedColumnReader DoubleReader; typedef TypedColumnReader ByteArrayReader; typedef TypedColumnReader FixedLenByteArrayReader; -extern template class PARQUET_EXPORT TypedColumnReader; -extern template class PARQUET_EXPORT TypedColumnReader; -extern template class PARQUET_EXPORT TypedColumnReader; -extern template class PARQUET_EXPORT TypedColumnReader; -extern template class PARQUET_EXPORT TypedColumnReader; -extern template class PARQUET_EXPORT TypedColumnReader; -extern template class PARQUET_EXPORT TypedColumnReader; -extern template class PARQUET_EXPORT TypedColumnReader; +PARQUET_EXTERN_TEMPLATE TypedColumnReader; +PARQUET_EXTERN_TEMPLATE TypedColumnReader; +PARQUET_EXTERN_TEMPLATE TypedColumnReader; +PARQUET_EXTERN_TEMPLATE TypedColumnReader; +PARQUET_EXTERN_TEMPLATE TypedColumnReader; +PARQUET_EXTERN_TEMPLATE TypedColumnReader; +PARQUET_EXTERN_TEMPLATE TypedColumnReader; +PARQUET_EXTERN_TEMPLATE TypedColumnReader; } // namespace parquet diff --git a/cpp/src/parquet/column_scanner.h b/cpp/src/parquet/column_scanner.h index 0a866eee1d505..f23c86173cb32 100644 --- a/cpp/src/parquet/column_scanner.h +++ b/cpp/src/parquet/column_scanner.h @@ -29,6 +29,7 @@ #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index e87d549be2102..8c20a6e4dab00 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -19,9 +19,9 @@ #include "parquet/column_reader.h" #include "parquet/column_writer.h" -#include "parquet/parquet_types.h" #include "parquet/test-specialization.h" #include "parquet/test-util.h" +#include "parquet/thrift.h" #include "parquet/types.h" #include "parquet/util/comparison.h" #include "parquet/util/memory.h" diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index a65bda85d79a0..9c7a39bfed2b1 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -19,16 +19,17 @@ #include #include +#include #include "arrow/util/bit-util.h" #include "arrow/util/compression.h" +#include "arrow/util/logging.h" #include "arrow/util/rle-encoding.h" #include "parquet/encoding-internal.h" #include "parquet/properties.h" #include "parquet/statistics.h" #include "parquet/thrift.h" -#include "parquet/util/logging.h" #include "parquet/util/memory.h" namespace parquet { diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 1ba428a9a8116..e3bfcf0ae185b 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -18,6 +18,7 @@ #ifndef PARQUET_COLUMN_WRITER_H #define PARQUET_COLUMN_WRITER_H +#include #include #include "parquet/column_page.h" @@ -27,6 +28,7 @@ #include "parquet/schema.h" #include "parquet/statistics.h" #include "parquet/types.h" +#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -225,7 +227,7 @@ class PARQUET_EXPORT ColumnWriter { // API to write values to a single column. This is the main client facing API. template -class PARQUET_EXPORT TypedColumnWriter : public ColumnWriter { +class PARQUET_TEMPLATE_CLASS_EXPORT TypedColumnWriter : public ColumnWriter { public: typedef typename DType::c_type T; @@ -314,14 +316,14 @@ typedef TypedColumnWriter DoubleWriter; typedef TypedColumnWriter ByteArrayWriter; typedef TypedColumnWriter FixedLenByteArrayWriter; -extern template class PARQUET_EXPORT TypedColumnWriter; -extern template class PARQUET_EXPORT TypedColumnWriter; -extern template class PARQUET_EXPORT TypedColumnWriter; -extern template class PARQUET_EXPORT TypedColumnWriter; -extern template class PARQUET_EXPORT TypedColumnWriter; -extern template class PARQUET_EXPORT TypedColumnWriter; -extern template class PARQUET_EXPORT TypedColumnWriter; -extern template class PARQUET_EXPORT TypedColumnWriter; +PARQUET_EXTERN_TEMPLATE TypedColumnWriter; +PARQUET_EXTERN_TEMPLATE TypedColumnWriter; +PARQUET_EXTERN_TEMPLATE TypedColumnWriter; +PARQUET_EXTERN_TEMPLATE TypedColumnWriter; +PARQUET_EXTERN_TEMPLATE TypedColumnWriter; +PARQUET_EXTERN_TEMPLATE TypedColumnWriter; +PARQUET_EXTERN_TEMPLATE TypedColumnWriter; +PARQUET_EXTERN_TEMPLATE TypedColumnWriter; } // namespace parquet diff --git a/cpp/src/parquet/exception.cc b/cpp/src/parquet/exception.cc deleted file mode 100644 index 5f5525cc308ff..0000000000000 --- a/cpp/src/parquet/exception.cc +++ /dev/null @@ -1,57 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "parquet/exception.h" - -#include -#include -#include - -#include "parquet/util/macros.h" - -namespace parquet { - -PARQUET_NORETURN void ParquetException::EofException(const std::string& msg) { - std::stringstream ss; - ss << "Unexpected end of stream"; - if (!msg.empty()) { - ss << ": " << msg; - } - throw ParquetException(ss.str()); -} - -PARQUET_NORETURN void ParquetException::NYI(const std::string& msg) { - std::stringstream ss; - ss << "Not yet implemented: " << msg << "."; - throw ParquetException(ss.str()); -} - -PARQUET_NORETURN void ParquetException::Throw(const std::string& msg) { - throw ParquetException(msg); -} - -ParquetException::ParquetException(const char* msg) : msg_(msg) {} - -ParquetException::ParquetException(const std::string& msg) : msg_(msg) {} - -ParquetException::ParquetException(const char* msg, std::exception& e) : msg_(msg) {} - -ParquetException::~ParquetException() throw() {} - -const char* ParquetException::what() const throw() { return msg_.c_str(); } - -} // namespace parquet diff --git a/cpp/src/parquet/exception.h b/cpp/src/parquet/exception.h index 08629beed4800..65e12af47a7c2 100644 --- a/cpp/src/parquet/exception.h +++ b/cpp/src/parquet/exception.h @@ -25,7 +25,6 @@ #include "arrow/status.h" #include "parquet/util/macros.h" -#include "parquet/util/visibility.h" // PARQUET-1085 #if !defined(ARROW_UNUSED) @@ -45,30 +44,44 @@ ARROW_UNUSED(_s); \ } while (0) -#define PARQUET_THROW_NOT_OK(s) \ - do { \ - ::arrow::Status _s = (s); \ - if (!_s.ok()) { \ - std::stringstream ss; \ - ss << "Arrow error: " << _s.ToString(); \ - ::parquet::ParquetException::Throw(ss.str()); \ - } \ +#define PARQUET_THROW_NOT_OK(s) \ + do { \ + ::arrow::Status _s = (s); \ + if (!_s.ok()) { \ + std::stringstream ss; \ + ss << "Arrow error: " << _s.ToString(); \ + throw ::parquet::ParquetException(ss.str()); \ + } \ } while (0) namespace parquet { -class PARQUET_EXPORT ParquetException : public std::exception { +class ParquetException : public std::exception { public: - PARQUET_NORETURN static void EofException(const std::string& msg = ""); - PARQUET_NORETURN static void NYI(const std::string& msg); - PARQUET_NORETURN static void Throw(const std::string& msg); + PARQUET_NORETURN static void EofException(const std::string& msg = "") { + std::stringstream ss; + ss << "Unexpected end of stream"; + if (!msg.empty()) { + ss << ": " << msg; + } + throw ParquetException(ss.str()); + } + + PARQUET_NORETURN static void NYI(const std::string& msg = "") { + std::stringstream ss; + ss << "Not yet implemented: " << msg << "."; + throw ParquetException(ss.str()); + } + + explicit ParquetException(const char* msg) : msg_(msg) {} + + explicit ParquetException(const std::string& msg) : msg_(msg) {} + + explicit ParquetException(const char* msg, std::exception& e) : msg_(msg) {} - explicit ParquetException(const char* msg); - explicit ParquetException(const std::string& msg); - explicit ParquetException(const char* msg, exception& e); + ~ParquetException() throw() override {} - virtual ~ParquetException() throw(); - virtual const char* what() const throw(); + const char* what() const throw() override { return msg_.c_str(); } private: std::string msg_; diff --git a/cpp/src/parquet/file-deserialize-test.cc b/cpp/src/parquet/file-deserialize-test.cc index 6b01ac21bc767..fb95534ae2c22 100644 --- a/cpp/src/parquet/file-deserialize-test.cc +++ b/cpp/src/parquet/file-deserialize-test.cc @@ -29,12 +29,13 @@ #include "parquet/column_reader.h" #include "parquet/exception.h" #include "parquet/file_reader.h" -#include "parquet/parquet_types.h" #include "parquet/thrift.h" #include "parquet/types.h" #include "parquet/util/memory.h" #include "parquet/util/test-common.h" +#include "arrow/util/compression.h" + namespace parquet { #define ASSERT_OK(expr) \ diff --git a/cpp/src/parquet/file_reader.cc b/cpp/src/parquet/file_reader.cc index c5a0f342b4b52..ea518fd988f01 100644 --- a/cpp/src/parquet/file_reader.cc +++ b/cpp/src/parquet/file_reader.cc @@ -27,16 +27,16 @@ #include #include "arrow/io/file.h" +#include "arrow/util/logging.h" #include "parquet/column_page.h" #include "parquet/column_reader.h" #include "parquet/column_scanner.h" #include "parquet/exception.h" #include "parquet/metadata.h" -#include "parquet/parquet_types.h" #include "parquet/properties.h" +#include "parquet/thrift.h" #include "parquet/types.h" -#include "parquet/util/logging.h" #include "parquet/util/memory.h" using std::string; diff --git a/cpp/src/parquet/file_reader.h b/cpp/src/parquet/file_reader.h index f751e9b3501ff..6836bb1a205f6 100644 --- a/cpp/src/parquet/file_reader.h +++ b/cpp/src/parquet/file_reader.h @@ -30,6 +30,7 @@ #include "parquet/properties.h" #include "parquet/schema.h" #include "parquet/statistics.h" +#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -74,7 +75,7 @@ class PARQUET_EXPORT ParquetFileReader { static std::unique_ptr Open( std::unique_ptr source, const ReaderProperties& props = default_reader_properties(), - const std::shared_ptr& metadata = nullptr); + const std::shared_ptr& metadata = NULLPTR); virtual ~Contents() {} // Perform any cleanup associated with the file contents @@ -94,21 +95,21 @@ class PARQUET_EXPORT ParquetFileReader { static std::unique_ptr Open( std::unique_ptr source, const ReaderProperties& props = default_reader_properties(), - const std::shared_ptr& metadata = nullptr); + const std::shared_ptr& metadata = NULLPTR); // Create a file reader instance from an Arrow file object. Thread-safety is // the responsibility of the file implementation static std::unique_ptr Open( const std::shared_ptr<::arrow::io::ReadableFileInterface>& source, const ReaderProperties& props = default_reader_properties(), - const std::shared_ptr& metadata = nullptr); + const std::shared_ptr& metadata = NULLPTR); // API Convenience to open a serialized Parquet file on disk, using Arrow IO // interfaces. static std::unique_ptr OpenFile( const std::string& path, bool memory_map = true, const ReaderProperties& props = default_reader_properties(), - const std::shared_ptr& metadata = nullptr); + const std::shared_ptr& metadata = NULLPTR); void Open(std::unique_ptr contents); void Close(); diff --git a/cpp/src/parquet/file_writer.cc b/cpp/src/parquet/file_writer.cc index 30673c59d2843..01fa112fe37ef 100644 --- a/cpp/src/parquet/file_writer.cc +++ b/cpp/src/parquet/file_writer.cc @@ -17,6 +17,7 @@ #include "parquet/file_writer.h" +#include #include #include "parquet/column_writer.h" diff --git a/cpp/src/parquet/file_writer.h b/cpp/src/parquet/file_writer.h index cdfe06cd0677c..82703f82dc899 100644 --- a/cpp/src/parquet/file_writer.h +++ b/cpp/src/parquet/file_writer.h @@ -24,6 +24,7 @@ #include "parquet/metadata.h" #include "parquet/properties.h" #include "parquet/schema.h" +#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -150,13 +151,13 @@ class PARQUET_EXPORT ParquetFileWriter { const std::shared_ptr<::arrow::io::OutputStream>& sink, const std::shared_ptr& schema, const std::shared_ptr& properties = default_writer_properties(), - const std::shared_ptr& key_value_metadata = nullptr); + const std::shared_ptr& key_value_metadata = NULLPTR); static std::unique_ptr Open( const std::shared_ptr& sink, const std::shared_ptr& schema, const std::shared_ptr& properties = default_writer_properties(), - const std::shared_ptr& key_value_metadata = nullptr); + const std::shared_ptr& key_value_metadata = NULLPTR); void Open(std::unique_ptr contents); void Close(); diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 1cab51f071d0f..9c66c7aab1d86 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -17,6 +17,7 @@ #include #include +#include #include #include "parquet/exception.h" diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 5d51e3d292c73..79f4fdb35032c 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -18,6 +18,7 @@ #ifndef PARQUET_FILE_METADATA_H #define PARQUET_FILE_METADATA_H +#include #include #include #include @@ -28,6 +29,7 @@ #include "parquet/schema.h" #include "parquet/statistics.h" #include "parquet/types.h" +#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -92,7 +94,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { // API convenience to get a MetaData accessor static std::unique_ptr Make( const uint8_t* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = nullptr); + const ApplicationVersion* writer_version = NULLPTR); ~ColumnChunkMetaData(); @@ -118,7 +120,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { private: explicit ColumnChunkMetaData(const uint8_t* metadata, const ColumnDescriptor* descr, - const ApplicationVersion* writer_version = nullptr); + const ApplicationVersion* writer_version = NULLPTR); // PIMPL Idiom class ColumnChunkMetaDataImpl; std::unique_ptr impl_; @@ -129,7 +131,7 @@ class PARQUET_EXPORT RowGroupMetaData { // API convenience to get a MetaData accessor static std::unique_ptr Make( const uint8_t* metadata, const SchemaDescriptor* schema, - const ApplicationVersion* writer_version = nullptr); + const ApplicationVersion* writer_version = NULLPTR); ~RowGroupMetaData(); @@ -143,7 +145,7 @@ class PARQUET_EXPORT RowGroupMetaData { private: explicit RowGroupMetaData(const uint8_t* metadata, const SchemaDescriptor* schema, - const ApplicationVersion* writer_version = nullptr); + const ApplicationVersion* writer_version = NULLPTR); // PIMPL Idiom class RowGroupMetaDataImpl; std::unique_ptr impl_; @@ -254,7 +256,7 @@ class PARQUET_EXPORT FileMetaDataBuilder { // API convenience to get a MetaData reader static std::unique_ptr Make( const SchemaDescriptor* schema, const std::shared_ptr& props, - const std::shared_ptr& key_value_metadata = nullptr); + const std::shared_ptr& key_value_metadata = NULLPTR); ~FileMetaDataBuilder(); @@ -266,7 +268,7 @@ class PARQUET_EXPORT FileMetaDataBuilder { private: explicit FileMetaDataBuilder( const SchemaDescriptor* schema, const std::shared_ptr& props, - const std::shared_ptr& key_value_metadata = nullptr); + const std::shared_ptr& key_value_metadata = NULLPTR); // PIMPL Idiom class FileMetaDataBuilderImpl; std::unique_ptr impl_; diff --git a/cpp/src/parquet/parquet.pc.in b/cpp/src/parquet/parquet.pc.in index 9313d1fc45a38..d2d98a53e3e8e 100644 --- a/cpp/src/parquet/parquet.pc.in +++ b/cpp/src/parquet/parquet.pc.in @@ -19,8 +19,9 @@ prefix=@CMAKE_INSTALL_PREFIX@ libdir=${prefix}/@CMAKE_INSTALL_LIBDIR@ includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ -so_version=@PARQUET_SO_VERSION@ -abi_version=@PARQUET_ABI_VERSION@ +so_version=@ARROW_SO_VERSION@ +abi_version=@ARROW_SO_VERSION@ +full_so_version=@ARROW_FULL_SO_VERSION@ Name: Apache Parquet Description: Apache Parquet is a columnar storage format. diff --git a/cpp/src/parquet/parquet.thrift b/cpp/src/parquet/parquet.thrift index aec8e5421f36f..2a4efb9c371a3 100644 --- a/cpp/src/parquet/parquet.thrift +++ b/cpp/src/parquet/parquet.thrift @@ -667,7 +667,7 @@ struct EncryptionWithFooterKey { struct EncryptionWithColumnKey { /** Column path in schema **/ 1: required list path_in_schema - + /** Retrieval metadata of the column-specific key **/ 2: optional binary column_key_metadata } @@ -703,7 +703,7 @@ struct ColumnChunk { /** Size of ColumnChunk's ColumnIndex, in bytes **/ 7: optional i32 column_index_length - + /** Crypto metadata of encrypted columns **/ 8: optional ColumnCryptoMetaData crypto_meta_data } @@ -905,20 +905,19 @@ union EncryptionAlgorithm { struct FileCryptoMetaData { 1: required EncryptionAlgorithm encryption_algorithm - + /** Parquet footer can be encrypted, or left as plaintext **/ 2: required bool encrypted_footer - - /** Retrieval metadata of key used for encryption of footer, + + /** Retrieval metadata of key used for encryption of footer, * and (possibly) columns **/ 3: optional binary footer_key_metadata /** Offset of Parquet footer (encrypted, or plaintext) **/ 4: required i64 footer_offset - + /** If file IVs are comprised of a fixed part, * and variable parts (random or counter), keep the fixed * part here **/ 5: optional binary iv_prefix } - diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 83dc20574b9f0..0e856edf8ae13 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -26,6 +26,7 @@ #include "parquet/parquet_version.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" diff --git a/cpp/src/parquet/schema-internal.h b/cpp/src/parquet/schema-internal.h index 53472abf13fba..0be5c2cf7f760 100644 --- a/cpp/src/parquet/schema-internal.h +++ b/cpp/src/parquet/schema-internal.h @@ -25,11 +25,10 @@ #include #include -#include "parquet/parquet_types.h" #include "parquet/schema.h" +#include "parquet/thrift.h" #include "parquet/types.h" #include "parquet/util/macros.h" -#include "parquet/util/visibility.h" namespace parquet { namespace schema { diff --git a/cpp/src/parquet/schema-test.cc b/cpp/src/parquet/schema-test.cc index a734a926b4854..5d2af285a415b 100644 --- a/cpp/src/parquet/schema-test.cc +++ b/cpp/src/parquet/schema-test.cc @@ -24,9 +24,9 @@ #include #include "parquet/exception.h" -#include "parquet/parquet_types.h" #include "parquet/schema-internal.h" #include "parquet/schema.h" +#include "parquet/thrift.h" #include "parquet/types.h" using std::string; @@ -363,8 +363,8 @@ TEST_F(TestGroupNode, FieldIndex) { // Test a non field node auto non_field_alien = Int32("alien", Repetition::REQUIRED); // other name auto non_field_familiar = Int32("one", Repetition::REPEATED); // other node - ASSERT_TRUE(group.FieldIndex(*non_field_alien) < 0); - ASSERT_TRUE(group.FieldIndex(*non_field_familiar) < 0); + ASSERT_LT(group.FieldIndex(*non_field_alien), 0); + ASSERT_LT(group.FieldIndex(*non_field_familiar), 0); } TEST_F(TestGroupNode, FieldIndexDuplicateName) { @@ -703,8 +703,8 @@ TEST_F(TestSchemaDescriptor, BuildTree) { // Test non-column nodes find NodePtr non_column_alien = Int32("alien", Repetition::REQUIRED); // other path NodePtr non_column_familiar = Int32("a", Repetition::REPEATED); // other node - ASSERT_TRUE(descr_.ColumnIndex(*non_column_alien) < 0); - ASSERT_TRUE(descr_.ColumnIndex(*non_column_familiar) < 0); + ASSERT_LT(descr_.ColumnIndex(*non_column_alien), 0); + ASSERT_LT(descr_.ColumnIndex(*non_column_familiar), 0); ASSERT_EQ(inta.get(), descr_.GetColumnRoot(0)); ASSERT_EQ(bag.get(), descr_.GetColumnRoot(3)); diff --git a/cpp/src/parquet/schema.cc b/cpp/src/parquet/schema.cc index 77187819e25ae..da004344f2016 100644 --- a/cpp/src/parquet/schema.cc +++ b/cpp/src/parquet/schema.cc @@ -22,9 +22,9 @@ #include #include #include +#include #include "parquet/exception.h" -#include "parquet/parquet_types.h" #include "parquet/thrift.h" using parquet::format::SchemaElement; diff --git a/cpp/src/parquet/schema.h b/cpp/src/parquet/schema.h index 1a94ed1fd4893..add2f6dbab013 100644 --- a/cpp/src/parquet/schema.h +++ b/cpp/src/parquet/schema.h @@ -114,7 +114,7 @@ class PARQUET_EXPORT Node { repetition_(repetition), logical_type_(logical_type), id_(id), - parent_(nullptr) {} + parent_(NULLPTR) {} virtual ~Node() {} @@ -180,7 +180,7 @@ class PARQUET_EXPORT Node { void SetParent(const Node* p_parent); private: - DISALLOW_COPY_AND_ASSIGN(Node); + PARQUET_DISALLOW_COPY_AND_ASSIGN(Node); }; // Save our breath all over the place with these typedefs @@ -333,7 +333,7 @@ class PARQUET_EXPORT ColumnDescriptor { public: ColumnDescriptor(const schema::NodePtr& node, int16_t max_definition_level, int16_t max_repetition_level, - const SchemaDescriptor* schema_descr = nullptr); + const SchemaDescriptor* schema_descr = NULLPTR); bool Equals(const ColumnDescriptor& other) const; diff --git a/cpp/src/parquet/statistics.cc b/cpp/src/parquet/statistics.cc index ea7f783b1423d..ed4e8d05592e4 100644 --- a/cpp/src/parquet/statistics.cc +++ b/cpp/src/parquet/statistics.cc @@ -161,8 +161,8 @@ void SetNaN(double* value) { template void TypedRowGroupStatistics::Update(const T* values, int64_t num_not_null, int64_t num_null) { - DCHECK(num_not_null >= 0); - DCHECK(num_null >= 0); + DCHECK_GE(num_not_null, 0); + DCHECK_GE(num_null, 0); IncrementNullCount(num_null); IncrementNumValues(num_not_null); @@ -200,8 +200,8 @@ void TypedRowGroupStatistics::UpdateSpaced(const T* values, int64_t valid_bits_offset, int64_t num_not_null, int64_t num_null) { - DCHECK(num_not_null >= 0); - DCHECK(num_null >= 0); + DCHECK_GE(num_not_null, 0); + DCHECK_GE(num_null, 0); IncrementNullCount(num_null); IncrementNumValues(num_not_null); diff --git a/cpp/src/parquet/statistics.h b/cpp/src/parquet/statistics.h index d1c4d16fbdb43..530bf85b73e47 100644 --- a/cpp/src/parquet/statistics.h +++ b/cpp/src/parquet/statistics.h @@ -26,6 +26,7 @@ #include "parquet/schema.h" #include "parquet/types.h" #include "parquet/util/comparison.h" +#include "parquet/util/macros.h" #include "parquet/util/memory.h" #include "parquet/util/visibility.h" @@ -82,7 +83,7 @@ class PARQUET_EXPORT EncodedStatistics { }; template -class PARQUET_EXPORT TypedRowGroupStatistics; +class PARQUET_TEMPLATE_CLASS_EXPORT TypedRowGroupStatistics; class PARQUET_EXPORT RowGroupStatistics : public std::enable_shared_from_this { @@ -134,13 +135,13 @@ class PARQUET_EXPORT RowGroupStatistics this->num_values_ = 0; } - const ColumnDescriptor* descr_ = nullptr; + const ColumnDescriptor* descr_ = NULLPTR; int64_t num_values_ = 0; EncodedStatistics statistics_; }; template -class TypedRowGroupStatistics : public RowGroupStatistics { +class PARQUET_TEMPLATE_CLASS_EXPORT TypedRowGroupStatistics : public RowGroupStatistics { public: using T = typename DType::c_type; @@ -226,11 +227,6 @@ typedef TypedRowGroupStatistics DoubleStatistics; typedef TypedRowGroupStatistics ByteArrayStatistics; typedef TypedRowGroupStatistics FLBAStatistics; -#if defined(__GNUC__) && !defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wattributes" -#endif - PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics; PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics; PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics; @@ -240,10 +236,6 @@ PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics; PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics; PARQUET_EXTERN_TEMPLATE TypedRowGroupStatistics; -#if defined(__GNUC__) && !defined(__clang__) -#pragma GCC diagnostic pop -#endif - } // namespace parquet #endif // PARQUET_COLUMN_STATISTICS_H diff --git a/cpp/src/parquet/thrift.h b/cpp/src/parquet/thrift.h index ec7ac906bf731..217cc76c0463b 100644 --- a/cpp/src/parquet/thrift.h +++ b/cpp/src/parquet/thrift.h @@ -18,6 +18,8 @@ #ifndef PARQUET_THRIFT_UTIL_H #define PARQUET_THRIFT_UTIL_H +#include "arrow/util/windows_compatibility.h" + #include // Check if thrift version < 0.11.0 // or if FORCE_BOOST_SMART_PTR is defined. Ref: https://thrift.apache.org/lib/cpp @@ -38,11 +40,12 @@ #include #include +#include "arrow/util/logging.h" #include "parquet/exception.h" -#include "parquet/parquet_types.h" -#include "parquet/util/logging.h" #include "parquet/util/memory.h" +#include "parquet/parquet_types.h" + namespace parquet { // Check if thrift version < 0.11.0 diff --git a/cpp/src/parquet/types-test.cc b/cpp/src/parquet/types-test.cc index 6b184e382a973..6fb0066a21205 100644 --- a/cpp/src/parquet/types-test.cc +++ b/cpp/src/parquet/types-test.cc @@ -61,11 +61,15 @@ TEST(TestLogicalTypeToString, LogicalTypes) { ASSERT_STREQ("INTERVAL", LogicalTypeToString(LogicalType::INTERVAL).c_str()); } -TEST(TypePrinter, StatisticsTypes) { #if !(defined(_WIN32) || defined(__CYGWIN__)) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wdeprecated-declarations" +#elif _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4996) #endif + +TEST(TypePrinter, StatisticsTypes) { std::string smin; std::string smax; int32_t int_min = 1024; @@ -134,9 +138,12 @@ TEST(TypePrinter, StatisticsTypes) { ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax).c_str()); ASSERT_STREQ("ijklmnop", FormatStatValue(Type::FIXED_LEN_BYTE_ARRAY, smax.c_str()).c_str()); +} + #if !(defined(_WIN32) || defined(__CYGWIN__)) #pragma GCC diagnostic pop +#elif _MSC_VER +#pragma warning(pop) #endif -} } // namespace parquet diff --git a/cpp/src/parquet/types.h b/cpp/src/parquet/types.h index 10789cbf5452d..016ac7c3de326 100644 --- a/cpp/src/parquet/types.h +++ b/cpp/src/parquet/types.h @@ -153,7 +153,7 @@ class ColumnOrder { // ---------------------------------------------------------------------- struct ByteArray { - ByteArray() : len(0), ptr(nullptr) {} + ByteArray() : len(0), ptr(NULLPTR) {} ByteArray(uint32_t len, const uint8_t* ptr) : len(len), ptr(ptr) {} uint32_t len; const uint8_t* ptr; @@ -168,7 +168,7 @@ inline bool operator!=(const ByteArray& left, const ByteArray& right) { } struct FixedLenByteArray { - FixedLenByteArray() : ptr(nullptr) {} + FixedLenByteArray() : ptr(NULLPTR) {} explicit FixedLenByteArray(const uint8_t* ptr) : ptr(ptr) {} const uint8_t* ptr; }; @@ -301,7 +301,7 @@ PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const std::string& val); /// \deprecated Since 1.5.0 -PARQUET_DEPRECATED("Use std::string instead of char* as input") +ARROW_DEPRECATED("Use std::string instead of char* as input") PARQUET_EXPORT std::string FormatStatValue(Type::type parquet_type, const char* val); PARQUET_EXPORT int GetTypeByteSize(Type::type t); diff --git a/cpp/src/parquet/util/CMakeLists.txt b/cpp/src/parquet/util/CMakeLists.txt index 7b138868e5996..debf4b3a88cd5 100644 --- a/cpp/src/parquet/util/CMakeLists.txt +++ b/cpp/src/parquet/util/CMakeLists.txt @@ -17,33 +17,12 @@ # Headers: util install(FILES - buffer-builder.h comparison.h - logging.h macros.h memory.h stopwatch.h visibility.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/parquet/util") -if (PARQUET_BUILD_BENCHMARKS) - add_library(parquet_benchmark_main benchmark_main.cc) - if (APPLE) - target_link_libraries(parquet_benchmark_main - gbenchmark - ) - elseif(WIN32) - target_link_libraries(parquet_benchmark_main - gbenchmark - shlwapi.lib # workaround for bug(?) in gbenchmark: unresolved external symbol __imp_SHGetValueA - ) - else() - target_link_libraries(parquet_benchmark_main - gbenchmark - pthread - ) - endif() -endif() - ADD_PARQUET_TEST(comparison-test) ADD_PARQUET_TEST(memory-test) diff --git a/cpp/src/parquet/util/benchmark_main.cc b/cpp/src/parquet/util/benchmark_main.cc deleted file mode 100644 index c9739af03fb53..0000000000000 --- a/cpp/src/parquet/util/benchmark_main.cc +++ /dev/null @@ -1,24 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "benchmark/benchmark.h" - -int main(int argc, char** argv) { - benchmark::Initialize(&argc, argv); - benchmark::RunSpecifiedBenchmarks(); - return 0; -} diff --git a/cpp/src/parquet/util/buffer-builder.h b/cpp/src/parquet/util/buffer-builder.h deleted file mode 100644 index 26f134ee94bc3..0000000000000 --- a/cpp/src/parquet/util/buffer-builder.h +++ /dev/null @@ -1,58 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -// Initially imported from Impala on 2016-02-23 - -#ifndef PARQUET_UTIL_BUFFER_BUILDER_H -#define PARQUET_UTIL_BUFFER_BUILDER_H - -#include -#include - -namespace parquet { - -/// Utility class to build an in-memory buffer. -class BufferBuilder { - public: - BufferBuilder(uint8_t* dst_buffer, int dst_len) - : buffer_(dst_buffer), capacity_(dst_len), size_(0) {} - - BufferBuilder(char* dst_buffer, int dst_len) - : buffer_(reinterpret_cast(dst_buffer)), capacity_(dst_len), size_(0) {} - - inline void Append(const void* buffer, int len) { - memcpy(buffer_ + size_, buffer, len); - size_ += len; - } - - template - inline void Append(const T& v) { - Append(&v, sizeof(T)); - } - - int capacity() const { return capacity_; } - int size() const { return size_; } - - private: - uint8_t* buffer_; - int capacity_; - int size_; -}; - -} // namespace parquet - -#endif // PARQUET_UTIL_BUFFER_BUILDER_H diff --git a/cpp/src/parquet/util/comparison.cc b/cpp/src/parquet/util/comparison.cc index a0768b31c270e..99733fc338e97 100644 --- a/cpp/src/parquet/util/comparison.cc +++ b/cpp/src/parquet/util/comparison.cc @@ -16,6 +16,7 @@ // under the License. #include +#include #include "parquet/exception.h" #include "parquet/schema.h" @@ -72,4 +73,40 @@ template class PARQUET_TEMPLATE_EXPORT CompareDefault; template class PARQUET_TEMPLATE_EXPORT CompareDefault; template class PARQUET_TEMPLATE_EXPORT CompareDefault; +bool CompareUnsignedInt32::operator()(const int32_t& a, const int32_t& b) { + const uint32_t ua = a; + const uint32_t ub = b; + return (ua < ub); +} + +bool CompareUnsignedInt64::operator()(const int64_t& a, const int64_t& b) { + const uint64_t ua = a; + const uint64_t ub = b; + return (ua < ub); +} + +bool CompareUnsignedInt96::operator()(const Int96& a, const Int96& b) { + if (a.value[2] != b.value[2]) { + return (a.value[2] < b.value[2]); + } else if (a.value[1] != b.value[1]) { + return (a.value[1] < b.value[1]); + } + return (a.value[0] < b.value[0]); +} + +bool CompareUnsignedByteArray::operator()(const ByteArray& a, const ByteArray& b) { + const uint8_t* aptr = reinterpret_cast(a.ptr); + const uint8_t* bptr = reinterpret_cast(b.ptr); + return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len); +} + +CompareUnsignedFLBA::CompareUnsignedFLBA(int length) : CompareDefaultFLBA(length) {} + +bool CompareUnsignedFLBA::operator()(const FLBA& a, const FLBA& b) { + const uint8_t* aptr = reinterpret_cast(a.ptr); + const uint8_t* bptr = reinterpret_cast(b.ptr); + return std::lexicographical_compare(aptr, aptr + type_length_, bptr, + bptr + type_length_); +} + } // namespace parquet diff --git a/cpp/src/parquet/util/comparison.h b/cpp/src/parquet/util/comparison.h index 7070a0f3a4c34..4daa4df0c9c7d 100644 --- a/cpp/src/parquet/util/comparison.h +++ b/cpp/src/parquet/util/comparison.h @@ -19,10 +19,12 @@ #define PARQUET_UTIL_COMPARISON_H #include +#include #include "parquet/exception.h" #include "parquet/schema.h" #include "parquet/types.h" +#include "parquet/util/visibility.h" namespace parquet { @@ -34,7 +36,7 @@ class PARQUET_EXPORT Comparator { // The default comparison is SIGNED template -class PARQUET_EXPORT CompareDefault : public Comparator { +class PARQUET_TEMPLATE_CLASS_EXPORT CompareDefault : public Comparator { public: typedef typename DType::c_type T; CompareDefault() {} @@ -42,7 +44,7 @@ class PARQUET_EXPORT CompareDefault : public Comparator { }; template <> -class PARQUET_EXPORT CompareDefault : public Comparator { +class PARQUET_TEMPLATE_CLASS_EXPORT CompareDefault : public Comparator { public: CompareDefault() {} virtual bool operator()(const Int96& a, const Int96& b) { @@ -60,7 +62,7 @@ class PARQUET_EXPORT CompareDefault : public Comparator { }; template <> -class PARQUET_EXPORT CompareDefault : public Comparator { +class PARQUET_TEMPLATE_CLASS_EXPORT CompareDefault : public Comparator { public: CompareDefault() {} virtual bool operator()(const ByteArray& a, const ByteArray& b) { @@ -71,7 +73,7 @@ class PARQUET_EXPORT CompareDefault : public Comparator { }; template <> -class PARQUET_EXPORT CompareDefault : public Comparator { +class PARQUET_TEMPLATE_CLASS_EXPORT CompareDefault : public Comparator { public: explicit CompareDefault(int length) : type_length_(length) {} virtual bool operator()(const FLBA& a, const FLBA& b) { @@ -92,75 +94,42 @@ typedef CompareDefault CompareDefaultDouble; typedef CompareDefault CompareDefaultByteArray; typedef CompareDefault CompareDefaultFLBA; -#if defined(__GNUC__) && !defined(__clang__) -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wattributes" -#endif - -PARQUET_EXTERN_TEMPLATE CompareDefault; -PARQUET_EXTERN_TEMPLATE CompareDefault; -PARQUET_EXTERN_TEMPLATE CompareDefault; -PARQUET_EXTERN_TEMPLATE CompareDefault; -PARQUET_EXTERN_TEMPLATE CompareDefault; -PARQUET_EXTERN_TEMPLATE CompareDefault; -PARQUET_EXTERN_TEMPLATE CompareDefault; -PARQUET_EXTERN_TEMPLATE CompareDefault; - -#if defined(__GNUC__) && !defined(__clang__) -#pragma GCC diagnostic pop -#endif - // Define Unsigned Comparators class PARQUET_EXPORT CompareUnsignedInt32 : public CompareDefaultInt32 { public: - bool operator()(const int32_t& a, const int32_t& b) override { - const uint32_t ua = a; - const uint32_t ub = b; - return (ua < ub); - } + bool operator()(const int32_t& a, const int32_t& b) override; }; class PARQUET_EXPORT CompareUnsignedInt64 : public CompareDefaultInt64 { public: - bool operator()(const int64_t& a, const int64_t& b) override { - const uint64_t ua = a; - const uint64_t ub = b; - return (ua < ub); - } + bool operator()(const int64_t& a, const int64_t& b) override; }; class PARQUET_EXPORT CompareUnsignedInt96 : public CompareDefaultInt96 { public: - bool operator()(const Int96& a, const Int96& b) override { - if (a.value[2] != b.value[2]) { - return (a.value[2] < b.value[2]); - } else if (a.value[1] != b.value[1]) { - return (a.value[1] < b.value[1]); - } - return (a.value[0] < b.value[0]); - } + bool operator()(const Int96& a, const Int96& b) override; }; class PARQUET_EXPORT CompareUnsignedByteArray : public CompareDefaultByteArray { public: - bool operator()(const ByteArray& a, const ByteArray& b) override { - const uint8_t* aptr = reinterpret_cast(a.ptr); - const uint8_t* bptr = reinterpret_cast(b.ptr); - return std::lexicographical_compare(aptr, aptr + a.len, bptr, bptr + b.len); - } + bool operator()(const ByteArray& a, const ByteArray& b) override; }; class PARQUET_EXPORT CompareUnsignedFLBA : public CompareDefaultFLBA { public: - explicit CompareUnsignedFLBA(int length) : CompareDefaultFLBA(length) {} - bool operator()(const FLBA& a, const FLBA& b) override { - const uint8_t* aptr = reinterpret_cast(a.ptr); - const uint8_t* bptr = reinterpret_cast(b.ptr); - return std::lexicographical_compare(aptr, aptr + type_length_, bptr, - bptr + type_length_); - } + explicit CompareUnsignedFLBA(int length); + bool operator()(const FLBA& a, const FLBA& b) override; }; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; +PARQUET_EXTERN_TEMPLATE CompareDefault; + } // namespace parquet #endif // PARQUET_UTIL_COMPARISON_H diff --git a/cpp/src/parquet/util/crypto.h b/cpp/src/parquet/util/crypto.h index d3beb105e30e3..3c063706b1e9a 100644 --- a/cpp/src/parquet/util/crypto.h +++ b/cpp/src/parquet/util/crypto.h @@ -18,6 +18,8 @@ #ifndef PARQUET_UTIL_CRYPTO_H #define PARQUET_UTIL_CRYPTO_H +#include + #include "parquet/properties.h" #include "parquet/types.h" diff --git a/cpp/src/parquet/util/logging.h b/cpp/src/parquet/util/logging.h deleted file mode 100644 index e2c7abb1cc05d..0000000000000 --- a/cpp/src/parquet/util/logging.h +++ /dev/null @@ -1,23 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PARQUET_UTIL_LOGGING_H -#define PARQUET_UTIL_LOGGING_H - -#include "arrow/util/logging.h" - -#endif // PARQUET_UTIL_LOGGING_H diff --git a/cpp/src/parquet/util/macros.h b/cpp/src/parquet/util/macros.h index c28b2fa68c209..1a1f9544ee36f 100644 --- a/cpp/src/parquet/util/macros.h +++ b/cpp/src/parquet/util/macros.h @@ -18,69 +18,17 @@ #ifndef PARQUET_UTIL_MACROS_H #define PARQUET_UTIL_MACROS_H -// Useful macros from elsewhere +#include "arrow/util/macros.h" -// From Google gutil -#ifndef DISALLOW_COPY_AND_ASSIGN -#define DISALLOW_COPY_AND_ASSIGN(TypeName) \ - TypeName(const TypeName&) = delete; \ - void operator=(const TypeName&) = delete -#endif - -#if defined(__GNUC__) -#define PARQUET_PREDICT_FALSE(x) (__builtin_expect(x, 0)) -#define PARQUET_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1)) -#define PARQUET_NORETURN __attribute__((noreturn)) -#define PARQUET_PREFETCH(addr) __builtin_prefetch(addr) -#elif defined(_MSC_VER) -#define PARQUET_NORETURN __declspec(noreturn) -#define PARQUET_PREDICT_FALSE(x) x -#define PARQUET_PREDICT_TRUE(x) x -#define PARQUET_PREFETCH(addr) -#else -#define PARQUET_NORETURN -#define PARQUET_PREDICT_FALSE(x) x -#define PARQUET_PREDICT_TRUE(x) x -#define PARQUET_PREFETCH(addr) -#endif - -// ---------------------------------------------------------------------- -// From googletest - -// When you need to test the private or protected members of a class, -// use the FRIEND_TEST macro to declare your tests as friends of the -// class. For example: -// -// class MyClass { -// private: -// void MyMethod(); -// FRIEND_TEST(MyClassTest, MyMethod); -// }; -// -// class MyClassTest : public testing::Test { -// // ... -// }; -// -// TEST_F(MyClassTest, MyMethod) { -// // Can call MyClass::MyMethod() here. -// } +#define PARQUET_DISALLOW_COPY_AND_ASSIGN ARROW_DISALLOW_COPY_AND_ASSIGN -#define FRIEND_TEST(test_case_name, test_name) \ - friend class test_case_name##_##test_name##_Test +#define PARQUET_NORETURN ARROW_NORETURN +#define PARQUET_DEPRECATED ARROW_DEPRECATED -// clang-format off -// [[deprecated]] is only available in C++14, use this for the time being -// This macro takes an optional deprecation message -#if __cplusplus <= 201103L -# ifdef __GNUC__ -# define PARQUET_DEPRECATED(...) __attribute__((deprecated(__VA_ARGS__))) -# elif defined(_MSC_VER) -# define PARQUET_DEPRECATED(...) __declspec(deprecated(__VA_ARGS__)) -# else -# define PARQUET_DEPRECATED(...) -# endif -#else -# define PARQUET_DEPRECATED(...) [[deprecated(__VA_ARGS__)]] +// If ARROW_VALGRIND set when compiling unit tests, also define +// PARQUET_VALGRIND +#ifdef ARROW_VALGRIND +#define PARQUET_VALGRIND #endif #endif // PARQUET_UTIL_MACROS_H diff --git a/cpp/src/parquet/util/memory.cc b/cpp/src/parquet/util/memory.cc index d9caf6e304190..5c76cd8a67659 100644 --- a/cpp/src/parquet/util/memory.cc +++ b/cpp/src/parquet/util/memory.cc @@ -26,14 +26,44 @@ #include "arrow/status.h" #include "arrow/util/bit-util.h" +#include "arrow/util/compression.h" +#include "arrow/util/logging.h" #include "parquet/exception.h" #include "parquet/types.h" -#include "parquet/util/logging.h" using arrow::MemoryPool; namespace parquet { +std::unique_ptr<::arrow::Codec> GetCodecFromArrow(Compression::type codec) { + std::unique_ptr<::arrow::Codec> result; + switch (codec) { + case Compression::UNCOMPRESSED: + break; + case Compression::SNAPPY: + PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::SNAPPY, &result)); + break; + case Compression::GZIP: + PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::GZIP, &result)); + break; + case Compression::LZO: + PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::LZO, &result)); + break; + case Compression::BROTLI: + PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::BROTLI, &result)); + break; + case Compression::LZ4: + PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::LZ4, &result)); + break; + case Compression::ZSTD: + PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::ZSTD, &result)); + break; + default: + break; + } + return result; +} + template Vector::Vector(int64_t size, MemoryPool* pool) : buffer_(AllocateBuffer(pool, size * sizeof(T))), size_(size), capacity_(size) { @@ -110,7 +140,7 @@ ChunkedAllocator::~ChunkedAllocator() { void ChunkedAllocator::ReturnPartialAllocation(int byte_size) { DCHECK_GE(byte_size, 0); - DCHECK(current_chunk_idx_ != -1); + DCHECK_NE(current_chunk_idx_, -1); ChunkInfo& info = chunks_[current_chunk_idx_]; DCHECK_GE(info.allocated_bytes, byte_size); info.allocated_bytes -= byte_size; diff --git a/cpp/src/parquet/util/memory.h b/cpp/src/parquet/util/memory.h index 088f86feddd29..2eadb33268ff6 100644 --- a/cpp/src/parquet/util/memory.h +++ b/cpp/src/parquet/util/memory.h @@ -30,44 +30,22 @@ #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" #include "arrow/memory_pool.h" -#include "arrow/status.h" -#include "arrow/util/compression.h" #include "parquet/exception.h" #include "parquet/types.h" #include "parquet/util/macros.h" #include "parquet/util/visibility.h" +namespace arrow { + +class Codec; + +} // namespace arrow + namespace parquet { -static inline std::unique_ptr<::arrow::Codec> GetCodecFromArrow(Compression::type codec) { - std::unique_ptr<::arrow::Codec> result; - switch (codec) { - case Compression::UNCOMPRESSED: - break; - case Compression::SNAPPY: - PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::SNAPPY, &result)); - break; - case Compression::GZIP: - PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::GZIP, &result)); - break; - case Compression::LZO: - PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::LZO, &result)); - break; - case Compression::BROTLI: - PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::BROTLI, &result)); - break; - case Compression::LZ4: - PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::LZ4, &result)); - break; - case Compression::ZSTD: - PARQUET_THROW_NOT_OK(::arrow::Codec::Create(::arrow::Compression::ZSTD, &result)); - break; - default: - break; - } - return result; -} +PARQUET_EXPORT +std::unique_ptr<::arrow::Codec> GetCodecFromArrow(Compression::type codec); static constexpr int64_t kInMemoryDefaultCapacity = 1024; @@ -94,7 +72,7 @@ class PARQUET_EXPORT Vector { int64_t capacity_; T* data_; - DISALLOW_COPY_AND_ASSIGN(Vector); + PARQUET_DISALLOW_COPY_AND_ASSIGN(Vector); }; /// A ChunkedAllocator maintains a list of memory chunks from which it @@ -194,7 +172,7 @@ class PARQUET_EXPORT ChunkedAllocator { explicit ChunkInfo(int64_t size, uint8_t* buf); - ChunkInfo() : data(nullptr), size(0), allocated_bytes(0) {} + ChunkInfo() : data(NULLPTR), size(0), allocated_bytes(0) {} }; /// chunk from which we served the last Allocate() call; @@ -291,6 +269,12 @@ class PARQUET_EXPORT ArrowFileMethods : virtual public FileInterface { virtual ::arrow::io::FileInterface* file_interface() = 0; }; +// Suppress C4250 warning caused by diamond inheritance +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable : 4250) +#endif + /// This interface depends on the threadsafety of the underlying Arrow file interface class PARQUET_EXPORT ArrowInputFile : public ArrowFileMethods, public RandomAccessSource { public: @@ -338,6 +322,11 @@ class PARQUET_EXPORT ArrowOutputStream : public ArrowFileMethods, public OutputS std::shared_ptr<::arrow::io::OutputStream> file_; }; +// Pop C4250 pragma +#ifdef _MSC_VER +#pragma warning(pop) +#endif + class PARQUET_EXPORT InMemoryOutputStream : public OutputStream { public: explicit InMemoryOutputStream( @@ -370,7 +359,7 @@ class PARQUET_EXPORT InMemoryOutputStream : public OutputStream { int64_t size_; int64_t capacity_; - DISALLOW_COPY_AND_ASSIGN(InMemoryOutputStream); + PARQUET_DISALLOW_COPY_AND_ASSIGN(InMemoryOutputStream); }; // ---------------------------------------------------------------------- @@ -379,7 +368,7 @@ class PARQUET_EXPORT InMemoryOutputStream : public OutputStream { // Interface for the column reader to get the bytes. The interface is a stream // interface, meaning the bytes in order and once a byte is read, it does not // need to be read again. -class InputStream { +class PARQUET_EXPORT InputStream { public: // Returns the next 'num_to_peek' without advancing the current position. // *num_bytes will contain the number of bytes returned which can only be diff --git a/cpp/src/parquet/util/visibility.h b/cpp/src/parquet/util/visibility.h index 984fac29abc72..929d3b22c8851 100644 --- a/cpp/src/parquet/util/visibility.h +++ b/cpp/src/parquet/util/visibility.h @@ -53,9 +53,11 @@ // This is a complicated topic, some reading on it: // http://www.codesynthesis.com/~boris/blog/2010/01/18/dll-export-cxx-templates/ -#ifdef _MSC_VER +#if defined(_MSC_VER) || defined(__clang__) +#define PARQUET_TEMPLATE_CLASS_EXPORT #define PARQUET_TEMPLATE_EXPORT PARQUET_EXPORT #else +#define PARQUET_TEMPLATE_CLASS_EXPORT PARQUET_EXPORT #define PARQUET_TEMPLATE_EXPORT #endif diff --git a/cpp/src/parquet/util/windows_compatibility.h b/cpp/src/parquet/util/windows_compatibility.h index 899590ac07e93..31ca04c8b660b 100644 --- a/cpp/src/parquet/util/windows_compatibility.h +++ b/cpp/src/parquet/util/windows_compatibility.h @@ -15,23 +15,16 @@ // specific language governing permissions and limitations // under the License. -#ifndef PARQUET_UTIL_WINDOWS_COMPATIBILITY -#define PARQUET_UTIL_WINDOWS_COMPATIBILITY +#pragma once -#ifdef _WIN32 - -// Windows defines min and max macros that mess up std::min/max -#ifndef NOMINMAX -#define NOMINMAX -#endif +#include "arrow/util/windows_compatibility.h" -#include -#include +#ifdef _WIN32 +// parquet.thrift's OPTIONAL RepetitionType conflicts with a #define from +// above, so we undefine it #ifdef OPTIONAL #undef OPTIONAL #endif -#endif // _WIN32 - -#endif // PARQUET_UTIL_WINDOWS_COMPATIBILITY +#endif diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index 88644b77d6fe6..116d534769160 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -88,14 +88,18 @@ if (ARROW_GPU) add_definitions(-DPLASMA_GPU) endif() - - ADD_ARROW_LIB(plasma SOURCES ${PLASMA_SRCS} + OUTPUTS PLASMA_LIBRARIES DEPENDENCIES gen_plasma_fbs SHARED_LINK_LIBS ${FLATBUFFERS_STATIC_LIB} ${CMAKE_THREAD_LIBS_INIT} ${PLASMA_LINK_LIBS} STATIC_LINK_LIBS ${FLATBUFFERS_STATIC_LIB} ${CMAKE_THREAD_LIBS_INIT} ${PLASMA_LINK_LIBS}) +foreach(LIB_TARGET ${PLASMA_LIBRARIES}) + target_compile_definitions(${LIB_TARGET} + PRIVATE ARROW_EXPORTING) +endforeach() + # The optimization flag -O3 is suggested by dlmalloc.c, which is #included in # malloc.cc; we set it here regardless of whether we do a debug or release build. set_source_files_properties(malloc.cc PROPERTIES diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing new file mode 160000 index 0000000000000..48a657ca05eb3 --- /dev/null +++ b/cpp/submodules/parquet-testing @@ -0,0 +1 @@ +Subproject commit 48a657ca05eb308539f3f00c698e8bb5185d9b38 diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 0f358ac6c4cbd..47f8d84054267 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -1,5 +1,6 @@ *.npmrc *.gitignore +.gitmodules *_generated.h *_generated.js *_generated.ts @@ -27,6 +28,7 @@ cpp/cmake_modules/SetupCxxFlags.cmake cpp/cmake_modules/SnappyCMakeLists.txt cpp/cmake_modules/SnappyConfig.h cpp/cmake_modules/CompilerInfo.cmake +cpp/src/parquet/.parquetcppversion cpp/src/plasma/thirdparty/ae/ae.c cpp/src/plasma/thirdparty/ae/ae.h cpp/src/plasma/thirdparty/ae/ae_epoll.c diff --git a/python/README.md b/python/README.md index e2ed9db6f50f8..c732e3b68b3e9 100644 --- a/python/README.md +++ b/python/README.md @@ -63,6 +63,5 @@ pip install -r doc/requirements.txt python setup.py build_sphinx -s doc/source ``` -[1]: https://github.com/apache/parquet-cpp [2]: https://github.com/apache/arrow/blob/master/python/doc/source/development.rst [3]: https://github.com/pandas-dev/pandas diff --git a/python/doc/source/development.rst b/python/doc/source/development.rst index 7ff16f3c6ff59..eefd9761ba4e1 100644 --- a/python/doc/source/development.rst +++ b/python/doc/source/development.rst @@ -54,14 +54,13 @@ Finally, set gcc 4.9 as the active compiler using: Environment Setup and Build --------------------------- -First, let's clone the Arrow and Parquet git repositories: +First, let's clone the Arrow git repository: .. code-block:: shell mkdir repos cd repos git clone https://github.com/apache/arrow.git - git clone https://github.com/apache/parquet-cpp.git You should now see @@ -70,7 +69,6 @@ You should now see $ ls -l total 8 drwxrwxr-x 12 wesm wesm 4096 Apr 15 19:19 arrow/ - drwxrwxr-x 12 wesm wesm 4096 Apr 15 19:19 parquet-cpp/ Using Conda ~~~~~~~~~~~ @@ -94,7 +92,6 @@ about our build toolchain: export ARROW_BUILD_TYPE=release export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX - export PARQUET_BUILD_TOOLCHAIN=$CONDA_PREFIX export ARROW_HOME=$CONDA_PREFIX export PARQUET_HOME=$CONDA_PREFIX @@ -135,7 +132,7 @@ folder as the repositories and a target installation folder: source ./pyarrow/bin/activate pip install six numpy pandas cython pytest - # This is the folder where we will install Arrow and Parquet to during + # This is the folder where we will install the Arrow libraries during # development mkdir dist @@ -165,6 +162,7 @@ Now build and install the Arrow C++ libraries: cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ + -DARROW_PARQUET=on \ -DARROW_PYTHON=on \ -DARROW_PLASMA=on \ -DARROW_BUILD_TESTS=OFF \ @@ -176,25 +174,6 @@ Now build and install the Arrow C++ libraries: If you don't want to build and install the Plasma in-memory object store, you can omit the ``-DARROW_PLASMA=on`` flag. -Now, optionally build and install the Apache Parquet libraries in your -toolchain: - -.. code-block:: shell - - mkdir parquet-cpp/build - pushd parquet-cpp/build - - cmake -DCMAKE_BUILD_TYPE=$ARROW_BUILD_TYPE \ - -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ - -DPARQUET_BUILD_BENCHMARKS=off \ - -DPARQUET_BUILD_EXECUTABLES=off \ - -DPARQUET_BUILD_TESTS=off \ - .. - - make -j4 - make install - popd - Now, build pyarrow: .. code-block:: shell @@ -203,8 +182,7 @@ Now, build pyarrow: python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ --with-parquet --with-plasma --inplace -If you did not build parquet-cpp, you can omit ``--with-parquet`` and if -you did not build with plasma, you can omit ``--with-plasma``. +If you did not build with plasma, you can omit ``--with-plasma``. You should be able to run the unit tests with: @@ -225,16 +203,15 @@ You should be able to run the unit tests with: ====== 1000 passed, 56 skipped, 6 xfailed, 19 warnings in 26.52 seconds ======= -To build a self-contained wheel (including Arrow C++ and Parquet C++), one -can set ``--bundle-arrow-cpp``: +To build a self-contained wheel (including the Arrow and Parquet C++ +libraries), one can set ``--bundle-arrow-cpp``: .. code-block:: shell python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ --with-parquet --with-plasma --bundle-arrow-cpp bdist_wheel -Again, if you did not build parquet-cpp, you should omit ``--with-parquet`` and -if you did not build with plasma, you should omit ``--with-plasma``. +Again, if you did not build with plasma, you should omit ``--with-plasma``. Building with optional ORC integration -------------------------------------- @@ -283,12 +260,11 @@ First, we bootstrap a conda environment similar to the `C++ build instructions `_. This includes all the dependencies for Arrow and the Apache Parquet C++ libraries. -First, starting from fresh clones of Apache Arrow and parquet-cpp: +First, starting from fresh clones of Apache Arrow: .. code-block:: shell git clone https://github.com/apache/arrow.git - git clone https://github.com/apache/parquet-cpp.git .. code-block:: shell @@ -311,25 +287,11 @@ Now, we build and install Arrow C++ libraries -DCMAKE_BUILD_TYPE=Release ^ -DARROW_BUILD_TESTS=on ^ -DARROW_CXXFLAGS="/WX /MP" ^ + -DARROW_PARQUET=on ^ -DARROW_PYTHON=on .. cmake --build . --target INSTALL --config Release cd ..\.. -Now, we build parquet-cpp and install the result in the same place: - -.. code-block:: shell - - mkdir ..\parquet-cpp\build - pushd ..\parquet-cpp\build - set PARQUET_BUILD_TOOLCHAIN=%CONDA_PREFIX%\Library - set PARQUET_HOME=C:\thirdparty - cmake -G "Visual Studio 14 2015 Win64" ^ - -DCMAKE_INSTALL_PREFIX=%PARQUET_HOME% ^ - -DCMAKE_BUILD_TYPE=Release ^ - -DPARQUET_BUILD_TESTS=off .. - cmake --build . --target INSTALL --config Release - popd - After that, we must put the install directory's bin path in our ``%PATH%``: .. code-block:: shell @@ -360,75 +322,3 @@ Getting ``python-test.exe`` to run is a bit tricky because your set PYTHONHOME=%CONDA_PREFIX% Now ``python-test.exe`` or simply ``ctest`` (to run all tests) should work. - -Nightly Builds of ``arrow-cpp``, ``parquet-cpp``, and ``pyarrow`` for Linux ---------------------------------------------------------------------------- - -Nightly builds of Linux conda packages for ``arrow-cpp``, ``parquet-cpp``, and -``pyarrow`` can be automated using an open source tool called `scourge -`_. - -``scourge`` is new, so please report any feature requests or bugs to the -`scourge issue tracker `_. - -To get scourge you need to clone the source and install it in development mode. - -To setup your own nightly builds: - -#. Clone and install scourge -#. Create a script that calls scourge -#. Run that script as a cronjob once per day - -First, clone and install scourge (you also need to `install docker -`_): - -.. code:: sh - - git clone https://github.com/cpcloud/scourge - cd scourge - python setup.py develop - which scourge - -Second, create a shell script that calls scourge: - -.. code:: sh - - function build() { - # make sure we got a working directory - workingdir="${1}" - [ -z "${workingdir}" ] && echo "Must provide a working directory" && exit 1 - scourge="/path/to/scourge" - - # get the hash of master for building parquet - PARQUET_ARROW_VERSION="$("${scourge}" sha apache/arrow master)" - - # setup the build for each package - "${scourge}" init arrow-cpp@master parquet-cpp@master pyarrow@master - - # build the packages with some constraints (the -c arguments) - # -e sets environment variables on a per package basis - "${scourge}" build \ - -e parquet-cpp:PARQUET_ARROW_VERSION="${PARQUET_ARROW_VERSION}" \ - -c "python >=2.7,<3|>=3.5" \ - -c "numpy >= 1.11" \ - -c "r-base >=3.3.2" - } - - workingdir="$(date +'%Y%m%d_%H_%M_%S')" - mkdir -p "${workingdir}" - build "${workingdir}" > "${workingdir}"/scourge.log 2>&1 - -Third, run that script as a cronjob once per day: - -.. code:: sh - - crontab -e - -then in the scratch file that's opened: - -.. code:: sh - - @daily /path/to/the/above/script.sh - -The build artifacts (conda packages) will be located in -``${workingdir}/artifacts/linux-64``. diff --git a/python/manylinux1/Dockerfile-x86_64 b/python/manylinux1/Dockerfile-x86_64 index 35be879cad9bc..04566ece29ddf 100644 --- a/python/manylinux1/Dockerfile-x86_64 +++ b/python/manylinux1/Dockerfile-x86_64 @@ -20,14 +20,19 @@ ADD arrow /arrow WORKDIR /arrow/cpp RUN mkdir build-plain WORKDIR /arrow/cpp/build-plain -RUN cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JEMALLOC=ON -DARROW_RPATH_ORIGIN=ON -DARROW_JEMALLOC_USE_SHARED=OFF -DBoost_NAMESPACE=arrow_boost -DBOOST_ROOT=/arrow_boost_dist .. +RUN cmake -GNinja -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/arrow-dist \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DARROW_BUILD_TESTS=OFF \ + -DARROW_BUILD_SHARED=ON \ + -DARROW_BOOST_USE_SHARED=ON \ + -DARROW_JEMALLOC=ON \ + -DARROW_PARQUET=ON \ + -DARROW_RPATH_ORIGIN=ON \ + -DARROW_JEMALLOC_USE_SHARED=OFF \ + -DBoost_NAMESPACE=arrow_boost \ + -DBOOST_ROOT=/arrow_boost_dist .. RUN ninja install ADD scripts/check_arrow_visibility.sh / RUN /check_arrow_visibility.sh - -WORKDIR / -RUN git clone https://github.com/apache/parquet-cpp.git -WORKDIR /parquet-cpp -RUN ARROW_HOME=/arrow-dist cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DPARQUET_BUILD_TESTS=OFF -DPARQUET_BUILD_SHARED=ON -DPARQUET_BUILD_STATIC=OFF -DPARQUET_BOOST_USE_SHARED=ON -DBoost_NAMESPACE=arrow_boost -DBOOST_ROOT=/arrow_boost_dist -DPARQUET_RPATH_ORIGIN=ON -GNinja . -RUN ninja install diff --git a/python/manylinux1/build_arrow.sh b/python/manylinux1/build_arrow.sh index 8e609f5bb0746..13138424271b5 100755 --- a/python/manylinux1/build_arrow.sh +++ b/python/manylinux1/build_arrow.sh @@ -19,8 +19,6 @@ # # Usage: # docker run --rm -v $PWD:/io arrow-base-x86_64 /io/build_arrow.sh -# or with Parquet support -# docker run --rm -v $PWD:/io parquet_arrow-base-x86_64 /io/build_arrow.sh # Build upon the scripts in https://github.com/matthew-brett/manylinux-builds # * Copyright (c) 2013-2016, Matt Terry and Matthew Brett (BSD 2-clause) @@ -43,7 +41,8 @@ export PYARROW_WITH_PLASMA=1 export PYARROW_BUNDLE_ARROW_CPP=1 export PYARROW_BUNDLE_BOOST=1 export PYARROW_BOOST_NAMESPACE=arrow_boost -export PKG_CONFIG_PATH=/arrow-dist/lib64/pkgconfig +export PKG_CONFIG_PATH=/arrow-dist/lib/pkgconfig + export PYARROW_CMAKE_OPTIONS='-DTHRIFT_HOME=/usr -DBoost_NAMESPACE=arrow_boost -DBOOST_ROOT=/arrow_boost_dist' # Ensure the target directory exists mkdir -p /io/dist @@ -69,7 +68,22 @@ for PYTHON_TUPLE in ${PYTHON_VERSIONS}; do ARROW_BUILD_DIR=/arrow/cpp/build-PY${PYTHON}-${U_WIDTH} mkdir -p "${ARROW_BUILD_DIR}" pushd "${ARROW_BUILD_DIR}" - PATH="${CPYTHON_PATH}/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=/arrow-dist -DARROW_BUILD_TESTS=OFF -DARROW_BUILD_SHARED=ON -DARROW_BOOST_USE_SHARED=ON -DARROW_JEMALLOC=ON -DARROW_RPATH_ORIGIN=ON -DARROW_PYTHON=ON -DPythonInterp_FIND_VERSION=${PYTHON} -DARROW_PLASMA=ON -DARROW_TENSORFLOW=ON -DARROW_ORC=ON -DBoost_NAMESPACE=arrow_boost -DBOOST_ROOT=/arrow_boost_dist -GNinja .. + PATH="${CPYTHON_PATH}/bin:$PATH" cmake -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX=/arrow-dist \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DARROW_BUILD_TESTS=OFF \ + -DARROW_BUILD_SHARED=ON \ + -DARROW_BOOST_USE_SHARED=ON \ + -DARROW_JEMALLOC=ON \ + -DARROW_RPATH_ORIGIN=ON \ + -DARROW_PYTHON=ON \ + -DPythonInterp_FIND_VERSION=${PYTHON} \ + -DARROW_PLASMA=ON \ + -DARROW_TENSORFLOW=ON \ + -DARROW_ORC=ON \ + -DBoost_NAMESPACE=arrow_boost \ + -DBOOST_ROOT=/arrow_boost_dist \ + -GNinja .. ninja install popd diff --git a/python/manylinux1/scripts/check_arrow_visibility.sh b/python/manylinux1/scripts/check_arrow_visibility.sh index bed357edf664a..477da658d9180 100755 --- a/python/manylinux1/scripts/check_arrow_visibility.sh +++ b/python/manylinux1/scripts/check_arrow_visibility.sh @@ -16,7 +16,7 @@ # specific language governing permissions and limitations # under the License. -nm -D -C /arrow-dist/lib64/libarrow.so > nm_arrow.log +nm -D -C /arrow-dist/lib/libarrow.so > nm_arrow.log grep ' T ' nm_arrow.log | grep -v arrow > visible_symbols.log if [[ `cat visible_symbols.log | wc -l` -eq 2 ]] diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index 564391e9ca5d8..b2f0a461ca61d 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -288,7 +288,7 @@ cdef extern from "parquet/arrow/reader.h" namespace "parquet::arrow" nogil: const ParquetFileReader* parquet_reader() - void set_num_threads(int num_threads) + void set_use_threads(c_bool use_threads) cdef extern from "parquet/arrow/schema.h" namespace "parquet::arrow" nogil: diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 7b97d06e7a1a7..937e70a5b4971 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -674,16 +674,17 @@ cdef class ParquetReader: def num_row_groups(self): return self.reader.get().num_row_groups() - def set_num_threads(self, int nthreads): - self.reader.get().set_num_threads(nthreads) + def set_use_threads(self, bint use_threads): + self.reader.get().set_use_threads(use_threads) - def read_row_group(self, int i, column_indices=None, nthreads=None): + def read_row_group(self, int i, column_indices=None, + bint use_threads=True): cdef: shared_ptr[CTable] ctable vector[int] c_column_indices - if nthreads: - self.set_num_threads(nthreads) + if use_threads: + self.set_use_threads(use_threads) if column_indices is not None: for index in column_indices: @@ -699,13 +700,13 @@ cdef class ParquetReader: .ReadRowGroup(i, &ctable)) return pyarrow_wrap_table(ctable) - def read_all(self, column_indices=None, nthreads=None): + def read_all(self, column_indices=None, bint use_threads=True): cdef: shared_ptr[CTable] ctable vector[int] c_column_indices - if nthreads: - self.set_num_threads(nthreads) + if use_threads: + self.set_use_threads(use_threads) if column_indices is not None: for index in column_indices: diff --git a/python/pyarrow/filesystem.py b/python/pyarrow/filesystem.py index 7dd94a8c08807..f1d0eec3f8df5 100644 --- a/python/pyarrow/filesystem.py +++ b/python/pyarrow/filesystem.py @@ -148,7 +148,8 @@ def _isfilestore(self): raise NotImplementedError def read_parquet(self, path, columns=None, metadata=None, schema=None, - nthreads=1, use_pandas_metadata=False): + use_threads=True, nthreads=None, + use_pandas_metadata=False): """ Read Parquet data from path in file system. Can read from a single file or a directory of files @@ -164,9 +165,8 @@ def read_parquet(self, path, columns=None, metadata=None, schema=None, schema : pyarrow.parquet.Schema Known schema to validate files against. Alternative to metadata argument - nthreads : int, default 1 - Number of columns to read in parallel. If > 1, requires that the - underlying file source is threadsafe + use_threads : boolean, default True + Perform multi-threaded column reads use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded @@ -176,9 +176,11 @@ def read_parquet(self, path, columns=None, metadata=None, schema=None, table : pyarrow.Table """ from pyarrow.parquet import ParquetDataset + from pyarrow.util import _deprecate_nthreads + use_threads = _deprecate_nthreads(use_threads, nthreads) dataset = ParquetDataset(path, schema=schema, metadata=metadata, filesystem=self) - return dataset.read(columns=columns, nthreads=nthreads, + return dataset.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata) def open(self, path, mode='rb'): diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index d56a67fd6876c..6c2539ccce6aa 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -32,8 +32,7 @@ ParquetSchema, ColumnSchema) from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem, _get_fs_from_path) -from pyarrow.util import _is_path_like, _stringify_path - +from pyarrow.util import _is_path_like, _stringify_path, _deprecate_nthreads # ---------------------------------------------------------------------- # Reading a single Parquet file @@ -89,8 +88,8 @@ def schema(self): def num_row_groups(self): return self.reader.num_row_groups - def read_row_group(self, i, columns=None, nthreads=1, - use_pandas_metadata=False): + def read_row_group(self, i, columns=None, nthreads=None, + use_threads=True, use_pandas_metadata=False): """ Read a single row group from a Parquet file @@ -100,9 +99,8 @@ def read_row_group(self, i, columns=None, nthreads=1, If not None, only these columns will be read from the row group. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' - nthreads : int, default 1 - Number of columns to read in parallel. If > 1, requires that the - underlying file source is threadsafe + use_threads : boolean, default True + Perform multi-threaded column reads use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded @@ -112,12 +110,13 @@ def read_row_group(self, i, columns=None, nthreads=1, pyarrow.table.Table Content of the row group as a table (of columns) """ + use_threads = _deprecate_nthreads(use_threads, nthreads) column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_row_group(i, column_indices=column_indices, - nthreads=nthreads) + use_threads=use_threads) - def read(self, columns=None, nthreads=1, use_pandas_metadata=False): + def read(self, columns=None, use_threads=True, use_pandas_metadata=False): """ Read a Table from Parquet format @@ -127,9 +126,8 @@ def read(self, columns=None, nthreads=1, use_pandas_metadata=False): If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' - nthreads : int, default 1 - Number of columns to read in parallel. If > 1, requires that the - underlying file source is threadsafe + use_threads : boolean, default True + Perform multi-threaded column reads use_pandas_metadata : boolean, default False If True and file has custom pandas schema metadata, ensure that index columns are also loaded @@ -142,7 +140,7 @@ def read(self, columns=None, nthreads=1, use_pandas_metadata=False): column_indices = self._get_column_indices( columns, use_pandas_metadata=use_pandas_metadata) return self.reader.read_all(column_indices=column_indices, - nthreads=nthreads) + use_threads=use_threads) def scan_contents(self, columns=None, batch_size=65536): """ @@ -415,7 +413,7 @@ def _open(self, open_file_func=None): reader = ParquetFile(reader) return reader - def read(self, columns=None, nthreads=1, partitions=None, + def read(self, columns=None, use_threads=True, partitions=None, open_file_func=None, file=None, use_pandas_metadata=False): """ Read this piece as a pyarrow.Table @@ -423,8 +421,8 @@ def read(self, columns=None, nthreads=1, partitions=None, Parameters ---------- columns : list of column names, default None - nthreads : int, default 1 - For multithreaded file reads + use_threads : boolean, default True + Perform multi-threaded column reads partitions : ParquetPartitions, default None open_file_func : function, default None A function that knows how to construct a ParquetFile object given @@ -445,7 +443,7 @@ def read(self, columns=None, nthreads=1, partitions=None, reader = ParquetFile(self.path) options = dict(columns=columns, - nthreads=nthreads, + use_threads=use_threads, use_pandas_metadata=use_pandas_metadata) if self.row_group is not None: @@ -857,7 +855,7 @@ def validate_schemas(self): .format(piece, file_schema, dataset_schema)) - def read(self, columns=None, nthreads=1, use_pandas_metadata=False): + def read(self, columns=None, use_threads=True, use_pandas_metadata=False): """ Read multiple Parquet files as a single pyarrow.Table @@ -865,9 +863,8 @@ def read(self, columns=None, nthreads=1, use_pandas_metadata=False): ---------- columns : List[str] Names of columns to read from the file - nthreads : int, default 1 - Number of columns to read in parallel. Requires that the underlying - file source is threadsafe + use_threads : boolean, default True + Perform multi-threaded column reads use_pandas_metadata : bool, default False Passed through to each dataset piece @@ -880,7 +877,7 @@ def read(self, columns=None, nthreads=1, use_pandas_metadata=False): tables = [] for piece in self.pieces: - table = piece.read(columns=columns, nthreads=nthreads, + table = piece.read(columns=columns, use_threads=use_threads, partitions=self.partitions, open_file_func=open_file, use_pandas_metadata=use_pandas_metadata) @@ -994,9 +991,8 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1): If not None, only these columns will be read from the file. A column name may be a prefix of a nested field, e.g. 'a' will select 'a.b', 'a.c', and 'a.d.e' -nthreads : int, default 1 - Number of columns to read in parallel. Requires that the underlying - file source is threadsafe +use_threads : boolean, default True + Perform multi-threaded column reads metadata : FileMetaData If separately computed {1} @@ -1007,15 +1003,17 @@ def _make_manifest(path_or_paths, fs, pathsep='/', metadata_nthreads=1): """ -def read_table(source, columns=None, nthreads=1, metadata=None, - use_pandas_metadata=False): +def read_table(source, columns=None, use_threads=True, metadata=None, + use_pandas_metadata=False, nthreads=None): + use_threads = _deprecate_nthreads(use_threads, nthreads) if _is_path_like(source): fs = _get_fs_from_path(source) - return fs.read_parquet(source, columns=columns, metadata=metadata, + return fs.read_parquet(source, columns=columns, + use_threads=use_threads, metadata=metadata, use_pandas_metadata=use_pandas_metadata) pf = ParquetFile(source, metadata=metadata) - return pf.read(columns=columns, nthreads=nthreads, + return pf.read(columns=columns, use_threads=use_threads, use_pandas_metadata=use_pandas_metadata) @@ -1028,8 +1026,10 @@ def read_table(source, columns=None, nthreads=1, metadata=None, Content of the file as a table (of columns)""") -def read_pandas(source, columns=None, nthreads=1, metadata=None): - return read_table(source, columns=columns, nthreads=nthreads, +def read_pandas(source, columns=None, use_threads=True, + nthreads=None, metadata=None): + return read_table(source, columns=columns, + use_threads=use_threads, metadata=metadata, use_pandas_metadata=True) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index b40294a35584c..556b1558a51c2 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -921,10 +921,10 @@ def test_multithreaded_read(): _write_table(table, buf, compression='SNAPPY', version='2.0') buf.seek(0) - table1 = _read_table(buf, nthreads=4) + table1 = _read_table(buf, use_threads=True) buf.seek(0) - table2 = _read_table(buf, nthreads=1) + table2 = _read_table(buf, use_threads=False) assert table1.equals(table2) @@ -1556,9 +1556,9 @@ def test_read_multiple_files(tempdir): # Write a _SUCCESS.crc file (dirpath / '_SUCCESS.crc').touch() - def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): + def read_multiple_files(paths, columns=None, use_threads=True, **kwargs): dataset = pq.ParquetDataset(paths, **kwargs) - return dataset.read(columns=columns, nthreads=nthreads) + return dataset.read(columns=columns, use_threads=use_threads) result = read_multiple_files(paths) expected = pa.concat_tables(test_data) @@ -1583,7 +1583,7 @@ def read_multiple_files(paths, columns=None, nthreads=None, **kwargs): assert result.equals(expected) # Read with multiple threads - pa.localfs.read_parquet(dirpath, nthreads=2) + pa.localfs.read_parquet(dirpath, use_threads=True) # Test failure modes with non-uniform metadata bad_apple = _test_dataframe(size, seed=i).iloc[:, :4] diff --git a/python/pyarrow/util.py b/python/pyarrow/util.py index 12064e6232b05..1c26ee5e22f73 100644 --- a/python/pyarrow/util.py +++ b/python/pyarrow/util.py @@ -72,3 +72,13 @@ def _stringify_path(path): return str(path) raise TypeError("not a path-like object") + + +def _deprecate_nthreads(use_threads, nthreads): + if nthreads is not None: + warnings.warn("`nthreads` argument is deprecated, " + "pass `use_threads` instead", FutureWarning, + stacklevel=3) + if nthreads > 1: + use_threads = True + return use_threads diff --git a/python/testing/functions.sh b/python/testing/functions.sh index 3bfb5b8a53f3e..983f490331ff8 100644 --- a/python/testing/functions.sh +++ b/python/testing/functions.sh @@ -63,6 +63,7 @@ build_arrow() { -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ -DARROW_NO_DEPRECATED_API=ON \ + -DARROW_PARQUET=ON \ -DARROW_PYTHON=ON \ -DARROW_PLASMA=ON \ -DARROW_BOOST_USE_SHARED=off \ @@ -72,29 +73,3 @@ build_arrow() { ninja install popd } - -build_parquet() { - PARQUET_DIR=$BUILD_DIR/parquet - mkdir -p $PARQUET_DIR - - git clone https://github.com/apache/parquet-cpp.git $PARQUET_DIR - - pushd $PARQUET_DIR - mkdir build-dir - cd build-dir - - cmake \ - -GNinja \ - -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ - -DCMAKE_INSTALL_PREFIX=$PARQUET_HOME \ - -DPARQUET_BOOST_USE_SHARED=off \ - -DPARQUET_BUILD_BENCHMARKS=off \ - -DPARQUET_BUILD_EXECUTABLES=off \ - -DPARQUET_BUILD_TESTS=off \ - .. - - ninja - ninja install - - popd -}