From 525f8b4b80146de1db3192e1e9ca581ce5a8c6b7 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 12 Jul 2024 09:31:34 -0700 Subject: [PATCH] [c++] remove HDFS support (fixes #6436) (#6534) --- CMakeLists.txt | 20 ------ build-python.sh | 5 -- docs/Installation-Guide.rst | 35 ---------- docs/_static/js/script.js | 1 - python-package/README.rst | 19 ------ src/io/file_io.cpp | 129 ------------------------------------ 6 files changed, 209 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 09eaaa214261..c287b6b31039 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,7 +2,6 @@ option(USE_MPI "Enable MPI-based distributed learning" OFF) option(USE_OPENMP "Enable OpenMP" ON) option(USE_GPU "Enable GPU-accelerated training" OFF) option(USE_SWIG "Enable SWIG to generate Java API" OFF) -option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF) option(USE_TIMETAG "Set to ON to output time costs" OFF) option(USE_CUDA "Enable CUDA-accelerated training " OFF) option(USE_DEBUG "Set to ON for Debug mode" OFF) @@ -294,21 +293,6 @@ if(USE_CUDA) endforeach() endif() -if(USE_HDFS) - message( - DEPRECATION - "HDFS support in LightGBM is deprecated, and will be removed in a future release.\ - See https://github.com/microsoft/LightGBM/issues/6436. - " - ) - find_package(JNI REQUIRED) - find_path(HDFS_INCLUDE_DIR hdfs.h REQUIRED) - find_library(HDFS_LIB NAMES hdfs REQUIRED) - include_directories(${HDFS_INCLUDE_DIR}) - add_definitions(-DUSE_HDFS) - set(HDFS_CXX_LIBRARIES ${HDFS_LIB} ${JAVA_JVM_LIBRARY}) -endif() - include(CheckCXXSourceCompiles) check_cxx_source_compiles(" #include @@ -647,10 +631,6 @@ if(USE_CUDA) target_link_libraries(_lightgbm PRIVATE ${histograms}) endif() -if(USE_HDFS) - target_link_libraries(lightgbm_objs PUBLIC ${HDFS_CXX_LIBRARIES}) -endif() - if(WIN32) if(MINGW OR CYGWIN) target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi) diff --git a/build-python.sh b/build-python.sh index 01c3cf7c3e02..afb4667acf97 100755 --- a/build-python.sh +++ b/build-python.sh @@ -40,8 +40,6 @@ # Compile CUDA version. # --gpu # Compile GPU version. -# --hdfs -# Compile HDFS version. # --integrated-opencl # Compile integrated OpenCL version. # --mingw @@ -148,9 +146,6 @@ while [ $# -gt 0 ]; do --gpu) BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_GPU=ON" ;; - --hdfs) - BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_HDFS=ON" - ;; --integrated-opencl) BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.__INTEGRATE_OPENCL=ON" ;; diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst index c59898032b70..427e11feb65e 100644 --- a/docs/Installation-Guide.rst +++ b/docs/Installation-Guide.rst @@ -628,41 +628,6 @@ Windows The CUDA version is not supported on Windows. Use the GPU version (``device_type=gpu``) for GPU acceleration on Windows. -Build HDFS Version -~~~~~~~~~~~~~~~~~~ - -.. warning:: - HDFS support in LightGBM is deprecated, and will be removed in a future release. - See https://github.com/microsoft/LightGBM/issues/6436. - -The HDFS version of LightGBM was tested on CDH-5.14.4 cluster. - -Linux -^^^^^ - -On Linux a HDFS version of LightGBM can be built using **CMake** and **gcc**. - -1. Install `CMake`_. - -2. Run the following commands: - - .. code:: sh - - git clone --recursive https://github.com/microsoft/LightGBM - cd LightGBM - cmake -B build -S . -DUSE_HDFS=ON - # if you have installed HDFS to a customized location, you should specify paths to HDFS headers (hdfs.h) and library (libhdfs.so) like the following: - # cmake \ - # -DUSE_HDFS=ON \ - # -DHDFS_LIB="/opt/cloudera/parcels/CDH-5.14.4-1.cdh5.14.4.p0.3/lib64/libhdfs.so" \ - # -DHDFS_INCLUDE_DIR="/opt/cloudera/parcels/CDH-5.14.4-1.cdh5.14.4.p0.3/include/" \ - # .. - cmake --build build -j4 - -**Note**: glibc >= 2.14 is required. - -**Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this). - Build Java Wrapper ~~~~~~~~~~~~~~~~~~ diff --git a/docs/_static/js/script.js b/docs/_static/js/script.js index 89d14d14aaf0..107a6a4969a3 100644 --- a/docs/_static/js/script.js +++ b/docs/_static/js/script.js @@ -29,7 +29,6 @@ $(function() { '#build-mpi-version', '#build-gpu-version', '#build-cuda-version', - '#build-hdfs-version', '#build-java-wrapper', '#build-c-unit-tests' ]; diff --git a/python-package/README.rst b/python-package/README.rst index 86d5da32b755..ffbe76123776 100644 --- a/python-package/README.rst +++ b/python-package/README.rst @@ -155,23 +155,6 @@ All requirements from `Build from Sources section <#build-from-sources>`__ apply To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters. -Build HDFS Version -~~~~~~~~~~~~~~~~~~ - -.. warning:: - HDFS support in LightGBM is deprecated, and will be removed in a future release. - See https://github.com/microsoft/LightGBM/issues/6436. - -.. code:: sh - - pip install lightgbm --config-settings=cmake.define.USE_HDFS=ON - -All requirements from `Build from Sources section <#build-from-sources>`__ apply for this installation option as well. - -**HDFS** library is needed: details for installation can be found in `Installation Guide `__. - -Note that the installation process of HDFS version was tested only on **Linux**. - Build with MinGW-w64 on Windows ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -247,8 +230,6 @@ Run ``sh ./build-python.sh install --gpu`` to enable GPU support. All requiremen Run ``sh ./build-python.sh install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well. -Run ``sh ./build-python.sh install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well. - Run ``sh ./build-python.sh install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well. Run ``sh ./build-python.sh install --time-costs``, if you want to output time costs for different internal routines. All requirements from `Build with Time Costs Output section <#build-with-time-costs-output>`__ apply for this installation option as well. diff --git a/src/io/file_io.cpp b/src/io/file_io.cpp index a2721e96c2dd..0dda86620e87 100644 --- a/src/io/file_io.cpp +++ b/src/io/file_io.cpp @@ -11,10 +11,6 @@ #include #include -#ifdef USE_HDFS -#include -#endif - namespace LightGBM { struct LocalFile : VirtualFileReader, VirtualFileWriter { @@ -56,142 +52,17 @@ struct LocalFile : VirtualFileReader, VirtualFileWriter { const std::string mode_; }; -const char* kHdfsProto = "hdfs://"; - -#ifdef USE_HDFS -const size_t kHdfsProtoLength = static_cast(strlen(kHdfsProto)); - -struct HDFSFile : VirtualFileReader, VirtualFileWriter { - HDFSFile(const std::string& filename, int flags) - : filename_(filename), flags_(flags) {} - ~HDFSFile() { - if (file_ != NULL) { - hdfsCloseFile(fs_, file_); - } - } - - bool Init() { - if (file_ == NULL) { - if (fs_ == NULL) { - fs_ = GetHDFSFileSystem(filename_); - } - if (fs_ != NULL && - (flags_ == O_WRONLY || 0 == hdfsExists(fs_, filename_.c_str()))) { - file_ = hdfsOpenFile(fs_, filename_.c_str(), flags_, 0, 0, 0); - } - } - return file_ != NULL; - } - - bool Exists() const { - if (fs_ == NULL) { - fs_ = GetHDFSFileSystem(filename_); - } - return fs_ != NULL && 0 == hdfsExists(fs_, filename_.c_str()); - } - - size_t Read(void* data, size_t bytes) const { - return FileOperation(data, bytes, &hdfsRead); - } - - size_t Write(const void* data, size_t bytes) const { - return FileOperation(data, bytes, &hdfsWrite); - } - - private: - template - using fileOp = tSize (*)(hdfsFS, hdfsFile, BufferType, tSize); - - template - inline size_t FileOperation(BufferType data, size_t bytes, - fileOp op) const { - char* buffer = const_cast(static_cast(data)); - size_t remain = bytes; - while (remain != 0) { - size_t nmax = static_cast(std::numeric_limits::max()); - tSize ret = op(fs_, file_, buffer, std::min(nmax, remain)); - if (ret > 0) { - size_t n = static_cast(ret); - remain -= n; - buffer += n; - } else if (ret == 0) { - break; - } else if (errno != EINTR) { - Log::Fatal("Failed HDFS file operation [%s]", strerror(errno)); - } - } - return bytes - remain; - } - - static hdfsFS GetHDFSFileSystem(const std::string& uri) { - size_t end = uri.find("/", kHdfsProtoLength); - if (uri.find(kHdfsProto) != 0 || end == std::string::npos) { - Log::Warning("Bad HDFS uri, no namenode found [%s]", uri.c_str()); - return NULL; - } - std::string hostport = uri.substr(kHdfsProtoLength, end - kHdfsProtoLength); - if (fs_cache_.count(hostport) == 0) { - fs_cache_[hostport] = MakeHDFSFileSystem(hostport); - } - return fs_cache_[hostport]; - } - - static hdfsFS MakeHDFSFileSystem(const std::string& hostport) { - std::istringstream iss(hostport); - std::string host; - tPort port = 0; - std::getline(iss, host, ':'); - iss >> port; - hdfsFS fs = iss.eof() ? hdfsConnect(host.c_str(), port) : NULL; - if (fs == NULL) { - Log::Warning("Could not connect to HDFS namenode [%s]", hostport.c_str()); - } - return fs; - } - - mutable hdfsFS fs_ = NULL; - hdfsFile file_ = NULL; - const std::string filename_; - const int flags_; - static std::unordered_map fs_cache_; -}; - -std::unordered_map HDFSFile::fs_cache_ = - std::unordered_map(); - -#define WITH_HDFS(x) x -#else -#define WITH_HDFS(x) Log::Fatal("HDFS support is not enabled") -#endif // USE_HDFS - std::unique_ptr VirtualFileReader::Make( const std::string& filename) { -#ifdef USE_HDFS - if (0 == filename.find(kHdfsProto)) { - WITH_HDFS(return std::unique_ptr( - new HDFSFile(filename, O_RDONLY))); - } -#endif return std::unique_ptr(new LocalFile(filename, "rb")); } std::unique_ptr VirtualFileWriter::Make( const std::string& filename) { -#ifdef USE_HDFS - if (0 == filename.find(kHdfsProto)) { - WITH_HDFS(return std::unique_ptr( - new HDFSFile(filename, O_WRONLY))); - } -#endif return std::unique_ptr(new LocalFile(filename, "wb")); } bool VirtualFileWriter::Exists(const std::string& filename) { -#ifdef USE_HDFS - if (0 == filename.find(kHdfsProto)) { - WITH_HDFS(HDFSFile file(filename, O_RDONLY); return file.Exists()); - } -#endif LocalFile file(filename, "rb"); return file.Exists(); }