diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h index 71d2d520264e..ab5815557f66 100644 --- a/src/common/compressed_iterator.h +++ b/src/common/compressed_iterator.h @@ -1,12 +1,11 @@ /** - * Copyright 2017-2023 by XGBoost Contributors + * Copyright 2017-2024, XGBoost Contributors * \file compressed_iterator.h */ #pragma once #include -#include -#include +#include // for ceil, log2 #include // for size_t #include "common.h" @@ -15,9 +14,7 @@ #include "device_helpers.cuh" #endif // __CUDACC__ -namespace xgboost { -namespace common { - +namespace xgboost::common { using CompressedByteT = unsigned char; namespace detail { @@ -87,13 +84,12 @@ class CompressedBufferWriter { template void WriteSymbol(CompressedByteT *buffer, T symbol, size_t offset) { - const int bits_per_byte = 8; + constexpr std::int32_t kBitsPerByte = 8; for (size_t i = 0; i < symbol_bits_; i++) { - size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / bits_per_byte; + size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / kBitsPerByte; byte_idx += detail::kPadding; - size_t bit_idx = - ((bits_per_byte + i) - ((offset + 1) * symbol_bits_)) % bits_per_byte; + size_t bit_idx = ((kBitsPerByte + i) - ((offset + 1) * symbol_bits_)) % kBitsPerByte; if (detail::CheckBit(symbol, i)) { detail::SetBit(&buffer[byte_idx], bit_idx); @@ -181,16 +177,14 @@ class CompressedIterator { typedef value_type reference; // NOLINT private: - const CompressedByteT *buffer_ {nullptr}; - size_t symbol_bits_ {0}; + CompressedByteT const *buffer_{nullptr}; + bst_idx_t const symbol_bits_{0}; size_t offset_ {0}; public: CompressedIterator() = default; - CompressedIterator(const CompressedByteT *buffer, size_t num_symbols) - : buffer_(buffer) { - symbol_bits_ = detail::SymbolBits(num_symbols); - } + CompressedIterator(CompressedByteT const *buffer, bst_idx_t num_symbols) + : buffer_{buffer}, symbol_bits_{detail::SymbolBits(num_symbols)} {} XGBOOST_DEVICE reference operator*() const { const int bits_per_byte = 8; @@ -218,5 +212,4 @@ class CompressedIterator { return *offset; } }; -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh index 66463ef2fe86..ffdafa29205c 100644 --- a/src/common/hist_util.cuh +++ b/src/common/hist_util.cuh @@ -24,29 +24,6 @@ #include "xgboost/span.h" // for IterSpan namespace xgboost::common { -namespace cuda { -/** - * copy and paste of the host version, we can't make it a __host__ __device__ function as - * the fn might be a host only or device only callable object, which is not allowed by nvcc. - */ -template -auto __device__ DispatchBinType(BinTypeSize type, Fn&& fn) { - switch (type) { - case kUint8BinsTypeSize: { - return fn(uint8_t{}); - } - case kUint16BinsTypeSize: { - return fn(uint16_t{}); - } - case kUint32BinsTypeSize: { - return fn(uint32_t{}); - } - } - SPAN_CHECK(false); - return fn(uint32_t{}); -} -} // namespace cuda - namespace detail { struct EntryCompareOp { __device__ bool operator()(const Entry& a, const Entry& b) { diff --git a/src/common/hist_util.h b/src/common/hist_util.h index 559093bb5a3f..ab2a64786827 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -84,6 +84,10 @@ class HistogramCuts { [[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const { return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature]; } + [[nodiscard]] bst_feature_t NumFeatures() const { + CHECK_EQ(this->min_vals_.Size(), this->cut_ptrs_.Size() - 1); + return this->min_vals_.Size(); + } std::vector const& Ptrs() const { return cut_ptrs_.ConstHostVector(); } std::vector const& Values() const { return cut_values_.ConstHostVector(); } @@ -101,8 +105,10 @@ class HistogramCuts { has_categorical_ = has_cat; max_cat_ = max_cat; } - - [[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); } + /** + * @brief The total number of histogram bins (excluding min values.) + */ + [[nodiscard]] bst_bin_t TotalBins() const { return this->cut_values_.Size(); } // Return the index of a cut point that is strictly greater than the input // value, or the last available index if none exists diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc index 4d918358d34c..1d99380c4e7d 100644 --- a/src/data/ellpack_page.cc +++ b/src/data/ellpack_page.cc @@ -53,12 +53,6 @@ bst_idx_t EllpackPage::Size() const { "EllpackPage is required"; return 0; } - -[[nodiscard]] bool EllpackPage::IsDense() const { - LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but " - "EllpackPage is required"; - return false; -} } // namespace xgboost #endif // XGBOOST_USE_CUDA diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu index f0c155701ead..ef7f7975baf0 100644 --- a/src/data/ellpack_page.cu +++ b/src/data/ellpack_page.cu @@ -1,30 +1,32 @@ /** * Copyright 2019-2024, XGBoost contributors */ -#include // for proclaim_return_type -#include -#include - -#include // for copy -#include // for move -#include // for vector - -#include "../common/algorithm.cuh" // for InclusiveScan -#include "../common/categorical.h" -#include "../common/cuda_context.cuh" +#include // for lower_bound, upper_bound +#include // for max_element +#include // for make_counting_iterator +#include // for transform_output_iterator + +#include // for copy +#include // for numeric_limits +#include // for move +#include // for vector + +#include "../common/algorithm.cuh" // for InclusiveScan +#include "../common/categorical.h" // for IsCat +#include "../common/cuda_context.cuh" // for CUDAContext #include "../common/cuda_rt_utils.h" // for SetDevice #include "../common/hist_util.cuh" // for HistogramCuts #include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc #include "../common/transform_iterator.h" // for MakeIndexTransformIter #include "device_adapter.cuh" // for NoInfInData -#include "ellpack_page.cuh" -#include "ellpack_page.h" -#include "gradient_index.h" -#include "xgboost/data.h" +#include "ellpack_page.cuh" // for EllpackPageImpl +#include "ellpack_page.h" // for EllpackPage +#include "gradient_index.h" // for GHistIndexMatrix +#include "xgboost/context.h" // for Context +#include "xgboost/data.h" // for DMatrix namespace xgboost { - -EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {} +EllpackPage::EllpackPage() : impl_{new EllpackPageImpl{}} {} EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param) : impl_{new EllpackPageImpl{ctx, dmat, param}} {} @@ -43,10 +45,9 @@ void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id) } [[nodiscard]] bst_idx_t EllpackPage::BaseRowId() const { return this->Impl()->base_rowid; } -[[nodiscard]] bool EllpackPage::IsDense() const { return this->Impl()->IsDense(); } // Bin each input data entry, store the bin indices in compressed form. -template +template __global__ void CompressBinEllpackKernel( common::CompressedBufferWriter wr, common::CompressedByteT* __restrict__ buffer, // gidx_buffer @@ -57,60 +58,98 @@ __global__ void CompressBinEllpackKernel( common::Span feature_types, size_t base_row, // batch_row_begin size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) { - size_t irow = threadIdx.x + blockIdx.x * blockDim.x; - int ifeature = threadIdx.y + blockIdx.y * blockDim.y; - if (irow >= n_rows || ifeature >= row_stride) { + auto irow = threadIdx.x + blockIdx.x * blockDim.x; + auto cpr_fidx = threadIdx.y + blockIdx.y * blockDim.y; // compressed fidx + if (irow >= n_rows || cpr_fidx >= row_stride) { return; } - int row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); + auto row_length = static_cast(row_ptrs[irow + 1] - row_ptrs[irow]); std::uint32_t bin = null_gidx_value; - if (ifeature < row_length) { - Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature]; - int feature = entry.index; + + // When treating a sparse matrix as dense, we need to write null values in between valid + // values. But we don't know where to write if the feature index is not recorded for a + // missing value. Here we use binary search to ensure `cpr_fidx` is the same as `fidx`. + if (kDenseCompressed && !HasNoMissing) { + auto row_beg = entries + row_ptrs[irow] - row_ptrs[0]; + auto row_end = entries + row_ptrs[irow + 1] - row_ptrs[0]; + auto it = thrust::make_transform_iterator(thrust::make_counting_iterator(0ul), + [=](std::size_t i) { return row_beg[i].index; }); + auto it_end = it + thrust::distance(row_beg, row_end); + auto res_it = thrust::lower_bound(thrust::seq, it, it_end, cpr_fidx); + if (res_it == it_end || cpr_fidx != *res_it) { + wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + cpr_fidx); + return; + } + cpr_fidx = thrust::distance(it, res_it); + SPAN_CHECK(cpr_fidx < row_length); + } + + if (cpr_fidx < row_length) { + // We are using sub-batch of a SparsePage, need to account for the first offset within + // the sub-batch. + // + // The block.y idx is calculated using row_stride, which is the longest row. We can + // use `compressed_fidx` to fully index the sparse page row. + Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + cpr_fidx]; + + bst_feature_t fidx = entry.index; float fvalue = entry.fvalue; - // {feature_cuts, ncuts} forms the array of cuts of `feature'. - const float* feature_cuts = &cuts[cut_ptrs[feature]]; - int ncuts = cut_ptrs[feature + 1] - cut_ptrs[feature]; - bool is_cat = common::IsCat(feature_types, ifeature); + // {feature_cuts, n_cuts} forms the array of cuts of the current `feature'. + float const* feature_cuts = &cuts[cut_ptrs[fidx]]; + auto n_cuts = cut_ptrs[fidx + 1] - cut_ptrs[fidx]; + + bool is_cat = common::IsCat(feature_types, fidx); // Assigning the bin in current entry. // S.t.: fvalue < feature_cuts[bin] + bin = std::numeric_limits::max(); if (is_cat) { auto it = dh::MakeTransformIterator(feature_cuts, [](float v) { return common::AsCat(v); }); - bin = thrust::lower_bound(thrust::seq, it, it + ncuts, common::AsCat(fvalue)) - it; + bin = thrust::lower_bound(thrust::seq, it, it + n_cuts, common::AsCat(fvalue)) - it; } else { - bin = thrust::upper_bound(thrust::seq, feature_cuts, feature_cuts + ncuts, fvalue) - + bin = thrust::upper_bound(thrust::seq, feature_cuts, feature_cuts + n_cuts, fvalue) - feature_cuts; } - if (bin >= ncuts) { - bin = ncuts - 1; + if (bin >= n_cuts) { + bin = n_cuts - 1; } - // Add the number of bins in previous features. - if (!kIsDense) { - bin += cut_ptrs[feature]; + if (!kDenseCompressed) { + // Sparse data, use the compressed fidx. Add the number of bins in previous + // features since we can't compresse it based on feature-local index. + bin += cut_ptrs[fidx]; + } else { + // Write to the actual fidx for dense data. + cpr_fidx = fidx; } } - // Write to gidx buffer. - wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + ifeature); + // Write to the gidx buffer for non-missing values. + wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + cpr_fidx); } namespace { // Calculate the number of symbols for the compressed ellpack. Similar to what the CPU // implementation does, we compress the dense data by subtracting the bin values with the -// starting bin of its feature. -[[nodiscard]] std::size_t CalcNumSymbols(Context const* ctx, bool is_dense, - std::shared_ptr cuts) { - // Cut values can be empty when the input data is empty. - if (!is_dense || cuts->cut_values_.Empty()) { - // Return the total number of symbols (total number of bins plus 1 for not found) - return cuts->cut_values_.Size() + 1; +// starting bin of its feature if it's dense. In addition, we treat the data as dense if +// there's no compression to be made by using ellpack. +[[nodiscard]] EllpackPageImpl::Info CalcNumSymbols( + Context const* ctx, bst_idx_t row_stride, bool is_dense, + std::shared_ptr cuts) { + // Return the total number of symbols (total number of bins plus 1 for missing) + // The null value equals the total number of bins. + bst_idx_t n_symbols = cuts->TotalBins() + 1; + if (n_symbols == 1) { // Empty DMatrix + return {static_cast(0), n_symbols}; } + bst_idx_t n_features = cuts->NumFeatures(); cuts->cut_ptrs_.SetDevice(ctx->Device()); common::Span dptrs = cuts->cut_ptrs_.ConstDeviceSpan(); - auto cuctx = ctx->CUDACtx(); using PtrT = typename decltype(dptrs)::value_type; + + // Calculate the number of required symbols if we treat the data as dense. + PtrT n_symbols_dense{0}; + CUDAContext const* cuctx = ctx->CUDACtx(); auto it = dh::MakeTransformIterator( thrust::make_counting_iterator(1ul), [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; }); @@ -119,13 +158,23 @@ namespace { dh::CachingDeviceUVector max_element(1); auto d_me = max_element.data(); dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; }); - PtrT h_me{0}; - dh::safe_cuda( - cudaMemcpyAsync(&h_me, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost, cuctx->Stream())); + dh::safe_cuda(cudaMemcpyAsync(&n_symbols_dense, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost, + cuctx->Stream())); cuctx->Stream().Sync(); - // No missing, hence no null value, hence no + 1 symbol. - // FIXME(jiamingy): When we extend this to use a sparsity threshold, +1 is needed back. - return h_me; + // Decide the type of the data. + CHECK_LE(row_stride, n_features); + if (is_dense) { + // No missing, hence no null value, hence no + 1 symbol. + LOG(INFO) << "Ellpack is dense."; + return {n_features, n_symbols_dense}; + } else if (n_features == row_stride) { + // Treat the ellpack as dense if we can save memory. + LOG(INFO) << "Ellpack is relatively dense."; + return {n_features, n_symbols_dense + 1}; // +1 for missing value (null in ellpack) + } else { + LOG(INFO) << "Ellpack is sparse."; + return {row_stride, n_symbols}; + } } } // namespace @@ -134,10 +183,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, bool is_dense, bst_idx_t row_stride, bst_idx_t n_rows) : is_dense{is_dense}, - cuts_{std::move(cuts)}, - row_stride{row_stride}, n_rows{n_rows}, - n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { + cuts_{std::move(cuts)}, + info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -148,11 +196,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, std::shared_ptr cuts, const SparsePage& page, bool is_dense, size_t row_stride, common::Span feature_types) - : cuts_{std::move(cuts)}, - is_dense{is_dense}, + : is_dense{is_dense}, n_rows{page.Size()}, - row_stride{row_stride}, - n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { + cuts_{std::move(cuts)}, + info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -164,14 +211,13 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const BatchParam& param) : is_dense{p_fmat->IsDense()}, n_rows{p_fmat->Info().num_row_}, - row_stride{GetRowStride(p_fmat)}, // Create the quantile sketches for the dmatrix and initialize HistogramCuts. cuts_{param.hess.empty() ? std::make_shared( common::DeviceSketch(ctx, p_fmat, param.max_bin)) : std::make_shared( common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))}, - n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} { + info{CalcNumSymbols(ctx, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} { monitor_.Init("ellpack_page"); curt::SetDevice(ctx->Ordinal()); @@ -179,15 +225,13 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc p_fmat->Info().feature_types.SetDevice(ctx->Device()); auto ft = p_fmat->Info().feature_types.ConstDeviceSpan(); - monitor_.Start("BinningCompression"); CHECK(p_fmat->SingleColBlock()); for (auto const& page : p_fmat->GetBatches()) { this->CreateHistIndices(ctx, page, ft); } - monitor_.Stop("BinningCompression"); } -template +template struct WriteCompressedEllpackFunctor { WriteCompressedEllpackFunctor(common::CompressedByteT* buffer, const common::CompressedBufferWriter& writer, AdapterBatchT batch, @@ -208,22 +252,40 @@ struct WriteCompressedEllpackFunctor { common::Span feature_types; data::IsValidFunctor is_valid; - using Tuple = thrust::tuple; - __device__ size_t operator()(Tuple out) { + // Tuple[0] = The row index of the input, used as a key to define segments + // Tuple[1] = Scanned flags of valid elements for each row + // Tuple[2] = The index in the input data + using Tuple = thrust::tuple; + + template + __device__ void Write(data::COOTuple const& e, bst_idx_t out_position) { + bst_bin_t bin_idx = 0; + if (common::IsCat(feature_types, e.column_idx)) { + bin_idx = accessor.SearchBin(e.value, e.column_idx); + } else { + bin_idx = accessor.SearchBin(e.value, e.column_idx); + } + if constexpr (kIsDenseCompressed) { + bin_idx -= accessor.feature_segments[e.column_idx]; + } + writer.AtomicWriteSymbol(d_buffer, bin_idx, out_position); + } + // Used for dense or as dense data. + __device__ void operator()(bst_idx_t i) { + auto e = batch.GetElement(i); + if (is_valid(e)) { + this->Write(e, i); + } else { + writer.AtomicWriteSymbol(d_buffer, accessor.NullValue(), i); + } + } + // Used for sparse data. + __device__ size_t operator()(Tuple const& out) { auto e = batch.GetElement(thrust::get<2>(out)); if (is_valid(e)) { // -1 because the scan is inclusive size_t output_position = accessor.row_stride * e.row_idx + thrust::get<1>(out) - 1; - uint32_t bin_idx = 0; - if (common::IsCat(feature_types, e.column_idx)) { - bin_idx = accessor.SearchBin(e.value, e.column_idx); - } else { - bin_idx = accessor.SearchBin(e.value, e.column_idx); - } - if (kIsDense) { - bin_idx -= accessor.feature_segments[e.column_idx]; - } - writer.AtomicWriteSymbol(d_buffer, bin_idx, output_position); + this->Write(e, output_position); } return 0; } @@ -244,90 +306,83 @@ struct TupleScanOp { // Here the data is already correctly ordered and simply needs to be compacted // to remove missing data -template +template void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch, common::Span feature_types, EllpackPageImpl* dst, float missing) { - // Some witchcraft happens here - // The goal is to copy valid elements out of the input to an ELLPACK matrix - // with a given row stride, using no extra working memory Standard stream - // compaction needs to be modified to do this, so we manually define a - // segmented stream compaction via operators on an inclusive scan. The output - // of this inclusive scan is fed to a custom function which works out the - // correct output position - auto counting = thrust::make_counting_iterator(0llu); data::IsValidFunctor is_valid(missing); bool valid = data::NoInfInData(batch, is_valid); CHECK(valid) << error::InfInData(); - auto key_iter = dh::MakeTransformIterator( - counting, - [=] __device__(size_t idx) { - return batch.GetElement(idx).row_idx; - }); - auto value_iter = dh::MakeTransformIterator( - counting, - [=] __device__(size_t idx) -> size_t { - return is_valid(batch.GetElement(idx)); - }); - - auto key_value_index_iter = - thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, counting)); - - // Tuple[0] = The row index of the input, used as a key to define segments - // Tuple[1] = Scanned flags of valid elements for each row - // Tuple[2] = The index in the input data - using Tuple = thrust::tuple; - - auto device_accessor = dst->GetDeviceAccessor(ctx); + auto cnt = thrust::make_counting_iterator(0llu); auto n_symbols = dst->NumSymbols(); - common::CompressedBufferWriter writer{n_symbols}; auto d_compressed_buffer = dst->gidx_buffer.data(); // We redirect the scan output into this functor to do the actual writing + using Tuple = typename WriteCompressedEllpackFunctor::Tuple; dh::TypedDiscard discard; - WriteCompressedEllpackFunctor functor{ + auto device_accessor = dst->GetDeviceAccessor(ctx); + WriteCompressedEllpackFunctor functor{ d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid}; - thrust::transform_output_iterator out(discard, functor); + // For dense compressed data, we can simply copy the data with the input position. + if (kIsDenseCompressed) { + CHECK(batch.NumRows() == 0 || batch.NumCols() == dst->info.row_stride); + thrust::for_each_n(ctx->CUDACtx()->CTP(), cnt, dst->Size() * dst->info.row_stride, functor); + return; + } + + // Some witchcraft happens here. + // + // The goal is to copy valid elements out of the input to an ELLPACK matrix with a given + // row stride, using no extra working memory Standard stream compaction needs to be + // modified to do this, so we manually define a segmented stream compaction via + // operators on an inclusive scan. The output of this inclusive scan is fed to a custom + // function which works out the correct output position + auto key_iter = dh::MakeTransformIterator( + cnt, [=] __device__(size_t idx) { return batch.GetElement(idx).row_idx; }); + auto value_iter = dh::MakeTransformIterator( + cnt, [=] __device__(size_t idx) -> size_t { return is_valid(batch.GetElement(idx)); }); + + auto key_value_index_iter = + thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, cnt)); + thrust::transform_output_iterator out(discard, functor); common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp{}, batch.Size()); } void WriteNullValues(Context const* ctx, EllpackPageImpl* dst, common::Span row_counts) { // Write the null values - auto device_accessor = dst->GetDeviceAccessor(ctx); + auto null = dst->GetDeviceAccessor(ctx).NullValue(); common::CompressedBufferWriter writer(dst->NumSymbols()); auto d_compressed_buffer = dst->gidx_buffer.data(); - auto row_stride = dst->row_stride; - dh::LaunchN(row_stride * dst->n_rows, ctx->CUDACtx()->Stream(), [=] __device__(bst_idx_t idx) { - // For some reason this variable got captured as const - auto writer_non_const = writer; + auto row_stride = dst->info.row_stride; + auto n = row_stride * dst->n_rows; + dh::LaunchN(n, ctx->CUDACtx()->Stream(), [=] __device__(bst_idx_t idx) mutable { size_t row_idx = idx / row_stride; size_t row_offset = idx % row_stride; if (row_offset >= row_counts[row_idx]) { - writer_non_const.AtomicWriteSymbol(d_compressed_buffer, device_accessor.NullValue(), idx); + writer.AtomicWriteSymbol(d_compressed_buffer, null, idx); } }); } template EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing, - bool is_dense, common::Span row_counts_span, + bool is_dense, common::Span row_counts, common::Span feature_types, size_t row_stride, bst_idx_t n_rows, std::shared_ptr cuts) : EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} { curt::SetDevice(ctx->Ordinal()); - if (this->IsDense()) { + if (this->IsDenseCompressed()) { CopyDataToEllpack(ctx, batch, feature_types, this, missing); } else { CopyDataToEllpack(ctx, batch, feature_types, this, missing); + WriteNullValues(ctx, this, row_counts); } - - WriteNullValues(ctx, this, row_counts_span); } #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T) \ @@ -339,10 +394,14 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch) ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch) +#undef ELLPACK_BATCH_SPECIALIZE + namespace { +template void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page, common::Span d_row_ptr, bst_idx_t row_stride, bst_bin_t null, bst_idx_t n_symbols, + common::Span d_cut_ptrs, common::CompressedByteT* d_compressed_buffer) { dh::device_vector data(page.index.begin(), page.index.end()); auto d_data = dh::ToSpan(data); @@ -350,43 +409,59 @@ void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page, // GPU employs the same dense compression as CPU, no need to handle page.index.Offset() auto bin_type = page.index.GetBinTypeSize(); common::CompressedBufferWriter writer{n_symbols}; - auto cuctx = ctx->CUDACtx(); - dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable { - auto ridx = idx / row_stride; - auto ifeature = idx % row_stride; + bool dense_compress = row_stride == page.Features() && !page.IsDense(); + auto n_samples = page.Size(); + auto cnt = thrust::make_counting_iterator(0ul); + auto ptr = reinterpret_cast(d_data.data()); + auto fn = [=] __device__(std::size_t i) mutable { + auto [ridx, fidx] = linalg::UnravelIndex(i, n_samples, row_stride); auto r_begin = d_row_ptr[ridx]; auto r_end = d_row_ptr[ridx + 1]; auto r_size = r_end - r_begin; - if (ifeature >= r_size) { - writer.AtomicWriteSymbol(d_compressed_buffer, null, idx); - return; + bst_bin_t bin_idx; + if (dense_compress) { + auto f_begin = d_cut_ptrs[fidx]; + auto f_end = d_cut_ptrs[fidx + 1]; + // CPU gidx is not compressed, can be used for binary search. + bin_idx = common::BinarySearchBin(r_begin, r_end, ptr, f_begin, f_end); + if (bin_idx == -1) { + bin_idx = null; + } else { + bin_idx -= d_cut_ptrs[fidx]; + } + } else if (fidx >= r_size) { + bin_idx = null; + } else { + bin_idx = ptr[r_begin + fidx]; } - common::cuda::DispatchBinType(bin_type, [&](auto t) { - using T = decltype(t); - auto ptr = reinterpret_cast(d_data.data()); - auto bin_idx = ptr[r_begin + ifeature]; - writer.AtomicWriteSymbol(d_compressed_buffer, bin_idx, idx); - }); - }); + writer.AtomicWriteSymbol(d_compressed_buffer, bin_idx, i); + }; + thrust::for_each_n(cuctx->CTP(), cnt, row_stride * page.Size(), fn); } } // anonymous namespace EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page, common::Span ft) : is_dense{page.IsDense()}, - row_stride{[&] { - auto it = common::MakeIndexTransformIter( - [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; }); - return *std::max_element(it, it + page.Size()); - }()}, base_rowid{page.base_rowid}, n_rows{page.Size()}, - cuts_{std::make_shared(page.cut)}, - n_symbols_{CalcNumSymbols(ctx, page.IsDense(), cuts_)} { + cuts_{[&] { + auto cuts = std::make_shared(page.cut); + cuts->SetDevice(ctx->Device()); + return cuts; + }()}, + info{CalcNumSymbols( + ctx, + [&] { + auto it = common::MakeIndexTransformIter( + [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; }); + return *std::max_element(it, it + page.Size()); + }(), + page.IsDense(), cuts_)} { this->monitor_.Init("ellpack_page"); CHECK(ctx->IsCUDA()); this->InitCompressedData(ctx); @@ -400,8 +475,12 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag auto accessor = this->GetDeviceAccessor(ctx, ft); this->monitor_.Start("CopyGHistToEllpack"); - CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, accessor.NullValue(), this->NumSymbols(), - d_compressed_buffer); + common::DispatchBinType(page.index.GetBinTypeSize(), [&](auto t) { + using T = decltype(t); + CopyGHistToEllpack(ctx, page, d_row_ptr, this->info.row_stride, accessor.NullValue(), + this->NumSymbols(), this->cuts_->cut_ptrs_.ConstDeviceSpan(), + d_compressed_buffer); + }); this->monitor_.Stop("CopyGHistToEllpack"); } @@ -432,13 +511,13 @@ struct CopyPage { // Copy the data from the given EllpackPage to the current page. bst_idx_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) { monitor_.Start(__func__); - bst_idx_t num_elements = page->n_rows * page->row_stride; - CHECK_EQ(this->row_stride, page->row_stride); + bst_idx_t num_elements = page->n_rows * page->info.row_stride; + CHECK_EQ(this->info.row_stride, page->info.row_stride); CHECK_EQ(NumSymbols(), page->NumSymbols()); - CHECK_GE(this->n_rows * this->row_stride, offset + num_elements); + CHECK_GE(this->n_rows * this->info.row_stride, offset + num_elements); if (page == this) { LOG(FATAL) << "Concatenating the same Ellpack."; - return this->n_rows * this->row_stride; + return this->n_rows * this->info.row_stride; } dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset}); monitor_.Stop(__func__); @@ -471,7 +550,7 @@ struct CompactPage { src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()}, row_indexes(row_indexes), base_rowid{src->base_rowid}, - row_stride{src->row_stride} {} + row_stride{src->info.row_stride} {} __device__ void operator()(bst_idx_t row_id) { size_t src_row = base_rowid + row_id; @@ -491,8 +570,8 @@ struct CompactPage { void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page, common::Span row_indexes) { monitor_.Start(__func__); - CHECK_EQ(row_stride, page->row_stride); - CHECK_EQ(NumSymbols(), page->NumSymbols()); + CHECK_EQ(this->info.row_stride, page->info.row_stride); + CHECK_EQ(this->NumSymbols(), page->NumSymbols()); CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size()); auto cuctx = ctx->CUDACtx(); dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes}); @@ -508,35 +587,37 @@ void EllpackPageImpl::InitCompressedData(Context const* ctx) { monitor_.Start(__func__); auto num_symbols = this->NumSymbols(); // Required buffer size for storing data matrix in ELLPack format. - std::size_t compressed_size_bytes = - common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols); + std::size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize( + this->info.row_stride * this->n_rows, num_symbols); auto init = static_cast(0); gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init); monitor_.Stop(__func__); } // Compress a CSR page into ELLPACK. -void EllpackPageImpl::CreateHistIndices(Context const* ctx, - const SparsePage& row_batch, +void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& row_batch, common::Span feature_types) { if (row_batch.Size() == 0) { return; } + + this->monitor_.Start(__func__); auto null_gidx_value = this->GetDeviceAccessor(ctx, feature_types).NullValue(); auto const& offset_vec = row_batch.offset.ConstHostVector(); // bin and compress entries in batches of rows size_t gpu_batch_nrows = - std::min(dh::TotalMemory(ctx->Ordinal()) / (16 * row_stride * sizeof(Entry)), + std::min(dh::TotalMemory(ctx->Ordinal()) / (16 * this->info.row_stride * sizeof(Entry)), static_cast(row_batch.Size())); size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows); + auto writer = common::CompressedBufferWriter{this->NumSymbols()}; + auto gidx_buffer_data = gidx_buffer.data(); for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) { size_t batch_row_begin = gpu_batch * gpu_batch_nrows; - size_t batch_row_end = - std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size()); + size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size()); size_t batch_nrows = batch_row_end - batch_row_begin; const auto ent_cnt_begin = offset_vec[batch_row_begin]; @@ -569,40 +650,45 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx, const dim3 block3(32, 8, 1); // 256 threads const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x), - common::DivRoundUp(row_stride, block3.y), 1); + common::DivRoundUp(this->info.row_stride, block3.y), 1); auto device_accessor = this->GetDeviceAccessor(ctx); auto launcher = [&](auto kernel) { dh::LaunchKernel{grid3, block3, 0, ctx->CUDACtx()->Stream()}( // NOLINT - kernel, common::CompressedBufferWriter(this->NumSymbols()), gidx_buffer.data(), - row_ptrs.data(), entries_d.data(), device_accessor.gidx_fvalue_map.data(), - device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows, - row_stride, null_gidx_value); + kernel, writer, gidx_buffer_data, row_ptrs.data(), entries_d.data(), + device_accessor.gidx_fvalue_map.data(), device_accessor.feature_segments, feature_types, + batch_row_begin, batch_nrows, this->info.row_stride, null_gidx_value); }; if (this->IsDense()) { - launcher(CompressBinEllpackKernel); + launcher(CompressBinEllpackKernel); } else { - launcher(CompressBinEllpackKernel); + if (this->IsDenseCompressed()) { + launcher(CompressBinEllpackKernel); + } else { + launcher(CompressBinEllpackKernel); + } } } + this->monitor_.Stop(__func__); } // Return the number of rows contained in this page. [[nodiscard]] bst_idx_t EllpackPageImpl::Size() const { return n_rows; } std::size_t EllpackPageImpl::MemCostBytes() const { - return this->gidx_buffer.size_bytes() + sizeof(this->n_rows) + sizeof(this->is_dense) + - sizeof(this->row_stride) + sizeof(this->base_rowid) + sizeof(this->n_symbols_); + return this->gidx_buffer.size_bytes() + sizeof(this->is_dense) + sizeof(this->n_rows) + + sizeof(this->base_rowid) + sizeof(this->info); } EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor( Context const* ctx, common::Span feature_types) const { + auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1; return {ctx, - cuts_, - is_dense, - row_stride, - base_rowid, - n_rows, - common::CompressedIterator(gidx_buffer.data(), this->NumSymbols()), + this->cuts_, + this->info.row_stride, + this->base_rowid, + this->n_rows, + common::CompressedIterator{gidx_buffer.data(), this->NumSymbols()}, + null, feature_types}; } @@ -615,26 +701,28 @@ EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor( dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(), cudaMemcpyDefault, ctx->CUDACtx()->Stream())); Context cpu_ctx; + auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1; return {ctx->IsCPU() ? ctx : &cpu_ctx, - cuts_, - is_dense, - row_stride, - base_rowid, - n_rows, - common::CompressedIterator(h_gidx_buffer->data(), this->NumSymbols()), + this->cuts_, + this->info.row_stride, + this->base_rowid, + this->n_rows, + common::CompressedIterator{h_gidx_buffer->data(), this->NumSymbols()}, + null, feature_types}; } [[nodiscard]] bst_idx_t EllpackPageImpl::NumNonMissing( Context const* ctx, common::Span feature_types) const { + if (this->IsDense()) { + return this->n_rows * this->info.row_stride; + } auto d_acc = this->GetDeviceAccessor(ctx, feature_types); using T = typename decltype(d_acc.gidx_iter)::value_type; auto it = thrust::make_transform_iterator( thrust::make_counting_iterator(0ull), - cuda::proclaim_return_type([=] __device__(std::size_t i) { return d_acc.gidx_iter[i]; })); - auto nnz = thrust::count_if(ctx->CUDACtx()->CTP(), it, it + d_acc.row_stride * d_acc.n_rows, - cuda::proclaim_return_type( - [=] __device__(T gidx) { return gidx != d_acc.NullValue(); })); - return nnz; + [=] XGBOOST_DEVICE(std::size_t i) { return d_acc.gidx_iter[i]; }); + return thrust::count_if(ctx->CUDACtx()->CTP(), it, it + d_acc.row_stride * d_acc.n_rows, + [=] XGBOOST_DEVICE(T gidx) -> bool { return gidx != d_acc.NullValue(); }); } } // namespace xgboost diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh index 78641c5ac9c7..85e412f86398 100644 --- a/src/data/ellpack_page.cuh +++ b/src/data/ellpack_page.cuh @@ -24,7 +24,7 @@ namespace xgboost { */ struct EllpackDeviceAccessor { /** @brief Whether or not if the matrix is dense. */ - bool is_dense; + bst_idx_t null_value; /** @brief Row length for ELLPACK, equal to number of features when the data is dense. */ bst_idx_t row_stride; /** @brief Starting index of the rows. Used for external memory. */ @@ -36,7 +36,7 @@ struct EllpackDeviceAccessor { /** @brief Minimum value for each feature. Size equals to number of features. */ common::Span min_fvalue; /** @brief Histogram cut pointers. Size equals to (number of features + 1). */ - common::Span feature_segments; + std::uint32_t const* feature_segments; /** @brief Histogram cut values. Size equals to (bins per feature * number of features). */ common::Span gidx_fvalue_map; /** @brief Type of each feature, categorical or numerical. */ @@ -44,10 +44,10 @@ struct EllpackDeviceAccessor { EllpackDeviceAccessor() = delete; EllpackDeviceAccessor(Context const* ctx, std::shared_ptr cuts, - bool is_dense, bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows, - common::CompressedIterator gidx_iter, + bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows, + common::CompressedIterator gidx_iter, bst_idx_t null_value, common::Span feature_types) - : is_dense{is_dense}, + : null_value{null_value}, row_stride{row_stride}, base_rowid{base_rowid}, n_rows{n_rows}, @@ -58,62 +58,72 @@ struct EllpackDeviceAccessor { cuts->cut_ptrs_.SetDevice(ctx->Device()); cuts->min_vals_.SetDevice(ctx->Device()); gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan(); - feature_segments = cuts->cut_ptrs_.ConstDeviceSpan(); + feature_segments = cuts->cut_ptrs_.ConstDevicePointer(); min_fvalue = cuts->min_vals_.ConstDeviceSpan(); } else { gidx_fvalue_map = cuts->cut_values_.ConstHostSpan(); - feature_segments = cuts->cut_ptrs_.ConstHostSpan(); + feature_segments = cuts->cut_ptrs_.ConstHostPointer(); min_fvalue = cuts->min_vals_.ConstHostSpan(); } } + [[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDenseCompressed() const { + return this->row_stride == this->NumFeatures(); + } /** - * @brief Given a row index and a feature index, returns the corresponding cut value. + * @brief Given a row index and a feature index, returns the corresponding bin index. * - * Uses binary search for look up. Returns NaN if missing. + * Uses binary search for look up. * * @tparam global_ridx Whether the row index is global to all ellpack batches or it's * local to the current batch. + * + * @return -1 if it's a missing value. */ template - [[nodiscard]] __device__ bst_bin_t GetBinIndex(bst_idx_t ridx, size_t fidx) const { + [[nodiscard]] __device__ bst_bin_t GetBinIndex(bst_idx_t ridx, std::size_t fidx) const { if (global_ridx) { ridx -= base_rowid; } auto row_begin = row_stride * ridx; - auto row_end = row_begin + row_stride; - bst_bin_t gidx = -1; - if (is_dense) { - gidx = gidx_iter[row_begin + fidx]; - gidx += this->feature_segments[fidx]; - } else { - gidx = common::BinarySearchBin(row_begin, row_end, gidx_iter, feature_segments[fidx], - feature_segments[fidx + 1]); + if (!this->IsDenseCompressed()) { + // binary search returns -1 if it's missing + auto row_end = row_begin + row_stride; + bst_bin_t gidx = common::BinarySearchBin(row_begin, row_end, gidx_iter, + feature_segments[fidx], feature_segments[fidx + 1]); + return gidx; + } + bst_bin_t gidx = gidx_iter[row_begin + fidx]; + if (gidx == this->NullValue()) { + // Missing value in a dense ellpack + return -1; } + // Dense ellpack + gidx += this->feature_segments[fidx]; return gidx; } - + /** + * @brief Find a bin to place the value in. Used during construction of the Ellpack. + */ template - [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const { - auto beg = feature_segments[column_id]; - auto end = feature_segments[column_id + 1]; - uint32_t idx = 0; + [[nodiscard]] __device__ bst_bin_t SearchBin(float value, std::size_t fidx) const { + auto beg = feature_segments[fidx]; + auto end = feature_segments[fidx + 1]; + bst_bin_t gidx = 0; if (is_cat) { - auto it = dh::MakeTransformIterator( - gidx_fvalue_map.cbegin(), [](float v) { return common::AsCat(v); }); - idx = thrust::lower_bound(thrust::seq, it + beg, it + end, - common::AsCat(value)) - - it; + auto it = dh::MakeTransformIterator(gidx_fvalue_map.cbegin(), + [](float v) { return common::AsCat(v); }); + gidx = thrust::lower_bound(thrust::seq, it + beg, it + end, common::AsCat(value)) - it; } else { auto it = thrust::upper_bound(thrust::seq, gidx_fvalue_map.cbegin() + beg, gidx_fvalue_map.cbegin() + end, value); - idx = it - gidx_fvalue_map.cbegin(); + gidx = it - gidx_fvalue_map.cbegin(); } - if (idx == end) { - idx -= 1; + if (gidx == end) { + gidx -= 1; } - return idx; + return gidx; } [[nodiscard]] __device__ float GetFvalue(bst_idx_t ridx, size_t fidx) const { @@ -123,22 +133,21 @@ struct EllpackDeviceAccessor { } return gidx_fvalue_map[gidx]; } - - // Check if the row id is withing range of the current batch. - [[nodiscard]] __device__ bool IsInRange(size_t row_id) const { - return row_id >= base_rowid && row_id < base_rowid + n_rows; - } - - [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return this->NumBins(); } - - [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); } - - [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); } + [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const { return this->null_value; } + [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NumBins() const { return gidx_fvalue_map.size(); } + [[nodiscard]] XGBOOST_HOST_DEV_INLINE size_t NumFeatures() const { return min_fvalue.size(); } }; class GHistIndexMatrix; +/** + * @brief This is either an Ellpack format matrix or a dense matrix. + * + * When there's no compression can be made by using ellpack, we use this structure as a + * simple dense matrix. For dense matrix, we can provide extra compression by counting the + * histogram bin for each feature instead of for the entire dataset. + */ class EllpackPageImpl { public: /** @@ -150,7 +159,7 @@ class EllpackPageImpl { EllpackPageImpl() = default; /** - * @brief Constructor from an existing EllpackInfo. + * @brief Constructor from existing ellpack matrics. * * This is used in the sampling case. The ELLPACK page is constructed from an existing * Ellpack page and the given number of rows. @@ -223,8 +232,17 @@ class EllpackPageImpl { [[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; } [[nodiscard]] std::shared_ptr CutsShared() const { return cuts_; } void SetCuts(std::shared_ptr cuts); + /** + * @brief Fully dense, there's not a single missing value. + */ + [[nodiscard]] bool IsDense() const { return this->is_dense; } + /** + * @brief Stored as a dense matrix, but there might be missing values. + */ + [[nodiscard]] bool IsDenseCompressed() const { + return this->cuts_->NumFeatures() == this->info.row_stride; + } - [[nodiscard]] bool IsDense() const { return is_dense; } /** @return Estimation of memory cost of this page. */ std::size_t MemCostBytes() const; @@ -232,9 +250,8 @@ class EllpackPageImpl { * @brief Return the total number of symbols (total number of bins plus 1 for not * found). */ - [[nodiscard]] std::size_t NumSymbols() const { return this->n_symbols_; } - void SetNumSymbols(bst_idx_t n_symbols) { this->n_symbols_ = n_symbols; } - + [[nodiscard]] auto NumSymbols() const { return this->info.n_symbols; } + void SetNumSymbols(bst_idx_t n_symbols) { this->info.n_symbols = n_symbols; } /** * @brief Get an accessor that can be passed into CUDA kernels. */ @@ -265,11 +282,11 @@ class EllpackPageImpl { */ void InitCompressedData(Context const* ctx); + std::shared_ptr cuts_; + public: - /** @brief Whether or not if the matrix is dense. */ - bool is_dense; - /** @brief Row length for ELLPACK. */ - bst_idx_t row_stride; + bool is_dense{false}; + bst_idx_t base_rowid{0}; bst_idx_t n_rows{0}; /** @@ -278,22 +295,28 @@ class EllpackPageImpl { * This can be backed by various storage types. */ common::RefResourceView gidx_buffer; + /** + * @brief Compression infomation. + */ + struct Info { + /** @brief Row length for ELLPACK. */ + bst_idx_t row_stride{0}; + /** @brief The number of unique bins including missing. */ + bst_idx_t n_symbols{0}; + } info; private: - std::shared_ptr cuts_; - bst_idx_t n_symbols_{0}; common::Monitor monitor_; }; -inline size_t GetRowStride(DMatrix* dmat) { +[[nodiscard]] inline bst_idx_t GetRowStride(DMatrix* dmat) { if (dmat->IsDense()) return dmat->Info().num_col_; size_t row_stride = 0; for (const auto& batch : dmat->GetBatches()) { const auto& row_offset = batch.offset.ConstHostVector(); for (auto i = 1ull; i < row_offset.size(); i++) { - row_stride = std::max( - row_stride, static_cast(row_offset[i] - row_offset[i - 1])); + row_stride = std::max(row_stride, static_cast(row_offset[i] - row_offset[i - 1])); } } return row_stride; diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h index fa312f6e7887..8e54855049a2 100644 --- a/src/data/ellpack_page.h +++ b/src/data/ellpack_page.h @@ -17,6 +17,8 @@ class EllpackPageImpl; * * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid * including CUDA-specific implementation details in the header. + * + * See @ref EllpackPageImpl . */ class EllpackPage { public: @@ -42,7 +44,6 @@ class EllpackPage { /*! \return Number of instances in the page. */ [[nodiscard]] bst_idx_t Size() const; - [[nodiscard]] bool IsDense() const; /*! \brief Set the base row id for this page. */ void SetBaseRowId(std::size_t row_id); diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu index 839966b08151..262b9c8d3796 100644 --- a/src/data/ellpack_page_raw_format.cu +++ b/src/data/ellpack_page_raw_format.cu @@ -57,7 +57,7 @@ template RET_IF_NOT(fi->Read(&impl->n_rows)); RET_IF_NOT(fi->Read(&impl->is_dense)); - RET_IF_NOT(fi->Read(&impl->row_stride)); + RET_IF_NOT(fi->Read(&impl->info.row_stride)); if (this->param_.prefetch_copy || !has_hmm_ats_) { RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer)); @@ -83,7 +83,7 @@ template auto* impl = page.Impl(); bytes += fo->Write(impl->n_rows); bytes += fo->Write(impl->is_dense); - bytes += fo->Write(impl->row_stride); + bytes += fo->Write(impl->info.row_stride); std::vector h_gidx_buffer; Context ctx = Context{}.MakeCUDA(curt::CurrentDevice()); [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer); diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu index 588ddccec32b..09fc0da68847 100644 --- a/src/data/ellpack_page_source.cu +++ b/src/data/ellpack_page_source.cu @@ -80,7 +80,7 @@ class EllpackHostCacheStreamImpl { common::MakeFixedVecWithPinnedMalloc(impl->gidx_buffer.size()); new_impl->n_rows = impl->Size(); new_impl->is_dense = impl->IsDense(); - new_impl->row_stride = impl->row_stride; + new_impl->info.row_stride = impl->info.row_stride; new_impl->base_rowid = impl->base_rowid; new_impl->SetNumSymbols(impl->NumSymbols()); @@ -108,7 +108,7 @@ class EllpackHostCacheStreamImpl { impl->n_rows = page->Size(); impl->is_dense = page->IsDense(); - impl->row_stride = page->row_stride; + impl->info.row_stride = page->info.row_stride; impl->base_rowid = page->base_rowid; impl->SetNumSymbols(page->NumSymbols()); } diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h index 40f29b6b93b2..2d5d5b8f1ce8 100644 --- a/src/data/ellpack_page_source.h +++ b/src/data/ellpack_page_source.h @@ -80,6 +80,12 @@ class EllpackFormatPolicy { if (!GlobalConfigThreadLocalStore::Get()->use_rmm) { LOG(WARNING) << "`use_rmm` is set to false." << msg; } + std::int32_t major{0}, minor{0}; + curt::DrVersion(&major, &minor); + if (!(major >= 12 && minor >= 7) && curt::SupportsAts()) { + // Use ATS, but with an old kernel driver. + LOG(WARNING) << "Using an old kernel driver with supported CTK<12.7." << msg; + } } // For testing with the HMM flag. explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {} diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu index 3fb1557e9993..119f9298c501 100644 --- a/src/data/extmem_quantile_dmatrix.cu +++ b/src/data/extmem_quantile_dmatrix.cu @@ -56,7 +56,7 @@ void ExtMemQuantileDMatrix::InitFromCUDA( for (auto const &page : this->GetEllpackPageImpl()) { n_total_samples += page.Size(); CHECK_EQ(page.Impl()->base_rowid, ext_info.base_rows[k]); - CHECK_EQ(page.Impl()->row_stride, ext_info.row_stride); + CHECK_EQ(page.Impl()->info.row_stride, ext_info.row_stride); ++k, ++batch_cnt; } CHECK_EQ(batch_cnt, ext_info.n_batches); diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu index ebdc99051924..5e15ff5f0fa2 100644 --- a/src/data/gradient_index.cu +++ b/src/data/gradient_index.cu @@ -1,7 +1,9 @@ /** - * Copyright 2022-2023, XGBoost Contributors + * Copyright 2022-2024, XGBoost Contributors */ -#include // std::unique_ptr +#include // for size_t +#include // for unique_ptr +#include // for vector #include "../common/column_matrix.h" #include "../common/hist_util.h" // Index @@ -20,23 +22,34 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page, auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer); auto const kNull = static_cast(accessor.NullValue()); - common::Span index_data_span = {out->index.data(), out->index.Size()}; + auto index_data_span = common::Span{out->index.data(), out->index.Size()}; auto n_bins_total = page->Cuts().TotalBins(); auto& hit_count_tloc = *p_hit_count_tloc; hit_count_tloc.clear(); hit_count_tloc.resize(ctx->Threads() * n_bins_total, 0); - - common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) { + bool dense_compressed = page->IsDenseCompressed() && !page->IsDense(); + common::ParallelFor(page->Size(), ctx->Threads(), [&](auto ridx) { auto tid = omp_get_thread_num(); - size_t in_rbegin = page->row_stride * i; - size_t out_rbegin = out->row_ptr[i]; - auto r_size = out->row_ptr[i + 1] - out->row_ptr[i]; - for (size_t j = 0; j < r_size; ++j) { - auto bin_idx = accessor.gidx_iter[in_rbegin + j]; - assert(bin_idx != kNull); - index_data_span[out_rbegin + j] = bin_idx; - ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)]; + size_t in_rbegin = page->info.row_stride * ridx; + size_t out_rbegin = out->row_ptr[ridx]; + if (dense_compressed) { + for (std::size_t j = 0, k = 0; j < page->info.row_stride; ++j) { + bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j]; + if (XGBOOST_EXPECT((bin_idx != kNull), true)) { // relatively dense + bin_idx = get_offset(bin_idx, j); + index_data_span[out_rbegin + k++] = bin_idx; + ++hit_count_tloc[tid * n_bins_total + bin_idx]; + } + } + } else { + auto r_size = out->row_ptr[ridx + 1] - out->row_ptr[ridx]; + for (size_t j = 0; j < r_size; ++j) { + bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j]; + assert(bin_idx != kNull); + index_data_span[out_rbegin + j] = bin_idx; + ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)]; + } } }); } @@ -45,16 +58,16 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page, common::RefResourceView* p_out) { auto& row_ptr = *p_out; row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0}); - if (page->is_dense) { - std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride); + if (page->IsDense()) { + std::fill(row_ptr.begin() + 1, row_ptr.end(), page->info.row_stride); } else { std::vector h_gidx_buffer; auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer); auto const kNull = static_cast(accessor.NullValue()); common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) { - size_t ibegin = page->row_stride * i; - for (size_t j = 0; j < page->row_stride; ++j) { + size_t ibegin = page->info.row_stride * i; + for (size_t j = 0; j < page->info.row_stride; ++j) { bst_bin_t bin_idx = accessor.gidx_iter[ibegin + j]; if (bin_idx != kNull) { row_ptr[i + 1]++; @@ -67,27 +80,27 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page, GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info, EllpackPage const& in_page, BatchParam const& p) - : max_numeric_bins_per_feat{p.max_bin} { + : cut{in_page.Cuts()}, + max_numeric_bins_per_feat{p.max_bin}, + isDense_{in_page.Impl()->IsDense()}, + base_rowid{in_page.BaseRowId()} { auto page = in_page.Impl(); - isDense_ = page->is_dense; - CHECK_EQ(info.num_row_, in_page.Size()); - this->cut = page->Cuts(); // pull to host early, prevent race condition this->cut.Ptrs(); this->cut.Values(); this->cut.MinValues(); - this->ResizeIndex(info.num_nonzero_, page->is_dense); - if (page->is_dense) { + this->ResizeIndex(info.num_nonzero_, page->IsDense()); + if (page->IsDense()) { this->index.SetBinOffset(page->Cuts().Ptrs()); } + auto offset = page->Cuts().cut_ptrs_.ConstHostSpan(); auto n_bins_total = page->Cuts().TotalBins(); GetRowPtrFromEllpack(ctx, page, &this->row_ptr); - if (page->IsDense()) { - auto offset = index.Offset(); + if (page->IsDenseCompressed()) { common::DispatchBinType(this->index.GetBinTypeSize(), [&](auto dtype) { using T = decltype(dtype); ::xgboost::SetIndexData( @@ -104,7 +117,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info, this->GatherHitCount(ctx->Threads(), n_bins_total); // sanity checks - CHECK_EQ(this->Features(), info.num_col_); + CHECK_EQ(this->Features(), in_page.Cuts().NumFeatures()); CHECK_EQ(this->Size(), info.num_row_); CHECK(this->cut.cut_ptrs_.HostCanRead()); CHECK(this->cut.cut_values_.HostCanRead()); diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index 29d38976361b..c9830fa093e8 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -49,6 +49,9 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro this->fmat_ctx_ = ctx; this->batch_ = p; + + LOG(INFO) << "Finished constructing the `IterativeDMatrix`: (" << this->Info().num_row_ << ", " + << this->Info().num_col_ << ", " << this->Info().num_nonzero_ << ")."; } void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p, diff --git a/src/tree/gpu_hist/feature_groups.cu b/src/tree/gpu_hist/feature_groups.cu index c6c6619852ca..0a1272a10aff 100644 --- a/src/tree/gpu_hist/feature_groups.cu +++ b/src/tree/gpu_hist/feature_groups.cu @@ -2,17 +2,17 @@ * Copyright 2020-2024, XGBoost Contributors */ -#include -#include -#include +#include // for max +#include // for size_t +#include // for uint32_t +#include // for vector +#include "../../common/hist_util.h" // for HistogramCuts #include "feature_groups.cuh" -#include "../../common/hist_util.h" - namespace xgboost::tree { -FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, - size_t shm_size, size_t bin_size) { +FeatureGroups::FeatureGroups(common::HistogramCuts const& cuts, bool is_dense, size_t shm_size) + : max_group_bins{0} { // Only use a single feature group for sparse matrices. bool single_group = !is_dense; if (single_group) { @@ -20,14 +20,14 @@ FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, return; } - std::vector& feature_segments_h = feature_segments.HostVector(); - std::vector& bin_segments_h = bin_segments.HostVector(); + auto& feature_segments_h = feature_segments.HostVector(); + auto& bin_segments_h = bin_segments.HostVector(); feature_segments_h.push_back(0); bin_segments_h.push_back(0); - const std::vector& cut_ptrs = cuts.Ptrs(); - size_t max_shmem_bins = shm_size / bin_size; - max_group_bins = 0; + std::vector const& cut_ptrs = cuts.Ptrs(); + // Maximum number of bins that can be placed into shared memory. + std::size_t max_shmem_bins = shm_size / sizeof(GradientPairInt64); for (size_t i = 2; i < cut_ptrs.size(); ++i) { int last_start = bin_segments_h.back(); @@ -41,17 +41,16 @@ FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, } feature_segments_h.push_back(cut_ptrs.size() - 1); bin_segments_h.push_back(cut_ptrs.back()); - max_group_bins = std::max(max_group_bins, - bin_segments_h.back() - - bin_segments_h[bin_segments_h.size() - 2]); + max_group_bins = + std::max(max_group_bins, bin_segments_h.back() - bin_segments_h[bin_segments_h.size() - 2]); } -void FeatureGroups::InitSingle(const common::HistogramCuts& cuts) { - std::vector& feature_segments_h = feature_segments.HostVector(); +void FeatureGroups::InitSingle(common::HistogramCuts const& cuts) { + auto& feature_segments_h = feature_segments.HostVector(); feature_segments_h.push_back(0); feature_segments_h.push_back(cuts.Ptrs().size() - 1); - std::vector& bin_segments_h = bin_segments.HostVector(); + auto& bin_segments_h = bin_segments.HostVector(); bin_segments_h.push_back(0); bin_segments_h.push_back(cuts.TotalBins()); diff --git a/src/tree/gpu_hist/feature_groups.cuh b/src/tree/gpu_hist/feature_groups.cuh index 82df69796ebd..37d87a9f577a 100644 --- a/src/tree/gpu_hist/feature_groups.cuh +++ b/src/tree/gpu_hist/feature_groups.cuh @@ -16,92 +16,97 @@ class HistogramCuts; namespace tree { -/** \brief FeatureGroup is a feature group. It is defined by a range of - consecutive feature indices, and also contains a range of all bin indices - associated with those features. */ +/** + * @brief FeatureGroup is a single group of features. + * + * It is defined by a range of consecutive feature indices, and also contains a range of + * all bin indices associated with those features. + */ struct FeatureGroup { - __host__ __device__ FeatureGroup(int start_feature_, int num_features_, - int start_bin_, int num_bins_) : - start_feature(start_feature_), num_features(num_features_), - start_bin(start_bin_), num_bins(num_bins_) {} + XGBOOST_DEVICE FeatureGroup(bst_feature_t start_feature, bst_feature_t n_features, + bst_bin_t start_bin, bst_bin_t num_bins) + : start_feature{start_feature}, + num_features{n_features}, + start_bin{start_bin}, + num_bins{num_bins} {} /** The first feature of the group. */ - int start_feature; + bst_feature_t start_feature; /** The number of features in the group. */ - int num_features; + bst_feature_t num_features; /** The first bin in the group. */ bst_bin_t start_bin; /** The number of bins in the group. */ bst_bin_t num_bins; }; -/** \brief FeatureGroupsAccessor is a non-owning accessor for FeatureGroups. */ +/** @brief FeatureGroupsAccessor is a non-owning accessor for FeatureGroups. */ struct FeatureGroupsAccessor { - FeatureGroupsAccessor(common::Span feature_segments_, - common::Span bin_segments_, int max_group_bins_) - : feature_segments(feature_segments_), - bin_segments(bin_segments_), - max_group_bins(max_group_bins_) {} + FeatureGroupsAccessor(common::Span feature_segments, + common::Span bin_segments, bst_bin_t max_group_bins) + : feature_segments{feature_segments}, + bin_segments{bin_segments}, + max_group_bins{max_group_bins} {} - common::Span feature_segments; + common::Span feature_segments; common::Span bin_segments; - int max_group_bins; + bst_bin_t max_group_bins; - /** \brief Gets the number of feature groups. */ - __host__ __device__ int NumGroups() const { - return feature_segments.size() - 1; - } + /** @brief Gets the number of feature groups. */ + XGBOOST_DEVICE int NumGroups() const { return feature_segments.size() - 1; } - /** \brief Gets the information about a feature group with index i. */ - __host__ __device__ FeatureGroup operator[](int i) const { - return {feature_segments[i], feature_segments[i + 1] - feature_segments[i], - bin_segments[i], bin_segments[i + 1] - bin_segments[i]}; + /** @brief Gets the information about a feature group with index i. */ + XGBOOST_DEVICE FeatureGroup operator[](bst_feature_t i) const { + return {feature_segments[i], feature_segments[i + 1] - feature_segments[i], bin_segments[i], + bin_segments[i + 1] - bin_segments[i]}; } }; -/** \brief FeatureGroups contains information that defines a split of features - into groups. Bins of a single feature group typically fit into shared - memory, so the histogram for the features of a single group can be computed - faster. - - \notes Known limitations: - - - splitting features into groups currently works only for dense matrices, - where it is easy to get a feature value in a row by its index; for sparse - matrices, the structure contains only a single group containing all - features; - - - if a single feature requires more bins than fit into shared memory, the - histogram is computed in global memory even if there are multiple feature - groups; note that this is unlikely to occur in practice, as the default - number of bins per feature is 256, whereas a thread block with 48 KiB - shared memory can contain 3072 bins if each gradient sum component is a - 64-bit floating-point value (double) +/** + * @brief FeatureGroups contains information that defines a split of features + * into groups. Bins of a single feature group typically fit into shared + * memory, so the histogram for the features of a single group can be computed + * faster. + * + * @note Known limitations: + * + * - splitting features into groups currently works only for dense matrices, + * where it is easy to get a feature value in a row by its index; for sparse + * matrices, the structure contains only a single group containing all + * features; + * + * - if a single feature requires more bins than fit into shared memory, the + * histogram is computed in global memory even if there are multiple feature + * groups; note that this is unlikely to occur in practice, as the default + * number of bins per feature is 256, whereas a thread block with 48 KiB + * shared memory can contain 3072 bins if each gradient sum component is a + * 64-bit floating-point value (double) */ struct FeatureGroups { /** Group cuts for features. Size equals to (number of groups + 1). */ - HostDeviceVector feature_segments; + HostDeviceVector feature_segments; /** Group cuts for bins. Size equals to (number of groups + 1) */ HostDeviceVector bin_segments; /** Maximum number of bins in a group. Useful to compute the amount of dynamic shared memory when launching a kernel. */ int max_group_bins; - /** Creates feature groups by splitting features into groups. - \param cuts Histogram cuts that given the number of bins per feature. - \param is_dense Whether the data matrix is dense. - \param shm_size Available size of shared memory per thread block (in - bytes) used to compute feature groups. - \param bin_size Size of a single bin of the histogram. */ - FeatureGroups(const common::HistogramCuts& cuts, bool is_dense, - size_t shm_size, size_t bin_size); - - /** Creates a single feature group containing all features and bins. - \notes This is used as a fallback for sparse matrices, and is also useful - for testing. + /** + * @brief Creates feature groups by splitting features into groups. + * + * @param cuts Histogram cuts that given the number of bins per feature. + * @param is_dense Whether the data matrix is dense. + * @param shm_size Available size of shared memory per thread block (in bytes) used to + * compute feature groups. */ - explicit FeatureGroups(const common::HistogramCuts& cuts) { - InitSingle(cuts); - } + FeatureGroups(common::HistogramCuts const& cuts, bool is_dense, size_t shm_size); + + /** + * @brief Creates a single feature group containing all features and bins. + * + * @notes This is used as a fallback for sparse matrices, and is also useful for + * testing. + */ + explicit FeatureGroups(const common::HistogramCuts& cuts) { this->InitSingle(cuts); } [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const { feature_segments.SetDevice(device); diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu index 46a52a8ea5d7..c9048e68ac77 100644 --- a/src/tree/gpu_hist/gradient_based_sampler.cu +++ b/src/tree/gpu_hist/gradient_based_sampler.cu @@ -205,8 +205,8 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx, auto batch_iterator = dmat->GetBatches(ctx, batch_param_); auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. - *page = EllpackPageImpl{ctx, first_page->CutsShared(), first_page->is_dense, - first_page->row_stride, sample_rows}; + *page = EllpackPageImpl{ctx, first_page->CutsShared(), first_page->IsDense(), + first_page->info.row_stride, sample_rows}; // Compact the ELLPACK pages into the single sample page. thrust::fill(cuctx->CTP(), page->gidx_buffer.begin(), page->gidx_buffer.end(), 0); @@ -290,8 +290,8 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c auto first_page = (*batch_iterator.begin()).Impl(); // Create a new ELLPACK page with empty rows. - *page = EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(), first_page->row_stride, - sample_rows}; + *page = EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(), + first_page->info.row_stride, sample_rows}; // Compact the ELLPACK pages into the single sample page. thrust::fill(cuctx->CTP(), page->gidx_buffer.begin(), page->gidx_buffer.end(), 0); for (auto& batch : batch_iterator) { diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu index d50f7284e9ad..102b1113be6b 100644 --- a/src/tree/gpu_hist/histogram.cu +++ b/src/tree/gpu_hist/histogram.cu @@ -138,19 +138,20 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest, *reinterpret_cast(&h)); } -template +template class HistogramAgent { + int constexpr static kItemsPerTile = kBlockThreads * kItemsPerThread; + GradientPairInt64* smem_arr_; GradientPairInt64* d_node_hist_; - using Idx = RowPartitioner::RowIndexT; + using Idx = cuda_impl::RowIndexT; dh::LDGIterator d_ridx_; const GradientPair* d_gpair_; const FeatureGroup group_; const EllpackDeviceAccessor& matrix_; const int feature_stride_; - const std::size_t n_elements_; + const bst_idx_t n_elements_; const GradientQuantiser& rounding_; public: @@ -158,34 +159,33 @@ class HistogramAgent { GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group, const EllpackDeviceAccessor& matrix, common::Span d_ridx, const GradientQuantiser& rounding, const GradientPair* d_gpair) - : smem_arr_(smem_arr), - d_node_hist_(d_node_hist), + : smem_arr_{smem_arr}, + d_node_hist_{d_node_hist}, d_ridx_(d_ridx.data()), - group_(group), + group_{group}, matrix_(matrix), - feature_stride_(kIsDense ? group.num_features : matrix.row_stride), - n_elements_(feature_stride_ * d_ridx.size()), - rounding_(rounding), - d_gpair_(d_gpair) {} + feature_stride_(kCompressed ? group.num_features : matrix.row_stride), + n_elements_{feature_stride_ * d_ridx.size()}, + rounding_{rounding}, + d_gpair_{d_gpair} {} __device__ void ProcessPartialTileShared(std::size_t offset) { - for (std::size_t idx = offset + threadIdx.x; - idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_); - idx += kBlockThreads) { + for (std::size_t idx = offset + threadIdx.x, + n = std::min(offset + kBlockThreads * kItemsPerTile, n_elements_); + idx < n; idx += kBlockThreads) { Idx ridx = d_ridx_[idx / feature_stride_]; auto fidx = FeatIdx(group_, idx, feature_stride_); bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)]; - if (kIsDense || compressed_bin != matrix_.NullValue()) { + if (compressed_bin != matrix_.NullValue()) { + // The matrix is compressed with feature-local bins. + if (kCompressed) { + compressed_bin += this->matrix_.feature_segments[fidx]; + } + // Avoid atomic add if it's a null value. auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]); // Subtract start_bin to write to group-local histogram. If this is not a dense // matrix, then start_bin is 0 since featuregrouping doesn't support sparse data. - if (kIsDense) { - AtomicAddGpairShared( - smem_arr_ + compressed_bin + this->matrix_.feature_segments[fidx] - group_.start_bin, - adjusted); - } else { - AtomicAddGpairShared(smem_arr_ + compressed_bin - group_.start_bin, adjusted); - } + AtomicAddGpairShared(smem_arr_ + compressed_bin - group_.start_bin, adjusted); } } } @@ -210,16 +210,19 @@ class HistogramAgent { for (int i = 0; i < kItemsPerThread; i++) { gpair[i] = d_gpair_[ridx[i]]; auto fidx = FeatIdx(group_, idx[i], feature_stride_); - if (kIsDense) { - gidx[i] = - matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)] + matrix_.feature_segments[fidx]; + gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)]; + if (gidx[i] != matrix_.NullValue()) { + if (kCompressed) { + gidx[i] += matrix_.feature_segments[fidx]; + } } else { - gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)]; + gidx[i] = -1; // missing } } #pragma unroll for (int i = 0; i < kItemsPerThread; i++) { - if ((kIsDense || gidx[i] != matrix_.NullValue())) { + // Avoid atomic add if it's a null value. + if (gidx[i] != -1) { auto adjusted = rounding_.ToFixedPoint(gpair[i]); AtomicAddGpairShared(smem_arr_ + gidx[i] - group_.start_bin, adjusted); } @@ -248,14 +251,12 @@ class HistogramAgent { Idx ridx = d_ridx_[idx / feature_stride_]; auto fidx = FeatIdx(group_, idx, feature_stride_); bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)]; - if (kIsDense || compressed_bin != matrix_.NullValue()) { - auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]); - if (kIsDense) { - auto start_bin = this->matrix_.feature_segments[fidx]; - AtomicAddGpairGlobal(d_node_hist_ + compressed_bin + start_bin, adjusted); - } else { - AtomicAddGpairGlobal(d_node_hist_ + compressed_bin, adjusted); + if (compressed_bin != matrix_.NullValue()) { + if (kCompressed) { + compressed_bin += this->matrix_.feature_segments[fidx]; } + auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]); + AtomicAddGpairGlobal(d_node_hist_ + compressed_bin, adjusted); } } } @@ -382,14 +383,14 @@ class DeviceHistogramBuilderImpl { if (!this->kernel_->shared) { CHECK_EQ(this->kernel_->smem_size, 0); - if (matrix.is_dense) { + if (matrix.IsDenseCompressed()) { launcher(this->kernel_->global_dense_kernel); } else { launcher(this->kernel_->global_kernel); } } else { CHECK_NE(this->kernel_->smem_size, 0); - if (matrix.is_dense) { + if (matrix.IsDenseCompressed()) { launcher(this->kernel_->shared_dense_kernel); } else { launcher(this->kernel_->shared_kernel); diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 31b8d34964b2..002f3e9a6ff1 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -104,6 +104,7 @@ struct GPUHistMakerDevice { dh::device_vector positions_; HistMakerTrainParam const* hist_param_; std::shared_ptr const cuts_; + std::unique_ptr feature_groups_; auto CreatePartitionNodes(RegTree const* p_tree, std::vector const& candidates) { std::vector nidx(candidates.size()); @@ -143,19 +144,20 @@ struct GPUHistMakerDevice { std::unique_ptr sampler; - std::unique_ptr feature_groups; common::Monitor monitor; GPUHistMakerDevice(Context const* ctx, TrainParam _param, HistMakerTrainParam const* hist_param, std::shared_ptr column_sampler, BatchParam batch_param, MetaInfo const& info, std::vector batch_ptr, - std::shared_ptr cuts) + std::shared_ptr cuts, bool dense_compressed) : evaluator_{_param, static_cast(info.num_col_), ctx->Device()}, ctx_{ctx}, column_sampler_{std::move(column_sampler)}, batch_ptr_{std::move(batch_ptr)}, hist_param_{hist_param}, cuts_{std::move(cuts)}, + feature_groups_{std::make_unique(*cuts_, dense_compressed, + dh::MaxSharedMemoryOptin(ctx_->Ordinal()))}, param{std::move(_param)}, interaction_constraints(param, static_cast(info.num_col_)), sampler{std::make_unique( @@ -172,15 +174,6 @@ struct GPUHistMakerDevice { ~GPUHistMakerDevice() = default; - void InitFeatureGroupsOnce(MetaInfo const& info) { - if (!feature_groups) { - CHECK(cuts_); - feature_groups = std::make_unique(*cuts_, info.IsDense(), - dh::MaxSharedMemoryOptin(ctx_->Ordinal()), - sizeof(GradientPairInt64)); - } - } - // Reset values for each update iteration [[nodiscard]] DMatrix* Reset(HostDeviceVector const* dh_gpair, DMatrix* p_fmat) { this->monitor.Start(__func__); @@ -240,10 +233,8 @@ struct GPUHistMakerDevice { */ this->quantiser = std::make_unique(ctx_, this->gpair, p_fmat->Info()); - this->InitFeatureGroupsOnce(info); - this->histogram_.Reset(ctx_, this->hist_param_->MaxCachedHistNodes(ctx_->Device()), - feature_groups->DeviceAccessor(ctx_->Device()), cuts_->TotalBins(), + feature_groups_->DeviceAccessor(ctx_->Device()), cuts_->TotalBins(), false); this->monitor.Stop(__func__); return p_fmat; @@ -334,7 +325,7 @@ struct GPUHistMakerDevice { auto d_ridx = partitioners_.at(k)->GetRows(nidx); this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc, - feature_groups->DeviceAccessor(ctx_->Device()), this->gpair, + feature_groups_->DeviceAccessor(ctx_->Device()), this->gpair, d_ridx, d_node_hist, *quantiser); monitor.Stop(__func__); } @@ -762,22 +753,27 @@ struct GPUHistMakerDevice { } }; -std::shared_ptr InitBatchCuts(Context const* ctx, DMatrix* p_fmat, - BatchParam batch, - std::vector* p_batch_ptr) { +std::pair, bool> InitBatchCuts( + Context const* ctx, DMatrix* p_fmat, BatchParam const& batch, + std::vector* p_batch_ptr) { std::vector& batch_ptr = *p_batch_ptr; batch_ptr = {0}; std::shared_ptr cuts; + std::int32_t dense_compressed = -1; for (auto const& page : p_fmat->GetBatches(ctx, batch)) { batch_ptr.push_back(page.Size()); cuts = page.Impl()->CutsShared(); CHECK(cuts->cut_values_.DeviceCanRead()); + if (dense_compressed != -1) { + CHECK_EQ(page.Impl()->IsDenseCompressed(), static_cast(dense_compressed)); + } + dense_compressed = page.Impl()->IsDenseCompressed(); } CHECK(cuts); CHECK_EQ(p_fmat->NumBatches(), batch_ptr.size() - 1); std::partial_sum(batch_ptr.cbegin(), batch_ptr.cend(), batch_ptr.begin()); - return cuts; + return {cuts, static_cast(dense_compressed)}; } class GPUHistMaker : public TreeUpdater { @@ -840,10 +836,11 @@ class GPUHistMaker : public TreeUpdater { std::vector batch_ptr; auto batch = HistBatch(*param); - auto cuts = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr); + auto [cuts, dense_compressed] = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr); - this->maker = std::make_unique( - ctx_, *param, &hist_maker_param_, column_sampler_, batch, p_fmat->Info(), batch_ptr, cuts); + this->maker = std::make_unique(ctx_, *param, &hist_maker_param_, + column_sampler_, batch, p_fmat->Info(), + batch_ptr, cuts, dense_compressed); p_last_fmat_ = p_fmat; initialised_ = true; @@ -947,11 +944,12 @@ class GPUGlobalApproxMaker : public TreeUpdater { std::vector batch_ptr; auto batch = ApproxBatch(*param, hess, *task_); - auto cuts = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr); + auto [cuts, dense_compressed] = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr); batch.regen = false; // Regen only at the beginning of the iteration. - this->maker_ = std::make_unique( - ctx_, *param, &hist_maker_param_, column_sampler_, batch, p_fmat->Info(), batch_ptr, cuts); + this->maker_ = std::make_unique(ctx_, *param, &hist_maker_param_, + column_sampler_, batch, p_fmat->Info(), + batch_ptr, cuts, dense_compressed); std::size_t t_idx{0}; for (xgboost::RegTree* tree : trees) { diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py index 91302c1ed563..c8d0f47709ab 100644 --- a/tests/ci_build/lint_python.py +++ b/tests/ci_build/lint_python.py @@ -105,6 +105,7 @@ class LintersPaths: "tests/test_distributed/test_with_spark/test_data.py", "tests/test_distributed/test_gpu_with_spark/test_data.py", "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py", + "tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py", # demo "demo/dask/", "demo/json-model/json_parser.py", diff --git a/tests/cpp/common/test_compressed_iterator.cc b/tests/cpp/common/test_compressed_iterator.cc index 93243c0b336e..41280dde7fa7 100644 --- a/tests/cpp/common/test_compressed_iterator.cc +++ b/tests/cpp/common/test_compressed_iterator.cc @@ -1,9 +1,25 @@ +/** + * Copyright 2017-2024, XGBoost contributors + */ #include "../../../src/common/compressed_iterator.h" #include "gtest/gtest.h" #include -namespace xgboost { -namespace common { +namespace xgboost::common { +TEST(CompressedIterator, Size) { + bst_idx_t n = 2048; + { + bst_idx_t n_symbols = 256; + auto n_bytes = CompressedBufferWriter::CalculateBufferSize(n, n_symbols); + ASSERT_EQ(n_bytes, 2052); + } + { + bst_idx_t n_symbols = 64; + auto n_bytes = CompressedBufferWriter::CalculateBufferSize(n, n_symbols); + ASSERT_EQ(n_bytes, 1540); + } +} + TEST(CompressedIterator, Test) { ASSERT_TRUE(detail::SymbolBits(256) == 8); ASSERT_TRUE(detail::SymbolBits(150) == 8); @@ -50,6 +66,4 @@ TEST(CompressedIterator, Test) { } } } - -} // namespace common -} // namespace xgboost +} // namespace xgboost::common diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu index 55375a5a7ffa..0ccb79def2ec 100644 --- a/tests/cpp/data/test_ellpack_page.cu +++ b/tests/cpp/data/test_ellpack_page.cu @@ -7,9 +7,11 @@ #include "../../../src/common/categorical.h" #include "../../../src/common/hist_util.h" +#include "../../../src/data/device_adapter.cuh" // for CupyAdapter #include "../../../src/data/ellpack_page.cuh" #include "../../../src/data/ellpack_page.h" -#include "../../../src/tree/param.h" // TrainParam +#include "../../../src/data/gradient_index.h" // for GHistIndexMatrix +#include "../../../src/tree/param.h" // for TrainParam #include "../helpers.h" #include "../histogram_helpers.h" #include "gtest/gtest.h" @@ -24,7 +26,7 @@ TEST(EllpackPage, EmptyDMatrix) { &ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()}) .begin(); auto impl = page.Impl(); - ASSERT_EQ(impl->row_stride, 0); + ASSERT_EQ(impl->info.row_stride, 0); ASSERT_EQ(impl->Cuts().TotalBins(), 0); ASSERT_EQ(impl->gidx_buffer.size(), 4); } @@ -36,7 +38,7 @@ TEST(EllpackPage, BuildGidxDense) { std::vector h_gidx_buffer; auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer); - ASSERT_EQ(page->row_stride, n_features); + ASSERT_EQ(page->info.row_stride, n_features); std::vector solution = { 0, 3, 8, 9, 14, 17, 20, 21, @@ -60,6 +62,9 @@ TEST(EllpackPage, BuildGidxDense) { auto fidx = i % n_features; ASSERT_EQ(solution[i], h_accessor.gidx_iter[i] + h_accessor.feature_segments[fidx]); } + ASSERT_EQ(page->NumSymbols(), 3); + ASSERT_EQ(page->NumNonMissing(&ctx, {}), n_samples * n_features); + ASSERT_EQ(page->NumSymbols(), h_accessor.NullValue()); } TEST(EllpackPage, BuildGidxSparse) { @@ -68,9 +73,9 @@ TEST(EllpackPage, BuildGidxSparse) { auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f); std::vector h_gidx_buffer; - auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer); + auto h_acc = page->GetHostAccessor(&ctx, &h_gidx_buffer); - ASSERT_LE(page->row_stride, 3); + ASSERT_EQ(page->info.row_stride, 3); // row_stride = 3, 16 rows, 48 entries for ELLPack std::vector solution = { @@ -78,8 +83,8 @@ TEST(EllpackPage, BuildGidxSparse) { 24, 24, 24, 24, 24, 5, 24, 24, 0, 16, 24, 15, 24, 24, 24, 24, 24, 7, 14, 16, 4, 24, 24, 24, 24, 24, 9, 24, 24, 1, 24, 24 }; - for (size_t i = 0; i < kNRows * page->row_stride; ++i) { - ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]); + for (size_t i = 0; i < kNRows * page->info.row_stride; ++i) { + ASSERT_EQ(solution[i], h_acc.gidx_iter[i]); } } @@ -103,8 +108,9 @@ TEST(EllpackPage, FromCategoricalBasic) { auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin(); ASSERT_EQ(n_uniques, kCats); - std::vector h_cuts_ptr(accessor.feature_segments.size()); - dh::CopyDeviceSpanToVector(&h_cuts_ptr, accessor.feature_segments); + std::vector h_cuts_ptr(accessor.NumFeatures() + 1); + dh::safe_cuda(cudaMemcpyAsync(h_cuts_ptr.data(), accessor.feature_segments, + sizeof(bst_feature_t) * h_cuts_ptr.size(), cudaMemcpyDefault)); std::vector h_cuts_values(accessor.gidx_fvalue_map.size()); dh::CopyDeviceSpanToVector(&h_cuts_values, accessor.gidx_fvalue_map); @@ -149,7 +155,7 @@ TEST(EllpackPage, Copy) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kRows); + EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->info.row_stride, kRows); // Copy batch pages into the result page. size_t offset = 0; @@ -195,7 +201,7 @@ TEST(EllpackPage, Compact) { auto page = (*dmat->GetBatches(&ctx, param).begin()).Impl(); // Create an empty result page. - EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, + EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->info.row_stride, kCompactedRows); // Compact batch pages into the result page. @@ -240,7 +246,141 @@ TEST(EllpackPage, Compact) { } namespace { -class EllpackPageTest : public testing::TestWithParam { +// Test for treating sparse ellpack as a dense +class CompressedDense : public ::testing::TestWithParam { + auto InitSparsePage(std::size_t null_column) const { + bst_idx_t n_samples = 16, n_features = 8; + std::vector data(n_samples * n_features); + + std::iota(data.begin(), data.end(), 0.0f); + for (std::size_t i = 0; i < data.size(); i += n_features) { + data[i + null_column] = std::numeric_limits::quiet_NaN(); + } + data[null_column] = null_column; // keep the first sample full. + auto p_fmat = GetDMatrixFromData(data, n_samples, n_features); + return p_fmat; + } + + void CheckBasic(Context const* ctx, BatchParam batch, std::size_t null_column, + EllpackPageImpl const& impl) { + ASSERT_FALSE(impl.IsDense()); + ASSERT_TRUE(impl.IsDenseCompressed()); + ASSERT_EQ(impl.NumSymbols(), batch.max_bin + 1); + + std::vector h_gidx; + auto h_acc = impl.GetHostAccessor(ctx, &h_gidx); + ASSERT_EQ(h_acc.row_stride, h_acc.NumFeatures()); + ASSERT_EQ(h_acc.NullValue(), batch.max_bin); + for (std::size_t i = 0; i < h_acc.row_stride * h_acc.n_rows; ++i) { + auto [m, n] = linalg::UnravelIndex(i, h_acc.n_rows, h_acc.row_stride); + if (n == null_column && m != 0) { + ASSERT_EQ(static_cast(h_acc.gidx_iter[i]), h_acc.NullValue()); + } else { + ASSERT_EQ(static_cast(h_acc.gidx_iter[i]), m); + } + } + } + + public: + void CheckFromSparsePage(std::size_t null_column) { + auto p_fmat = this->InitSparsePage(null_column); + auto ctx = MakeCUDACtx(0); + auto batch = BatchParam{static_cast(p_fmat->Info().num_row_), + std::numeric_limits::quiet_NaN()}; + + for (auto const& ellpack : p_fmat->GetBatches(&ctx, batch)) { + auto impl = ellpack.Impl(); + this->CheckBasic(&ctx, batch, null_column, *impl); + } + } + + void CheckFromAdapter(std::size_t null_column) { + bst_idx_t n_samples = 16, n_features = 8; + + auto ctx = MakeCUDACtx(0); + HostDeviceVector data(n_samples * n_features, 0.0f, ctx.Device()); + auto& h_data = data.HostVector(); + std::iota(h_data.begin(), h_data.end(), 0.0f); + for (std::size_t i = 0; i < h_data.size(); i += n_features) { + h_data[i + null_column] = std::numeric_limits::quiet_NaN(); + } + h_data[null_column] = null_column; // Keep the first sample full. + auto p_fmat = GetDMatrixFromData(h_data, n_samples, n_features); + + data.ConstDeviceSpan(); // Pull to device + auto arri = GetArrayInterface(&data, n_samples, n_features); + auto sarri = Json::Dump(arri); + data::CupyAdapter adapter{StringView{sarri}}; + + Context cpu_ctx; + auto batch = BatchParam{static_cast(p_fmat->Info().num_row_), 0.8}; + + std::shared_ptr cuts; + for (auto const& page : p_fmat->GetBatches(&cpu_ctx, batch)) { + cuts = std::make_shared(page.Cuts()); + } + dh::device_vector row_counts(n_samples, n_features - 1); + row_counts[0] = n_features; + auto d_row_counts = dh::ToSpan(row_counts); + ASSERT_EQ(adapter.NumColumns(), n_features); + auto impl = + EllpackPageImpl{&ctx, adapter.Value(), std::numeric_limits::quiet_NaN(), + false, d_row_counts, {}, + n_features, n_samples, cuts}; + this->CheckBasic(&ctx, batch, null_column, impl); + dh::DefaultStream().Sync(); + } + + void CheckFromToGHist(std::size_t null_column) { + Context cpu_ctx; + auto ctx = MakeCUDACtx(0); + std::vector orig; + { + // Test from GHist + auto p_fmat = this->InitSparsePage(null_column); + auto batch = BatchParam{static_cast(p_fmat->Info().num_row_), 0.8}; + for (auto const& page : p_fmat->GetBatches(&cpu_ctx, batch)) { + orig = {page.data.cbegin(), page.data.cend()}; + auto impl = EllpackPageImpl{&ctx, page, {}}; + this->CheckBasic(&ctx, batch, null_column, impl); + } + } + + { + // Test to GHist + auto p_fmat = this->InitSparsePage(null_column); + auto batch = BatchParam{static_cast(p_fmat->Info().num_row_), 0.8}; + for (auto const& page : p_fmat->GetBatches(&ctx, batch)) { + auto gidx = GHistIndexMatrix{&ctx, p_fmat->Info(), page, batch}; + ASSERT_EQ(gidx.Size(), p_fmat->Info().num_row_); + for (std::size_t ridx = 0; ridx < gidx.Size(); ++ridx) { + auto rbegin = gidx.row_ptr[ridx]; + auto rend = gidx.row_ptr[ridx + 1]; + if (ridx == 0) { + ASSERT_EQ(rend - rbegin, p_fmat->Info().num_col_); + } else { + ASSERT_EQ(rend - rbegin, p_fmat->Info().num_col_ - 1); + } + } + // GHist can't compress a dataset with missing values + ASSERT_FALSE(gidx.index.Offset()); + ASSERT_TRUE(std::equal(gidx.data.cbegin(), gidx.data.cend(), orig.cbegin())); + } + } + } +}; + +TEST_P(CompressedDense, FromSparsePage) { this->CheckFromSparsePage(this->GetParam()); } + +TEST_P(CompressedDense, FromAdapter) { this->CheckFromAdapter(this->GetParam()); } + +TEST_P(CompressedDense, FromToGHist) { this->CheckFromToGHist(this->GetParam()); } +} // anonymous namespace + +INSTANTIATE_TEST_SUITE_P(EllpackPage, CompressedDense, testing::Values(0ul, 1ul, 7ul)); + +namespace { +class SparseEllpack : public testing::TestWithParam { protected: void TestFromGHistIndex(float sparsity) const { // Only testing with small sample size as the cuts might be different between host and @@ -268,7 +408,7 @@ class EllpackPageTest : public testing::TestWithParam { std::vector h_gidx_from_sparse, h_gidx_from_ghist; auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist); auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse); - for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) { + for (size_t i = 0; i < from_ghist->n_rows * from_ghist->info.row_stride; ++i) { ASSERT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]); } } @@ -289,9 +429,9 @@ class EllpackPageTest : public testing::TestWithParam { }; } // namespace -TEST_P(EllpackPageTest, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); } +TEST_P(SparseEllpack, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); } -TEST_P(EllpackPageTest, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); } +TEST_P(SparseEllpack, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); } -INSTANTIATE_TEST_SUITE_P(EllpackPage, EllpackPageTest, testing::Values(.0f, .2f, .4f, .8f)); +INSTANTIATE_TEST_SUITE_P(EllpackPage, SparseEllpack, testing::Values(.0f, .2f, .4f, .8f)); } // namespace xgboost diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu index a26aaedb5e07..87fd6db5fa05 100644 --- a/tests/cpp/data/test_ellpack_page_raw_format.cu +++ b/tests/cpp/data/test_ellpack_page_raw_format.cu @@ -56,7 +56,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam { ASSERT_EQ(loaded->Cuts().MinValues(), orig->Cuts().MinValues()); ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values()); ASSERT_EQ(loaded->base_rowid, orig->base_rowid); - ASSERT_EQ(loaded->row_stride, orig->row_stride); + ASSERT_EQ(loaded->info.row_stride, orig->info.row_stride); std::vector h_loaded, h_orig; [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded); [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig); @@ -125,7 +125,8 @@ TEST_P(TestEllpackPageRawFormat, HostIO) { ASSERT_EQ(h_acc_orig.row_stride, h_acc.row_stride); ASSERT_EQ(h_acc_orig.n_rows, h_acc.n_rows); ASSERT_EQ(h_acc_orig.base_rowid, h_acc.base_rowid); - ASSERT_EQ(h_acc_orig.is_dense, h_acc.is_dense); + ASSERT_EQ(h_acc_orig.IsDenseCompressed(), h_acc.IsDenseCompressed()); + ASSERT_EQ(h_acc_orig.NullValue(), h_acc.NullValue()); } } } diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cc b/tests/cpp/data/test_extmem_quantile_dmatrix.cc index 623637ea4ec2..691bcf3369f8 100644 --- a/tests/cpp/data/test_extmem_quantile_dmatrix.cc +++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cc @@ -45,7 +45,8 @@ class ExtMemQuantileDMatrixCpu : public ::testing::TestWithParam { }; Context ctx; - TestExtMemQdmBasic(&ctx, false, sparsity, equal); + TestExtMemQdmBasic( + &ctx, false, sparsity, equal, [](GHistIndexMatrix const& page) { return page.IsDense(); }); } }; } // anonymous namespace diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu index 3b65dffa19a1..0c0be6be8c40 100644 --- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu +++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu @@ -29,10 +29,13 @@ class ExtMemQuantileDMatrixGpu : public ::testing::TestWithParam { auto equal = std::equal(h_orig.cbegin(), h_orig.cend(), h_sparse.cbegin()); ASSERT_TRUE(equal); }; + auto no_missing = [](EllpackPage const& page) { + return page.Impl()->IsDense(); + }; auto ctx = MakeCUDACtx(0); - TestExtMemQdmBasic(&ctx, true, sparsity, equal); - TestExtMemQdmBasic(&ctx, false, sparsity, equal); + TestExtMemQdmBasic(&ctx, true, sparsity, equal, no_missing); + TestExtMemQdmBasic(&ctx, false, sparsity, equal, no_missing); } }; diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.h b/tests/cpp/data/test_extmem_quantile_dmatrix.h index 25f2e06542c8..2d4958010b03 100644 --- a/tests/cpp/data/test_extmem_quantile_dmatrix.h +++ b/tests/cpp/data/test_extmem_quantile_dmatrix.h @@ -8,8 +8,9 @@ #include "../helpers.h" // for RandomDataGenerator namespace xgboost::data { -template -void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal&& check_equal) { +template +void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal&& check_equal, + NoMissing&& no_missing) { bst_idx_t n_samples = 256, n_features = 16, n_batches = 4; bst_bin_t max_bin = 64; bst_target_t n_targets = 3; @@ -31,7 +32,7 @@ void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal& ++batch_cnt; base_cnt += n_samples / n_batches; row_cnt += page.Size(); - ASSERT_EQ((sparsity == 0.0f), page.IsDense()); + ASSERT_EQ((sparsity == 0.0f), no_missing(page)); } ASSERT_EQ(n_batches, batch_cnt); ASSERT_EQ(p_fmat->Info().num_row_, n_samples); diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu index 8d2e837ff38c..eebc38fbeee2 100644 --- a/tests/cpp/data/test_iterative_dmatrix.cu +++ b/tests/cpp/data/test_iterative_dmatrix.cu @@ -20,14 +20,15 @@ void TestEquivalent(float sparsity) { std::numeric_limits::quiet_NaN(), 0, 256); std::size_t offset = 0; auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl(); - std::unique_ptr page_concatenated{new EllpackPageImpl( - &ctx, first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)}; + std::unique_ptr page_concatenated{new EllpackPageImpl{ + &ctx, first->CutsShared(), first->is_dense, first->info.row_stride, 1000 * 100}}; for (auto& batch : m.GetBatches(&ctx, {})) { auto page = batch.Impl(); size_t num_elements = page_concatenated->Copy(&ctx, page, offset); offset += num_elements; } - auto from_iter = page_concatenated->GetDeviceAccessor(&ctx); + std::vector h_iter_buffer; + auto from_iter = page_concatenated->GetHostAccessor(&ctx, &h_iter_buffer); ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols()); ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows()); @@ -37,33 +38,20 @@ void TestEquivalent(float sparsity) { DMatrix::Create(&adapter, std::numeric_limits::quiet_NaN(), 0)}; auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()}; for (auto& ellpack : dm->GetBatches(&ctx, bp)) { - auto from_data = ellpack.Impl()->GetDeviceAccessor(&ctx); + std::vector h_data_buffer; + auto from_data = ellpack.Impl()->GetHostAccessor(&ctx, &h_data_buffer); - std::vector cuts_from_iter(from_iter.gidx_fvalue_map.size()); - std::vector min_fvalues_iter(from_iter.min_fvalue.size()); - std::vector cut_ptrs_iter(from_iter.feature_segments.size()); - dh::CopyDeviceSpanToVector(&cuts_from_iter, from_iter.gidx_fvalue_map); - dh::CopyDeviceSpanToVector(&min_fvalues_iter, from_iter.min_fvalue); - dh::CopyDeviceSpanToVector(&cut_ptrs_iter, from_iter.feature_segments); - - std::vector cuts_from_data(from_data.gidx_fvalue_map.size()); - std::vector min_fvalues_data(from_data.min_fvalue.size()); - std::vector cut_ptrs_data(from_data.feature_segments.size()); - dh::CopyDeviceSpanToVector(&cuts_from_data, from_data.gidx_fvalue_map); - dh::CopyDeviceSpanToVector(&min_fvalues_data, from_data.min_fvalue); - dh::CopyDeviceSpanToVector(&cut_ptrs_data, from_data.feature_segments); - - ASSERT_EQ(cuts_from_iter.size(), cuts_from_data.size()); - for (size_t i = 0; i < cuts_from_iter.size(); ++i) { - EXPECT_NEAR(cuts_from_iter[i], cuts_from_data[i], kRtEps); + ASSERT_EQ(from_iter.gidx_fvalue_map.size(), from_data.gidx_fvalue_map.size()); + for (size_t i = 0; i < from_iter.gidx_fvalue_map.size(); ++i) { + EXPECT_NEAR(from_iter.gidx_fvalue_map[i], from_data.gidx_fvalue_map[i], kRtEps); } - ASSERT_EQ(min_fvalues_iter.size(), min_fvalues_data.size()); - for (size_t i = 0; i < min_fvalues_iter.size(); ++i) { - ASSERT_NEAR(min_fvalues_iter[i], min_fvalues_data[i], kRtEps); + ASSERT_EQ(from_iter.min_fvalue.size(), from_data.min_fvalue.size()); + for (size_t i = 0; i < from_iter.min_fvalue.size(); ++i) { + ASSERT_NEAR(from_iter.min_fvalue[i], from_data.min_fvalue[i], kRtEps); } - ASSERT_EQ(cut_ptrs_iter.size(), cut_ptrs_data.size()); - for (size_t i = 0; i < cut_ptrs_iter.size(); ++i) { - ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]); + ASSERT_EQ(from_iter.NumFeatures(), from_data.NumFeatures()); + for (size_t i = 0; i < from_iter.NumFeatures() + 1; ++i) { + ASSERT_EQ(from_iter.feature_segments[i], from_data.feature_segments[i]); } std::vector buffer_from_iter, buffer_from_data; @@ -122,39 +110,43 @@ TEST(IterativeDeviceDMatrix, RowMajor) { TEST(IterativeDeviceDMatrix, RowMajorMissing) { const float kMissing = std::numeric_limits::quiet_NaN(); - size_t rows = 10; - size_t cols = 2; - CudaArrayIterForTest iter(0.0f, rows, cols, 2); + bst_idx_t rows = 4; + size_t cols = 3; + CudaArrayIterForTest iter{0.0f, rows, cols, 2}; std::string interface_str = iter.AsArray(); - auto j_interface = - Json::Load({interface_str.c_str(), interface_str.size()}); - ArrayInterface<2> loaded {get(j_interface)}; + auto j_interface = Json::Load({interface_str.c_str(), interface_str.size()}); + ArrayInterface<2> loaded{get(j_interface)}; std::vector h_data(cols * rows); common::Span s_data{static_cast(loaded.data), cols * rows}; dh::CopyDeviceSpanToVector(&h_data, s_data); h_data[1] = kMissing; h_data[5] = kMissing; h_data[6] = kMissing; - auto ptr = thrust::device_ptr( - reinterpret_cast(get(j_interface["data"][0]))); + h_data[9] = kMissing; // idx = (2, 0) + h_data[10] = kMissing; // idx = (2, 1) + auto ptr = + thrust::device_ptr(reinterpret_cast(get(j_interface["data"][0]))); thrust::copy(h_data.cbegin(), h_data.cend(), ptr); - - IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next, - std::numeric_limits::quiet_NaN(), 0, 256); + IterativeDMatrix m{ + &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits::quiet_NaN(), 0, 256}; auto ctx = MakeCUDACtx(0); auto& ellpack = *m.GetBatches(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()}) .begin(); auto impl = ellpack.Impl(); std::vector h_gidx; - auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx); - EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(&ctx).NullValue()); - EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(&ctx).NullValue()); + auto h_acc = impl->GetHostAccessor(&ctx, &h_gidx); // null values get placed after valid values in a row - EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(&ctx).NullValue()); + ASSERT_FALSE(h_acc.IsDenseCompressed()); + ASSERT_EQ(h_acc.row_stride, cols - 1); + ASSERT_EQ(h_acc.gidx_iter[7], impl->GetDeviceAccessor(&ctx).NullValue()); + for (std::size_t i = 0; i < 7; ++i) { + ASSERT_NE(h_acc.gidx_iter[i], impl->GetDeviceAccessor(&ctx).NullValue()); + } + EXPECT_EQ(m.Info().num_col_, cols); EXPECT_EQ(m.Info().num_row_, rows); - EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3); + EXPECT_EQ(m.Info().num_nonzero_, rows * cols - 5); } TEST(IterativeDeviceDMatrix, IsDense) { diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index ff65b6ae59b3..dd26eca609fc 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -203,8 +203,8 @@ class TestEllpackPageExt : public ::testing::TestWithParamGetBatches(&ctx, param).begin()).Impl(); ASSERT_EQ(impl->base_rowid, 0); ASSERT_EQ(impl->n_rows, kRows); - ASSERT_EQ(impl->is_dense, is_dense); - ASSERT_EQ(impl->row_stride, 2); + ASSERT_EQ(impl->IsDense(), is_dense); + ASSERT_EQ(impl->info.row_stride, 2); ASSERT_EQ(impl->Cuts().TotalBins(), 4); std::unique_ptr impl_ext; @@ -213,15 +213,15 @@ class TestEllpackPageExt : public ::testing::TestWithParam(&ctx, batch.Impl()->CutsShared(), batch.Impl()->is_dense, - batch.Impl()->row_stride, kRows); + batch.Impl()->info.row_stride, kRows); } auto n_elems = impl_ext->Copy(&ctx, batch.Impl(), offset); offset += n_elems; } ASSERT_EQ(impl_ext->base_rowid, 0); ASSERT_EQ(impl_ext->n_rows, kRows); - ASSERT_EQ(impl_ext->is_dense, is_dense); - ASSERT_EQ(impl_ext->row_stride, 2); + ASSERT_EQ(impl_ext->IsDense(), is_dense); + ASSERT_EQ(impl_ext->info.row_stride, 2); ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4); std::vector buffer; diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu index a2322d23b1f4..5d8a67c22b05 100644 --- a/tests/cpp/test_context.cu +++ b/tests/cpp/test_context.cu @@ -28,7 +28,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) { } } // namespace -TEST(Context, DeviceOrdinal) { +TEST(Context, MGPUDeviceOrdinal) { Context ctx; auto n_vis = curt::AllVisibleGPUs(); auto ord = n_vis - 1; @@ -77,7 +77,7 @@ TEST(Context, DeviceOrdinal) { TestCUDA(ctx, 0); } -TEST(Context, GPUId) { +TEST(Context, MGPUId) { Context ctx; ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}}); TestCUDA(ctx, 0); diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu index e26c8b980649..84be979d85ad 100644 --- a/tests/cpp/tree/gpu_hist/test_histogram.cu +++ b/tests/cpp/tree/gpu_hist/test_histogram.cu @@ -61,8 +61,7 @@ TEST(Histogram, SubtractionTrack) { auto page = BuildEllpackPage(&ctx, 64, 4); auto cuts = page->CutsShared(); - FeatureGroups fg{*cuts, true, std::numeric_limits::max(), - sizeof(GradientPairPrecise)}; + FeatureGroups fg{*cuts, true, std::numeric_limits::max()}; auto fg_acc = fg.DeviceAccessor(ctx.Device()); auto n_total_bins = cuts->TotalBins(); @@ -102,14 +101,7 @@ std::vector GetHostHistGpair() { void TestBuildHist(bool use_shared_memory_histograms) { int const kNRows = 16, kNCols = 8; - Context ctx{MakeCUDACtx(0)}; - - TrainParam param; - Args args{ - {"max_depth", "6"}, - {"max_leaves", "0"}, - }; - param.Init(args); + auto ctx = MakeCUDACtx(0); auto page = BuildEllpackPage(&ctx, kNRows, kNCols); BatchParam batch_param{}; @@ -129,7 +121,7 @@ void TestBuildHist(bool use_shared_memory_histograms) { auto quantiser = std::make_unique(&ctx, gpair.ConstDeviceSpan(), MetaInfo()); auto shm_size = use_shared_memory_histograms ? dh::MaxSharedMemoryOptin(ctx.Ordinal()) : 0; - FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64)); + FeatureGroups feature_groups(page->Cuts(), page->IsDenseCompressed(), shm_size); DeviceHistogramBuilder builder; builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(), @@ -161,7 +153,8 @@ TEST(Histogram, BuildHistSharedMem) { TestBuildHist(true); } -void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) { +namespace { +void TestDeterministicHistogram(bool is_dense, std::size_t shm_size, bool force_global) { Context ctx = MakeCUDACtx(0); size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16; float constexpr kLower = -1e-2, kUpper = 1e2; @@ -183,7 +176,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) auto gpair = GenerateRandomGradients(kRows, kLower, kUpper); gpair.SetDevice(ctx.Device()); - FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64)); + FeatureGroups feature_groups{page->Cuts(), page->IsDenseCompressed(), shm_size}; auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo()); DeviceHistogramBuilder builder; @@ -211,8 +204,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) std::vector new_histogram_h(num_bins); dh::safe_cuda(cudaMemcpy(new_histogram_h.data(), d_new_histogram.data(), - num_bins * sizeof(GradientPairInt64), - cudaMemcpyDeviceToHost)); + num_bins * sizeof(GradientPairInt64), cudaMemcpyDeviceToHost)); for (size_t j = 0; j < new_histogram_h.size(); ++j) { ASSERT_EQ(new_histogram_h[j].GetQuantisedGrad(), histogram_h[j].GetQuantisedGrad()); ASSERT_EQ(new_histogram_h[j].GetQuantisedHess(), histogram_h[j].GetQuantisedHess()); @@ -236,28 +228,30 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) std::vector baseline_h(num_bins); dh::safe_cuda(cudaMemcpy(baseline_h.data(), baseline.data().get(), - num_bins * sizeof(GradientPairInt64), - cudaMemcpyDeviceToHost)); + num_bins * sizeof(GradientPairInt64), cudaMemcpyDeviceToHost)); for (size_t i = 0; i < baseline.size(); ++i) { - EXPECT_NEAR(baseline_h[i].GetQuantisedGrad(), histogram_h[i].GetQuantisedGrad(), + ASSERT_NEAR(baseline_h[i].GetQuantisedGrad(), histogram_h[i].GetQuantisedGrad(), baseline_h[i].GetQuantisedGrad() * 1e-3); } } } } - -TEST(Histogram, GPUDeterministic) { - std::vector is_dense_array{false, true}; - std::vector shm_sizes{48 * 1024, 64 * 1024, 160 * 1024}; - for (bool is_dense : is_dense_array) { - for (int shm_size : shm_sizes) { - for (bool force_global : {true, false}) { - TestDeterministicHistogram(is_dense, shm_size, force_global); - } - } +class TestGPUDeterministic : public ::testing::TestWithParam> { + protected: + void Run() { + auto [is_dense, shm_size, force_global] = this->GetParam(); + TestDeterministicHistogram(is_dense, shm_size, force_global); } -} +}; +} // anonymous namespace + +TEST_P(TestGPUDeterministic, Histogram) { this->Run(); } + +INSTANTIATE_TEST_SUITE_P(Histogram, TestGPUDeterministic, + ::testing::Combine(::testing::Bool(), + ::testing::Values(48 * 1024, 64 * 1024, 160 * 1024), + ::testing::Bool())); void ValidateCategoricalHistogram(size_t n_categories, common::Span onehot, common::Span cat) { @@ -513,13 +507,7 @@ TEST_P(HistogramExternalMemoryTest, ExternalMemory) { std::apply(&HistogramExternalMemoryTest::Run, std::tuple_cat(std::make_tuple(this), GetParam())); } -INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest, ::testing::ValuesIn([]() { - std::vector> params; - for (auto global : {true, false}) { - for (auto sparsity : {0.0f, 0.2f, 0.8f}) { - params.emplace_back(sparsity, global); - } - } - return params; - }())); +INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest, + ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.8f), + ::testing::Bool())); } // namespace xgboost::tree diff --git a/tests/cpp/tree/test_gpu_approx.cu b/tests/cpp/tree/test_gpu_approx.cu new file mode 100644 index 000000000000..7df60b8cbcd2 --- /dev/null +++ b/tests/cpp/tree/test_gpu_approx.cu @@ -0,0 +1,65 @@ +/** + * Copyright 2024, XGBoost contributors + */ +#include +#include // for Json +#include // for ObjInfo +#include // for RegTree +#include // for TreeUpdater + +#include "../../../src/tree/param.h" // for TrainParam +#include "../collective/test_worker.h" // for BaseMGPUTest +#include "../helpers.h" // for GenerateRandomGradients + +namespace xgboost::tree { +namespace { +RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) { + ObjInfo task{ObjInfo::kRegression}; + std::unique_ptr approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)}; + approx_maker->Configure(Args{}); + + TrainParam param; + param.UpdateAllowUnknown(Args{}); + + linalg::Matrix gpair({dmat->Info().num_row_}, ctx->Device()); + gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_)); + + std::vector> position(1); + RegTree tree; + approx_maker->Update(¶m, &gpair, dmat, common::Span>{position}, + {&tree}); + return tree; +} + +void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) { + auto ctx = MakeCUDACtx(DistGpuIdx()); + + auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true); + auto const world_size = collective::GetWorldSize(); + auto const rank = collective::GetRank(); + std::unique_ptr sliced{Xy->SliceCol(world_size, rank)}; + + RegTree tree = GetApproxTree(&ctx, sliced.get()); + + Json json{Object{}}; + tree.SaveModel(&json); + Json expected_json{Object{}}; + expected_tree.SaveModel(&expected_json); + ASSERT_EQ(json, expected_json); +} +} // anonymous namespace + +class MGPUApproxTest : public collective::BaseMGPUTest {}; + +TEST_F(MGPUApproxTest, GPUApproxColumnSplit) { + auto constexpr kRows = 32; + auto constexpr kCols = 16; + + Context ctx(MakeCUDACtx(0)); + auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true); + RegTree expected_tree = GetApproxTree(&ctx, dmat.get()); + + this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, true); + this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, false); +} +} // namespace xgboost::tree diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py new file mode 100644 index 000000000000..4e0f0bcc2f89 --- /dev/null +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py @@ -0,0 +1,31 @@ +"""Copyright 2024, XGBoost contributors""" + +import pytest +from dask_cuda import LocalCUDACluster +from distributed import Client + +import xgboost as xgb +from xgboost.testing.dask import check_external_memory + + +@pytest.mark.parametrize("is_qdm", [True, False]) +def test_external_memory(is_qdm: bool) -> None: + n_workers = 2 + with LocalCUDACluster(n_workers=2) as cluster: + with Client(cluster) as client: + args = client.sync( + xgb.dask._get_rabit_args, + 2, + None, + client, + ) + + futs = client.map( + check_external_memory, + range(n_workers), + n_workers=n_workers, + device="cuda", + comm_args=args, + is_qdm=is_qdm, + ) + client.gather(futs)