From 48e90911e0ae865bd530a3ecb5f00c495c86ae3f Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 24 Sep 2024 19:45:55 +0800
Subject: [PATCH 1/6] Work on compressing sparse data.

Compression info.

Row stride.

Add test.

cleanup.

Add tests.

name.

Cleanup.

comment.

used n_symbols.

cleanup.

Histogram.

Split up tests.

Write compressed.

Comment.

from ghist.

Work on ellpack -> ghist.

look into adapter case.

from data.

cleanup.

start working on tests.

check.

From adapter.

to ghist.

dense.

empty.

feature groups.

feature group.

histogram.

rename.

CPU.

cleanup.

Enable get cut.

Fix test.
---
 src/common/compressed_iterator.h              |  29 +-
 src/common/hist_util.cuh                      |  23 -
 src/common/hist_util.h                        |  10 +-
 src/data/ellpack_page.cc                      |   6 -
 src/data/ellpack_page.cu                      | 480 +++++++++++-------
 src/data/ellpack_page.cuh                     | 128 +++--
 src/data/ellpack_page.h                       |   1 -
 src/data/ellpack_page_raw_format.cu           |   4 +-
 src/data/ellpack_page_source.cu               |   4 +-
 src/data/ellpack_page_source.h                |   6 +
 src/data/extmem_quantile_dmatrix.cu           |   2 +-
 src/data/gradient_index.cu                    |  65 ++-
 src/data/iterative_dmatrix.cc                 |   3 +
 src/tree/gpu_hist/feature_groups.cu           |  35 +-
 src/tree/gpu_hist/feature_groups.cuh          | 123 ++---
 src/tree/gpu_hist/gradient_based_sampler.cu   |   8 +-
 src/tree/gpu_hist/histogram.cu                |  73 +--
 src/tree/updater_gpu_hist.cu                  |  47 +-
 tests/ci_build/lint_python.py                 |   1 +
 tests/cpp/data/test_ellpack_page.cu           | 157 +++++-
 .../cpp/data/test_ellpack_page_raw_format.cu  |   5 +-
 .../cpp/data/test_extmem_quantile_dmatrix.cc  |   3 +-
 .../cpp/data/test_extmem_quantile_dmatrix.cu  |   7 +-
 tests/cpp/data/test_extmem_quantile_dmatrix.h |   7 +-
 tests/cpp/data/test_iterative_dmatrix.cu      |  42 +-
 tests/cpp/data/test_sparse_page_dmatrix.cu    |  10 +-
 tests/cpp/test_context.cu                     |   4 +-
 tests/cpp/tree/gpu_hist/test_histogram.cu     |  64 +--
 tests/cpp/tree/test_gpu_approx.cu             |  65 +++
 .../test_gpu_external_memory.py               |  31 ++
 30 files changed, 878 insertions(+), 565 deletions(-)
 create mode 100644 tests/cpp/tree/test_gpu_approx.cu
 create mode 100644 tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py
diff --git a/src/common/compressed_iterator.h b/src/common/compressed_iterator.h
index 71d2d520264e..ab5815557f66 100644
--- a/src/common/compressed_iterator.h
+++ b/src/common/compressed_iterator.h
@@ -1,12 +1,11 @@
 /**
- * Copyright 2017-2023 by XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
  * \file compressed_iterator.h
  */
 #pragma once
 #include <xgboost/base.h>
 
-#include <algorithm>
-#include <cmath>
+#include <cmath>    // for ceil, log2
 #include <cstddef>  // for size_t
 
 #include "common.h"
@@ -15,9 +14,7 @@
 #include "device_helpers.cuh"
 #endif  // __CUDACC__
 
-namespace xgboost {
-namespace common {
-
+namespace xgboost::common {
 using CompressedByteT = unsigned char;
 
 namespace detail {
@@ -87,13 +84,12 @@ class CompressedBufferWriter {
 
   template <typename T>
   void WriteSymbol(CompressedByteT *buffer, T symbol, size_t offset) {
-    const int bits_per_byte = 8;
+    constexpr std::int32_t kBitsPerByte = 8;
 
     for (size_t i = 0; i < symbol_bits_; i++) {
-      size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / bits_per_byte;
+      size_t byte_idx = ((offset + 1) * symbol_bits_ - (i + 1)) / kBitsPerByte;
       byte_idx += detail::kPadding;
-      size_t bit_idx =
-          ((bits_per_byte + i) - ((offset + 1) * symbol_bits_)) % bits_per_byte;
+      size_t bit_idx = ((kBitsPerByte + i) - ((offset + 1) * symbol_bits_)) % kBitsPerByte;
 
       if (detail::CheckBit(symbol, i)) {
         detail::SetBit(&buffer[byte_idx], bit_idx);
@@ -181,16 +177,14 @@ class CompressedIterator {
   typedef value_type reference;             // NOLINT
 
  private:
-  const CompressedByteT *buffer_ {nullptr};
-  size_t symbol_bits_ {0};
+  CompressedByteT const *buffer_{nullptr};
+  bst_idx_t const symbol_bits_{0};
   size_t offset_ {0};
 
  public:
   CompressedIterator() = default;
-  CompressedIterator(const CompressedByteT *buffer, size_t num_symbols)
-      : buffer_(buffer) {
-    symbol_bits_ = detail::SymbolBits(num_symbols);
-  }
+  CompressedIterator(CompressedByteT const *buffer, bst_idx_t num_symbols)
+      : buffer_{buffer}, symbol_bits_{detail::SymbolBits(num_symbols)} {}
 
   XGBOOST_DEVICE reference operator*() const {
     const int bits_per_byte = 8;
@@ -218,5 +212,4 @@ class CompressedIterator {
     return *offset;
   }
 };
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common
diff --git a/src/common/hist_util.cuh b/src/common/hist_util.cuh
index 66463ef2fe86..ffdafa29205c 100644
--- a/src/common/hist_util.cuh
+++ b/src/common/hist_util.cuh
@@ -24,29 +24,6 @@
 #include "xgboost/span.h"  // for IterSpan
 
 namespace xgboost::common {
-namespace cuda {
-/**
- * copy and paste of the host version, we can't make it a __host__ __device__ function as
- * the fn might be a host only or device only callable object, which is not allowed by nvcc.
- */
-template <typename Fn>
-auto __device__ DispatchBinType(BinTypeSize type, Fn&& fn) {
-  switch (type) {
-    case kUint8BinsTypeSize: {
-      return fn(uint8_t{});
-    }
-    case kUint16BinsTypeSize: {
-      return fn(uint16_t{});
-    }
-    case kUint32BinsTypeSize: {
-      return fn(uint32_t{});
-    }
-  }
-  SPAN_CHECK(false);
-  return fn(uint32_t{});
-}
-}  // namespace cuda
-
 namespace detail {
 struct EntryCompareOp {
   __device__ bool operator()(const Entry& a, const Entry& b) {
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 559093bb5a3f..ab2a64786827 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -84,6 +84,10 @@ class HistogramCuts {
   [[nodiscard]] bst_bin_t FeatureBins(bst_feature_t feature) const {
     return cut_ptrs_.ConstHostVector().at(feature + 1) - cut_ptrs_.ConstHostVector()[feature];
   }
+  [[nodiscard]] bst_feature_t NumFeatures() const {
+    CHECK_EQ(this->min_vals_.Size(), this->cut_ptrs_.Size() - 1);
+    return this->min_vals_.Size();
+  }
 
   std::vector<uint32_t> const& Ptrs()      const { return cut_ptrs_.ConstHostVector();   }
   std::vector<float>    const& Values()    const { return cut_values_.ConstHostVector(); }
@@ -101,8 +105,10 @@ class HistogramCuts {
     has_categorical_ = has_cat;
     max_cat_ = max_cat;
   }
-
-  [[nodiscard]] bst_bin_t TotalBins() const { return cut_ptrs_.ConstHostVector().back(); }
+  /**
+   * @brief The total number of histogram bins (excluding min values.)
+   */
+  [[nodiscard]] bst_bin_t TotalBins() const { return this->cut_values_.Size(); }
 
   // Return the index of a cut point that is strictly greater than the input
   // value, or the last available index if none exists
diff --git a/src/data/ellpack_page.cc b/src/data/ellpack_page.cc
index 4d918358d34c..1d99380c4e7d 100644
--- a/src/data/ellpack_page.cc
+++ b/src/data/ellpack_page.cc
@@ -53,12 +53,6 @@ bst_idx_t EllpackPage::Size() const {
                 "EllpackPage is required";
   return 0;
 }
-
-[[nodiscard]] bool EllpackPage::IsDense() const {
-  LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
-                "EllpackPage is required";
-  return false;
-}
 }  // namespace xgboost
 
 #endif  // XGBOOST_USE_CUDA
diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index f0c155701ead..57779aa55c91 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -1,30 +1,32 @@
 /**
  * Copyright 2019-2024, XGBoost contributors
  */
-#include <cuda/functional>  // for proclaim_return_type
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-
-#include <algorithm>        // for copy
-#include <utility>          // for move
-#include <vector>           // for vector
-
-#include "../common/algorithm.cuh"  // for InclusiveScan
-#include "../common/categorical.h"
-#include "../common/cuda_context.cuh"
+#include <thrust/binary_search.h>                       // for lower_bound,  upper_bound
+#include <thrust/extrema.h>                             // for max_element
+#include <thrust/iterator/counting_iterator.h>          // for make_counting_iterator
+#include <thrust/iterator/transform_output_iterator.h>  // for transform_output_iterator
+
+#include <algorithm>  // for copy
+#include <limits>     // for numeric_limits
+#include <utility>    // for move
+#include <vector>     // for vector
+
+#include "../common/algorithm.cuh"          // for InclusiveScan
+#include "../common/categorical.h"          // for IsCat
+#include "../common/cuda_context.cuh"       // for CUDAContext
 #include "../common/cuda_rt_utils.h"        // for SetDevice
 #include "../common/hist_util.cuh"          // for HistogramCuts
 #include "../common/ref_resource_view.cuh"  // for MakeFixedVecWithCudaMalloc
 #include "../common/transform_iterator.h"   // for MakeIndexTransformIter
 #include "device_adapter.cuh"               // for NoInfInData
-#include "ellpack_page.cuh"
-#include "ellpack_page.h"
-#include "gradient_index.h"
-#include "xgboost/data.h"
+#include "ellpack_page.cuh"                 // for EllpackPageImpl
+#include "ellpack_page.h"                   // for EllpackPage
+#include "gradient_index.h"                 // for GHistIndexMatrix
+#include "xgboost/context.h"                // for Context
+#include "xgboost/data.h"                   // for DMatrix
 
 namespace xgboost {
-
-EllpackPage::EllpackPage() : impl_{new EllpackPageImpl()} {}
+EllpackPage::EllpackPage() : impl_{new EllpackPageImpl{}} {}
 
 EllpackPage::EllpackPage(Context const* ctx, DMatrix* dmat, const BatchParam& param)
     : impl_{new EllpackPageImpl{ctx, dmat, param}} {}
@@ -43,10 +45,9 @@ void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id)
 }
 
 [[nodiscard]] bst_idx_t EllpackPage::BaseRowId() const { return this->Impl()->base_rowid; }
-[[nodiscard]] bool EllpackPage::IsDense() const { return this->Impl()->IsDense(); }
 
 // Bin each input data entry, store the bin indices in compressed form.
-template <bool kIsDense>
+template <bool HasNoMissing, bool kDenseCompressed>
 __global__ void CompressBinEllpackKernel(
     common::CompressedBufferWriter wr,
     common::CompressedByteT* __restrict__ buffer,  // gidx_buffer
@@ -57,75 +58,134 @@ __global__ void CompressBinEllpackKernel(
     common::Span<FeatureType const> feature_types,
     size_t base_row,  // batch_row_begin
     size_t n_rows, size_t row_stride, std::uint32_t null_gidx_value) {
-  size_t irow = threadIdx.x + blockIdx.x * blockDim.x;
-  int ifeature = threadIdx.y + blockIdx.y * blockDim.y;
-  if (irow >= n_rows || ifeature >= row_stride) {
+  auto irow = threadIdx.x + blockIdx.x * blockDim.x;
+  auto cpr_fidx = threadIdx.y + blockIdx.y * blockDim.y;  // compressed fidx
+  if (irow >= n_rows || cpr_fidx >= row_stride) {
     return;
   }
-  int row_length = static_cast<int>(row_ptrs[irow + 1] - row_ptrs[irow]);
+  auto row_length = static_cast<decltype(cpr_fidx)>(row_ptrs[irow + 1] - row_ptrs[irow]);
   std::uint32_t bin = null_gidx_value;
-  if (ifeature < row_length) {
-    Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + ifeature];
-    int feature = entry.index;
+
+  // When treating a sparse matrix as dense, we need to write null values in between valid
+  // values. But we don't know where to write if the feature index is not recorded for a
+  // missing value. Here we use binary search to ensure `cpr_fidx` is the same as `fidx`.
+  if (kDenseCompressed && !HasNoMissing) {
+    auto row_beg = entries + row_ptrs[irow] - row_ptrs[0];
+    auto row_end = entries + row_ptrs[irow + 1] - row_ptrs[0];
+    auto it = thrust::make_transform_iterator(thrust::make_counting_iterator(0ul),
+                                              [=](std::size_t i) { return row_beg[i].index; });
+    auto it_end = it + thrust::distance(row_beg, row_end);
+    auto res_it = thrust::lower_bound(thrust::seq, it, it_end, cpr_fidx);
+    if (res_it == it_end || cpr_fidx != *res_it) {
+      wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + cpr_fidx);
+      return;
+    }
+    cpr_fidx = thrust::distance(it, res_it);
+    SPAN_CHECK(cpr_fidx < row_length);
+  }
+
+  if (cpr_fidx < row_length) {
+    // We are using sub-batch of a SparsePage, need to account for the first offset within
+    // the sub-batch.
+    //
+    // The block.y idx is calculated using row_stride, which is the longest row. We can
+    // use `compressed_fidx` to fully index the sparse page row.
+    Entry entry = entries[row_ptrs[irow] - row_ptrs[0] + cpr_fidx];
+
+    bst_feature_t fidx = entry.index;
     float fvalue = entry.fvalue;
-    // {feature_cuts, ncuts} forms the array of cuts of `feature'.
-    const float* feature_cuts = &cuts[cut_ptrs[feature]];
-    int ncuts = cut_ptrs[feature + 1] - cut_ptrs[feature];
-    bool is_cat = common::IsCat(feature_types, ifeature);
+    // {feature_cuts, n_cuts} forms the array of cuts of the current `feature'.
+    float const* feature_cuts = &cuts[cut_ptrs[fidx]];
+    auto n_cuts = cut_ptrs[fidx + 1] - cut_ptrs[fidx];
+
+    bool is_cat = common::IsCat(feature_types, fidx);
     // Assigning the bin in current entry.
     // S.t.: fvalue < feature_cuts[bin]
+    bin = std::numeric_limits<decltype(bin)>::max();
     if (is_cat) {
       auto it =
           dh::MakeTransformIterator<int>(feature_cuts, [](float v) { return common::AsCat(v); });
-      bin = thrust::lower_bound(thrust::seq, it, it + ncuts, common::AsCat(fvalue)) - it;
+      bin = thrust::lower_bound(thrust::seq, it, it + n_cuts, common::AsCat(fvalue)) - it;
     } else {
-      bin = thrust::upper_bound(thrust::seq, feature_cuts, feature_cuts + ncuts, fvalue) -
+      bin = thrust::upper_bound(thrust::seq, feature_cuts, feature_cuts + n_cuts, fvalue) -
             feature_cuts;
     }
 
-    if (bin >= ncuts) {
-      bin = ncuts - 1;
+    if (bin >= n_cuts) {
+      bin = n_cuts - 1;
     }
-    // Add the number of bins in previous features.
-    if (!kIsDense) {
-      bin += cut_ptrs[feature];
+    if (!kDenseCompressed) {
+      // Sparse data, use the compressed fidx.  Add the number of bins in previous
+      // features since we can't compresse it based on feature-local index.
+      bin += cut_ptrs[fidx];
+    } else {
+      // Write to the actual fidx for dense data.
+      cpr_fidx = fidx;
     }
   }
-  // Write to gidx buffer.
-  wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + ifeature);
+  // Write to the gidx buffer for non-missing values.
+  wr.AtomicWriteSymbol(buffer, bin, (irow + base_row) * row_stride + cpr_fidx);
 }
 
 namespace {
 // Calculate the number of symbols for the compressed ellpack. Similar to what the CPU
 // implementation does, we compress the dense data by subtracting the bin values with the
-// starting bin of its feature.
-[[nodiscard]] std::size_t CalcNumSymbols(Context const* ctx, bool is_dense,
-                                         std::shared_ptr<common::HistogramCuts const> cuts) {
-  // Cut values can be empty when the input data is empty.
-  if (!is_dense || cuts->cut_values_.Empty()) {
-    // Return the total number of symbols (total number of bins plus 1 for not found)
-    return cuts->cut_values_.Size() + 1;
+// starting bin of its feature if it's dense.
+[[nodiscard]] EllpackPageImpl::Info CalcNumSymbols(
+    Context const* ctx, bst_idx_t n_samples, bst_idx_t row_stride, bool is_dense,
+    std::shared_ptr<common::HistogramCuts const> cuts) {
+  // Return the total number of symbols (total number of bins plus 1 for missing)
+  // The null value equals the total number of bins.
+  bst_idx_t n_symbols = cuts->TotalBins() + 1;
+  if (n_symbols == 1) {  // Empty DMatrix
+    return {static_cast<bst_feature_t>(0), n_symbols};
   }
 
+  bst_idx_t n_features = cuts->NumFeatures();
   cuts->cut_ptrs_.SetDevice(ctx->Device());
   common::Span<std::uint32_t const> dptrs = cuts->cut_ptrs_.ConstDeviceSpan();
-  auto cuctx = ctx->CUDACtx();
   using PtrT = typename decltype(dptrs)::value_type;
-  auto it = dh::MakeTransformIterator<PtrT>(
-      thrust::make_counting_iterator(1ul),
-      [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; });
-  CHECK_GE(dptrs.size(), 2);
-  auto max_it = thrust::max_element(cuctx->CTP(), it, it + dptrs.size() - 1);
-  dh::CachingDeviceUVector<PtrT> max_element(1);
-  auto d_me = max_element.data();
-  dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; });
-  PtrT h_me{0};
-  dh::safe_cuda(
-      cudaMemcpyAsync(&h_me, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost, cuctx->Stream()));
-  cuctx->Stream().Sync();
-  // No missing, hence no null value, hence no + 1 symbol.
-  // FIXME(jiamingy): When we extend this to use a sparsity threshold, +1 is needed back.
-  return h_me;
+
+  // Calculate the number of required symbols if we treat the data as dense.
+  PtrT n_symbols_dense{0};
+  {
+    CUDAContext const* cuctx = ctx->CUDACtx();
+    auto it = dh::MakeTransformIterator<PtrT>(
+        thrust::make_counting_iterator(1ul),
+        [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; });
+    CHECK_GE(dptrs.size(), 2);
+    auto max_it = thrust::max_element(cuctx->CTP(), it, it + dptrs.size() - 1);
+    dh::CachingDeviceUVector<PtrT> max_element(1);
+    auto d_me = max_element.data();
+    dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; });
+
+    dh::safe_cuda(cudaMemcpyAsync(&n_symbols_dense, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost,
+                                  cuctx->Stream()));
+    cuctx->Stream().Sync();
+  }
+
+  CHECK_LE(row_stride, n_features);
+  // Treat the ellpack as dense if we can save memory.
+  auto calc_n_bytes = [&](bst_idx_t row_stride, auto n_symbols) {
+    return common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_samples, n_symbols);
+  };
+  auto as_dense = [&] {
+    bool no_compression = n_features == row_stride;
+    bool less = calc_n_bytes(n_features, n_symbols_dense + 1) < calc_n_bytes(row_stride, n_symbols);
+    return no_compression || less;
+  };
+
+  if (is_dense) {
+    // No missing, hence no null value, hence no + 1 symbol.
+    LOG(INFO) << "Ellpack is dense.";
+    return {n_features, n_symbols_dense};
+  } else if (as_dense()) {
+    LOG(INFO) << "Ellpack is relatively dense.";
+    return {n_features, n_symbols_dense + 1};  // +1 for missing value (null in ellpack)
+  } else {
+    LOG(INFO) << "Ellpack is sparse.";
+    return {row_stride, n_symbols};
+  }
 }
 }  // namespace
 
@@ -134,10 +194,9 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts, bool is_dense,
                                  bst_idx_t row_stride, bst_idx_t n_rows)
     : is_dense{is_dense},
-      cuts_{std::move(cuts)},
-      row_stride{row_stride},
       n_rows{n_rows},
-      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
+      cuts_{std::move(cuts)},
+      info{CalcNumSymbols(ctx, n_rows, row_stride, is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
 
@@ -148,11 +207,10 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
                                  std::shared_ptr<common::HistogramCuts const> cuts,
                                  const SparsePage& page, bool is_dense, size_t row_stride,
                                  common::Span<FeatureType const> feature_types)
-    : cuts_{std::move(cuts)},
-      is_dense{is_dense},
+    : is_dense{is_dense},
       n_rows{page.Size()},
-      row_stride{row_stride},
-      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
+      cuts_{std::move(cuts)},
+      info{CalcNumSymbols(ctx, n_rows, row_stride, is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
 
@@ -164,14 +222,13 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const BatchParam& param)
     : is_dense{p_fmat->IsDense()},
       n_rows{p_fmat->Info().num_row_},
-      row_stride{GetRowStride(p_fmat)},
       // Create the quantile sketches for the dmatrix and initialize HistogramCuts.
       cuts_{param.hess.empty()
                 ? std::make_shared<common::HistogramCuts>(
                       common::DeviceSketch(ctx, p_fmat, param.max_bin))
                 : std::make_shared<common::HistogramCuts>(
                       common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
-      n_symbols_{CalcNumSymbols(ctx, this->is_dense, this->cuts_)} {
+      info{CalcNumSymbols(ctx, n_rows, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
 
@@ -179,15 +236,13 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
 
   p_fmat->Info().feature_types.SetDevice(ctx->Device());
   auto ft = p_fmat->Info().feature_types.ConstDeviceSpan();
-  monitor_.Start("BinningCompression");
   CHECK(p_fmat->SingleColBlock());
   for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
     this->CreateHistIndices(ctx, page, ft);
   }
-  monitor_.Stop("BinningCompression");
 }
 
-template <typename AdapterBatchT, bool kIsDense>
+template <typename AdapterBatchT>
 struct WriteCompressedEllpackFunctor {
   WriteCompressedEllpackFunctor(common::CompressedByteT* buffer,
                                 const common::CompressedBufferWriter& writer, AdapterBatchT batch,
@@ -208,22 +263,40 @@ struct WriteCompressedEllpackFunctor {
   common::Span<FeatureType const> feature_types;
   data::IsValidFunctor is_valid;
 
-  using Tuple = thrust::tuple<size_t, size_t, size_t>;
-  __device__ size_t operator()(Tuple out) {
+  // Tuple[0] = The row index of the input, used as a key to define segments
+  // Tuple[1] = Scanned flags of valid elements for each row
+  // Tuple[2] = The index in the input data
+  using Tuple = thrust::tuple<bst_idx_t, bst_idx_t, bst_idx_t>;
+
+  template <bool kIsDense>
+  __device__ void Write(data::COOTuple const& e, bst_idx_t out_position) {
+    bst_bin_t bin_idx = 0;
+    if (common::IsCat(feature_types, e.column_idx)) {
+      bin_idx = accessor.SearchBin<true>(e.value, e.column_idx);
+    } else {
+      bin_idx = accessor.SearchBin<false>(e.value, e.column_idx);
+    }
+    if constexpr (kIsDense) {
+      bin_idx -= accessor.feature_segments[e.column_idx];
+    }
+    writer.AtomicWriteSymbol(d_buffer, bin_idx, out_position);
+  }
+  // Used for dense or as dense data.
+  __device__ void operator()(bst_idx_t i) {
+    auto e = batch.GetElement(i);
+    if (is_valid(e)) {
+      this->Write<true>(e, i);
+    } else {
+      writer.AtomicWriteSymbol(d_buffer, accessor.NullValue(), i);
+    }
+  }
+  // Used for sparse data.
+  __device__ size_t operator()(Tuple const& out) {
     auto e = batch.GetElement(thrust::get<2>(out));
     if (is_valid(e)) {
       // -1 because the scan is inclusive
       size_t output_position = accessor.row_stride * e.row_idx + thrust::get<1>(out) - 1;
-      uint32_t bin_idx = 0;
-      if (common::IsCat(feature_types, e.column_idx)) {
-        bin_idx = accessor.SearchBin<true>(e.value, e.column_idx);
-      } else {
-        bin_idx = accessor.SearchBin<false>(e.value, e.column_idx);
-      }
-      if (kIsDense) {
-        bin_idx -= accessor.feature_segments[e.column_idx];
-      }
-      writer.AtomicWriteSymbol(d_buffer, bin_idx, output_position);
+      this->Write<false>(e, output_position);
     }
     return 0;
   }
@@ -248,86 +321,79 @@ template <bool kIsDense, typename AdapterBatchT>
 void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
                        common::Span<FeatureType const> feature_types, EllpackPageImpl* dst,
                        float missing) {
-  // Some witchcraft happens here
-  // The goal is to copy valid elements out of the input to an ELLPACK matrix
-  // with a given row stride, using no extra working memory Standard stream
-  // compaction needs to be modified to do this, so we manually define a
-  // segmented stream compaction via operators on an inclusive scan. The output
-  // of this inclusive scan is fed to a custom function which works out the
-  // correct output position
-  auto counting = thrust::make_counting_iterator(0llu);
   data::IsValidFunctor is_valid(missing);
   bool valid = data::NoInfInData(batch, is_valid);
   CHECK(valid) << error::InfInData();
 
-  auto key_iter = dh::MakeTransformIterator<size_t>(
-      counting,
-      [=] __device__(size_t idx) {
-        return batch.GetElement(idx).row_idx;
-      });
-  auto value_iter = dh::MakeTransformIterator<size_t>(
-      counting,
-      [=] __device__(size_t idx) -> size_t {
-        return is_valid(batch.GetElement(idx));
-      });
-
-  auto key_value_index_iter =
-      thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, counting));
-
-  // Tuple[0] = The row index of the input, used as a key to define segments
-  // Tuple[1] = Scanned flags of valid elements for each row
-  // Tuple[2] = The index in the input data
-  using Tuple = thrust::tuple<bst_idx_t, bst_idx_t, bst_idx_t>;
-
-  auto device_accessor = dst->GetDeviceAccessor(ctx);
+  auto cnt = thrust::make_counting_iterator(0llu);
   auto n_symbols = dst->NumSymbols();
-
   common::CompressedBufferWriter writer{n_symbols};
   auto d_compressed_buffer = dst->gidx_buffer.data();
 
   // We redirect the scan output into this functor to do the actual writing
+  using Tuple = typename WriteCompressedEllpackFunctor<AdapterBatchT>::Tuple;
   dh::TypedDiscard<Tuple> discard;
-  WriteCompressedEllpackFunctor<AdapterBatchT, kIsDense> functor{
+  auto device_accessor = dst->GetDeviceAccessor(ctx);
+  WriteCompressedEllpackFunctor<AdapterBatchT> functor{
       d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid};
-  thrust::transform_output_iterator<decltype(functor), decltype(discard)> out(discard, functor);
 
+  // For dense compressed data, we can simply copy the data with the input position.
+  if (kIsDense) {
+    CHECK_EQ(batch.NumCols(), dst->info.row_stride);
+    thrust::for_each_n(ctx->CUDACtx()->CTP(), cnt, dst->Size() * dst->info.row_stride, functor);
+    return;
+  }
+
+  // Some witchcraft happens here.
+  //
+  // The goal is to copy valid elements out of the input to an ELLPACK matrix with a given
+  // row stride, using no extra working memory Standard stream compaction needs to be
+  // modified to do this, so we manually define a segmented stream compaction via
+  // operators on an inclusive scan. The output of this inclusive scan is fed to a custom
+  // function which works out the correct output position
+  auto key_iter = dh::MakeTransformIterator<size_t>(
+      cnt, [=] __device__(size_t idx) { return batch.GetElement(idx).row_idx; });
+  auto value_iter = dh::MakeTransformIterator<size_t>(
+      cnt, [=] __device__(size_t idx) -> size_t { return is_valid(batch.GetElement(idx)); });
+
+  auto key_value_index_iter =
+      thrust::make_zip_iterator(thrust::make_tuple(key_iter, value_iter, cnt));
+  thrust::transform_output_iterator<decltype(functor), decltype(discard)> out(discard, functor);
   common::InclusiveScan(ctx, key_value_index_iter, out, TupleScanOp<Tuple>{}, batch.Size());
 }
 
 void WriteNullValues(Context const* ctx, EllpackPageImpl* dst,
                      common::Span<size_t const> row_counts) {
   // Write the null values
-  auto device_accessor = dst->GetDeviceAccessor(ctx);
+  auto null = dst->GetDeviceAccessor(ctx).NullValue();
   common::CompressedBufferWriter writer(dst->NumSymbols());
   auto d_compressed_buffer = dst->gidx_buffer.data();
-  auto row_stride = dst->row_stride;
-  dh::LaunchN(row_stride * dst->n_rows, ctx->CUDACtx()->Stream(), [=] __device__(bst_idx_t idx) {
-    // For some reason this variable got captured as const
-    auto writer_non_const = writer;
+  auto row_stride = dst->info.row_stride;
+  auto n = row_stride * dst->n_rows;
+  dh::LaunchN(n, ctx->CUDACtx()->Stream(), [=] __device__(bst_idx_t idx) mutable {
     size_t row_idx = idx / row_stride;
     size_t row_offset = idx % row_stride;
     if (row_offset >= row_counts[row_idx]) {
-      writer_non_const.AtomicWriteSymbol(d_compressed_buffer, device_accessor.NullValue(), idx);
+      writer.AtomicWriteSymbol(d_compressed_buffer, null, idx);
     }
   });
 }
 
 template <typename AdapterBatch>
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float missing,
-                                 bool is_dense, common::Span<size_t const> row_counts_span,
+                                 bool is_dense, common::Span<size_t const> row_counts,
                                  common::Span<FeatureType const> feature_types, size_t row_stride,
                                  bst_idx_t n_rows,
                                  std::shared_ptr<common::HistogramCuts const> cuts)
     : EllpackPageImpl{ctx, cuts, is_dense, row_stride, n_rows} {
   curt::SetDevice(ctx->Ordinal());
 
-  if (this->IsDense()) {
+  if (this->IsDenseCompressed()) {
     CopyDataToEllpack<true>(ctx, batch, feature_types, this, missing);
   } else {
     CopyDataToEllpack<false>(ctx, batch, feature_types, this, missing);
+    WriteNullValues(ctx, this, row_counts);
   }
-
-  WriteNullValues(ctx, this, row_counts_span);
 }
 
 #define ELLPACK_BATCH_SPECIALIZE(__BATCH_T)                                                      \
@@ -339,10 +405,14 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, AdapterBatch batch, float m
 ELLPACK_BATCH_SPECIALIZE(data::CudfAdapterBatch)
 ELLPACK_BATCH_SPECIALIZE(data::CupyAdapterBatch)
 
+#undef ELLPACK_BATCH_SPECIALIZE
+
 namespace {
+template <typename T>
 void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
                         common::Span<bst_idx_t const> d_row_ptr, bst_idx_t row_stride,
                         bst_bin_t null, bst_idx_t n_symbols,
+                        common::Span<bst_feature_t const> d_cut_ptrs,
                         common::CompressedByteT* d_compressed_buffer) {
   dh::device_vector<uint8_t> data(page.index.begin(), page.index.end());
   auto d_data = dh::ToSpan(data);
@@ -350,43 +420,59 @@ void CopyGHistToEllpack(Context const* ctx, GHistIndexMatrix const& page,
   // GPU employs the same dense compression as CPU, no need to handle page.index.Offset()
   auto bin_type = page.index.GetBinTypeSize();
   common::CompressedBufferWriter writer{n_symbols};
-
   auto cuctx = ctx->CUDACtx();
-  dh::LaunchN(row_stride * page.Size(), cuctx->Stream(), [=] __device__(bst_idx_t idx) mutable {
-    auto ridx = idx / row_stride;
-    auto ifeature = idx % row_stride;
 
+  bool dense_compress = row_stride == page.Features() && !page.IsDense();
+  auto n_samples = page.Size();
+  auto cnt = thrust::make_counting_iterator(0ul);
+  auto ptr = reinterpret_cast<T const*>(d_data.data());
+  auto fn = [=] __device__(std::size_t i) mutable {
+    auto [ridx, fidx] = linalg::UnravelIndex(i, n_samples, row_stride);
     auto r_begin = d_row_ptr[ridx];
     auto r_end = d_row_ptr[ridx + 1];
     auto r_size = r_end - r_begin;
 
-    if (ifeature >= r_size) {
-      writer.AtomicWriteSymbol(d_compressed_buffer, null, idx);
-      return;
+    bst_bin_t bin_idx;
+    if (dense_compress) {
+      auto f_begin = d_cut_ptrs[fidx];
+      auto f_end = d_cut_ptrs[fidx + 1];
+      // CPU gidx is not compressed, can be used for binary search.
+      bin_idx = common::BinarySearchBin(r_begin, r_end, ptr, f_begin, f_end);
+      if (bin_idx == -1) {
+        bin_idx = null;
+      } else {
+        bin_idx -= d_cut_ptrs[fidx];
+      }
+    } else if (fidx >= r_size) {
+      bin_idx = null;
+    } else {
+      bin_idx = ptr[r_begin + fidx];
     }
 
-    common::cuda::DispatchBinType(bin_type, [&](auto t) {
-      using T = decltype(t);
-      auto ptr = reinterpret_cast<T const*>(d_data.data());
-      auto bin_idx = ptr[r_begin + ifeature];
-      writer.AtomicWriteSymbol(d_compressed_buffer, bin_idx, idx);
-    });
-  });
+    writer.AtomicWriteSymbol(d_compressed_buffer, bin_idx, i);
+  };
+  thrust::for_each_n(cuctx->CTP(), cnt, row_stride * page.Size(), fn);
 }
 }  // anonymous namespace
 
 EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& page,
                                  common::Span<FeatureType const> ft)
     : is_dense{page.IsDense()},
-      row_stride{[&] {
-        auto it = common::MakeIndexTransformIter(
-            [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
-        return *std::max_element(it, it + page.Size());
-      }()},
       base_rowid{page.base_rowid},
       n_rows{page.Size()},
-      cuts_{std::make_shared<common::HistogramCuts>(page.cut)},
-      n_symbols_{CalcNumSymbols(ctx, page.IsDense(), cuts_)} {
+      cuts_{[&] {
+        auto cuts = std::make_shared<common::HistogramCuts>(page.cut);
+        cuts->SetDevice(ctx->Device());
+        return cuts;
+      }()},
+      info{CalcNumSymbols(
+          ctx, n_rows,
+          [&] {
+            auto it = common::MakeIndexTransformIter(
+                [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
+            return *std::max_element(it, it + page.Size());
+          }(),
+          page.IsDense(), cuts_)} {
   this->monitor_.Init("ellpack_page");
   CHECK(ctx->IsCUDA());
   this->InitCompressedData(ctx);
@@ -400,8 +486,12 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
 
   auto accessor = this->GetDeviceAccessor(ctx, ft);
   this->monitor_.Start("CopyGHistToEllpack");
-  CopyGHistToEllpack(ctx, page, d_row_ptr, row_stride, accessor.NullValue(), this->NumSymbols(),
-                     d_compressed_buffer);
+  common::DispatchBinType(page.index.GetBinTypeSize(), [&](auto t) {
+    using T = decltype(t);
+    CopyGHistToEllpack<T>(ctx, page, d_row_ptr, this->info.row_stride, accessor.NullValue(),
+                          this->NumSymbols(), this->cuts_->cut_ptrs_.ConstDeviceSpan(),
+                          d_compressed_buffer);
+  });
   this->monitor_.Stop("CopyGHistToEllpack");
 }
 
@@ -432,13 +522,14 @@ struct CopyPage {
 // Copy the data from the given EllpackPage to the current page.
 bst_idx_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) {
   monitor_.Start(__func__);
-  bst_idx_t num_elements = page->n_rows * page->row_stride;
-  CHECK_EQ(this->row_stride, page->row_stride);
+  bst_idx_t num_elements = page->n_rows * page->info.row_stride;
+  CHECK_EQ(this->info.row_stride,
+           page->info.row_stride);  // fixme: make sure row strides are correctly handled
   CHECK_EQ(NumSymbols(), page->NumSymbols());
-  CHECK_GE(this->n_rows * this->row_stride, offset + num_elements);
+  CHECK_GE(this->n_rows * this->info.row_stride, offset + num_elements);
   if (page == this) {
     LOG(FATAL) << "Concatenating the same Ellpack.";
-    return this->n_rows * this->row_stride;
+    return this->n_rows * this->info.row_stride;
   }
   dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset});
   monitor_.Stop(__func__);
@@ -471,7 +562,7 @@ struct CompactPage {
         src_iterator_d{src->gidx_buffer.data(), src->NumSymbols()},
         row_indexes(row_indexes),
         base_rowid{src->base_rowid},
-        row_stride{src->row_stride} {}
+        row_stride{src->info.row_stride} {}
 
   __device__ void operator()(bst_idx_t row_id) {
     size_t src_row = base_rowid + row_id;
@@ -491,8 +582,8 @@ struct CompactPage {
 void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
                               common::Span<size_t> row_indexes) {
   monitor_.Start(__func__);
-  CHECK_EQ(row_stride, page->row_stride);
-  CHECK_EQ(NumSymbols(), page->NumSymbols());
+  CHECK_EQ(this->info.row_stride, page->info.row_stride);  // fixme: check in iterative dmatrix
+  CHECK_EQ(this->NumSymbols(), page->NumSymbols());
   CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
   auto cuctx = ctx->CUDACtx();
   dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage{this, page, row_indexes});
@@ -508,35 +599,37 @@ void EllpackPageImpl::InitCompressedData(Context const* ctx) {
   monitor_.Start(__func__);
   auto num_symbols = this->NumSymbols();
   // Required buffer size for storing data matrix in ELLPack format.
-  std::size_t compressed_size_bytes =
-      common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_rows, num_symbols);
+  std::size_t compressed_size_bytes = common::CompressedBufferWriter::CalculateBufferSize(
+      this->info.row_stride * this->n_rows, num_symbols);
   auto init = static_cast<common::CompressedByteT>(0);
   gidx_buffer = common::MakeFixedVecWithCudaMalloc(ctx, compressed_size_bytes, init);
   monitor_.Stop(__func__);
 }
 
 // Compress a CSR page into ELLPACK.
-void EllpackPageImpl::CreateHistIndices(Context const* ctx,
-                                        const SparsePage& row_batch,
+void EllpackPageImpl::CreateHistIndices(Context const* ctx, const SparsePage& row_batch,
                                         common::Span<FeatureType const> feature_types) {
   if (row_batch.Size() == 0) {
     return;
   }
+
+  this->monitor_.Start(__func__);
   auto null_gidx_value = this->GetDeviceAccessor(ctx, feature_types).NullValue();
 
   auto const& offset_vec = row_batch.offset.ConstHostVector();
 
   // bin and compress entries in batches of rows
   size_t gpu_batch_nrows =
-      std::min(dh::TotalMemory(ctx->Ordinal()) / (16 * row_stride * sizeof(Entry)),
+      std::min(dh::TotalMemory(ctx->Ordinal()) / (16 * this->info.row_stride * sizeof(Entry)),
                static_cast<size_t>(row_batch.Size()));
 
   size_t gpu_nbatches = common::DivRoundUp(row_batch.Size(), gpu_batch_nrows);
+  auto writer = common::CompressedBufferWriter{this->NumSymbols()};
+  auto gidx_buffer_data = gidx_buffer.data();
 
   for (size_t gpu_batch = 0; gpu_batch < gpu_nbatches; ++gpu_batch) {
     size_t batch_row_begin = gpu_batch * gpu_batch_nrows;
-    size_t batch_row_end =
-        std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size());
+    size_t batch_row_end = std::min((gpu_batch + 1) * gpu_batch_nrows, row_batch.Size());
     size_t batch_nrows = batch_row_end - batch_row_begin;
 
     const auto ent_cnt_begin = offset_vec[batch_row_begin];
@@ -569,40 +662,45 @@ void EllpackPageImpl::CreateHistIndices(Context const* ctx,
 
     const dim3 block3(32, 8, 1);  // 256 threads
     const dim3 grid3(common::DivRoundUp(batch_nrows, block3.x),
-                     common::DivRoundUp(row_stride, block3.y), 1);
+                     common::DivRoundUp(this->info.row_stride, block3.y), 1);
     auto device_accessor = this->GetDeviceAccessor(ctx);
     auto launcher = [&](auto kernel) {
       dh::LaunchKernel{grid3, block3, 0, ctx->CUDACtx()->Stream()}(  // NOLINT
-          kernel, common::CompressedBufferWriter(this->NumSymbols()), gidx_buffer.data(),
-          row_ptrs.data(), entries_d.data(), device_accessor.gidx_fvalue_map.data(),
-          device_accessor.feature_segments.data(), feature_types, batch_row_begin, batch_nrows,
-          row_stride, null_gidx_value);
+          kernel, writer, gidx_buffer_data, row_ptrs.data(), entries_d.data(),
+          device_accessor.gidx_fvalue_map.data(), device_accessor.feature_segments, feature_types,
+          batch_row_begin, batch_nrows, this->info.row_stride, null_gidx_value);
     };
     if (this->IsDense()) {
-      launcher(CompressBinEllpackKernel<true>);
+      launcher(CompressBinEllpackKernel<true, true>);
     } else {
-      launcher(CompressBinEllpackKernel<false>);
+      if (this->IsDenseCompressed()) {
+        launcher(CompressBinEllpackKernel<false, true>);
+      } else {
+        launcher(CompressBinEllpackKernel<false, false>);
+      }
     }
   }
+  this->monitor_.Stop(__func__);
 }
 
 // Return the number of rows contained in this page.
 [[nodiscard]] bst_idx_t EllpackPageImpl::Size() const { return n_rows; }
 
 std::size_t EllpackPageImpl::MemCostBytes() const {
-  return this->gidx_buffer.size_bytes() + sizeof(this->n_rows) + sizeof(this->is_dense) +
-         sizeof(this->row_stride) + sizeof(this->base_rowid) + sizeof(this->n_symbols_);
+  return this->gidx_buffer.size_bytes() + sizeof(this->is_dense) + sizeof(this->n_rows) +
+         sizeof(this->base_rowid) + sizeof(this->info);
 }
 
 EllpackDeviceAccessor EllpackPageImpl::GetDeviceAccessor(
     Context const* ctx, common::Span<FeatureType const> feature_types) const {
+  auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
   return {ctx,
-          cuts_,
-          is_dense,
-          row_stride,
-          base_rowid,
-          n_rows,
-          common::CompressedIterator<uint32_t>(gidx_buffer.data(), this->NumSymbols()),
+          this->cuts_,
+          this->info.row_stride,
+          this->base_rowid,
+          this->n_rows,
+          common::CompressedIterator<uint32_t>{gidx_buffer.data(), this->NumSymbols()},
+          null,
           feature_types};
 }
 
@@ -615,26 +713,28 @@ EllpackDeviceAccessor EllpackPageImpl::GetHostAccessor(
   dh::safe_cuda(cudaMemcpyAsync(h_gidx_buffer->data(), gidx_buffer.data(), gidx_buffer.size_bytes(),
                                 cudaMemcpyDefault, ctx->CUDACtx()->Stream()));
   Context cpu_ctx;
+  auto null = this->IsDense() ? this->NumSymbols() : this->NumSymbols() - 1;
   return {ctx->IsCPU() ? ctx : &cpu_ctx,
-          cuts_,
-          is_dense,
-          row_stride,
-          base_rowid,
-          n_rows,
-          common::CompressedIterator<uint32_t>(h_gidx_buffer->data(), this->NumSymbols()),
+          this->cuts_,
+          this->info.row_stride,
+          this->base_rowid,
+          this->n_rows,
+          common::CompressedIterator<uint32_t>{h_gidx_buffer->data(), this->NumSymbols()},
+          null,
           feature_types};
 }
 
 [[nodiscard]] bst_idx_t EllpackPageImpl::NumNonMissing(
     Context const* ctx, common::Span<FeatureType const> feature_types) const {
+  if (this->IsDense()) {
+    return this->n_rows * this->info.row_stride;
+  }
   auto d_acc = this->GetDeviceAccessor(ctx, feature_types);
   using T = typename decltype(d_acc.gidx_iter)::value_type;
   auto it = thrust::make_transform_iterator(
       thrust::make_counting_iterator(0ull),
-      cuda::proclaim_return_type<T>([=] __device__(std::size_t i) { return d_acc.gidx_iter[i]; }));
-  auto nnz = thrust::count_if(ctx->CUDACtx()->CTP(), it, it + d_acc.row_stride * d_acc.n_rows,
-                              cuda::proclaim_return_type<bool>(
-                                  [=] __device__(T gidx) { return gidx != d_acc.NullValue(); }));
-  return nnz;
+      [=] XGBOOST_DEVICE(std::size_t i) { return d_acc.gidx_iter[i]; });
+  return thrust::count_if(ctx->CUDACtx()->CTP(), it, it + d_acc.row_stride * d_acc.n_rows,
+                          [=] XGBOOST_DEVICE(T gidx) -> bool { return gidx != d_acc.NullValue(); });
 }
 }  // namespace xgboost
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 78641c5ac9c7..7a2ce40add9d 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -24,7 +24,7 @@ namespace xgboost {
  */
 struct EllpackDeviceAccessor {
   /** @brief Whether or not if the matrix is dense. */
-  bool is_dense;
+  bst_idx_t null_value;
   /** @brief Row length for ELLPACK, equal to number of features when the data is dense. */
   bst_idx_t row_stride;
   /** @brief Starting index of the rows. Used for external memory. */
@@ -36,7 +36,7 @@ struct EllpackDeviceAccessor {
   /** @brief Minimum value for each feature. Size equals to number of features. */
   common::Span<const float> min_fvalue;
   /** @brief Histogram cut pointers. Size equals to (number of features + 1). */
-  common::Span<const std::uint32_t> feature_segments;
+  std::uint32_t const* feature_segments;
   /** @brief Histogram cut values. Size equals to (bins per feature * number of features). */
   common::Span<const float> gidx_fvalue_map;
   /** @brief Type of each feature, categorical or numerical. */
@@ -44,10 +44,10 @@ struct EllpackDeviceAccessor {
 
   EllpackDeviceAccessor() = delete;
   EllpackDeviceAccessor(Context const* ctx, std::shared_ptr<const common::HistogramCuts> cuts,
-                        bool is_dense, bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
-                        common::CompressedIterator<uint32_t> gidx_iter,
+                        bst_idx_t row_stride, bst_idx_t base_rowid, bst_idx_t n_rows,
+                        common::CompressedIterator<uint32_t> gidx_iter, bst_idx_t null_value,
                         common::Span<FeatureType const> feature_types)
-      : is_dense{is_dense},
+      : null_value{null_value},
         row_stride{row_stride},
         base_rowid{base_rowid},
         n_rows{n_rows},
@@ -58,62 +58,72 @@ struct EllpackDeviceAccessor {
       cuts->cut_ptrs_.SetDevice(ctx->Device());
       cuts->min_vals_.SetDevice(ctx->Device());
       gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan();
-      feature_segments = cuts->cut_ptrs_.ConstDeviceSpan();
+      feature_segments = cuts->cut_ptrs_.ConstDevicePointer();
       min_fvalue = cuts->min_vals_.ConstDeviceSpan();
     } else {
       gidx_fvalue_map = cuts->cut_values_.ConstHostSpan();
-      feature_segments = cuts->cut_ptrs_.ConstHostSpan();
+      feature_segments = cuts->cut_ptrs_.ConstHostPointer();
       min_fvalue = cuts->min_vals_.ConstHostSpan();
     }
   }
 
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bool IsDenseCompressed() const {
+    return this->row_stride == this->NumFeatures();
+  }
   /**
-   * @brief Given a row index and a feature index, returns the corresponding cut value.
+   * @brief Given a row index and a feature index, returns the corresponding bin index.
    *
-   * Uses binary search for look up. Returns NaN if missing.
+   * Uses binary search for look up.
    *
    * @tparam global_ridx Whether the row index is global to all ellpack batches or it's
    *                     local to the current batch.
+   *
+   * @return -1 if it's a missing value.
    */
   template <bool global_ridx = true>
-  [[nodiscard]] __device__ bst_bin_t GetBinIndex(bst_idx_t ridx, size_t fidx) const {
+  [[nodiscard]] __device__ bst_bin_t GetBinIndex(bst_idx_t ridx, std::size_t fidx) const {
     if (global_ridx) {
       ridx -= base_rowid;
     }
     auto row_begin = row_stride * ridx;
-    auto row_end = row_begin + row_stride;
-    bst_bin_t gidx = -1;
-    if (is_dense) {
-      gidx = gidx_iter[row_begin + fidx];
-      gidx += this->feature_segments[fidx];
-    } else {
-      gidx = common::BinarySearchBin(row_begin, row_end, gidx_iter, feature_segments[fidx],
-                                     feature_segments[fidx + 1]);
+    if (!this->IsDenseCompressed()) {
+      // binary search returns -1 if it's missing
+      auto row_end = row_begin + row_stride;
+      bst_bin_t gidx = common::BinarySearchBin(row_begin, row_end, gidx_iter,
+                                               feature_segments[fidx], feature_segments[fidx + 1]);
+      return gidx;
+    }
+    bst_bin_t gidx = gidx_iter[row_begin + fidx];
+    if (gidx == this->NullValue()) {
+      // Missing value in a dense ellpack
+      return -1;
     }
+    // Dense ellpack
+    gidx += this->feature_segments[fidx];
     return gidx;
   }
-
+  /**
+   * @brief Find a bin to place the value in. Used during construction of the Ellpack.
+   */
   template <bool is_cat>
-  [[nodiscard]] __device__ uint32_t SearchBin(float value, size_t column_id) const {
-    auto beg = feature_segments[column_id];
-    auto end = feature_segments[column_id + 1];
-    uint32_t idx = 0;
+  [[nodiscard]] __device__ bst_bin_t SearchBin(float value, std::size_t fidx) const {
+    auto beg = feature_segments[fidx];
+    auto end = feature_segments[fidx + 1];
+    bst_bin_t gidx = 0;
     if (is_cat) {
-      auto it = dh::MakeTransformIterator<bst_cat_t>(
-          gidx_fvalue_map.cbegin(), [](float v) { return common::AsCat(v); });
-      idx = thrust::lower_bound(thrust::seq, it + beg, it + end,
-                                common::AsCat(value)) -
-            it;
+      auto it = dh::MakeTransformIterator<bst_cat_t>(gidx_fvalue_map.cbegin(),
+                                                     [](float v) { return common::AsCat(v); });
+      gidx = thrust::lower_bound(thrust::seq, it + beg, it + end, common::AsCat(value)) - it;
     } else {
       auto it = thrust::upper_bound(thrust::seq, gidx_fvalue_map.cbegin() + beg,
                                     gidx_fvalue_map.cbegin() + end, value);
-      idx = it - gidx_fvalue_map.cbegin();
+      gidx = it - gidx_fvalue_map.cbegin();
     }
 
-    if (idx == end) {
-      idx -= 1;
+    if (gidx == end) {
+      gidx -= 1;
     }
-    return idx;
+    return gidx;
   }
 
   [[nodiscard]] __device__ float GetFvalue(bst_idx_t ridx, size_t fidx) const {
@@ -123,17 +133,9 @@ struct EllpackDeviceAccessor {
     }
     return gidx_fvalue_map[gidx];
   }
-
-  // Check if the row id is withing range of the current batch.
-  [[nodiscard]] __device__ bool IsInRange(size_t row_id) const {
-    return row_id >= base_rowid && row_id < base_rowid + n_rows;
-  }
-
-  [[nodiscard]] XGBOOST_DEVICE size_t NullValue() const { return this->NumBins(); }
-
-  [[nodiscard]] XGBOOST_DEVICE size_t NumBins() const { return gidx_fvalue_map.size(); }
-
-  [[nodiscard]] XGBOOST_DEVICE size_t NumFeatures() const { return min_fvalue.size(); }
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NullValue() const { return this->null_value; }
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE bst_idx_t NumBins() const { return gidx_fvalue_map.size(); }
+  [[nodiscard]] XGBOOST_HOST_DEV_INLINE size_t NumFeatures() const { return min_fvalue.size(); }
 };
 
 
@@ -223,8 +225,17 @@ class EllpackPageImpl {
   [[nodiscard]] common::HistogramCuts const& Cuts() const { return *cuts_; }
   [[nodiscard]] std::shared_ptr<common::HistogramCuts const> CutsShared() const { return cuts_; }
   void SetCuts(std::shared_ptr<common::HistogramCuts const> cuts);
+  /**
+   * @brief Fully dense, there's not a single missing value.
+   */
+  [[nodiscard]] bool IsDense() const { return this->is_dense; }
+  /**
+   * @brief Stored as a dense matrix, but there are might be missing values.
+   */
+  [[nodiscard]] bool IsDenseCompressed() const {
+    return this->cuts_->NumFeatures() == this->info.row_stride;
+  }
 
-  [[nodiscard]] bool IsDense() const { return is_dense; }
   /** @return Estimation of memory cost of this page. */
   std::size_t MemCostBytes() const;
 
@@ -232,9 +243,8 @@ class EllpackPageImpl {
    * @brief Return the total number of symbols (total number of bins plus 1 for not
    *        found).
    */
-  [[nodiscard]] std::size_t NumSymbols() const { return this->n_symbols_; }
-  void SetNumSymbols(bst_idx_t n_symbols) { this->n_symbols_ = n_symbols; }
-
+  [[nodiscard]] auto NumSymbols() const { return this->info.n_symbols; }
+  void SetNumSymbols(bst_idx_t n_symbols) { this->info.n_symbols = n_symbols; }
   /**
    * @brief Get an accessor that can be passed into CUDA kernels.
    */
@@ -265,11 +275,11 @@ class EllpackPageImpl {
    */
   void InitCompressedData(Context const* ctx);
 
+  std::shared_ptr<common::HistogramCuts const> cuts_;
+
  public:
-  /** @brief Whether or not if the matrix is dense. */
-  bool is_dense;
-  /** @brief Row length for ELLPACK. */
-  bst_idx_t row_stride;
+  bool is_dense{false};
+
   bst_idx_t base_rowid{0};
   bst_idx_t n_rows{0};
   /**
@@ -278,22 +288,28 @@ class EllpackPageImpl {
    * This can be backed by various storage types.
    */
   common::RefResourceView<common::CompressedByteT> gidx_buffer;
+  /**
+   * @brief Compression infomation.
+   */
+  struct Info {
+    /** @brief Row length for ELLPACK. */
+    bst_idx_t row_stride{0};
+    /** @brief The number of unique bins including missing. */
+    bst_idx_t n_symbols{0};
+  } info;
 
  private:
-  std::shared_ptr<common::HistogramCuts const> cuts_;
-  bst_idx_t n_symbols_{0};
   common::Monitor monitor_;
 };
 
-inline size_t GetRowStride(DMatrix* dmat) {
+[[nodiscard]] inline bst_idx_t GetRowStride(DMatrix* dmat) {
   if (dmat->IsDense()) return dmat->Info().num_col_;
 
   size_t row_stride = 0;
   for (const auto& batch : dmat->GetBatches<SparsePage>()) {
     const auto& row_offset = batch.offset.ConstHostVector();
     for (auto i = 1ull; i < row_offset.size(); i++) {
-      row_stride = std::max(
-        row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
+      row_stride = std::max(row_stride, static_cast<size_t>(row_offset[i] - row_offset[i - 1]));
     }
   }
   return row_stride;
diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h
index fa312f6e7887..bbd6db0eef7f 100644
--- a/src/data/ellpack_page.h
+++ b/src/data/ellpack_page.h
@@ -42,7 +42,6 @@ class EllpackPage {
 
   /*! \return Number of instances in the page. */
   [[nodiscard]] bst_idx_t Size() const;
-  [[nodiscard]] bool IsDense() const;
 
   /*! \brief Set the base row id for this page. */
   void SetBaseRowId(std::size_t row_id);
diff --git a/src/data/ellpack_page_raw_format.cu b/src/data/ellpack_page_raw_format.cu
index 839966b08151..262b9c8d3796 100644
--- a/src/data/ellpack_page_raw_format.cu
+++ b/src/data/ellpack_page_raw_format.cu
@@ -57,7 +57,7 @@ template <typename T>
 
   RET_IF_NOT(fi->Read(&impl->n_rows));
   RET_IF_NOT(fi->Read(&impl->is_dense));
-  RET_IF_NOT(fi->Read(&impl->row_stride));
+  RET_IF_NOT(fi->Read(&impl->info.row_stride));
 
   if (this->param_.prefetch_copy || !has_hmm_ats_) {
     RET_IF_NOT(ReadDeviceVec(fi, &impl->gidx_buffer));
@@ -83,7 +83,7 @@ template <typename T>
   auto* impl = page.Impl();
   bytes += fo->Write(impl->n_rows);
   bytes += fo->Write(impl->is_dense);
-  bytes += fo->Write(impl->row_stride);
+  bytes += fo->Write(impl->info.row_stride);
   std::vector<common::CompressedByteT> h_gidx_buffer;
   Context ctx = Context{}.MakeCUDA(curt::CurrentDevice());
   [[maybe_unused]] auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx_buffer);
diff --git a/src/data/ellpack_page_source.cu b/src/data/ellpack_page_source.cu
index 588ddccec32b..09fc0da68847 100644
--- a/src/data/ellpack_page_source.cu
+++ b/src/data/ellpack_page_source.cu
@@ -80,7 +80,7 @@ class EllpackHostCacheStreamImpl {
         common::MakeFixedVecWithPinnedMalloc<common::CompressedByteT>(impl->gidx_buffer.size());
     new_impl->n_rows = impl->Size();
     new_impl->is_dense = impl->IsDense();
-    new_impl->row_stride = impl->row_stride;
+    new_impl->info.row_stride = impl->info.row_stride;
     new_impl->base_rowid = impl->base_rowid;
     new_impl->SetNumSymbols(impl->NumSymbols());
 
@@ -108,7 +108,7 @@ class EllpackHostCacheStreamImpl {
 
     impl->n_rows = page->Size();
     impl->is_dense = page->IsDense();
-    impl->row_stride = page->row_stride;
+    impl->info.row_stride = page->info.row_stride;
     impl->base_rowid = page->base_rowid;
     impl->SetNumSymbols(page->NumSymbols());
   }
diff --git a/src/data/ellpack_page_source.h b/src/data/ellpack_page_source.h
index 40f29b6b93b2..2d5d5b8f1ce8 100644
--- a/src/data/ellpack_page_source.h
+++ b/src/data/ellpack_page_source.h
@@ -80,6 +80,12 @@ class EllpackFormatPolicy {
     if (!GlobalConfigThreadLocalStore::Get()->use_rmm) {
       LOG(WARNING) << "`use_rmm` is set to false." << msg;
     }
+    std::int32_t major{0}, minor{0};
+    curt::DrVersion(&major, &minor);
+    if (!(major >= 12 && minor >= 7) && curt::SupportsAts()) {
+      // Use ATS, but with an old kernel driver.
+      LOG(WARNING) << "Using an old kernel driver with supported CTK<12.7." << msg;
+    }
   }
   // For testing with the HMM flag.
   explicit EllpackFormatPolicy(bool has_hmm) : has_hmm_{has_hmm} {}
diff --git a/src/data/extmem_quantile_dmatrix.cu b/src/data/extmem_quantile_dmatrix.cu
index 3fb1557e9993..119f9298c501 100644
--- a/src/data/extmem_quantile_dmatrix.cu
+++ b/src/data/extmem_quantile_dmatrix.cu
@@ -56,7 +56,7 @@ void ExtMemQuantileDMatrix::InitFromCUDA(
   for (auto const &page : this->GetEllpackPageImpl()) {
     n_total_samples += page.Size();
     CHECK_EQ(page.Impl()->base_rowid, ext_info.base_rows[k]);
-    CHECK_EQ(page.Impl()->row_stride, ext_info.row_stride);
+    CHECK_EQ(page.Impl()->info.row_stride, ext_info.row_stride);
     ++k, ++batch_cnt;
   }
   CHECK_EQ(batch_cnt, ext_info.n_batches);
diff --git a/src/data/gradient_index.cu b/src/data/gradient_index.cu
index ebdc99051924..5e15ff5f0fa2 100644
--- a/src/data/gradient_index.cu
+++ b/src/data/gradient_index.cu
@@ -1,7 +1,9 @@
 /**
- * Copyright 2022-2023, XGBoost Contributors
+ * Copyright 2022-2024, XGBoost Contributors
  */
-#include <memory>  // std::unique_ptr
+#include <cstddef>  // for size_t
+#include <memory>   // for unique_ptr
+#include <vector>   // for vector
 
 #include "../common/column_matrix.h"
 #include "../common/hist_util.h"  // Index
@@ -20,23 +22,34 @@ void SetIndexData(Context const* ctx, EllpackPageImpl const* page,
   auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
   auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
 
-  common::Span<BinT> index_data_span = {out->index.data<BinT>(), out->index.Size()};
+  auto index_data_span = common::Span{out->index.data<BinT>(), out->index.Size()};
   auto n_bins_total = page->Cuts().TotalBins();
 
   auto& hit_count_tloc = *p_hit_count_tloc;
   hit_count_tloc.clear();
   hit_count_tloc.resize(ctx->Threads() * n_bins_total, 0);
-
-  common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) {
+  bool dense_compressed = page->IsDenseCompressed() && !page->IsDense();
+  common::ParallelFor(page->Size(), ctx->Threads(), [&](auto ridx) {
     auto tid = omp_get_thread_num();
-    size_t in_rbegin = page->row_stride * i;
-    size_t out_rbegin = out->row_ptr[i];
-    auto r_size = out->row_ptr[i + 1] - out->row_ptr[i];
-    for (size_t j = 0; j < r_size; ++j) {
-      auto bin_idx = accessor.gidx_iter[in_rbegin + j];
-      assert(bin_idx != kNull);
-      index_data_span[out_rbegin + j] = bin_idx;
-      ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)];
+    size_t in_rbegin = page->info.row_stride * ridx;
+    size_t out_rbegin = out->row_ptr[ridx];
+    if (dense_compressed) {
+      for (std::size_t j = 0, k = 0; j < page->info.row_stride; ++j) {
+        bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j];
+        if (XGBOOST_EXPECT((bin_idx != kNull), true)) {  // relatively dense
+          bin_idx = get_offset(bin_idx, j);
+          index_data_span[out_rbegin + k++] = bin_idx;
+          ++hit_count_tloc[tid * n_bins_total + bin_idx];
+        }
+      }
+    } else {
+      auto r_size = out->row_ptr[ridx + 1] - out->row_ptr[ridx];
+      for (size_t j = 0; j < r_size; ++j) {
+        bst_bin_t bin_idx = accessor.gidx_iter[in_rbegin + j];
+        assert(bin_idx != kNull);
+        index_data_span[out_rbegin + j] = bin_idx;
+        ++hit_count_tloc[tid * n_bins_total + get_offset(bin_idx, j)];
+      }
     }
   });
 }
@@ -45,16 +58,16 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
                           common::RefResourceView<std::size_t>* p_out) {
   auto& row_ptr = *p_out;
   row_ptr = common::MakeFixedVecWithMalloc(page->Size() + 1, std::size_t{0});
-  if (page->is_dense) {
-    std::fill(row_ptr.begin() + 1, row_ptr.end(), page->row_stride);
+  if (page->IsDense()) {
+    std::fill(row_ptr.begin() + 1, row_ptr.end(), page->info.row_stride);
   } else {
     std::vector<common::CompressedByteT> h_gidx_buffer;
     auto accessor = page->GetHostAccessor(ctx, &h_gidx_buffer);
     auto const kNull = static_cast<bst_bin_t>(accessor.NullValue());
 
     common::ParallelFor(page->Size(), ctx->Threads(), [&](auto i) {
-      size_t ibegin = page->row_stride * i;
-      for (size_t j = 0; j < page->row_stride; ++j) {
+      size_t ibegin = page->info.row_stride * i;
+      for (size_t j = 0; j < page->info.row_stride; ++j) {
         bst_bin_t bin_idx = accessor.gidx_iter[ibegin + j];
         if (bin_idx != kNull) {
           row_ptr[i + 1]++;
@@ -67,27 +80,27 @@ void GetRowPtrFromEllpack(Context const* ctx, EllpackPageImpl const* page,
 
 GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
                                    EllpackPage const& in_page, BatchParam const& p)
-    : max_numeric_bins_per_feat{p.max_bin} {
+    : cut{in_page.Cuts()},
+      max_numeric_bins_per_feat{p.max_bin},
+      isDense_{in_page.Impl()->IsDense()},
+      base_rowid{in_page.BaseRowId()} {
   auto page = in_page.Impl();
-  isDense_ = page->is_dense;
-
   CHECK_EQ(info.num_row_, in_page.Size());
 
-  this->cut = page->Cuts();
   // pull to host early, prevent race condition
   this->cut.Ptrs();
   this->cut.Values();
   this->cut.MinValues();
 
-  this->ResizeIndex(info.num_nonzero_, page->is_dense);
-  if (page->is_dense) {
+  this->ResizeIndex(info.num_nonzero_, page->IsDense());
+  if (page->IsDense()) {
     this->index.SetBinOffset(page->Cuts().Ptrs());
   }
 
+  auto offset = page->Cuts().cut_ptrs_.ConstHostSpan();
   auto n_bins_total = page->Cuts().TotalBins();
   GetRowPtrFromEllpack(ctx, page, &this->row_ptr);
-  if (page->IsDense()) {
-    auto offset = index.Offset();
+  if (page->IsDenseCompressed()) {
     common::DispatchBinType(this->index.GetBinTypeSize(), [&](auto dtype) {
       using T = decltype(dtype);
       ::xgboost::SetIndexData<T>(
@@ -104,7 +117,7 @@ GHistIndexMatrix::GHistIndexMatrix(Context const* ctx, MetaInfo const& info,
   this->GatherHitCount(ctx->Threads(), n_bins_total);
 
   // sanity checks
-  CHECK_EQ(this->Features(), info.num_col_);
+  CHECK_EQ(this->Features(), in_page.Cuts().NumFeatures());
   CHECK_EQ(this->Size(), info.num_row_);
   CHECK(this->cut.cut_ptrs_.HostCanRead());
   CHECK(this->cut.cut_values_.HostCanRead());
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index 29d38976361b..c9830fa093e8 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -49,6 +49,9 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
 
   this->fmat_ctx_ = ctx;
   this->batch_ = p;
+
+  LOG(INFO) << "Finished constructing the `IterativeDMatrix`: (" << this->Info().num_row_ << ", "
+            << this->Info().num_col_ << ", " << this->Info().num_nonzero_ << ").";
 }
 
 void IterativeDMatrix::InitFromCPU(Context const* ctx, BatchParam const& p,
diff --git a/src/tree/gpu_hist/feature_groups.cu b/src/tree/gpu_hist/feature_groups.cu
index c6c6619852ca..0a1272a10aff 100644
--- a/src/tree/gpu_hist/feature_groups.cu
+++ b/src/tree/gpu_hist/feature_groups.cu
@@ -2,17 +2,17 @@
  * Copyright 2020-2024, XGBoost Contributors
  */
 
-#include <xgboost/base.h>
-#include <algorithm>
-#include <vector>
+#include <algorithm>  // for max
+#include <cstddef>    // for size_t
+#include <cstdint>    // for uint32_t
+#include <vector>     // for vector
 
+#include "../../common/hist_util.h"  // for HistogramCuts
 #include "feature_groups.cuh"
 
-#include "../../common/hist_util.h"
-
 namespace xgboost::tree {
-FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense,
-                             size_t shm_size, size_t bin_size) {
+FeatureGroups::FeatureGroups(common::HistogramCuts const& cuts, bool is_dense, size_t shm_size)
+    : max_group_bins{0} {
   // Only use a single feature group for sparse matrices.
   bool single_group = !is_dense;
   if (single_group) {
@@ -20,14 +20,14 @@ FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense,
     return;
   }
 
-  std::vector<int>& feature_segments_h = feature_segments.HostVector();
-  std::vector<int>& bin_segments_h = bin_segments.HostVector();
+  auto& feature_segments_h = feature_segments.HostVector();
+  auto& bin_segments_h = bin_segments.HostVector();
   feature_segments_h.push_back(0);
   bin_segments_h.push_back(0);
 
-  const std::vector<uint32_t>& cut_ptrs = cuts.Ptrs();
-  size_t max_shmem_bins = shm_size / bin_size;
-  max_group_bins = 0;
+  std::vector<std::uint32_t> const& cut_ptrs = cuts.Ptrs();
+  // Maximum number of bins that can be placed into shared memory.
+  std::size_t max_shmem_bins = shm_size / sizeof(GradientPairInt64);
 
   for (size_t i = 2; i < cut_ptrs.size(); ++i) {
     int last_start = bin_segments_h.back();
@@ -41,17 +41,16 @@ FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense,
   }
   feature_segments_h.push_back(cut_ptrs.size() - 1);
   bin_segments_h.push_back(cut_ptrs.back());
-  max_group_bins = std::max(max_group_bins,
-                            bin_segments_h.back() -
-                            bin_segments_h[bin_segments_h.size() - 2]);
+  max_group_bins =
+      std::max(max_group_bins, bin_segments_h.back() - bin_segments_h[bin_segments_h.size() - 2]);
 }
 
-void FeatureGroups::InitSingle(const common::HistogramCuts& cuts) {
-  std::vector<int>& feature_segments_h = feature_segments.HostVector();
+void FeatureGroups::InitSingle(common::HistogramCuts const& cuts) {
+  auto& feature_segments_h = feature_segments.HostVector();
   feature_segments_h.push_back(0);
   feature_segments_h.push_back(cuts.Ptrs().size() - 1);
 
-  std::vector<int>& bin_segments_h = bin_segments.HostVector();
+  auto& bin_segments_h = bin_segments.HostVector();
   bin_segments_h.push_back(0);
   bin_segments_h.push_back(cuts.TotalBins());
 
diff --git a/src/tree/gpu_hist/feature_groups.cuh b/src/tree/gpu_hist/feature_groups.cuh
index 82df69796ebd..37d87a9f577a 100644
--- a/src/tree/gpu_hist/feature_groups.cuh
+++ b/src/tree/gpu_hist/feature_groups.cuh
@@ -16,92 +16,97 @@ class HistogramCuts;
 
 namespace tree {
 
-/** \brief FeatureGroup is a feature group. It is defined by a range of
-    consecutive feature indices, and also contains a range of all bin indices
-    associated with those features. */
+/**
+ * @brief FeatureGroup is a single group of features.
+ *
+ * It is defined by a range of consecutive feature indices, and also contains a range of
+ * all bin indices associated with those features.
+ */
 struct FeatureGroup {
-  __host__ __device__ FeatureGroup(int start_feature_, int num_features_,
-                                   int start_bin_, int num_bins_) :
-    start_feature(start_feature_), num_features(num_features_),
-    start_bin(start_bin_), num_bins(num_bins_) {}
+  XGBOOST_DEVICE FeatureGroup(bst_feature_t start_feature, bst_feature_t n_features,
+                              bst_bin_t start_bin, bst_bin_t num_bins)
+      : start_feature{start_feature},
+        num_features{n_features},
+        start_bin{start_bin},
+        num_bins{num_bins} {}
   /** The first feature of the group. */
-  int start_feature;
+  bst_feature_t start_feature;
   /** The number of features in the group. */
-  int num_features;
+  bst_feature_t num_features;
   /** The first bin in the group. */
   bst_bin_t start_bin;
   /** The number of bins in the group. */
   bst_bin_t num_bins;
 };
 
-/** \brief FeatureGroupsAccessor is a non-owning accessor for FeatureGroups. */
+/** @brief FeatureGroupsAccessor is a non-owning accessor for FeatureGroups. */
 struct FeatureGroupsAccessor {
-  FeatureGroupsAccessor(common::Span<const int> feature_segments_,
-                        common::Span<const int> bin_segments_, int max_group_bins_)
-      : feature_segments(feature_segments_),
-        bin_segments(bin_segments_),
-        max_group_bins(max_group_bins_) {}
+  FeatureGroupsAccessor(common::Span<const bst_feature_t> feature_segments,
+                        common::Span<const bst_bin_t> bin_segments, bst_bin_t max_group_bins)
+      : feature_segments{feature_segments},
+        bin_segments{bin_segments},
+        max_group_bins{max_group_bins} {}
 
-  common::Span<const int> feature_segments;
+  common::Span<const bst_feature_t> feature_segments;
   common::Span<const int> bin_segments;
-  int max_group_bins;
+  bst_bin_t max_group_bins;
 
-  /** \brief Gets the number of feature groups. */
-  __host__ __device__ int NumGroups() const {
-    return feature_segments.size() - 1;
-  }
+  /** @brief Gets the number of feature groups. */
+  XGBOOST_DEVICE int NumGroups() const { return feature_segments.size() - 1; }
 
-  /** \brief Gets the information about a feature group with index i. */
-  __host__ __device__ FeatureGroup operator[](int i) const {
-    return {feature_segments[i], feature_segments[i + 1] - feature_segments[i],
-        bin_segments[i], bin_segments[i + 1] - bin_segments[i]};
+  /** @brief Gets the information about a feature group with index i. */
+  XGBOOST_DEVICE FeatureGroup operator[](bst_feature_t i) const {
+    return {feature_segments[i], feature_segments[i + 1] - feature_segments[i], bin_segments[i],
+            bin_segments[i + 1] - bin_segments[i]};
   }
 };
 
-/** \brief FeatureGroups contains information that defines a split of features
-    into groups. Bins of a single feature group typically fit into shared
-    memory, so the histogram for the features of a single group can be computed
-    faster.
-
-  \notes Known limitations:
-
-    - splitting features into groups currently works only for dense matrices,
-      where it is easy to get a feature value in a row by its index; for sparse
-      matrices, the structure contains only a single group containing all
-      features;
-
-    - if a single feature requires more bins than fit into shared memory, the
-      histogram is computed in global memory even if there are multiple feature
-      groups; note that this is unlikely to occur in practice, as the default
-      number of bins per feature is 256, whereas a thread block with 48 KiB
-      shared memory can contain 3072 bins if each gradient sum component is a
-      64-bit floating-point value (double)
+/**
+ * @brief FeatureGroups contains information that defines a split of features
+ *   into groups. Bins of a single feature group typically fit into shared
+ *   memory, so the histogram for the features of a single group can be computed
+ *   faster.
+ *
+ * @note Known limitations:
+ *
+ *  - splitting features into groups currently works only for dense matrices,
+ *    where it is easy to get a feature value in a row by its index; for sparse
+ *    matrices, the structure contains only a single group containing all
+ *    features;
+ *
+ *  - if a single feature requires more bins than fit into shared memory, the
+ *    histogram is computed in global memory even if there are multiple feature
+ *    groups; note that this is unlikely to occur in practice, as the default
+ *    number of bins per feature is 256, whereas a thread block with 48 KiB
+ *    shared memory can contain 3072 bins if each gradient sum component is a
+ *     64-bit floating-point value (double)
 */
 struct FeatureGroups {
   /** Group cuts for features. Size equals to (number of groups + 1). */
-  HostDeviceVector<int> feature_segments;
+  HostDeviceVector<bst_feature_t> feature_segments;
   /** Group cuts for bins. Size equals to (number of groups + 1)  */
   HostDeviceVector<int> bin_segments;
   /** Maximum number of bins in a group. Useful to compute the amount of dynamic
       shared memory when launching a kernel. */
   int max_group_bins;
 
-  /** Creates feature groups by splitting features into groups.
-      \param cuts Histogram cuts that given the number of bins per feature.
-      \param is_dense Whether the data matrix is dense.
-      \param shm_size Available size of shared memory per thread block (in
-      bytes) used to compute feature groups.
-      \param bin_size Size of a single bin of the histogram. */
-  FeatureGroups(const common::HistogramCuts& cuts, bool is_dense,
-                size_t shm_size, size_t bin_size);
-
-  /** Creates a single feature group containing all features and bins.
-      \notes This is used as a fallback for sparse matrices, and is also useful
-      for testing.
+  /**
+   * @brief Creates feature groups by splitting features into groups.
+   *
+   * @param cuts Histogram cuts that given the number of bins per feature.
+   * @param is_dense Whether the data matrix is dense.
+   * @param shm_size Available size of shared memory per thread block (in bytes) used to
+   *  compute feature groups.
    */
-  explicit FeatureGroups(const common::HistogramCuts& cuts) {
-    InitSingle(cuts);
-  }
+  FeatureGroups(common::HistogramCuts const& cuts, bool is_dense, size_t shm_size);
+
+  /**
+   * @brief Creates a single feature group containing all features and bins.
+   *
+   * @notes This is used as a fallback for sparse matrices, and is also useful for
+   *        testing.
+   */
+  explicit FeatureGroups(const common::HistogramCuts& cuts) { this->InitSingle(cuts); }
 
   [[nodiscard]] FeatureGroupsAccessor DeviceAccessor(DeviceOrd device) const {
     feature_segments.SetDevice(device);
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 46a52a8ea5d7..77a720fcc709 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -205,8 +205,8 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   auto batch_iterator = dmat->GetBatches<EllpackPage>(ctx, batch_param_);
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
-  *page = EllpackPageImpl{ctx, first_page->CutsShared(), first_page->is_dense,
-                          first_page->row_stride, sample_rows};
+  *page = EllpackPageImpl{ctx, first_page->CutsShared(), first_page->IsDense(),
+                          first_page->info.row_stride, sample_rows};  // fixme: row_stride
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(cuctx->CTP(), page->gidx_buffer.begin(), page->gidx_buffer.end(), 0);
@@ -290,8 +290,8 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
 
-  *page = EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(), first_page->row_stride,
-                          sample_rows};
+  *page = EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(),
+                          first_page->info.row_stride, sample_rows};  // fixme: row_stride
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(cuctx->CTP(), page->gidx_buffer.begin(), page->gidx_buffer.end(), 0);
   for (auto& batch : batch_iterator) {
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index d50f7284e9ad..102b1113be6b 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -138,19 +138,20 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest,
             *reinterpret_cast<uint64_t*>(&h));
 }
 
-template <bool kIsDense, int kBlockThreads, int kItemsPerThread,
-          int kItemsPerTile = kBlockThreads * kItemsPerThread>
+template <bool kCompressed, int kBlockThreads, int kItemsPerThread>
 class HistogramAgent {
+  int constexpr static kItemsPerTile = kBlockThreads * kItemsPerThread;
+
   GradientPairInt64* smem_arr_;
   GradientPairInt64* d_node_hist_;
-  using Idx = RowPartitioner::RowIndexT;
+  using Idx = cuda_impl::RowIndexT;
 
   dh::LDGIterator<const Idx> d_ridx_;
   const GradientPair* d_gpair_;
   const FeatureGroup group_;
   const EllpackDeviceAccessor& matrix_;
   const int feature_stride_;
-  const std::size_t n_elements_;
+  const bst_idx_t n_elements_;
   const GradientQuantiser& rounding_;
 
  public:
@@ -158,34 +159,33 @@ class HistogramAgent {
                             GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
                             const EllpackDeviceAccessor& matrix, common::Span<const Idx> d_ridx,
                             const GradientQuantiser& rounding, const GradientPair* d_gpair)
-      : smem_arr_(smem_arr),
-        d_node_hist_(d_node_hist),
+      : smem_arr_{smem_arr},
+        d_node_hist_{d_node_hist},
         d_ridx_(d_ridx.data()),
-        group_(group),
+        group_{group},
         matrix_(matrix),
-        feature_stride_(kIsDense ? group.num_features : matrix.row_stride),
-        n_elements_(feature_stride_ * d_ridx.size()),
-        rounding_(rounding),
-        d_gpair_(d_gpair) {}
+        feature_stride_(kCompressed ? group.num_features : matrix.row_stride),
+        n_elements_{feature_stride_ * d_ridx.size()},
+        rounding_{rounding},
+        d_gpair_{d_gpair} {}
 
   __device__ void ProcessPartialTileShared(std::size_t offset) {
-    for (std::size_t idx = offset + threadIdx.x;
-         idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
-         idx += kBlockThreads) {
+    for (std::size_t idx = offset + threadIdx.x,
+                     n = std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
+         idx < n; idx += kBlockThreads) {
       Idx ridx = d_ridx_[idx / feature_stride_];
       auto fidx = FeatIdx(group_, idx, feature_stride_);
       bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)];
-      if (kIsDense || compressed_bin != matrix_.NullValue()) {
+      if (compressed_bin != matrix_.NullValue()) {
+        // The matrix is compressed with feature-local bins.
+        if (kCompressed) {
+          compressed_bin += this->matrix_.feature_segments[fidx];
+        }
+        // Avoid atomic add if it's a null value.
         auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
         // Subtract start_bin to write to group-local histogram. If this is not a dense
         // matrix, then start_bin is 0 since featuregrouping doesn't support sparse data.
-        if (kIsDense) {
-          AtomicAddGpairShared(
-              smem_arr_ + compressed_bin + this->matrix_.feature_segments[fidx] - group_.start_bin,
-              adjusted);
-        } else {
-          AtomicAddGpairShared(smem_arr_ + compressed_bin - group_.start_bin, adjusted);
-        }
+        AtomicAddGpairShared(smem_arr_ + compressed_bin - group_.start_bin, adjusted);
       }
     }
   }
@@ -210,16 +210,19 @@ class HistogramAgent {
     for (int i = 0; i < kItemsPerThread; i++) {
       gpair[i] = d_gpair_[ridx[i]];
       auto fidx = FeatIdx(group_, idx[i], feature_stride_);
-      if (kIsDense) {
-        gidx[i] =
-            matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)] + matrix_.feature_segments[fidx];
+      gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)];
+      if (gidx[i] != matrix_.NullValue()) {
+        if (kCompressed) {
+          gidx[i] += matrix_.feature_segments[fidx];
+        }
       } else {
-        gidx[i] = matrix_.gidx_iter[IterIdx(matrix_, ridx[i], fidx)];
+        gidx[i] = -1;  // missing
       }
     }
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
-      if ((kIsDense || gidx[i] != matrix_.NullValue())) {
+      // Avoid atomic add if it's a null value.
+      if (gidx[i] != -1) {
         auto adjusted = rounding_.ToFixedPoint(gpair[i]);
         AtomicAddGpairShared(smem_arr_ + gidx[i] - group_.start_bin, adjusted);
       }
@@ -248,14 +251,12 @@ class HistogramAgent {
       Idx ridx = d_ridx_[idx / feature_stride_];
       auto fidx = FeatIdx(group_, idx, feature_stride_);
       bst_bin_t compressed_bin = matrix_.gidx_iter[IterIdx(matrix_, ridx, fidx)];
-      if (kIsDense || compressed_bin != matrix_.NullValue()) {
-        auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
-        if (kIsDense) {
-          auto start_bin = this->matrix_.feature_segments[fidx];
-          AtomicAddGpairGlobal(d_node_hist_ + compressed_bin + start_bin, adjusted);
-        } else {
-          AtomicAddGpairGlobal(d_node_hist_ + compressed_bin, adjusted);
+      if (compressed_bin != matrix_.NullValue()) {
+        if (kCompressed) {
+          compressed_bin += this->matrix_.feature_segments[fidx];
         }
+        auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
+        AtomicAddGpairGlobal(d_node_hist_ + compressed_bin, adjusted);
       }
     }
   }
@@ -382,14 +383,14 @@ class DeviceHistogramBuilderImpl {
 
     if (!this->kernel_->shared) {
       CHECK_EQ(this->kernel_->smem_size, 0);
-      if (matrix.is_dense) {
+      if (matrix.IsDenseCompressed()) {
         launcher(this->kernel_->global_dense_kernel);
       } else {
         launcher(this->kernel_->global_kernel);
       }
     } else {
       CHECK_NE(this->kernel_->smem_size, 0);
-      if (matrix.is_dense) {
+      if (matrix.IsDenseCompressed()) {
         launcher(this->kernel_->shared_dense_kernel);
       } else {
         launcher(this->kernel_->shared_kernel);
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 31b8d34964b2..1096518685a3 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -104,6 +104,7 @@ struct GPUHistMakerDevice {
   dh::device_vector<bst_node_t> positions_;
   HistMakerTrainParam const* hist_param_;
   std::shared_ptr<common::HistogramCuts const> const cuts_;
+  std::unique_ptr<FeatureGroups> feature_groups_;
 
   auto CreatePartitionNodes(RegTree const* p_tree, std::vector<GPUExpandEntry> const& candidates) {
     std::vector<bst_node_t> nidx(candidates.size());
@@ -143,19 +144,20 @@ struct GPUHistMakerDevice {
 
   std::unique_ptr<GradientBasedSampler> sampler;
 
-  std::unique_ptr<FeatureGroups> feature_groups;
   common::Monitor monitor;
 
   GPUHistMakerDevice(Context const* ctx, TrainParam _param, HistMakerTrainParam const* hist_param,
                      std::shared_ptr<common::ColumnSampler> column_sampler, BatchParam batch_param,
                      MetaInfo const& info, std::vector<bst_idx_t> batch_ptr,
-                     std::shared_ptr<common::HistogramCuts const> cuts)
+                     std::shared_ptr<common::HistogramCuts const> cuts, bool dense_compressed)
       : evaluator_{_param, static_cast<bst_feature_t>(info.num_col_), ctx->Device()},
         ctx_{ctx},
         column_sampler_{std::move(column_sampler)},
         batch_ptr_{std::move(batch_ptr)},
         hist_param_{hist_param},
         cuts_{std::move(cuts)},
+        feature_groups_{std::make_unique<FeatureGroups>(*cuts_, dense_compressed,
+                                                        dh::MaxSharedMemoryOptin(ctx_->Ordinal()))},
         param{std::move(_param)},
         interaction_constraints(param, static_cast<bst_feature_t>(info.num_col_)),
         sampler{std::make_unique<GradientBasedSampler>(
@@ -172,15 +174,6 @@ struct GPUHistMakerDevice {
 
   ~GPUHistMakerDevice() = default;
 
-  void InitFeatureGroupsOnce(MetaInfo const& info) {
-    if (!feature_groups) {
-      CHECK(cuts_);
-      feature_groups = std::make_unique<FeatureGroups>(*cuts_, info.IsDense(),
-                                                       dh::MaxSharedMemoryOptin(ctx_->Ordinal()),
-                                                       sizeof(GradientPairInt64));
-    }
-  }
-
   // Reset values for each update iteration
   [[nodiscard]] DMatrix* Reset(HostDeviceVector<GradientPair> const* dh_gpair, DMatrix* p_fmat) {
     this->monitor.Start(__func__);
@@ -240,10 +233,8 @@ struct GPUHistMakerDevice {
      */
     this->quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, p_fmat->Info());
 
-    this->InitFeatureGroupsOnce(info);
-
     this->histogram_.Reset(ctx_, this->hist_param_->MaxCachedHistNodes(ctx_->Device()),
-                           feature_groups->DeviceAccessor(ctx_->Device()), cuts_->TotalBins(),
+                           feature_groups_->DeviceAccessor(ctx_->Device()), cuts_->TotalBins(),
                            false);
     this->monitor.Stop(__func__);
     return p_fmat;
@@ -334,7 +325,7 @@ struct GPUHistMakerDevice {
 
     auto d_ridx = partitioners_.at(k)->GetRows(nidx);
     this->histogram_.BuildHistogram(ctx_->CUDACtx(), acc,
-                                    feature_groups->DeviceAccessor(ctx_->Device()), this->gpair,
+                                    feature_groups_->DeviceAccessor(ctx_->Device()), this->gpair,
                                     d_ridx, d_node_hist, *quantiser);
     monitor.Stop(__func__);
   }
@@ -762,22 +753,26 @@ struct GPUHistMakerDevice {
   }
 };
 
-std::shared_ptr<common::HistogramCuts const> InitBatchCuts(Context const* ctx, DMatrix* p_fmat,
-                                                           BatchParam batch,
-                                                           std::vector<bst_idx_t>* p_batch_ptr) {
+std::pair<std::shared_ptr<common::HistogramCuts const>, bool> InitBatchCuts(
+    Context const* ctx, DMatrix* p_fmat, BatchParam batch, std::vector<bst_idx_t>* p_batch_ptr) {
   std::vector<bst_idx_t>& batch_ptr = *p_batch_ptr;
   batch_ptr = {0};
   std::shared_ptr<common::HistogramCuts const> cuts;
 
+  std::int32_t dense_compressed = -1;
   for (auto const& page : p_fmat->GetBatches<EllpackPage>(ctx, batch)) {
     batch_ptr.push_back(page.Size());
     cuts = page.Impl()->CutsShared();
     CHECK(cuts->cut_values_.DeviceCanRead());
+    if (dense_compressed != -1) {
+      CHECK_EQ(page.Impl()->IsDenseCompressed(), static_cast<bool>(dense_compressed));
+    }
+    dense_compressed = page.Impl()->IsDenseCompressed();
   }
   CHECK(cuts);
   CHECK_EQ(p_fmat->NumBatches(), batch_ptr.size() - 1);
   std::partial_sum(batch_ptr.cbegin(), batch_ptr.cend(), batch_ptr.begin());
-  return cuts;
+  return {cuts, dense_compressed};
 }
 
 class GPUHistMaker : public TreeUpdater {
@@ -840,10 +835,11 @@ class GPUHistMaker : public TreeUpdater {
 
     std::vector<bst_idx_t> batch_ptr;
     auto batch = HistBatch(*param);
-    auto cuts = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr);
+    auto [cuts, dense_compressed] = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr);
 
-    this->maker = std::make_unique<GPUHistMakerDevice>(
-        ctx_, *param, &hist_maker_param_, column_sampler_, batch, p_fmat->Info(), batch_ptr, cuts);
+    this->maker = std::make_unique<GPUHistMakerDevice>(ctx_, *param, &hist_maker_param_,
+                                                       column_sampler_, batch, p_fmat->Info(),
+                                                       batch_ptr, cuts, dense_compressed);
 
     p_last_fmat_ = p_fmat;
     initialised_ = true;
@@ -947,11 +943,12 @@ class GPUGlobalApproxMaker : public TreeUpdater {
 
     std::vector<bst_idx_t> batch_ptr;
     auto batch = ApproxBatch(*param, hess, *task_);
-    auto cuts = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr);
+    auto [cuts, dense_compressed] = InitBatchCuts(ctx_, p_fmat, batch, &batch_ptr);
     batch.regen = false;  // Regen only at the beginning of the iteration.
 
-    this->maker_ = std::make_unique<GPUHistMakerDevice>(
-        ctx_, *param, &hist_maker_param_, column_sampler_, batch, p_fmat->Info(), batch_ptr, cuts);
+    this->maker_ = std::make_unique<GPUHistMakerDevice>(ctx_, *param, &hist_maker_param_,
+                                                        column_sampler_, batch, p_fmat->Info(),
+                                                        batch_ptr, cuts, dense_compressed);
 
     std::size_t t_idx{0};
     for (xgboost::RegTree* tree : trees) {
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 91302c1ed563..c8d0f47709ab 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -105,6 +105,7 @@ class LintersPaths:
         "tests/test_distributed/test_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
+        "tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py",
         # demo
         "demo/dask/",
         "demo/json-model/json_parser.py",
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 55375a5a7ffa..799e08d4a741 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -7,9 +7,11 @@
 
 #include "../../../src/common/categorical.h"
 #include "../../../src/common/hist_util.h"
+#include "../../../src/data/device_adapter.cuh"  // for CupyAdapter
 #include "../../../src/data/ellpack_page.cuh"
 #include "../../../src/data/ellpack_page.h"
-#include "../../../src/tree/param.h"  // TrainParam
+#include "../../../src/data/gradient_index.h"  // for GHistIndexMatrix
+#include "../../../src/tree/param.h"           // for TrainParam
 #include "../helpers.h"
 #include "../histogram_helpers.h"
 #include "gtest/gtest.h"
@@ -24,7 +26,7 @@ TEST(EllpackPage, EmptyDMatrix) {
                         &ctx, BatchParam{kMaxBin, tree::TrainParam::DftSparseThreshold()})
                     .begin();
   auto impl = page.Impl();
-  ASSERT_EQ(impl->row_stride, 0);
+  ASSERT_EQ(impl->info.row_stride, 0);
   ASSERT_EQ(impl->Cuts().TotalBins(), 0);
   ASSERT_EQ(impl->gidx_buffer.size(), 4);
 }
@@ -36,7 +38,7 @@ TEST(EllpackPage, BuildGidxDense) {
   std::vector<common::CompressedByteT> h_gidx_buffer;
   auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
-  ASSERT_EQ(page->row_stride, n_features);
+  ASSERT_EQ(page->info.row_stride, n_features);
 
   std::vector<uint32_t> solution = {
     0, 3, 8,  9, 14, 17, 20, 21,
@@ -68,9 +70,9 @@ TEST(EllpackPage, BuildGidxSparse) {
   auto page = BuildEllpackPage(&ctx, kNRows, kNCols, 0.9f);
 
   std::vector<common::CompressedByteT> h_gidx_buffer;
-  auto h_accessor = page->GetHostAccessor(&ctx, &h_gidx_buffer);
+  auto h_acc = page->GetHostAccessor(&ctx, &h_gidx_buffer);
 
-  ASSERT_LE(page->row_stride, 3);
+  ASSERT_EQ(page->info.row_stride, 3);
 
   // row_stride = 3, 16 rows, 48 entries for ELLPack
   std::vector<uint32_t> solution = {
@@ -78,8 +80,8 @@ TEST(EllpackPage, BuildGidxSparse) {
     24, 24, 24, 24, 24,  5, 24, 24,  0, 16, 24, 15, 24, 24, 24, 24,
     24,  7, 14, 16,  4, 24, 24, 24, 24, 24,  9, 24, 24,  1, 24, 24
   };
-  for (size_t i = 0; i < kNRows * page->row_stride; ++i) {
-    ASSERT_EQ(solution[i], h_accessor.gidx_iter[i]);
+  for (size_t i = 0; i < kNRows * page->info.row_stride; ++i) {
+    ASSERT_EQ(solution[i], h_acc.gidx_iter[i]);
   }
 }
 
@@ -103,8 +105,9 @@ TEST(EllpackPage, FromCategoricalBasic) {
   auto n_uniques = std::unique(x_copy.begin(), x_copy.end()) - x_copy.begin();
   ASSERT_EQ(n_uniques, kCats);
 
-  std::vector<uint32_t> h_cuts_ptr(accessor.feature_segments.size());
-  dh::CopyDeviceSpanToVector(&h_cuts_ptr, accessor.feature_segments);
+  std::vector<uint32_t> h_cuts_ptr(accessor.NumFeatures() + 1);
+  dh::safe_cuda(cudaMemcpyAsync(h_cuts_ptr.data(), accessor.feature_segments,
+                                sizeof(bst_feature_t) * h_cuts_ptr.size(), cudaMemcpyDefault));
   std::vector<float> h_cuts_values(accessor.gidx_fvalue_map.size());
   dh::CopyDeviceSpanToVector(&h_cuts_values, accessor.gidx_fvalue_map);
 
@@ -149,7 +152,7 @@ TEST(EllpackPage, Copy) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride, kRows);
+  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->info.row_stride, kRows);
 
   // Copy batch pages into the result page.
   size_t offset = 0;
@@ -195,7 +198,7 @@ TEST(EllpackPage, Compact) {
   auto page = (*dmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
 
   // Create an empty result page.
-  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->row_stride,
+  EllpackPageImpl result(&ctx, page->CutsShared(), page->is_dense, page->info.row_stride,
                          kCompactedRows);
 
   // Compact batch pages into the result page.
@@ -240,7 +243,129 @@ TEST(EllpackPage, Compact) {
 }
 
 namespace {
-class EllpackPageTest : public testing::TestWithParam<float> {
+// Test for treating sparse ellpack as a dense
+class CompressedDense : public ::testing::TestWithParam<std::size_t> {
+  auto InitSparsePage(std::size_t null_column) const {
+    bst_idx_t n_samples = 16, n_features = 8;
+    std::vector<float> data(n_samples * n_features);
+
+    std::iota(data.begin(), data.end(), 0.0f);
+    for (std::size_t i = 0; i < data.size(); i += n_features) {
+      data[i + null_column] = std::numeric_limits<float>::quiet_NaN();
+    }
+    auto p_fmat = GetDMatrixFromData(data, n_samples, n_features);
+    return p_fmat;
+  }
+
+  void CheckBasic(Context const* ctx, BatchParam batch, std::size_t null_column,
+                  EllpackPageImpl const& impl) {
+    ASSERT_FALSE(impl.IsDense());
+    ASSERT_TRUE(impl.IsDenseCompressed());
+    ASSERT_EQ(impl.NumSymbols(), batch.max_bin + 1);
+
+    std::vector<common::CompressedByteT> h_gidx;
+    auto h_acc = impl.GetHostAccessor(ctx, &h_gidx);
+    ASSERT_EQ(h_acc.row_stride, h_acc.NumFeatures());
+    ASSERT_EQ(h_acc.NullValue(), batch.max_bin);
+    for (std::size_t i = 0; i < h_acc.row_stride * h_acc.n_rows; ++i) {
+      auto [m, n] = linalg::UnravelIndex(i, h_acc.n_rows, h_acc.row_stride);
+      if (n == null_column) {
+        ASSERT_EQ(static_cast<std::int32_t>(h_acc.gidx_iter[i]), h_acc.NullValue());
+      } else {
+        ASSERT_EQ(static_cast<std::int32_t>(h_acc.gidx_iter[i]), m);
+      }
+    }
+  }
+
+ public:
+  void CheckFromSparsePage(std::size_t null_column) {
+    auto p_fmat = this->InitSparsePage(null_column);
+    auto ctx = MakeCUDACtx(0);
+    auto batch = BatchParam{static_cast<bst_bin_t>(p_fmat->Info().num_row_),
+                            std::numeric_limits<float>::quiet_NaN()};
+
+    for (auto const& ellpack : p_fmat->GetBatches<EllpackPage>(&ctx, batch)) {
+      auto impl = ellpack.Impl();
+      this->CheckBasic(&ctx, batch, null_column, *impl);
+    }
+  }
+
+  void CheckFromAdapter(std::size_t null_column) {
+    bst_idx_t n_samples = 16, n_features = 8;
+    HostDeviceVector<float> data(n_samples * n_features);
+    auto& h_data = data.HostVector();
+    std::iota(h_data.begin(), h_data.end(), 0.0f);
+    for (std::size_t i = 0; i < h_data.size(); i += n_features) {
+      h_data[i + null_column] = std::numeric_limits<float>::quiet_NaN();
+    }
+    auto p_fmat = GetDMatrixFromData(h_data, n_samples, n_features);
+
+    auto arri = GetArrayInterface(&data, n_samples, n_features);
+    auto sarri = Json::Dump(arri);
+    data::CupyAdapter adapter{StringView{sarri}};
+
+    auto ctx = MakeCUDACtx(0);
+    Context cpu_ctx;
+    auto batch = BatchParam{static_cast<bst_bin_t>(p_fmat->Info().num_row_), 0.8};
+
+    std::shared_ptr<common::HistogramCuts> cuts;
+    for (auto const& page : p_fmat->GetBatches<GHistIndexMatrix>(&cpu_ctx, batch)) {
+      cuts = std::make_shared<common::HistogramCuts>(page.Cuts());
+    }
+    dh::device_vector<bst_idx_t> row_counts(n_samples, n_features - 1);
+    auto d_row_counts = dh::ToSpan(row_counts);
+    auto impl =
+        EllpackPageImpl{&ctx,           adapter.Value(), std::numeric_limits<float>::quiet_NaN(),
+                        false,          d_row_counts,    {},
+                        n_features - 1, n_samples,       cuts};
+    this->CheckBasic(&ctx, batch, null_column, impl);
+  }
+
+  void CheckFromToGHist(std::size_t null_column) {
+    Context cpu_ctx;
+    auto ctx = MakeCUDACtx(0);
+    std::vector<std::uint8_t> orig;
+    {
+      // Test from GHist
+      auto p_fmat = this->InitSparsePage(null_column);
+      auto batch = BatchParam{static_cast<bst_bin_t>(p_fmat->Info().num_row_), 0.8};
+      for (auto const& page : p_fmat->GetBatches<GHistIndexMatrix>(&cpu_ctx, batch)) {
+        orig = {page.data.cbegin(), page.data.cend()};
+        auto impl = EllpackPageImpl{&ctx, page, {}};
+        this->CheckBasic(&ctx, batch, null_column, impl);
+      }
+    }
+
+    {
+      // Test to GHist
+      auto p_fmat = this->InitSparsePage(null_column);
+      auto batch = BatchParam{static_cast<bst_bin_t>(p_fmat->Info().num_row_), 0.8};
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, batch)) {
+        auto gidx = GHistIndexMatrix{&ctx, p_fmat->Info(), page, batch};
+        ASSERT_EQ(gidx.Size(), p_fmat->Info().num_row_);
+        for (std::size_t ridx = 0; ridx < gidx.Size(); ++ridx) {
+          auto rbegin = gidx.row_ptr[ridx];
+          auto rend = gidx.row_ptr[ridx + 1];
+          ASSERT_EQ(rend - rbegin, p_fmat->Info().num_col_ - 1);
+        }
+        ASSERT_FALSE(gidx.index.Offset());
+        ASSERT_TRUE(std::equal(gidx.data.cbegin(), gidx.data.cend(), orig.cbegin()));
+      }
+    }
+  }
+};
+
+TEST_P(CompressedDense, FromSparsePage) { this->CheckFromSparsePage(this->GetParam()); }
+
+TEST_P(CompressedDense, FromAdapter) { this->CheckFromAdapter(this->GetParam()); }
+
+TEST_P(CompressedDense, FromToGHist) { this->CheckFromToGHist(this->GetParam()); }
+}  // anonymous namespace
+
+INSTANTIATE_TEST_SUITE_P(EllpackPage, CompressedDense, testing::Values(0ul, 1ul, 7ul));
+
+namespace {
+class SparseEllpack : public testing::TestWithParam<float> {
  protected:
   void TestFromGHistIndex(float sparsity) const {
     // Only testing with small sample size as the cuts might be different between host and
@@ -268,7 +393,7 @@ class EllpackPageTest : public testing::TestWithParam<float> {
       std::vector<common::CompressedByteT> h_gidx_from_sparse, h_gidx_from_ghist;
       auto from_ghist_acc = from_ghist->GetHostAccessor(&gpu_ctx, &h_gidx_from_ghist);
       auto from_sparse_acc = from_sparse_page->GetHostAccessor(&gpu_ctx, &h_gidx_from_sparse);
-      for (size_t i = 0; i < from_ghist->n_rows * from_ghist->row_stride; ++i) {
+      for (size_t i = 0; i < from_ghist->n_rows * from_ghist->info.row_stride; ++i) {
         ASSERT_EQ(from_ghist_acc.gidx_iter[i], from_sparse_acc.gidx_iter[i]);
       }
     }
@@ -289,9 +414,9 @@ class EllpackPageTest : public testing::TestWithParam<float> {
 };
 }  // namespace
 
-TEST_P(EllpackPageTest, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); }
+TEST_P(SparseEllpack, FromGHistIndex) { this->TestFromGHistIndex(GetParam()); }
 
-TEST_P(EllpackPageTest, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); }
+TEST_P(SparseEllpack, NumNonMissing) { this->TestNumNonMissing(this->GetParam()); }
 
-INSTANTIATE_TEST_SUITE_P(EllpackPage, EllpackPageTest, testing::Values(.0f, .2f, .4f, .8f));
+INSTANTIATE_TEST_SUITE_P(EllpackPage, SparseEllpack, testing::Values(.0f, .2f, .4f, .8f));
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu
index a26aaedb5e07..87fd6db5fa05 100644
--- a/tests/cpp/data/test_ellpack_page_raw_format.cu
+++ b/tests/cpp/data/test_ellpack_page_raw_format.cu
@@ -56,7 +56,7 @@ class TestEllpackPageRawFormat : public ::testing::TestWithParam<bool> {
       ASSERT_EQ(loaded->Cuts().MinValues(), orig->Cuts().MinValues());
       ASSERT_EQ(loaded->Cuts().Values(), orig->Cuts().Values());
       ASSERT_EQ(loaded->base_rowid, orig->base_rowid);
-      ASSERT_EQ(loaded->row_stride, orig->row_stride);
+      ASSERT_EQ(loaded->info.row_stride, orig->info.row_stride);
       std::vector<common::CompressedByteT> h_loaded, h_orig;
       [[maybe_unused]] auto h_loaded_acc = loaded->GetHostAccessor(&ctx, &h_loaded);
       [[maybe_unused]] auto h_orig_acc = orig->GetHostAccessor(&ctx, &h_orig);
@@ -125,7 +125,8 @@ TEST_P(TestEllpackPageRawFormat, HostIO) {
         ASSERT_EQ(h_acc_orig.row_stride, h_acc.row_stride);
         ASSERT_EQ(h_acc_orig.n_rows, h_acc.n_rows);
         ASSERT_EQ(h_acc_orig.base_rowid, h_acc.base_rowid);
-        ASSERT_EQ(h_acc_orig.is_dense, h_acc.is_dense);
+        ASSERT_EQ(h_acc_orig.IsDenseCompressed(), h_acc.IsDenseCompressed());
+        ASSERT_EQ(h_acc_orig.NullValue(), h_acc.NullValue());
       }
     }
   }
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cc b/tests/cpp/data/test_extmem_quantile_dmatrix.cc
index 623637ea4ec2..691bcf3369f8 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cc
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cc
@@ -45,7 +45,8 @@ class ExtMemQuantileDMatrixCpu : public ::testing::TestWithParam<float> {
     };
 
     Context ctx;
-    TestExtMemQdmBasic<GHistIndexMatrix>(&ctx, false, sparsity, equal);
+    TestExtMemQdmBasic<GHistIndexMatrix>(
+        &ctx, false, sparsity, equal, [](GHistIndexMatrix const& page) { return page.IsDense(); });
   }
 };
 }  // anonymous namespace
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.cu b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
index 3b65dffa19a1..0c0be6be8c40 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.cu
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.cu
@@ -29,10 +29,13 @@ class ExtMemQuantileDMatrixGpu : public ::testing::TestWithParam<float> {
       auto equal = std::equal(h_orig.cbegin(), h_orig.cend(), h_sparse.cbegin());
       ASSERT_TRUE(equal);
     };
+    auto no_missing = [](EllpackPage const& page) {
+      return page.Impl()->IsDense();
+    };
 
     auto ctx = MakeCUDACtx(0);
-    TestExtMemQdmBasic<EllpackPage>(&ctx, true, sparsity, equal);
-    TestExtMemQdmBasic<EllpackPage>(&ctx, false, sparsity, equal);
+    TestExtMemQdmBasic<EllpackPage>(&ctx, true, sparsity, equal, no_missing);
+    TestExtMemQdmBasic<EllpackPage>(&ctx, false, sparsity, equal, no_missing);
   }
 };
 
diff --git a/tests/cpp/data/test_extmem_quantile_dmatrix.h b/tests/cpp/data/test_extmem_quantile_dmatrix.h
index 25f2e06542c8..2d4958010b03 100644
--- a/tests/cpp/data/test_extmem_quantile_dmatrix.h
+++ b/tests/cpp/data/test_extmem_quantile_dmatrix.h
@@ -8,8 +8,9 @@
 #include "../helpers.h"               // for RandomDataGenerator
 
 namespace xgboost::data {
-template <typename Page, typename Equal>
-void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal&& check_equal) {
+template <typename Page, typename Equal, typename NoMissing>
+void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal&& check_equal,
+                        NoMissing&& no_missing) {
   bst_idx_t n_samples = 256, n_features = 16, n_batches = 4;
   bst_bin_t max_bin = 64;
   bst_target_t n_targets = 3;
@@ -31,7 +32,7 @@ void TestExtMemQdmBasic(Context const* ctx, bool on_host, float sparsity, Equal&
     ++batch_cnt;
     base_cnt += n_samples / n_batches;
     row_cnt += page.Size();
-    ASSERT_EQ((sparsity == 0.0f), page.IsDense());
+    ASSERT_EQ((sparsity == 0.0f), no_missing(page));
   }
   ASSERT_EQ(n_batches, batch_cnt);
   ASSERT_EQ(p_fmat->Info().num_row_, n_samples);
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 8d2e837ff38c..24bd93cbe5d0 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -20,14 +20,15 @@ void TestEquivalent(float sparsity) {
                      std::numeric_limits<float>::quiet_NaN(), 0, 256);
   std::size_t offset = 0;
   auto first = (*m.GetEllpackBatches(&ctx, {}).begin()).Impl();
-  std::unique_ptr<EllpackPageImpl> page_concatenated{new EllpackPageImpl(
-      &ctx, first->CutsShared(), first->is_dense, first->row_stride, 1000 * 100)};
+  std::unique_ptr<EllpackPageImpl> page_concatenated{new EllpackPageImpl{
+      &ctx, first->CutsShared(), first->is_dense, first->info.row_stride, 1000 * 100}};
   for (auto& batch : m.GetBatches<EllpackPage>(&ctx, {})) {
     auto page = batch.Impl();
     size_t num_elements = page_concatenated->Copy(&ctx, page, offset);
     offset += num_elements;
   }
-  auto from_iter = page_concatenated->GetDeviceAccessor(&ctx);
+  std::vector<common::CompressedByteT> h_iter_buffer;
+  auto from_iter = page_concatenated->GetHostAccessor(&ctx, &h_iter_buffer);
   ASSERT_EQ(m.Info().num_col_, CudaArrayIterForTest::Cols());
   ASSERT_EQ(m.Info().num_row_, CudaArrayIterForTest::Rows());
 
@@ -37,33 +38,20 @@ void TestEquivalent(float sparsity) {
       DMatrix::Create(&adapter, std::numeric_limits<float>::quiet_NaN(), 0)};
   auto bp = BatchParam{256, tree::TrainParam::DftSparseThreshold()};
   for (auto& ellpack : dm->GetBatches<EllpackPage>(&ctx, bp)) {
-    auto from_data = ellpack.Impl()->GetDeviceAccessor(&ctx);
+    std::vector<common::CompressedByteT> h_data_buffer;
+    auto from_data = ellpack.Impl()->GetHostAccessor(&ctx, &h_data_buffer);
 
-    std::vector<float> cuts_from_iter(from_iter.gidx_fvalue_map.size());
-    std::vector<float> min_fvalues_iter(from_iter.min_fvalue.size());
-    std::vector<uint32_t> cut_ptrs_iter(from_iter.feature_segments.size());
-    dh::CopyDeviceSpanToVector(&cuts_from_iter, from_iter.gidx_fvalue_map);
-    dh::CopyDeviceSpanToVector(&min_fvalues_iter, from_iter.min_fvalue);
-    dh::CopyDeviceSpanToVector(&cut_ptrs_iter, from_iter.feature_segments);
-
-    std::vector<float> cuts_from_data(from_data.gidx_fvalue_map.size());
-    std::vector<float> min_fvalues_data(from_data.min_fvalue.size());
-    std::vector<uint32_t> cut_ptrs_data(from_data.feature_segments.size());
-    dh::CopyDeviceSpanToVector(&cuts_from_data, from_data.gidx_fvalue_map);
-    dh::CopyDeviceSpanToVector(&min_fvalues_data, from_data.min_fvalue);
-    dh::CopyDeviceSpanToVector(&cut_ptrs_data, from_data.feature_segments);
-
-    ASSERT_EQ(cuts_from_iter.size(), cuts_from_data.size());
-    for (size_t i = 0; i < cuts_from_iter.size(); ++i) {
-      EXPECT_NEAR(cuts_from_iter[i], cuts_from_data[i], kRtEps);
+    ASSERT_EQ(from_iter.gidx_fvalue_map.size(), from_data.gidx_fvalue_map.size());
+    for (size_t i = 0; i < from_iter.gidx_fvalue_map.size(); ++i) {
+      EXPECT_NEAR(from_iter.gidx_fvalue_map[i], from_data.gidx_fvalue_map[i], kRtEps);
     }
-    ASSERT_EQ(min_fvalues_iter.size(), min_fvalues_data.size());
-    for (size_t i = 0; i < min_fvalues_iter.size(); ++i) {
-      ASSERT_NEAR(min_fvalues_iter[i], min_fvalues_data[i], kRtEps);
+    ASSERT_EQ(from_iter.min_fvalue.size(), from_data.min_fvalue.size());
+    for (size_t i = 0; i < from_iter.min_fvalue.size(); ++i) {
+      ASSERT_NEAR(from_iter.min_fvalue[i], from_data.min_fvalue[i], kRtEps);
     }
-    ASSERT_EQ(cut_ptrs_iter.size(), cut_ptrs_data.size());
-    for (size_t i = 0; i < cut_ptrs_iter.size(); ++i) {
-      ASSERT_EQ(cut_ptrs_iter[i], cut_ptrs_data[i]);
+    ASSERT_EQ(from_iter.NumFeatures(), from_data.NumFeatures());
+    for (size_t i = 0; i < from_iter.NumFeatures() + 1; ++i) {
+      ASSERT_EQ(from_iter.feature_segments[i], from_data.feature_segments[i]);
     }
 
     std::vector<common::CompressedByteT> buffer_from_iter, buffer_from_data;
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index ff65b6ae59b3..dd26eca609fc 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -203,8 +203,8 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
     auto impl = (*p_fmat->GetBatches<EllpackPage>(&ctx, param).begin()).Impl();
     ASSERT_EQ(impl->base_rowid, 0);
     ASSERT_EQ(impl->n_rows, kRows);
-    ASSERT_EQ(impl->is_dense, is_dense);
-    ASSERT_EQ(impl->row_stride, 2);
+    ASSERT_EQ(impl->IsDense(), is_dense);
+    ASSERT_EQ(impl->info.row_stride, 2);
     ASSERT_EQ(impl->Cuts().TotalBins(), 4);
 
     std::unique_ptr<EllpackPageImpl> impl_ext;
@@ -213,15 +213,15 @@ class TestEllpackPageExt : public ::testing::TestWithParam<std::tuple<bool, bool
       if (!impl_ext) {
         impl_ext = std::make_unique<EllpackPageImpl>(&ctx, batch.Impl()->CutsShared(),
                                                      batch.Impl()->is_dense,
-                                                     batch.Impl()->row_stride, kRows);
+                                                     batch.Impl()->info.row_stride, kRows);
       }
       auto n_elems = impl_ext->Copy(&ctx, batch.Impl(), offset);
       offset += n_elems;
     }
     ASSERT_EQ(impl_ext->base_rowid, 0);
     ASSERT_EQ(impl_ext->n_rows, kRows);
-    ASSERT_EQ(impl_ext->is_dense, is_dense);
-    ASSERT_EQ(impl_ext->row_stride, 2);
+    ASSERT_EQ(impl_ext->IsDense(), is_dense);
+    ASSERT_EQ(impl_ext->info.row_stride, 2);
     ASSERT_EQ(impl_ext->Cuts().TotalBins(), 4);
 
     std::vector<common::CompressedByteT> buffer;
diff --git a/tests/cpp/test_context.cu b/tests/cpp/test_context.cu
index a2322d23b1f4..5d8a67c22b05 100644
--- a/tests/cpp/test_context.cu
+++ b/tests/cpp/test_context.cu
@@ -28,7 +28,7 @@ void TestCUDA(Context const& ctx, bst_d_ordinal_t ord) {
 }
 }  // namespace
 
-TEST(Context, DeviceOrdinal) {
+TEST(Context, MGPUDeviceOrdinal) {
   Context ctx;
   auto n_vis = curt::AllVisibleGPUs();
   auto ord = n_vis - 1;
@@ -77,7 +77,7 @@ TEST(Context, DeviceOrdinal) {
   TestCUDA(ctx, 0);
 }
 
-TEST(Context, GPUId) {
+TEST(Context, MGPUId) {
   Context ctx;
   ctx.UpdateAllowUnknown(Args{{"gpu_id", "0"}});
   TestCUDA(ctx, 0);
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index e26c8b980649..84be979d85ad 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -61,8 +61,7 @@ TEST(Histogram, SubtractionTrack) {
 
   auto page = BuildEllpackPage(&ctx, 64, 4);
   auto cuts = page->CutsShared();
-  FeatureGroups fg{*cuts, true, std::numeric_limits<std::size_t>::max(),
-                   sizeof(GradientPairPrecise)};
+  FeatureGroups fg{*cuts, true, std::numeric_limits<std::size_t>::max()};
   auto fg_acc = fg.DeviceAccessor(ctx.Device());
   auto n_total_bins = cuts->TotalBins();
 
@@ -102,14 +101,7 @@ std::vector<GradientPairPrecise> GetHostHistGpair() {
 
 void TestBuildHist(bool use_shared_memory_histograms) {
   int const kNRows = 16, kNCols = 8;
-  Context ctx{MakeCUDACtx(0)};
-
-  TrainParam param;
-  Args args{
-      {"max_depth", "6"},
-      {"max_leaves", "0"},
-  };
-  param.Init(args);
+  auto ctx = MakeCUDACtx(0);
 
   auto page = BuildEllpackPage(&ctx, kNRows, kNCols);
   BatchParam batch_param{};
@@ -129,7 +121,7 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   auto quantiser = std::make_unique<GradientQuantiser>(&ctx, gpair.ConstDeviceSpan(), MetaInfo());
   auto shm_size = use_shared_memory_histograms ? dh::MaxSharedMemoryOptin(ctx.Ordinal()) : 0;
-  FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64));
+  FeatureGroups feature_groups(page->Cuts(), page->IsDenseCompressed(), shm_size);
 
   DeviceHistogramBuilder builder;
   builder.Reset(&ctx, HistMakerTrainParam::CudaDefaultNodes(),
@@ -161,7 +153,8 @@ TEST(Histogram, BuildHistSharedMem) {
   TestBuildHist(true);
 }
 
-void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) {
+namespace {
+void TestDeterministicHistogram(bool is_dense, std::size_t shm_size, bool force_global) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
   float constexpr kLower = -1e-2, kUpper = 1e2;
@@ -183,7 +176,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
     auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
     gpair.SetDevice(ctx.Device());
 
-    FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64));
+    FeatureGroups feature_groups{page->Cuts(), page->IsDenseCompressed(), shm_size};
 
     auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
     DeviceHistogramBuilder builder;
@@ -211,8 +204,7 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
 
       std::vector<GradientPairInt64> new_histogram_h(num_bins);
       dh::safe_cuda(cudaMemcpy(new_histogram_h.data(), d_new_histogram.data(),
-                               num_bins * sizeof(GradientPairInt64),
-                               cudaMemcpyDeviceToHost));
+                               num_bins * sizeof(GradientPairInt64), cudaMemcpyDeviceToHost));
       for (size_t j = 0; j < new_histogram_h.size(); ++j) {
         ASSERT_EQ(new_histogram_h[j].GetQuantisedGrad(), histogram_h[j].GetQuantisedGrad());
         ASSERT_EQ(new_histogram_h[j].GetQuantisedHess(), histogram_h[j].GetQuantisedHess());
@@ -236,28 +228,30 @@ void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global)
 
       std::vector<GradientPairInt64> baseline_h(num_bins);
       dh::safe_cuda(cudaMemcpy(baseline_h.data(), baseline.data().get(),
-                               num_bins * sizeof(GradientPairInt64),
-                               cudaMemcpyDeviceToHost));
+                               num_bins * sizeof(GradientPairInt64), cudaMemcpyDeviceToHost));
 
       for (size_t i = 0; i < baseline.size(); ++i) {
-        EXPECT_NEAR(baseline_h[i].GetQuantisedGrad(), histogram_h[i].GetQuantisedGrad(),
+        ASSERT_NEAR(baseline_h[i].GetQuantisedGrad(), histogram_h[i].GetQuantisedGrad(),
                     baseline_h[i].GetQuantisedGrad() * 1e-3);
       }
     }
   }
 }
-
-TEST(Histogram, GPUDeterministic) {
-  std::vector<bool> is_dense_array{false, true};
-  std::vector<int> shm_sizes{48 * 1024, 64 * 1024, 160 * 1024};
-  for (bool is_dense : is_dense_array) {
-    for (int shm_size : shm_sizes) {
-      for (bool force_global : {true, false}) {
-        TestDeterministicHistogram(is_dense, shm_size, force_global);
-      }
-    }
+class TestGPUDeterministic : public ::testing::TestWithParam<std::tuple<bool, std::size_t, bool>> {
+ protected:
+  void Run() {
+    auto [is_dense, shm_size, force_global] = this->GetParam();
+    TestDeterministicHistogram(is_dense, shm_size, force_global);
   }
-}
+};
+}  // anonymous namespace
+
+TEST_P(TestGPUDeterministic, Histogram) { this->Run(); }
+
+INSTANTIATE_TEST_SUITE_P(Histogram, TestGPUDeterministic,
+                         ::testing::Combine(::testing::Bool(),
+                                            ::testing::Values(48 * 1024, 64 * 1024, 160 * 1024),
+                                            ::testing::Bool()));
 
 void ValidateCategoricalHistogram(size_t n_categories, common::Span<GradientPairInt64> onehot,
                                   common::Span<GradientPairInt64> cat) {
@@ -513,13 +507,7 @@ TEST_P(HistogramExternalMemoryTest, ExternalMemory) {
   std::apply(&HistogramExternalMemoryTest::Run, std::tuple_cat(std::make_tuple(this), GetParam()));
 }
 
-INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest, ::testing::ValuesIn([]() {
-                           std::vector<std::tuple<float, bool>> params;
-                           for (auto global : {true, false}) {
-                             for (auto sparsity : {0.0f, 0.2f, 0.8f}) {
-                               params.emplace_back(sparsity, global);
-                             }
-                           }
-                           return params;
-                         }()));
+INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest,
+                         ::testing::Combine(::testing::Values(0.0f, 0.2f, 0.8f),
+                                            ::testing::Bool()));
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/test_gpu_approx.cu b/tests/cpp/tree/test_gpu_approx.cu
new file mode 100644
index 000000000000..7df60b8cbcd2
--- /dev/null
+++ b/tests/cpp/tree/test_gpu_approx.cu
@@ -0,0 +1,65 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/json.h>          // for Json
+#include <xgboost/task.h>          // for ObjInfo
+#include <xgboost/tree_model.h>    // for RegTree
+#include <xgboost/tree_updater.h>  // for TreeUpdater
+
+#include "../../../src/tree/param.h"    // for TrainParam
+#include "../collective/test_worker.h"  // for BaseMGPUTest
+#include "../helpers.h"                 // for GenerateRandomGradients
+
+namespace xgboost::tree {
+namespace {
+RegTree GetApproxTree(Context const* ctx, DMatrix* dmat) {
+  ObjInfo task{ObjInfo::kRegression};
+  std::unique_ptr<TreeUpdater> approx_maker{TreeUpdater::Create("grow_gpu_approx", ctx, &task)};
+  approx_maker->Configure(Args{});
+
+  TrainParam param;
+  param.UpdateAllowUnknown(Args{});
+
+  linalg::Matrix<GradientPair> gpair({dmat->Info().num_row_}, ctx->Device());
+  gpair.Data()->Copy(GenerateRandomGradients(dmat->Info().num_row_));
+
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+  RegTree tree;
+  approx_maker->Update(&param, &gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position},
+                       {&tree});
+  return tree;
+}
+
+void VerifyApproxColumnSplit(bst_idx_t rows, bst_feature_t cols, RegTree const& expected_tree) {
+  auto ctx = MakeCUDACtx(DistGpuIdx());
+
+  auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true);
+  auto const world_size = collective::GetWorldSize();
+  auto const rank = collective::GetRank();
+  std::unique_ptr<DMatrix> sliced{Xy->SliceCol(world_size, rank)};
+
+  RegTree tree = GetApproxTree(&ctx, sliced.get());
+
+  Json json{Object{}};
+  tree.SaveModel(&json);
+  Json expected_json{Object{}};
+  expected_tree.SaveModel(&expected_json);
+  ASSERT_EQ(json, expected_json);
+}
+}  // anonymous namespace
+
+class MGPUApproxTest : public collective::BaseMGPUTest {};
+
+TEST_F(MGPUApproxTest, GPUApproxColumnSplit) {
+  auto constexpr kRows = 32;
+  auto constexpr kCols = 16;
+
+  Context ctx(MakeCUDACtx(0));
+  auto dmat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true);
+  RegTree expected_tree = GetApproxTree(&ctx, dmat.get());
+
+  this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, true);
+  this->DoTest([&] { VerifyApproxColumnSplit(kRows, kCols, expected_tree); }, false);
+}
+}  // namespace xgboost::tree
diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py
new file mode 100644
index 000000000000..4e0f0bcc2f89
--- /dev/null
+++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_external_memory.py
@@ -0,0 +1,31 @@
+"""Copyright 2024, XGBoost contributors"""
+
+import pytest
+from dask_cuda import LocalCUDACluster
+from distributed import Client
+
+import xgboost as xgb
+from xgboost.testing.dask import check_external_memory
+
+
+@pytest.mark.parametrize("is_qdm", [True, False])
+def test_external_memory(is_qdm: bool) -> None:
+    n_workers = 2
+    with LocalCUDACluster(n_workers=2) as cluster:
+        with Client(cluster) as client:
+            args = client.sync(
+                xgb.dask._get_rabit_args,
+                2,
+                None,
+                client,
+            )
+
+            futs = client.map(
+                check_external_memory,
+                range(n_workers),
+                n_workers=n_workers,
+                device="cuda",
+                comm_args=args,
+                is_qdm=is_qdm,
+            )
+            client.gather(futs)

From 4e1152cfb044fa2414b95a179493091f91f0caf4 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 1 Oct 2024 18:53:49 +0800
Subject: [PATCH 2/6] Avoid over compression for now.

cleanup.

Fix.

test.

fix.
---
 src/data/ellpack_page.cu                 | 60 ++++++++++--------------
 tests/cpp/data/test_ellpack_page.cu      | 18 +++++--
 tests/cpp/data/test_iterative_dmatrix.cu | 36 +++++++-------
 3 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 57779aa55c91..fe804ebbcdd4 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -130,9 +130,10 @@ __global__ void CompressBinEllpackKernel(
 namespace {
 // Calculate the number of symbols for the compressed ellpack. Similar to what the CPU
 // implementation does, we compress the dense data by subtracting the bin values with the
-// starting bin of its feature if it's dense.
+// starting bin of its feature if it's dense. In addition, we treat the data as dense if
+// there's no compression to be made by using ellpack.
 [[nodiscard]] EllpackPageImpl::Info CalcNumSymbols(
-    Context const* ctx, bst_idx_t n_samples, bst_idx_t row_stride, bool is_dense,
+    Context const* ctx, bst_idx_t row_stride, bool is_dense,
     std::shared_ptr<common::HistogramCuts const> cuts) {
   // Return the total number of symbols (total number of bins plus 1 for missing)
   // The null value equals the total number of bins.
@@ -148,38 +149,26 @@ namespace {
 
   // Calculate the number of required symbols if we treat the data as dense.
   PtrT n_symbols_dense{0};
-  {
-    CUDAContext const* cuctx = ctx->CUDACtx();
-    auto it = dh::MakeTransformIterator<PtrT>(
-        thrust::make_counting_iterator(1ul),
-        [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; });
-    CHECK_GE(dptrs.size(), 2);
-    auto max_it = thrust::max_element(cuctx->CTP(), it, it + dptrs.size() - 1);
-    dh::CachingDeviceUVector<PtrT> max_element(1);
-    auto d_me = max_element.data();
-    dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; });
-
-    dh::safe_cuda(cudaMemcpyAsync(&n_symbols_dense, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost,
-                                  cuctx->Stream()));
-    cuctx->Stream().Sync();
-  }
-
+  CUDAContext const* cuctx = ctx->CUDACtx();
+  auto it = dh::MakeTransformIterator<PtrT>(
+      thrust::make_counting_iterator(1ul),
+      [=] XGBOOST_DEVICE(std::size_t i) { return dptrs[i] - dptrs[i - 1]; });
+  CHECK_GE(dptrs.size(), 2);
+  auto max_it = thrust::max_element(cuctx->CTP(), it, it + dptrs.size() - 1);
+  dh::CachingDeviceUVector<PtrT> max_element(1);
+  auto d_me = max_element.data();
+  dh::LaunchN(1, cuctx->Stream(), [=] XGBOOST_DEVICE(std::size_t i) { d_me[i] = *max_it; });
+  dh::safe_cuda(cudaMemcpyAsync(&n_symbols_dense, d_me, sizeof(PtrT), cudaMemcpyDeviceToHost,
+                                cuctx->Stream()));
+  cuctx->Stream().Sync();
+  // Decide the type of the data.
   CHECK_LE(row_stride, n_features);
-  // Treat the ellpack as dense if we can save memory.
-  auto calc_n_bytes = [&](bst_idx_t row_stride, auto n_symbols) {
-    return common::CompressedBufferWriter::CalculateBufferSize(row_stride * n_samples, n_symbols);
-  };
-  auto as_dense = [&] {
-    bool no_compression = n_features == row_stride;
-    bool less = calc_n_bytes(n_features, n_symbols_dense + 1) < calc_n_bytes(row_stride, n_symbols);
-    return no_compression || less;
-  };
-
   if (is_dense) {
     // No missing, hence no null value, hence no + 1 symbol.
     LOG(INFO) << "Ellpack is dense.";
     return {n_features, n_symbols_dense};
-  } else if (as_dense()) {
+  } else if (n_features == row_stride) {
+    // Treat the ellpack as dense if we can save memory.
     LOG(INFO) << "Ellpack is relatively dense.";
     return {n_features, n_symbols_dense + 1};  // +1 for missing value (null in ellpack)
   } else {
@@ -196,7 +185,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
     : is_dense{is_dense},
       n_rows{n_rows},
       cuts_{std::move(cuts)},
-      info{CalcNumSymbols(ctx, n_rows, row_stride, is_dense, this->cuts_)} {
+      info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
 
@@ -210,7 +199,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx,
     : is_dense{is_dense},
       n_rows{page.Size()},
       cuts_{std::move(cuts)},
-      info{CalcNumSymbols(ctx, n_rows, row_stride, is_dense, this->cuts_)} {
+      info{CalcNumSymbols(ctx, row_stride, is_dense, this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
 
@@ -228,7 +217,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, DMatrix* p_fmat, const Batc
                       common::DeviceSketch(ctx, p_fmat, param.max_bin))
                 : std::make_shared<common::HistogramCuts>(
                       common::DeviceSketchWithHessian(ctx, p_fmat, param.max_bin, param.hess))},
-      info{CalcNumSymbols(ctx, n_rows, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} {
+      info{CalcNumSymbols(ctx, GetRowStride(p_fmat), p_fmat->IsDense(), this->cuts_)} {
   monitor_.Init("ellpack_page");
   curt::SetDevice(ctx->Ordinal());
 
@@ -466,7 +455,7 @@ EllpackPageImpl::EllpackPageImpl(Context const* ctx, GHistIndexMatrix const& pag
         return cuts;
       }()},
       info{CalcNumSymbols(
-          ctx, n_rows,
+          ctx,
           [&] {
             auto it = common::MakeIndexTransformIter(
                 [&](bst_idx_t i) { return page.row_ptr[i + 1] - page.row_ptr[i]; });
@@ -523,8 +512,7 @@ struct CopyPage {
 bst_idx_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bst_idx_t offset) {
   monitor_.Start(__func__);
   bst_idx_t num_elements = page->n_rows * page->info.row_stride;
-  CHECK_EQ(this->info.row_stride,
-           page->info.row_stride);  // fixme: make sure row strides are correctly handled
+  CHECK_EQ(this->info.row_stride, page->info.row_stride);
   CHECK_EQ(NumSymbols(), page->NumSymbols());
   CHECK_GE(this->n_rows * this->info.row_stride, offset + num_elements);
   if (page == this) {
@@ -582,7 +570,7 @@ struct CompactPage {
 void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
                               common::Span<size_t> row_indexes) {
   monitor_.Start(__func__);
-  CHECK_EQ(this->info.row_stride, page->info.row_stride);  // fixme: check in iterative dmatrix
+  CHECK_EQ(this->info.row_stride, page->info.row_stride);
   CHECK_EQ(this->NumSymbols(), page->NumSymbols());
   CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
   auto cuctx = ctx->CUDACtx();
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 799e08d4a741..932d43ecdfa6 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -253,6 +253,7 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
     for (std::size_t i = 0; i < data.size(); i += n_features) {
       data[i + null_column] = std::numeric_limits<float>::quiet_NaN();
     }
+    data[null_column] = null_column;  // keep the first sample full.
     auto p_fmat = GetDMatrixFromData(data, n_samples, n_features);
     return p_fmat;
   }
@@ -269,7 +270,7 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
     ASSERT_EQ(h_acc.NullValue(), batch.max_bin);
     for (std::size_t i = 0; i < h_acc.row_stride * h_acc.n_rows; ++i) {
       auto [m, n] = linalg::UnravelIndex(i, h_acc.n_rows, h_acc.row_stride);
-      if (n == null_column) {
+      if (n == null_column && m != 0) {
         ASSERT_EQ(static_cast<std::int32_t>(h_acc.gidx_iter[i]), h_acc.NullValue());
       } else {
         ASSERT_EQ(static_cast<std::int32_t>(h_acc.gidx_iter[i]), m);
@@ -298,6 +299,7 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
     for (std::size_t i = 0; i < h_data.size(); i += n_features) {
       h_data[i + null_column] = std::numeric_limits<float>::quiet_NaN();
     }
+    h_data[null_column] = null_column;  // Keep the first sample full.
     auto p_fmat = GetDMatrixFromData(h_data, n_samples, n_features);
 
     auto arri = GetArrayInterface(&data, n_samples, n_features);
@@ -313,11 +315,12 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
       cuts = std::make_shared<common::HistogramCuts>(page.Cuts());
     }
     dh::device_vector<bst_idx_t> row_counts(n_samples, n_features - 1);
+    row_counts[0] = n_features;
     auto d_row_counts = dh::ToSpan(row_counts);
     auto impl =
-        EllpackPageImpl{&ctx,           adapter.Value(), std::numeric_limits<float>::quiet_NaN(),
-                        false,          d_row_counts,    {},
-                        n_features - 1, n_samples,       cuts};
+        EllpackPageImpl{&ctx,       adapter.Value(), std::numeric_limits<float>::quiet_NaN(),
+                        false,      d_row_counts,    {},
+                        n_features, n_samples,       cuts};
     this->CheckBasic(&ctx, batch, null_column, impl);
   }
 
@@ -346,8 +349,13 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
         for (std::size_t ridx = 0; ridx < gidx.Size(); ++ridx) {
           auto rbegin = gidx.row_ptr[ridx];
           auto rend = gidx.row_ptr[ridx + 1];
-          ASSERT_EQ(rend - rbegin, p_fmat->Info().num_col_ - 1);
+          if (ridx == 0) {
+            ASSERT_EQ(rend - rbegin, p_fmat->Info().num_col_);
+          } else {
+            ASSERT_EQ(rend - rbegin, p_fmat->Info().num_col_ - 1);
+          }
         }
+        // GHist can't compress a dataset with missing values
         ASSERT_FALSE(gidx.index.Offset());
         ASSERT_TRUE(std::equal(gidx.data.cbegin(), gidx.data.cend(), orig.cbegin()));
       }
diff --git a/tests/cpp/data/test_iterative_dmatrix.cu b/tests/cpp/data/test_iterative_dmatrix.cu
index 24bd93cbe5d0..eebc38fbeee2 100644
--- a/tests/cpp/data/test_iterative_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_dmatrix.cu
@@ -110,39 +110,43 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
 
 TEST(IterativeDeviceDMatrix, RowMajorMissing) {
   const float kMissing = std::numeric_limits<float>::quiet_NaN();
-  size_t rows = 10;
-  size_t cols = 2;
-  CudaArrayIterForTest iter(0.0f, rows, cols, 2);
+  bst_idx_t rows = 4;
+  size_t cols = 3;
+  CudaArrayIterForTest iter{0.0f, rows, cols, 2};
   std::string interface_str = iter.AsArray();
-  auto j_interface =
-      Json::Load({interface_str.c_str(), interface_str.size()});
-  ArrayInterface<2> loaded {get<Object const>(j_interface)};
+  auto j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
+  ArrayInterface<2> loaded{get<Object const>(j_interface)};
   std::vector<float> h_data(cols * rows);
   common::Span<float const> s_data{static_cast<float const*>(loaded.data), cols * rows};
   dh::CopyDeviceSpanToVector(&h_data, s_data);
   h_data[1] = kMissing;
   h_data[5] = kMissing;
   h_data[6] = kMissing;
-  auto ptr = thrust::device_ptr<float>(
-      reinterpret_cast<float *>(get<Integer>(j_interface["data"][0])));
+  h_data[9] = kMissing;  // idx = (2, 0)
+  h_data[10] = kMissing; // idx = (2, 1)
+  auto ptr =
+      thrust::device_ptr<float>(reinterpret_cast<float*>(get<Integer>(j_interface["data"][0])));
   thrust::copy(h_data.cbegin(), h_data.cend(), ptr);
-
-  IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
-                     std::numeric_limits<float>::quiet_NaN(), 0, 256);
+  IterativeDMatrix m{
+      &iter, iter.Proxy(), nullptr, Reset, Next, std::numeric_limits<float>::quiet_NaN(), 0, 256};
   auto ctx = MakeCUDACtx(0);
   auto& ellpack =
       *m.GetBatches<EllpackPage>(&ctx, BatchParam{256, tree::TrainParam::DftSparseThreshold()})
            .begin();
   auto impl = ellpack.Impl();
   std::vector<common::CompressedByteT> h_gidx;
-  auto h_accessor = impl->GetHostAccessor(&ctx, &h_gidx);
-  EXPECT_EQ(h_accessor.gidx_iter[1], impl->GetDeviceAccessor(&ctx).NullValue());
-  EXPECT_EQ(h_accessor.gidx_iter[5], impl->GetDeviceAccessor(&ctx).NullValue());
+  auto h_acc = impl->GetHostAccessor(&ctx, &h_gidx);
   // null values get placed after valid values in a row
-  EXPECT_EQ(h_accessor.gidx_iter[7], impl->GetDeviceAccessor(&ctx).NullValue());
+  ASSERT_FALSE(h_acc.IsDenseCompressed());
+  ASSERT_EQ(h_acc.row_stride, cols - 1);
+  ASSERT_EQ(h_acc.gidx_iter[7], impl->GetDeviceAccessor(&ctx).NullValue());
+  for (std::size_t i = 0; i < 7; ++i) {
+  ASSERT_NE(h_acc.gidx_iter[i], impl->GetDeviceAccessor(&ctx).NullValue());
+  }
+
   EXPECT_EQ(m.Info().num_col_, cols);
   EXPECT_EQ(m.Info().num_row_, rows);
-  EXPECT_EQ(m.Info().num_nonzero_, rows* cols - 3);
+  EXPECT_EQ(m.Info().num_nonzero_, rows * cols - 5);
 }
 
 TEST(IterativeDeviceDMatrix, IsDense) {

From 5d7c22f7722a12701b1d9918bf50478a197168f6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 3 Oct 2024 00:19:17 +0800
Subject: [PATCH 3/6] Update & fix.

---
 src/data/ellpack_page.cuh           | 9 ++++++++-
 src/data/ellpack_page.h             | 2 ++
 tests/cpp/data/test_ellpack_page.cu | 8 ++++++--
 3 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 7a2ce40add9d..e85bb0970d26 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -141,6 +141,13 @@ struct EllpackDeviceAccessor {
 
 class GHistIndexMatrix;
 
+/**
+ * @brief This is either an Ellpack format matrix or a dense matrix.
+ *
+ * When there's no compression can be made by using ellpack, we use this structure as a
+ * simple dense matrix. For dense matrix, we can provide extra compression by counting the
+ * histogram bin for each feature instead of for the entire dataset.
+ */
 class EllpackPageImpl {
  public:
   /**
@@ -152,7 +159,7 @@ class EllpackPageImpl {
   EllpackPageImpl() = default;
 
   /**
-   * @brief Constructor from an existing EllpackInfo.
+   * @brief Constructor from existing ellpack matrics.
    *
    * This is used in the sampling case. The ELLPACK page is constructed from an existing
    * Ellpack page and the given number of rows.
diff --git a/src/data/ellpack_page.h b/src/data/ellpack_page.h
index bbd6db0eef7f..8e54855049a2 100644
--- a/src/data/ellpack_page.h
+++ b/src/data/ellpack_page.h
@@ -17,6 +17,8 @@ class EllpackPageImpl;
  *
  * This class uses the PImpl idiom (https://en.cppreference.com/w/cpp/language/pimpl) to avoid
  * including CUDA-specific implementation details in the header.
+ *
+ * See @ref EllpackPageImpl .
  */
 class EllpackPage {
  public:
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index 932d43ecdfa6..db843cd0564f 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -293,7 +293,9 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
 
   void CheckFromAdapter(std::size_t null_column) {
     bst_idx_t n_samples = 16, n_features = 8;
-    HostDeviceVector<float> data(n_samples * n_features);
+
+    auto ctx = MakeCUDACtx(0);
+    HostDeviceVector<float> data(n_samples * n_features, 0.0f, ctx.Device());
     auto& h_data = data.HostVector();
     std::iota(h_data.begin(), h_data.end(), 0.0f);
     for (std::size_t i = 0; i < h_data.size(); i += n_features) {
@@ -302,11 +304,11 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
     h_data[null_column] = null_column;  // Keep the first sample full.
     auto p_fmat = GetDMatrixFromData(h_data, n_samples, n_features);
 
+    data.ConstDeviceSpan();  // Pull to device
     auto arri = GetArrayInterface(&data, n_samples, n_features);
     auto sarri = Json::Dump(arri);
     data::CupyAdapter adapter{StringView{sarri}};
 
-    auto ctx = MakeCUDACtx(0);
     Context cpu_ctx;
     auto batch = BatchParam{static_cast<bst_bin_t>(p_fmat->Info().num_row_), 0.8};
 
@@ -317,11 +319,13 @@ class CompressedDense : public ::testing::TestWithParam<std::size_t> {
     dh::device_vector<bst_idx_t> row_counts(n_samples, n_features - 1);
     row_counts[0] = n_features;
     auto d_row_counts = dh::ToSpan(row_counts);
+    ASSERT_EQ(adapter.NumColumns(), n_features);
     auto impl =
         EllpackPageImpl{&ctx,       adapter.Value(), std::numeric_limits<float>::quiet_NaN(),
                         false,      d_row_counts,    {},
                         n_features, n_samples,       cuts};
     this->CheckBasic(&ctx, batch, null_column, impl);
+    dh::DefaultStream().Sync();
   }
 
   void CheckFromToGHist(std::size_t null_column) {

From d7e934456397339391aef6bbce7ba5249eb17043 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 3 Oct 2024 16:44:17 +0800
Subject: [PATCH 4/6] empty.

---
 src/data/ellpack_page.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index fe804ebbcdd4..7ca3fcc065d8 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -328,7 +328,7 @@ void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
 
   // For dense compressed data, we can simply copy the data with the input position.
   if (kIsDense) {
-    CHECK_EQ(batch.NumCols(), dst->info.row_stride);
+    CHECK(batch.NumRows() == 0 || batch.NumCols() == dst->info.row_stride);
     thrust::for_each_n(ctx->CUDACtx()->CTP(), cnt, dst->Size() * dst->info.row_stride, functor);
     return;
   }

From adda02e32bd606f24f52c061a2f9e3f47f63a519 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 3 Oct 2024 17:32:02 +0800
Subject: [PATCH 5/6] cleanup.

---
 src/data/ellpack_page.cu                    | 8 ++++----
 src/data/ellpack_page.cuh                   | 2 +-
 src/tree/gpu_hist/gradient_based_sampler.cu | 4 ++--
 src/tree/updater_gpu_hist.cu                | 5 +++--
 tests/cpp/data/test_ellpack_page.cu         | 3 +++
 5 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/data/ellpack_page.cu b/src/data/ellpack_page.cu
index 7ca3fcc065d8..ef7f7975baf0 100644
--- a/src/data/ellpack_page.cu
+++ b/src/data/ellpack_page.cu
@@ -257,7 +257,7 @@ struct WriteCompressedEllpackFunctor {
   // Tuple[2] = The index in the input data
   using Tuple = thrust::tuple<bst_idx_t, bst_idx_t, bst_idx_t>;
 
-  template <bool kIsDense>
+  template <bool kIsDenseCompressed>
   __device__ void Write(data::COOTuple const& e, bst_idx_t out_position) {
     bst_bin_t bin_idx = 0;
     if (common::IsCat(feature_types, e.column_idx)) {
@@ -265,7 +265,7 @@ struct WriteCompressedEllpackFunctor {
     } else {
       bin_idx = accessor.SearchBin<false>(e.value, e.column_idx);
     }
-    if constexpr (kIsDense) {
+    if constexpr (kIsDenseCompressed) {
       bin_idx -= accessor.feature_segments[e.column_idx];
     }
     writer.AtomicWriteSymbol(d_buffer, bin_idx, out_position);
@@ -306,7 +306,7 @@ struct TupleScanOp {
 
 // Here the data is already correctly ordered and simply needs to be compacted
 // to remove missing data
-template <bool kIsDense, typename AdapterBatchT>
+template <bool kIsDenseCompressed, typename AdapterBatchT>
 void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
                        common::Span<FeatureType const> feature_types, EllpackPageImpl* dst,
                        float missing) {
@@ -327,7 +327,7 @@ void CopyDataToEllpack(Context const* ctx, const AdapterBatchT& batch,
       d_compressed_buffer, writer, batch, device_accessor, feature_types, is_valid};
 
   // For dense compressed data, we can simply copy the data with the input position.
-  if (kIsDense) {
+  if (kIsDenseCompressed) {
     CHECK(batch.NumRows() == 0 || batch.NumCols() == dst->info.row_stride);
     thrust::for_each_n(ctx->CUDACtx()->CTP(), cnt, dst->Size() * dst->info.row_stride, functor);
     return;
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index e85bb0970d26..85e412f86398 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -237,7 +237,7 @@ class EllpackPageImpl {
    */
   [[nodiscard]] bool IsDense() const { return this->is_dense; }
   /**
-   * @brief Stored as a dense matrix, but there are might be missing values.
+   * @brief Stored as a dense matrix, but there might be missing values.
    */
   [[nodiscard]] bool IsDenseCompressed() const {
     return this->cuts_->NumFeatures() == this->info.row_stride;
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
index 77a720fcc709..c9048e68ac77 100644
--- a/src/tree/gpu_hist/gradient_based_sampler.cu
+++ b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -206,7 +206,7 @@ GradientBasedSample ExternalMemoryUniformSampling::Sample(Context const* ctx,
   auto first_page = (*batch_iterator.begin()).Impl();
   // Create a new ELLPACK page with empty rows.
   *page = EllpackPageImpl{ctx, first_page->CutsShared(), first_page->IsDense(),
-                          first_page->info.row_stride, sample_rows};  // fixme: row_stride
+                          first_page->info.row_stride, sample_rows};
 
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(cuctx->CTP(), page->gidx_buffer.begin(), page->gidx_buffer.end(), 0);
@@ -291,7 +291,7 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   // Create a new ELLPACK page with empty rows.
 
   *page = EllpackPageImpl{ctx, first_page->CutsShared(), dmat->IsDense(),
-                          first_page->info.row_stride, sample_rows};  // fixme: row_stride
+                          first_page->info.row_stride, sample_rows};
   // Compact the ELLPACK pages into the single sample page.
   thrust::fill(cuctx->CTP(), page->gidx_buffer.begin(), page->gidx_buffer.end(), 0);
   for (auto& batch : batch_iterator) {
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 1096518685a3..002f3e9a6ff1 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -754,7 +754,8 @@ struct GPUHistMakerDevice {
 };
 
 std::pair<std::shared_ptr<common::HistogramCuts const>, bool> InitBatchCuts(
-    Context const* ctx, DMatrix* p_fmat, BatchParam batch, std::vector<bst_idx_t>* p_batch_ptr) {
+    Context const* ctx, DMatrix* p_fmat, BatchParam const& batch,
+    std::vector<bst_idx_t>* p_batch_ptr) {
   std::vector<bst_idx_t>& batch_ptr = *p_batch_ptr;
   batch_ptr = {0};
   std::shared_ptr<common::HistogramCuts const> cuts;
@@ -772,7 +773,7 @@ std::pair<std::shared_ptr<common::HistogramCuts const>, bool> InitBatchCuts(
   CHECK(cuts);
   CHECK_EQ(p_fmat->NumBatches(), batch_ptr.size() - 1);
   std::partial_sum(batch_ptr.cbegin(), batch_ptr.cend(), batch_ptr.begin());
-  return {cuts, dense_compressed};
+  return {cuts, static_cast<bool>(dense_compressed)};
 }
 
 class GPUHistMaker : public TreeUpdater {
diff --git a/tests/cpp/data/test_ellpack_page.cu b/tests/cpp/data/test_ellpack_page.cu
index db843cd0564f..0ccb79def2ec 100644
--- a/tests/cpp/data/test_ellpack_page.cu
+++ b/tests/cpp/data/test_ellpack_page.cu
@@ -62,6 +62,9 @@ TEST(EllpackPage, BuildGidxDense) {
     auto fidx = i % n_features;
     ASSERT_EQ(solution[i], h_accessor.gidx_iter[i] + h_accessor.feature_segments[fidx]);
   }
+  ASSERT_EQ(page->NumSymbols(), 3);
+  ASSERT_EQ(page->NumNonMissing(&ctx, {}), n_samples * n_features);
+  ASSERT_EQ(page->NumSymbols(), h_accessor.NullValue());
 }
 
 TEST(EllpackPage, BuildGidxSparse) {

From 5a2b0c4f7b9b3c510833595faa6beaca803683c0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 3 Oct 2024 17:53:26 +0800
Subject: [PATCH 6/6] Test.

---
 tests/cpp/common/test_compressed_iterator.cc | 24 ++++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/tests/cpp/common/test_compressed_iterator.cc b/tests/cpp/common/test_compressed_iterator.cc
index 93243c0b336e..41280dde7fa7 100644
--- a/tests/cpp/common/test_compressed_iterator.cc
+++ b/tests/cpp/common/test_compressed_iterator.cc
@@ -1,9 +1,25 @@
+/**
+ * Copyright 2017-2024, XGBoost contributors
+ */
 #include "../../../src/common/compressed_iterator.h"
 #include "gtest/gtest.h"
 #include <algorithm>
 
-namespace xgboost {
-namespace common {
+namespace xgboost::common {
+TEST(CompressedIterator, Size) {
+  bst_idx_t n = 2048;
+  {
+    bst_idx_t n_symbols = 256;
+    auto n_bytes = CompressedBufferWriter::CalculateBufferSize(n, n_symbols);
+    ASSERT_EQ(n_bytes, 2052);
+  }
+  {
+    bst_idx_t n_symbols = 64;
+    auto n_bytes = CompressedBufferWriter::CalculateBufferSize(n, n_symbols);
+    ASSERT_EQ(n_bytes, 1540);
+  }
+}
+
 TEST(CompressedIterator, Test) {
   ASSERT_TRUE(detail::SymbolBits(256) == 8);
   ASSERT_TRUE(detail::SymbolBits(150) == 8);
@@ -50,6 +66,4 @@ TEST(CompressedIterator, Test) {
     }
   }
 }
-
-}  // namespace common
-}  // namespace xgboost
+}  // namespace xgboost::common