fix tests fails

razdoburdin · Feb 7, 2024 · 54f577a · 54f577a
1 parent 527af09
commit 54f577a
Show file tree

Hide file tree

Showing 9 changed files with 73 additions and 53 deletions.
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
@@ -107,7 +107,7 @@ class Predictor {
    */
   virtual void PredictBatch(DMatrix* dmat, PredictionCacheEntry* out_preds,
                             const gbm::GBTreeModel& model, uint32_t tree_begin,
-                            uint32_t tree_end = 0) const = 0;
+                            uint32_t tree_end = 0, bool training = false) const = 0;
 
   /**
    * \brief Inplace prediction.

diff --git a/plugin/sycl/common/hist_util.h b/plugin/sycl/common/hist_util.h
@@ -6,6 +6,7 @@
 #define PLUGIN_SYCL_COMMON_HIST_UTIL_H_
 
 #include <vector>
+#include <unordered_map>
 
 #include "../data.h"
 #include "row_set.h"
@@ -62,11 +63,11 @@ class HistCollection {
 
   // Access histogram for i-th node
   GHistRowT& operator[](bst_uint nid) {
-    return data_[nid];
+    return *(data_.at(nid));
   }
 
   const GHistRowT& operator[](bst_uint nid) const {
-    return data_[nid];
+    return *(data_.at(nid));
   }
 
   // Initialize histogram collection
@@ -78,20 +79,19 @@ class HistCollection {
     }
   }
 
-  // Reserve the space for hist rows
-  void Reserve(bst_uint max_nid) {
-    data_.reserve(max_nid + 1);
-  }
-
   // Create an empty histogram for i-th node
   ::sycl::event AddHistRow(bst_uint nid) {
-    if (nid >= data_.size()) {
-      data_.resize(nid + 1);
-    }
     ::sycl::event event;
-    data_[nid].Resize(&qu_, nbins_,
-                      xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
-                      &event);
+    if (data_.count(nid) == 0) {
+      data_[nid] = 
+        std::make_shared<GHistRowT>(&qu_, nbins_,
+                                    xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
+                                    &event);
+    } else {
+      data_[nid]->Resize(&qu_, nbins_,
+                         xgboost::detail::GradientPairInternal<GradientSumT>(0, 0),
+                         &event);
+    }
     return event;
   }
 
@@ -103,7 +103,7 @@ class HistCollection {
   /*! \brief Number of all bins over all features */
   uint32_t nbins_ = 0;
 
-  std::vector<GHistRowT> data_;
+  std::unordered_map<uint32_t, std::shared_ptr<GHistRowT>> data_;
 
   ::sycl::queue qu_;
 };

diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
@@ -82,6 +82,12 @@ class USMVector {
     qu->fill(data_.get(), v, size_).wait();
   }
 
+  USMVector(::sycl::queue* qu, size_t size, T v,
+            ::sycl::event* event) : size_(size), capacity_(size) {
+    data_ = allocate_memory_(qu, size_);
+    *event = qu->fill(data_.get(), v, size_, *event);
+  }
+
   USMVector(::sycl::queue* qu, const std::vector<T> &vec) {
     size_ = vec.size();
     capacity_ = size_;
@@ -92,12 +98,9 @@ class USMVector {
   ~USMVector() {
   }
 
-  USMVector<T>& operator=(const USMVector<T>& other) {
-    size_ = other.size_;
-    capacity_ = other.capacity_;
-    data_ = other.data_;
-    return *this;
-  }
+  USMVector(const USMVector&) = delete;
+
+  USMVector<T>& operator=(const USMVector<T>& other) = delete;
 
   T* Data() { return data_.get(); }
   const T* DataConst() const { return data_.get(); }
@@ -220,14 +223,26 @@ struct DeviceMatrix {
 
   DeviceMatrix() = default;
 
-  void Init(::sycl::queue qu, DMatrix* dmat) {
-    if (p_mat == dmat) {
+  DeviceMatrix(const DeviceMatrix& other) = delete;
+
+  DeviceMatrix& operator= (const DeviceMatrix& other) = delete;
+
+  // During training the same dmatrix is used, so we don't need reload it on device
+  bool ReinitializationRequired(DMatrix* dmat, bool training) {
+    if (!training) return true;
+    if (p_mat != dmat) return true;
+    return false;
+  }
+
+  void Init(::sycl::queue qu, DMatrix* dmat, bool training = false) {
+    qu_ = qu;
+    if (!ReinitializationRequired(dmat, training)) {
       is_from_cache = true;
       return;
     }
+
     is_from_cache = false;
     p_mat = dmat;
-    qu_ = qu;
 
     size_t num_row = 0;
     size_t num_nonzero = 0;

diff --git a/plugin/sycl/predictor/predictor.cc b/plugin/sycl/predictor/predictor.cc
@@ -287,11 +287,10 @@ class Predictor : public xgboost::Predictor {
 
   void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts,
                     const gbm::GBTreeModel &model, uint32_t tree_begin,
-                    uint32_t tree_end = 0) const override {
+                    uint32_t tree_end = 0, bool training = false) const override {
     ::sycl::queue qu = device_manager.GetQueue(ctx_->Device());
-
     predictor_monitor_.Start("InitDeviceMatrix");
-    device_matrix.Init(qu, dmat);
+    device_matrix.Init(qu, dmat, training);
     predictor_monitor_.Stop("InitDeviceMatrix");
 
     auto* out_preds = &predts->predictions;

diff --git a/plugin/sycl/tree/split_evaluator.h b/plugin/sycl/tree/split_evaluator.h
@@ -45,21 +45,31 @@ class TreeEvaluator {
   bool has_constraint_;
 
  public:
-  TreeEvaluator(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
+  void Reset(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
     qu_ = qu;
     if (p.monotone_constraints.empty()) {
       monotone_.Resize(&qu_, n_features, 0);
       has_constraint_ = false;
     } else {
-      monotone_ = USMVector<int32_t>(&qu_, p.monotone_constraints);
+      // monotone_ = USMVector<int32_t>(&qu_, p.monotone_constraints);
+      // monotone_.Resize(&qu_, n_features, 0);
+
       monotone_.Resize(&qu_, n_features, 0);
+      qu_.memcpy(monotone_.Data(), p.monotone_constraints.data(),
+                 sizeof(int32_t) * p.monotone_constraints.size());
+      qu_.wait();
+
       lower_bounds_.Resize(&qu_, p.MaxNodes(), -std::numeric_limits<GradType>::max());
       upper_bounds_.Resize(&qu_, p.MaxNodes(), std::numeric_limits<GradType>::max());
       has_constraint_ = true;
     }
     param_ = TrainParam(p);
   }
 
+  TreeEvaluator(::sycl::queue qu, xgboost::tree::TrainParam const& p, bst_feature_t n_features) {
+    Reset(qu, p, n_features);
+  }
+
   struct SplitEvaluator {
     int* constraints;
     GradType* lower;

diff --git a/plugin/sycl/tree/updater_quantile_hist.cc b/plugin/sycl/tree/updater_quantile_hist.cc
@@ -146,12 +146,12 @@ void BatchHistSynchronizer<GradientSumT>::SyncHistograms(BuilderT *builder,
   hist_sync_events_.resize(builder->nodes_for_explicit_hist_build_.size());
   for (int i = 0; i < builder->nodes_for_explicit_hist_build_.size(); i++) {
     const auto entry = builder->nodes_for_explicit_hist_build_[i];
-    auto this_hist = builder->hist_[entry.nid];
+    auto& this_hist = builder->hist_[entry.nid];
 
     if (!(*p_tree)[entry.nid].IsRoot() && entry.sibling_nid > -1) {
       const size_t parent_id = (*p_tree)[entry.nid].Parent();
-      auto parent_hist = builder->hist_[parent_id];
-      auto sibling_hist = builder->hist_[entry.sibling_nid];
+      auto& parent_hist = builder->hist_[parent_id];
+      auto& sibling_hist = builder->hist_[entry.sibling_nid];
       hist_sync_events_[i] = common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
                                                      this_hist, nbins, ::sycl::event());
     }
@@ -169,19 +169,19 @@ void DistributedHistSynchronizer<GradientSumT>::SyncHistograms(BuilderT* builder
   const size_t nbins = builder->hist_builder_.GetNumBins();
   for (int node = 0; node < builder->nodes_for_explicit_hist_build_.size(); node++) {
     const auto entry = builder->nodes_for_explicit_hist_build_[node];
-    auto this_hist = builder->hist_[entry.nid];
+    auto& this_hist = builder->hist_[entry.nid];
     // Store posible parent node
-    auto this_local = builder->hist_local_worker_[entry.nid];
+    auto& this_local = builder->hist_local_worker_[entry.nid];
     common::CopyHist(builder->qu_, &this_local, this_hist, nbins);
 
     if (!(*p_tree)[entry.nid].IsRoot() && entry.sibling_nid > -1) {
       const size_t parent_id = (*p_tree)[entry.nid].Parent();
-      auto parent_hist = builder->hist_local_worker_[parent_id];
-      auto sibling_hist = builder->hist_[entry.sibling_nid];
+      auto& parent_hist = builder->hist_local_worker_[parent_id];
+      auto& sibling_hist = builder->hist_[entry.sibling_nid];
       common::SubtractionHist(builder->qu_, &sibling_hist, parent_hist,
                               this_hist, nbins, ::sycl::event());
       // Store posible parent node
-      auto sibling_local = builder->hist_local_worker_[entry.sibling_nid];
+      auto& sibling_local = builder->hist_local_worker_[entry.sibling_nid];
       common::CopyHist(builder->qu_, &sibling_local, sibling_hist, nbins);
     }
   }
@@ -202,11 +202,11 @@ void DistributedHistSynchronizer<GradientSumT>::ParallelSubtractionHist(
   for (int node = 0; node < nodes.size(); node++) {
     const auto entry = nodes[node];
     if (!((*p_tree)[entry.nid].IsLeftChild())) {
-      auto this_hist = builder->hist_[entry.nid];
+      auto& this_hist = builder->hist_[entry.nid];
 
       if (!(*p_tree)[entry.nid].IsRoot() && entry.sibling_nid > -1) {
-        auto parent_hist = builder->hist_[(*p_tree)[entry.nid].Parent()];
-        auto sibling_hist = builder->hist_[entry.sibling_nid];
+        auto& parent_hist = builder->hist_[(*p_tree)[entry.nid].Parent()];
+        auto& sibling_hist = builder->hist_[entry.sibling_nid];
         common::SubtractionHist(builder->qu_, &this_hist, parent_hist,
                                 sibling_hist, nbins, ::sycl::event());
       }
@@ -219,7 +219,7 @@ void QuantileHistMaker::Builder<GradientSumT>::ReduceHists(const std::vector<int
                                                            size_t nbins) {
   std::vector<GradientPairT> reduce_buffer(sync_ids.size() * nbins);
   for (size_t i = 0; i < sync_ids.size(); i++) {
-    auto this_hist = hist_[sync_ids[i]];
+    auto& this_hist = hist_[sync_ids[i]];
     const GradientPairT* psrc = reinterpret_cast<const GradientPairT*>(this_hist.DataConst());
     std::copy(psrc, psrc + nbins, reduce_buffer.begin() + i * nbins);
   }
@@ -228,7 +228,7 @@ void QuantileHistMaker::Builder<GradientSumT>::ReduceHists(const std::vector<int
     2 * nbins * sync_ids.size());
   // histred_.Allreduce(reduce_buffer.data(), nbins * sync_ids.size());
   for (size_t i = 0; i < sync_ids.size(); i++) {
-    auto this_hist = hist_[sync_ids[i]];
+    auto& this_hist = hist_[sync_ids[i]];
     GradientPairT* psrc = reinterpret_cast<GradientPairT*>(this_hist.Data());
     std::copy(reduce_buffer.begin() + i * nbins, reduce_buffer.begin() + (i + 1) * nbins, psrc);
   }
@@ -249,7 +249,6 @@ void BatchHistRowsAdder<GradientSumT>::AddHistRows(BuilderT *builder,
     max_nid = node.nid > max_nid ? node.nid : max_nid;
   }
 
-  builder->hist_.Reserve(max_nid);
   for (auto const& entry : builder->nodes_for_explicit_hist_build_) {
     int nid = entry.nid;
     auto event = builder->hist_.AddHistRow(nid);
@@ -611,7 +610,7 @@ void QuantileHistMaker::Builder<GradientSumT>::Update(
   builder_monitor_.Start("Update");
 
   const std::vector<GradientPair>& gpair_h = gpair->ConstHostVector();
-  tree_evaluator_ = TreeEvaluator<GradientSumT>(qu_, param_, p_fmat->Info().num_col_);
+  tree_evaluator_.Reset(qu_, param_, p_fmat->Info().num_col_);
   interaction_constraints_.Reset();
 
   this->InitData(ctx, gmat, gpair_h, gpair_device, *p_fmat, *p_tree);
@@ -847,10 +846,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(
     }
 
     // initialize histogram builder
-#pragma omp parallel
-    {
-      this->nthread_ = omp_get_num_threads();
-    }
+    this->nthread_ = omp_get_num_threads();
     hist_builder_ = GHistBuilder<GradientSumT>(qu_, nbins);
 
     USMVector<size_t, MemoryType::on_device>* row_indices = &(row_set_collection_.Data());
@@ -1284,7 +1280,7 @@ void QuantileHistMaker::Builder<GradientSumT>::InitNewNode(int nid,
   }
 
   {
-    auto hist = hist_[nid];
+    auto& hist = hist_[nid];
     GradientPairT grad_stat;
     if (tree[nid].IsRoot()) {
       if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {

diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
@@ -514,7 +514,7 @@ void GBTree::PredictBatchImpl(DMatrix* p_fmat, PredictionCacheEntry* out_preds,
   auto [tree_begin, tree_end] = detail::LayerToTree(model_, layer_begin, layer_end);
   CHECK_LE(tree_end, model_.trees.size()) << "Invalid number of trees.";
   if (tree_end > tree_begin) {
-    predictor->PredictBatch(p_fmat, out_preds, model_, tree_begin, tree_end);
+    predictor->PredictBatch(p_fmat, out_preds, model_, tree_begin, tree_end, is_training);
   }
   if (reset) {
     out_preds->version = 0;
@@ -763,7 +763,7 @@ class Dart : public GBTree {
       auto version = i / layer_trees();
       p_out_preds->version = version;
       predts.predictions.Fill(0);
-      predictor->PredictBatch(p_fmat, &predts, model_, i, i + 1);
+      predictor->PredictBatch(p_fmat, &predts, model_, i, i + 1, training);
 
       // Multiple the weight to output prediction.
       auto w = this->weight_drop_.at(i);

diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
@@ -699,7 +699,7 @@ class CPUPredictor : public Predictor {
   explicit CPUPredictor(Context const *ctx) : Predictor::Predictor{ctx} {}
 
   void PredictBatch(DMatrix *dmat, PredictionCacheEntry *predts, const gbm::GBTreeModel &model,
-                    uint32_t tree_begin, uint32_t tree_end = 0) const override {
+                    uint32_t tree_begin, uint32_t tree_end = 0, bool training = false) const override {
     auto *out_preds = &predts->predictions;
     // This is actually already handled in gbm, but large amount of tests rely on the
     // behaviour.

diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
@@ -928,7 +928,7 @@ class GPUPredictor : public xgboost::Predictor {
 
   void PredictBatch(DMatrix* dmat, PredictionCacheEntry* predts,
                     const gbm::GBTreeModel& model, uint32_t tree_begin,
-                    uint32_t tree_end = 0) const override {
+                    uint32_t tree_end = 0, bool training = false) const override {
     int device = ctx_->gpu_id;
     CHECK_GE(device, 0) << "Set `gpu_id' to positive value for processing GPU data.";
     auto* out_preds = &predts->predictions;