optimize UpdatePredictionCache

razdoburdin · Apr 18, 2024 · a54282f · a54282f
1 parent 300a25d
commit a54282f
Show file tree

Hide file tree

Showing 6 changed files with 156 additions and 97 deletions.
diff --git a/plugin/sycl/data.h b/plugin/sycl/data.h
@@ -138,6 +138,19 @@ class USMVector {
     }
   }
 
+  /* Resize without keeping the data*/
+  void ResizeNoCopy(::sycl::queue* qu, size_t size_new) {
+    if (size_new <= capacity_) {
+      size_ = size_new;
+    } else {
+      size_t size_old = size_;
+      auto data_old = data_;
+      size_ = size_new;
+      capacity_ = size_new;
+      data_ = allocate_memory_(qu, size_);
+    }
+  }
+
   void Resize(::sycl::queue* qu, size_t size_new, T v) {
     if (size_new <= size_) {
       size_ = size_new;

diff --git a/plugin/sycl/tree/hist_updater.cc b/plugin/sycl/tree/hist_updater.cc
@@ -380,16 +380,27 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
   CHECK_GT(out_preds.Size(), 0U);
 
   const size_t stride = out_preds.Stride(0);
-  const int buffer_size = out_preds.Size()*stride - stride + 1;
+  const bool is_first_group = (out_pred_ptr == nullptr);
+  const size_t gid = out_pred_ptr == nullptr ? 0 : &out_preds(0) - out_pred_ptr;
+  const bool is_last_group = (gid + 1 == stride);
+
+  const int buffer_size = out_preds.Size() *stride;
   if (buffer_size == 0) return true;
-  ::sycl::buffer<float, 1> out_preds_buf(&out_preds(0), buffer_size);
+
+  ::sycl::event event;
+  if (is_first_group) {
+    out_preds_buf_.ResizeNoCopy(&qu_, buffer_size);
+    out_pred_ptr = &out_preds(0);
+    event = qu_.memcpy(out_preds_buf_.Data(), out_pred_ptr, buffer_size * sizeof(bst_float), event);
+  }
+  auto* out_preds_buf_ptr = out_preds_buf_.Data();
 
   size_t n_nodes = row_set_collection_.Size();
+  std::vector<::sycl::event> events(n_nodes);
   for (size_t node = 0; node < n_nodes; node++) {
     const common::RowSetCollection::Elem& rowset = row_set_collection_[node];
     if (rowset.begin != nullptr && rowset.end != nullptr && rowset.Size() != 0) {
       int nid = rowset.node_id;
-      bst_float leaf_value;
       // if a node is marked as deleted by the pruner, traverse upward to locate
       // a non-deleted leaf.
       if ((*p_last_tree_)[nid].IsDeleted()) {
@@ -398,19 +409,23 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
         }
         CHECK((*p_last_tree_)[nid].IsLeaf());
       }
-      leaf_value = (*p_last_tree_)[nid].LeafValue();
-
+      bst_float leaf_value = (*p_last_tree_)[nid].LeafValue();
       const size_t* rid = rowset.begin;
       const size_t num_rows = rowset.Size();
 
-      qu_.submit([&](::sycl::handler& cgh) {
-        auto out_predictions = out_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
+      events[node] = qu_.submit([&](::sycl::handler& cgh) {
+        cgh.depends_on(event);
         cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
-          out_predictions[rid[pid.get_id(0)]*stride] += leaf_value;
+          out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value;
         });
-      }).wait();
+      });
     }
   }
+  if (is_last_group) {
+    qu_.memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
+    out_pred_ptr = nullptr;
+  }
+  qu_.wait();
 
   builder_monitor_.Stop("UpdatePredictionCache");
   return true;

diff --git a/plugin/sycl/tree/hist_updater.h b/plugin/sycl/tree/hist_updater.h
@@ -295,6 +295,9 @@ class HistUpdater {
   std::unique_ptr<HistSynchronizer<GradientSumT>> hist_synchronizer_;
   std::unique_ptr<HistRowsAdder<GradientSumT>> hist_rows_adder_;
 
+  USMVector<bst_float, MemoryType::on_device> out_preds_buf_;
+  bst_float* out_pred_ptr = nullptr;
+
   ::sycl::queue qu_;
 };
 

diff --git a/tests/cpp/plugin/test_sycl_prediction_cache.cc b/tests/cpp/plugin/test_sycl_prediction_cache.cc
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2020-2024 by XGBoost contributors
+ */
+#include <gtest/gtest.h>
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
+#pragma GCC diagnostic ignored "-W#pragma-messages"
+#include "../tree/test_prediction_cache.h"
+#pragma GCC diagnostic pop
+
+namespace xgboost::sycl::tree {
+
+class SyclPredictionCache : public xgboost::TestPredictionCache {};
+
+TEST_F(SyclPredictionCache, Hist) {
+  Context ctx;
+  ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});
+
+  this->RunTest(&ctx, "grow_quantile_histmaker_sycl", "one_output_per_tree");
+}
+
+}  // namespace xgboost::sycl::tree
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
@@ -2,97 +2,10 @@
  * Copyright 2021-2023 by XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/host_device_vector.h>
-#include <xgboost/tree_updater.h>
 
-#include <memory>
-
-#include "../../../src/tree/param.h"  // for TrainParam
-#include "../helpers.h"
-#include "xgboost/task.h"             // for ObjInfo
+#include "test_prediction_cache.h"
 
 namespace xgboost {
-
-class TestPredictionCache : public ::testing::Test {
-  std::shared_ptr<DMatrix> Xy_;
-  std::size_t n_samples_{2048};
-
- protected:
-  void SetUp() override {
-    std::size_t n_features = 13;
-    bst_target_t n_targets = 3;
-    Xy_ = RandomDataGenerator{n_samples_, n_features, 0}.Targets(n_targets).GenerateDMatrix(true);
-  }
-
-  void RunLearnerTest(Context const* ctx, std::string updater_name, float subsample,
-                      std::string const& grow_policy, std::string const& strategy) {
-    std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
-    learner->SetParam("device", ctx->DeviceName());
-    learner->SetParam("updater", updater_name);
-    learner->SetParam("multi_strategy", strategy);
-    learner->SetParam("grow_policy", grow_policy);
-    learner->SetParam("subsample", std::to_string(subsample));
-    learner->SetParam("nthread", "0");
-    learner->Configure();
-
-    for (size_t i = 0; i < 8; ++i) {
-      learner->UpdateOneIter(i, Xy_);
-    }
-
-    HostDeviceVector<float> out_prediction_cached;
-    learner->Predict(Xy_, false, &out_prediction_cached, 0, 0);
-
-    Json model{Object()};
-    learner->SaveModel(&model);
-
-    HostDeviceVector<float> out_prediction;
-    {
-      std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
-      learner->LoadModel(model);
-      learner->Predict(Xy_, false, &out_prediction, 0, 0);
-    }
-
-    auto const h_predt_cached = out_prediction_cached.ConstHostSpan();
-    auto const h_predt = out_prediction.ConstHostSpan();
-
-    ASSERT_EQ(h_predt.size(), h_predt_cached.size());
-    for (size_t i = 0; i < h_predt.size(); ++i) {
-      ASSERT_NEAR(h_predt[i], h_predt_cached[i], kRtEps);
-    }
-  }
-
-  void RunTest(Context* ctx, std::string const& updater_name, std::string const& strategy) {
-    {
-      ctx->InitAllowUnknown(Args{{"nthread", "8"}});
-
-      ObjInfo task{ObjInfo::kRegression};
-      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
-      RegTree tree;
-      std::vector<RegTree*> trees{&tree};
-      auto gpair = GenerateRandomGradients(n_samples_);
-      tree::TrainParam param;
-      param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
-
-      updater->Configure(Args{});
-      std::vector<HostDeviceVector<bst_node_t>> position(1);
-      updater->Update(&param, &gpair, Xy_.get(), position, trees);
-      HostDeviceVector<float> out_prediction_cached;
-      out_prediction_cached.SetDevice(ctx->Device());
-      out_prediction_cached.Resize(n_samples_);
-      auto cache =
-          linalg::MakeTensorView(ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
-      ASSERT_TRUE(updater->UpdatePredictionCache(Xy_.get(), cache));
-    }
-
-    for (auto policy : {"depthwise", "lossguide"}) {
-      for (auto subsample : {1.0f, 0.4f}) {
-        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
-        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
-      }
-    }
-  }
-};
-
 TEST_F(TestPredictionCache, Approx) {
   Context ctx;
   this->RunTest(&ctx, "grow_histmaker", "one_output_per_tree");

diff --git a/tests/cpp/tree/test_prediction_cache.h b/tests/cpp/tree/test_prediction_cache.h
@@ -0,0 +1,92 @@
+#include <gtest/gtest.h>
+
+#include <xgboost/host_device_vector.h>
+#include <xgboost/tree_updater.h>
+
+#include <memory>
+
+#include "../../../src/tree/param.h"  // for TrainParam
+#include "../helpers.h"
+#include "xgboost/task.h"             // for ObjInfo
+
+namespace xgboost {
+class TestPredictionCache : public ::testing::Test {
+  std::shared_ptr<DMatrix> Xy_;
+  std::size_t n_samples_{2048};
+
+ protected:
+  void SetUp() override {
+    std::size_t n_features = 13;
+    bst_target_t n_targets = 3;
+    Xy_ = RandomDataGenerator{n_samples_, n_features, 0}.Targets(n_targets).GenerateDMatrix(true);
+  }
+
+  void RunLearnerTest(Context const* ctx, std::string updater_name, float subsample,
+                      std::string const& grow_policy, std::string const& strategy) {
+    std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
+    learner->SetParam("device", ctx->DeviceName());
+    learner->SetParam("updater", updater_name);
+    learner->SetParam("multi_strategy", strategy);
+    learner->SetParam("grow_policy", grow_policy);
+    learner->SetParam("subsample", std::to_string(subsample));
+    learner->SetParam("nthread", "0");
+    learner->Configure();
+
+    for (size_t i = 0; i < 8; ++i) {
+      learner->UpdateOneIter(i, Xy_);
+    }
+
+    HostDeviceVector<float> out_prediction_cached;
+    learner->Predict(Xy_, false, &out_prediction_cached, 0, 0);
+
+    Json model{Object()};
+    learner->SaveModel(&model);
+
+    HostDeviceVector<float> out_prediction;
+    {
+      std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
+      learner->LoadModel(model);
+      learner->Predict(Xy_, false, &out_prediction, 0, 0);
+    }
+
+    auto const h_predt_cached = out_prediction_cached.ConstHostSpan();
+    auto const h_predt = out_prediction.ConstHostSpan();
+
+    ASSERT_EQ(h_predt.size(), h_predt_cached.size());
+    for (size_t i = 0; i < h_predt.size(); ++i) {
+      ASSERT_NEAR(h_predt[i], h_predt_cached[i], kRtEps);
+    }
+  }
+
+  void RunTest(Context* ctx, std::string const& updater_name, std::string const& strategy) {
+    {
+      ctx->InitAllowUnknown(Args{{"nthread", "8"}});
+
+      ObjInfo task{ObjInfo::kRegression};
+      std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
+      RegTree tree;
+      std::vector<RegTree*> trees{&tree};
+      auto gpair = GenerateRandomGradients(n_samples_);
+      tree::TrainParam param;
+      param.UpdateAllowUnknown(Args{{"max_bin", "64"}});
+
+      updater->Configure(Args{});
+      std::vector<HostDeviceVector<bst_node_t>> position(1);
+      updater->Update(&param, &gpair, Xy_.get(), position, trees);
+      HostDeviceVector<float> out_prediction_cached;
+      out_prediction_cached.SetDevice(ctx->Device());
+      out_prediction_cached.Resize(n_samples_);
+      auto cache =
+          linalg::MakeTensorView(ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
+      ASSERT_TRUE(updater->UpdatePredictionCache(Xy_.get(), cache));
+    }
+
+    for (auto policy : {"depthwise", "lossguide"}) {
+      for (auto subsample : {1.0f, 0.4f}) {
+        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
+        this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
+      }
+    }
+  }
+};
+}  // namespace xgboost