Skip to content

Commit

Permalink
optimize UpdatePredictionCache
Browse files Browse the repository at this point in the history
  • Loading branch information
Dmitry Razdoburdin committed Apr 18, 2024
1 parent 300a25d commit a54282f
Show file tree
Hide file tree
Showing 6 changed files with 156 additions and 97 deletions.
13 changes: 13 additions & 0 deletions plugin/sycl/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,19 @@ class USMVector {
}
}

/* Resize without keeping the data*/
void ResizeNoCopy(::sycl::queue* qu, size_t size_new) {
if (size_new <= capacity_) {
size_ = size_new;
} else {
size_t size_old = size_;
auto data_old = data_;
size_ = size_new;
capacity_ = size_new;
data_ = allocate_memory_(qu, size_);
}
}

void Resize(::sycl::queue* qu, size_t size_new, T v) {
if (size_new <= size_) {
size_ = size_new;
Expand Down
33 changes: 24 additions & 9 deletions plugin/sycl/tree/hist_updater.cc
Original file line number Diff line number Diff line change
Expand Up @@ -380,16 +380,27 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
CHECK_GT(out_preds.Size(), 0U);

const size_t stride = out_preds.Stride(0);
const int buffer_size = out_preds.Size()*stride - stride + 1;
const bool is_first_group = (out_pred_ptr == nullptr);
const size_t gid = out_pred_ptr == nullptr ? 0 : &out_preds(0) - out_pred_ptr;
const bool is_last_group = (gid + 1 == stride);

const int buffer_size = out_preds.Size() *stride;
if (buffer_size == 0) return true;
::sycl::buffer<float, 1> out_preds_buf(&out_preds(0), buffer_size);

::sycl::event event;
if (is_first_group) {
out_preds_buf_.ResizeNoCopy(&qu_, buffer_size);
out_pred_ptr = &out_preds(0);
event = qu_.memcpy(out_preds_buf_.Data(), out_pred_ptr, buffer_size * sizeof(bst_float), event);
}
auto* out_preds_buf_ptr = out_preds_buf_.Data();

size_t n_nodes = row_set_collection_.Size();
std::vector<::sycl::event> events(n_nodes);
for (size_t node = 0; node < n_nodes; node++) {
const common::RowSetCollection::Elem& rowset = row_set_collection_[node];
if (rowset.begin != nullptr && rowset.end != nullptr && rowset.Size() != 0) {
int nid = rowset.node_id;
bst_float leaf_value;
// if a node is marked as deleted by the pruner, traverse upward to locate
// a non-deleted leaf.
if ((*p_last_tree_)[nid].IsDeleted()) {
Expand All @@ -398,19 +409,23 @@ bool HistUpdater<GradientSumT>::UpdatePredictionCache(
}
CHECK((*p_last_tree_)[nid].IsLeaf());
}
leaf_value = (*p_last_tree_)[nid].LeafValue();

bst_float leaf_value = (*p_last_tree_)[nid].LeafValue();
const size_t* rid = rowset.begin;
const size_t num_rows = rowset.Size();

qu_.submit([&](::sycl::handler& cgh) {
auto out_predictions = out_preds_buf.get_access<::sycl::access::mode::read_write>(cgh);
events[node] = qu_.submit([&](::sycl::handler& cgh) {
cgh.depends_on(event);
cgh.parallel_for<>(::sycl::range<1>(num_rows), [=](::sycl::item<1> pid) {
out_predictions[rid[pid.get_id(0)]*stride] += leaf_value;
out_preds_buf_ptr[rid[pid.get_id(0)]*stride + gid] += leaf_value;
});
}).wait();
});
}
}
if (is_last_group) {
qu_.memcpy(out_pred_ptr, out_preds_buf_ptr, buffer_size * sizeof(bst_float), events);
out_pred_ptr = nullptr;
}
qu_.wait();

builder_monitor_.Stop("UpdatePredictionCache");
return true;
Expand Down
3 changes: 3 additions & 0 deletions plugin/sycl/tree/hist_updater.h
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,9 @@ class HistUpdater {
std::unique_ptr<HistSynchronizer<GradientSumT>> hist_synchronizer_;
std::unique_ptr<HistRowsAdder<GradientSumT>> hist_rows_adder_;

USMVector<bst_float, MemoryType::on_device> out_preds_buf_;
bst_float* out_pred_ptr = nullptr;

::sycl::queue qu_;
};

Expand Down
23 changes: 23 additions & 0 deletions tests/cpp/plugin/test_sycl_prediction_cache.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/**
* Copyright 2020-2024 by XGBoost contributors
*/
#include <gtest/gtest.h>

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wtautological-constant-compare"
#pragma GCC diagnostic ignored "-W#pragma-messages"
#include "../tree/test_prediction_cache.h"
#pragma GCC diagnostic pop

namespace xgboost::sycl::tree {

class SyclPredictionCache : public xgboost::TestPredictionCache {};

TEST_F(SyclPredictionCache, Hist) {
Context ctx;
ctx.UpdateAllowUnknown(Args{{"device", "sycl"}});

this->RunTest(&ctx, "grow_quantile_histmaker_sycl", "one_output_per_tree");
}

} // namespace xgboost::sycl::tree
89 changes: 1 addition & 88 deletions tests/cpp/tree/test_prediction_cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2,97 +2,10 @@
* Copyright 2021-2023 by XGBoost contributors
*/
#include <gtest/gtest.h>
#include <xgboost/host_device_vector.h>
#include <xgboost/tree_updater.h>

#include <memory>

#include "../../../src/tree/param.h" // for TrainParam
#include "../helpers.h"
#include "xgboost/task.h" // for ObjInfo
#include "test_prediction_cache.h"

namespace xgboost {

class TestPredictionCache : public ::testing::Test {
std::shared_ptr<DMatrix> Xy_;
std::size_t n_samples_{2048};

protected:
void SetUp() override {
std::size_t n_features = 13;
bst_target_t n_targets = 3;
Xy_ = RandomDataGenerator{n_samples_, n_features, 0}.Targets(n_targets).GenerateDMatrix(true);
}

void RunLearnerTest(Context const* ctx, std::string updater_name, float subsample,
std::string const& grow_policy, std::string const& strategy) {
std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
learner->SetParam("device", ctx->DeviceName());
learner->SetParam("updater", updater_name);
learner->SetParam("multi_strategy", strategy);
learner->SetParam("grow_policy", grow_policy);
learner->SetParam("subsample", std::to_string(subsample));
learner->SetParam("nthread", "0");
learner->Configure();

for (size_t i = 0; i < 8; ++i) {
learner->UpdateOneIter(i, Xy_);
}

HostDeviceVector<float> out_prediction_cached;
learner->Predict(Xy_, false, &out_prediction_cached, 0, 0);

Json model{Object()};
learner->SaveModel(&model);

HostDeviceVector<float> out_prediction;
{
std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
learner->LoadModel(model);
learner->Predict(Xy_, false, &out_prediction, 0, 0);
}

auto const h_predt_cached = out_prediction_cached.ConstHostSpan();
auto const h_predt = out_prediction.ConstHostSpan();

ASSERT_EQ(h_predt.size(), h_predt_cached.size());
for (size_t i = 0; i < h_predt.size(); ++i) {
ASSERT_NEAR(h_predt[i], h_predt_cached[i], kRtEps);
}
}

void RunTest(Context* ctx, std::string const& updater_name, std::string const& strategy) {
{
ctx->InitAllowUnknown(Args{{"nthread", "8"}});

ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
RegTree tree;
std::vector<RegTree*> trees{&tree};
auto gpair = GenerateRandomGradients(n_samples_);
tree::TrainParam param;
param.UpdateAllowUnknown(Args{{"max_bin", "64"}});

updater->Configure(Args{});
std::vector<HostDeviceVector<bst_node_t>> position(1);
updater->Update(&param, &gpair, Xy_.get(), position, trees);
HostDeviceVector<float> out_prediction_cached;
out_prediction_cached.SetDevice(ctx->Device());
out_prediction_cached.Resize(n_samples_);
auto cache =
linalg::MakeTensorView(ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
ASSERT_TRUE(updater->UpdatePredictionCache(Xy_.get(), cache));
}

for (auto policy : {"depthwise", "lossguide"}) {
for (auto subsample : {1.0f, 0.4f}) {
this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
}
}
}
};

TEST_F(TestPredictionCache, Approx) {
Context ctx;
this->RunTest(&ctx, "grow_histmaker", "one_output_per_tree");
Expand Down
92 changes: 92 additions & 0 deletions tests/cpp/tree/test_prediction_cache.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#include <gtest/gtest.h>

#include <xgboost/host_device_vector.h>
#include <xgboost/tree_updater.h>

#include <memory>

#include "../../../src/tree/param.h" // for TrainParam
#include "../helpers.h"
#include "xgboost/task.h" // for ObjInfo

namespace xgboost {
class TestPredictionCache : public ::testing::Test {
std::shared_ptr<DMatrix> Xy_;
std::size_t n_samples_{2048};

protected:
void SetUp() override {
std::size_t n_features = 13;
bst_target_t n_targets = 3;
Xy_ = RandomDataGenerator{n_samples_, n_features, 0}.Targets(n_targets).GenerateDMatrix(true);
}

void RunLearnerTest(Context const* ctx, std::string updater_name, float subsample,
std::string const& grow_policy, std::string const& strategy) {
std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
learner->SetParam("device", ctx->DeviceName());
learner->SetParam("updater", updater_name);
learner->SetParam("multi_strategy", strategy);
learner->SetParam("grow_policy", grow_policy);
learner->SetParam("subsample", std::to_string(subsample));
learner->SetParam("nthread", "0");
learner->Configure();

for (size_t i = 0; i < 8; ++i) {
learner->UpdateOneIter(i, Xy_);
}

HostDeviceVector<float> out_prediction_cached;
learner->Predict(Xy_, false, &out_prediction_cached, 0, 0);

Json model{Object()};
learner->SaveModel(&model);

HostDeviceVector<float> out_prediction;
{
std::unique_ptr<Learner> learner{Learner::Create({Xy_})};
learner->LoadModel(model);
learner->Predict(Xy_, false, &out_prediction, 0, 0);
}

auto const h_predt_cached = out_prediction_cached.ConstHostSpan();
auto const h_predt = out_prediction.ConstHostSpan();

ASSERT_EQ(h_predt.size(), h_predt_cached.size());
for (size_t i = 0; i < h_predt.size(); ++i) {
ASSERT_NEAR(h_predt[i], h_predt_cached[i], kRtEps);
}
}

void RunTest(Context* ctx, std::string const& updater_name, std::string const& strategy) {
{
ctx->InitAllowUnknown(Args{{"nthread", "8"}});

ObjInfo task{ObjInfo::kRegression};
std::unique_ptr<TreeUpdater> updater{TreeUpdater::Create(updater_name, ctx, &task)};
RegTree tree;
std::vector<RegTree*> trees{&tree};
auto gpair = GenerateRandomGradients(n_samples_);
tree::TrainParam param;
param.UpdateAllowUnknown(Args{{"max_bin", "64"}});

updater->Configure(Args{});
std::vector<HostDeviceVector<bst_node_t>> position(1);
updater->Update(&param, &gpair, Xy_.get(), position, trees);
HostDeviceVector<float> out_prediction_cached;
out_prediction_cached.SetDevice(ctx->Device());
out_prediction_cached.Resize(n_samples_);
auto cache =
linalg::MakeTensorView(ctx, &out_prediction_cached, out_prediction_cached.Size(), 1);
ASSERT_TRUE(updater->UpdatePredictionCache(Xy_.get(), cache));
}

for (auto policy : {"depthwise", "lossguide"}) {
for (auto subsample : {1.0f, 0.4f}) {
this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
this->RunLearnerTest(ctx, updater_name, subsample, policy, strategy);
}
}
}
};
} // namespace xgboost

0 comments on commit a54282f

Please sign in to comment.