From 68dd18b0da4c405f5517694f4eef9cb9dcd8e751 Mon Sep 17 00:00:00 2001 From: Jiho Chu Date: Mon, 5 Feb 2024 17:44:54 +0900 Subject: [PATCH] [Layer] Add update scale factor method It adds a method for updating scale factor. The scale factor needs to be updated while calcDerivative step when it enabled. Signed-off-by: Jiho Chu --- .../loss/cross_entropy_sigmoid_loss_layer.cpp | 3 + .../loss/cross_entropy_softmax_loss_layer.cpp | 14 ++-- .../loss/cross_entropy_softmax_loss_layer.h | 2 +- nntrainer/layers/loss/loss_layer.h | 17 ++++- nntrainer/layers/loss/meson.build | 4 +- nntrainer/layers/loss/mse_loss_layer.cpp | 72 +++++++++++++++---- 6 files changed, 88 insertions(+), 24 deletions(-) diff --git a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp index 60ea113418..feeff2b3d8 100644 --- a/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp +++ b/nntrainer/layers/loss/cross_entropy_sigmoid_loss_layer.cpp @@ -61,6 +61,9 @@ void CrossEntropySigmoidLossLayer::calcDerivative(RunLayerContext &context) { Tensor &y = context.getInput(SINGLE_INOUT_IDX); y.apply(ActiFunc::sigmoid, ret_derivative); + + applyLossScale(ret_derivative); + ret_derivative.subtract_i(y2); if (ret_derivative.divide_i(ret_derivative.size()) != ML_ERROR_NONE) { throw std::runtime_error("[CrossEntropySigmoidLossLayer::calcDerivative] " diff --git a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp index 7899fa8e03..53022fce73 100644 --- a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp +++ b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.cpp @@ -81,20 +81,22 @@ void CrossEntropySoftmaxLossLayer::calcDerivative(RunLayerContext &context) { #endif } - float loss_scale = std::get(loss_props).get(); - /// @note y and ret_derivative can be same here, so this has to be out-place /// operation // TODO: verify y and ret_derivative must not be same as loss layer is not // working in-place - ret.subtract(y2, ret_derivative); + if (ret.getDataType() != y2.getDataType()) { + ret.subtract(y2.clone(ret.getDataType()), ret_derivative); + } else { + ret.subtract(y2, ret_derivative); + } + + applyLossScale(ret_derivative); + if (ret_derivative.divide_i(ret.batch()) != ML_ERROR_NONE) { throw std::runtime_error("[CrossEntropySoftmaxLossLayer::calcDerivative] " "Error when calculating loss"); } - - if (loss_scale != 0.0f) - ret_derivative.multiply_i(loss_scale); } } // namespace nntrainer diff --git a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.h b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.h index c9bc751fbf..ecc3104f36 100644 --- a/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.h +++ b/nntrainer/layers/loss/cross_entropy_softmax_loss_layer.h @@ -37,7 +37,7 @@ class CrossEntropySoftmaxLossLayer : public LossLayer { ~CrossEntropySoftmaxLossLayer() = default; /** - * @copydoc Layer::forwarding(RunLayerContext &context, bool training) + * @copydoc Layer::forwarding(RunLayerContext &context, bool training */ void forwarding(RunLayerContext &context, bool training) override; diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h index 87d1fb2a55..bc7c2dec44 100644 --- a/nntrainer/layers/loss/loss_layer.h +++ b/nntrainer/layers/loss/loss_layer.h @@ -52,11 +52,19 @@ class LossLayer : public Layer { */ virtual bool supportBackwarding() const override { return true; } + /** + * @brief Set loss scale factor + */ + virtual void setLossSacle(float scale) override { loss_scale = scale; } + +private: /** * @copydoc Layer::requireLabel() */ bool requireLabel() const override { return true; } + float loss_scale; /**< loss scale factor */ + protected: /** * @brief update loss @@ -65,10 +73,17 @@ class LossLayer : public Layer { */ void updateLoss(RunLayerContext &context, const Tensor &l); + /** + * @brief apply loss scale + */ + void applyLossScale(Tensor &derivative) { + if (loss_scale != 0.0f) + derivative.multiply_i(loss_scale); + } + Tensor l; /**< loss tensor to store intermediate value to calculate loss value */ - std::tuple loss_props; }; } // namespace nntrainer diff --git a/nntrainer/layers/loss/meson.build b/nntrainer/layers/loss/meson.build index 9fccd0290d..8ec9928101 100644 --- a/nntrainer/layers/loss/meson.build +++ b/nntrainer/layers/loss/meson.build @@ -7,7 +7,9 @@ loss_layer_sources = [ 'constant_derivative_loss_layer.cpp' ] -loss_layer_headers = [] +loss_layer_headers = [ + 'loss_layer.h' +] loss_layer_deps = [] diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp index 7f7bd1626f..26935531be 100644 --- a/nntrainer/layers/loss/mse_loss_layer.cpp +++ b/nntrainer/layers/loss/mse_loss_layer.cpp @@ -11,6 +11,7 @@ * */ +#include "tensor.h" #include #include @@ -20,24 +21,42 @@ static constexpr size_t SINGLE_INOUT_IDX = 0; void MSELossLayer::forwarding(RunLayerContext &context, bool training) { Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX); - Tensor &y = context.getInput(SINGLE_INOUT_IDX); + Tensor &y_ = context.getInput(SINGLE_INOUT_IDX); // hidden_ <- y2 - y; - if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { - Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); - y2.subtract(y, hidden_); + auto out_type = hidden_.getDataType(); + if (out_type != y_.getDataType()) { + Tensor y = y_.clone(out_type); + if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { + Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); + y2.subtract(y, hidden_); - /** calculate sum of squares normalized by size */ - float l2norm = hidden_.l2norm(); - l2norm *= l2norm / hidden_.size(); + /** calculate sum of squares normalized by size */ + float l2norm = hidden_.l2norm(); + l2norm *= l2norm / hidden_.size(); - /** wrap in tensor for update loss */ - Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); - LossLayer::updateLoss(context, l); - } + /** wrap in tensor for update loss */ + Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); + LossLayer::updateLoss(context, l); + } + // fill the output + hidden_.fill(y); + } else { + if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { + Tensor &y2 = context.getLabel(SINGLE_INOUT_IDX); + y2.subtract(y_, hidden_); + + /** calculate sum of squares normalized by size */ + float l2norm = hidden_.l2norm(); + l2norm *= l2norm / hidden_.size(); - // fill the output - hidden_.fill(y); + /** wrap in tensor for update loss */ + Tensor l = Tensor(TensorDim(1, 1, 1, 1), &l2norm); + LossLayer::updateLoss(context, l); + } + // fill the output + hidden_.fill(y_); + } } void MSELossLayer::calcDerivative(RunLayerContext &context) { @@ -45,9 +64,32 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) { const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX); Tensor &y = context.getInput(SINGLE_INOUT_IDX); - y.subtract(y2, ret_derivative); + const auto &in_type = y.getDataType(); + if (in_type != y2.getDataType()) { + Tensor y2_ = y2.clone(in_type); + y.subtract(y2_, ret_derivative); + } else { + y.subtract(y2, ret_derivative); + } + + applyLossScale(ret_derivative); + float divider = ((float)y.size()) / 2; - if (ret_derivative.divide_i(divider) != ML_ERROR_NONE) { + + /* ret_derivative may be eliminated by big divider with fp16 calculation. + * So, it calcuated with larger precision. + */ + int ret; + if (ret_derivative.getDataType() != ml::train::TensorDim::DataType::FP32) { + Tensor ret_derivative_ = + ret_derivative.clone(ml::train::TensorDim::DataType::FP32); + ret = ret_derivative_.divide_i(divider); + ret_derivative.copyData(ret_derivative_); + } else { + ret = ret_derivative.divide_i(divider); + } + + if (ret != ML_ERROR_NONE) { throw std::runtime_error( "[MSELossLayer::calcDerivative] Error when calculating loss"); }