Samsung · BalyshevArtem · Jun 18, 2024 · Jun 17, 2024
diff --git a/onert-micro/onert-micro/include/pal/common/PALFullyConnectedInputGrad.h b/onert-micro/onert-micro/include/pal/common/PALFullyConnectedInputGrad.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_EXECUTE_PAL_COMMON_FULLY_CONNECTED_INPUT_GRAD_H
+#define ONERT_MICRO_EXECUTE_PAL_COMMON_FULLY_CONNECTED_INPUT_GRAD_H
+
+#include "OMStatus.h"
+#include "PALUtils.h"
+
+#include <cmath>
+
+namespace onert_micro
+{
+namespace train
+{
+namespace pal
+{
+
+void inline FullyConnectedInputGrad(const float *dloss_doutput_data,
+                                    const core::OMRuntimeShape &dloss_doutput_shape,
+                                    const float *weight_data,
+                                    const core::OMRuntimeShape &weight_shape,
+                                    float *dloss_dinput_data)
+{
+  const uint32_t input_rows = dloss_doutput_shape.dims(0);
+  const uint32_t input_col = weight_shape.dims(1);
+  const uint32_t output_cols = dloss_doutput_shape.dims(1);
+
+  for (uint32_t i = 0; i < input_rows; ++i)
+  {
+    for (uint32_t j = 0; j < input_col; ++j)
+    {
+      float total = 0.f;
+      for (uint32_t o = 0; o < output_cols; ++o)
+      {
+        total += weight_data[o * input_col + j] * dloss_doutput_data[o + i * output_cols];
+      }
+      dloss_dinput_data[j + i * input_col] = total;
+    }
+  }
+}
+
+} // namespace pal
+} // namespace train
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_EXECUTE_PAL_COMMON_FULLY_CONNECTED_WEIGHT_GRAD_H
diff --git a/onert-micro/onert-micro/include/pal/common/PALFullyConnectedWeightGrad.h b/onert-micro/onert-micro/include/pal/common/PALFullyConnectedWeightGrad.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_EXECUTE_PAL_COMMON_FULLY_CONNECTED_WEIGHT_GRAD_H
+#define ONERT_MICRO_EXECUTE_PAL_COMMON_FULLY_CONNECTED_WEIGHT_GRAD_H
+
+#include "OMStatus.h"
+#include "PALUtils.h"
+
+#include <cmath>
+
+namespace onert_micro
+{
+namespace train
+{
+namespace pal
+{
+
+void inline FullyConnectedWeightGrad(const float *dloss_doutput_data,
+                                     const core::OMRuntimeShape &dloss_doutput_shape,
+                                     const float *input_data,
+                                     const core::OMRuntimeShape &input_shape,
+                                     float *dloss_dweight_data)
+{
+  const uint32_t batches = input_shape.dims(0);
+  const uint32_t output_depth = dloss_doutput_shape.dims(1);
+  const uint32_t accum_depth = input_shape.dims(1);
+
+  for (uint32_t o = 0; o < output_depth; ++o)
+  {
+    float cur_dloss_doutput = dloss_doutput_data[o];
+    for (uint32_t i = 0; i < accum_depth; ++i)
+    {
+      dloss_dweight_data[i + o * accum_depth] = cur_dloss_doutput * input_data[i];
+    }
+  }
+
+  for (int b = 1; b < batches; ++b)
+  {
+    for (uint32_t o = 0; o < output_depth; ++o)
+    {
+      float cur_dloss_doutput = dloss_doutput_data[o + b * output_depth];
+      for (uint32_t i = 0; i < accum_depth; ++i)
+      {
+        dloss_dweight_data[i + o * accum_depth] +=
+          cur_dloss_doutput * input_data[i + b * accum_depth];
+      }
+    }
+  }
+}
+
+} // namespace pal
+} // namespace train
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_EXECUTE_PAL_COMMON_FULLY_CONNECTED_WEIGHT_GRAD_H
diff --git a/onert-micro/onert-micro/include/pal/common/PALReluInputGrad.h b/onert-micro/onert-micro/include/pal/common/PALReluInputGrad.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef ONERT_MICRO_EXECUTE_PAL_COMMON_RELU_INPUT_GRAD_H
+#define ONERT_MICRO_EXECUTE_PAL_COMMON_RELU_INPUT_GRAD_H
+
+#include "OMStatus.h"
+#include "PALUtils.h"
+
+#include <cmath>
+
+namespace onert_micro
+{
+namespace train
+{
+namespace pal
+{
+
+// Note: Perform inplace calculation
+void inline ReluInputGrad(const float *input_relu_data, float *dloss_doutput_data,
+                          const core::OMRuntimeShape &dloss_doutput_shape)
+{
+  const uint32_t flat_size = dloss_doutput_shape.flatSize();
+
+  for (uint32_t i = 0; i < flat_size; ++i)
+  {
+    dloss_doutput_data[i] = input_relu_data[i] > 0 ? dloss_doutput_data[i] : 0.f;
+  }
+}
+
+} // namespace pal
+} // namespace train
+} // namespace onert_micro
+
+#endif // ONERT_MICRO_EXECUTE_PAL_COMMON_RELU_INPUT_GRAD_H
diff --git a/onert-micro/onert-micro/src/train/kernels/FullyConnected.cpp b/onert-micro/onert-micro/src/train/kernels/FullyConnected.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2024 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OMStatus.h"
+#include "core/OMUtils.h"
+#include "core/OMDataType.h"
+#include "train/OMBackpropExecutionBuilder.h"
+#include "execute/OMRuntimeKernel.h"
+#include "PALFullyConnectedWeightGrad.h"
+#include "PALFullyConnectedInputGrad.h"
+#include "PALReluInputGrad.h"
+
+using namespace onert_micro;
+using namespace onert_micro::core;
+using namespace onert_micro::train;
+
+namespace
+{
+
+constexpr uint32_t inputTensorIdx = 0;
+constexpr uint32_t weightTensorIdx = 1;
+constexpr uint32_t biasTensorIdx = 2;
+
+constexpr uint32_t outputTensorIdx = 0;
+
+} // namespace
+
+/*
+ * - Calculate weight gradient (with bias)
+ * - Calculate input gradient - Optional (not required if it is last op)
+ */
+OMStatus onert_micro::train::train_kernel_CircleFullyConnected(const OMBackpropExecuteArgs &args)
+{
+  core::OMRuntimeStorage &forward_storage = args.forward_storage;
+  core::OMRuntimeStorage &backward_storage = args.backward_storage;
+  core::OMRuntimeContext &context = args.backward_context;
+  uint16_t op_index = args.kernel_index;
+
+  const circle::Tensor *input;
+  const circle::Tensor *weight;
+  const circle::Tensor *output;
+
+  uint8_t *input_data;
+  uint8_t *dloss_dinput_data;
+
+  uint8_t *weight_data;
+  uint8_t *dloss_dweight_data;
+
+  uint8_t *bias_data;
+  uint8_t *dloss_dbias_data;
+
+  uint8_t *output_data;
+  uint8_t *dloss_doutput_data;
+
+  const circle::FullyConnectedOptions *options;
+  // Read kernel
+  {
+    execute::OMRuntimeKernel runtime_kernel;
+    runtime_kernel.readKernel(op_index, context);
+
+    input = runtime_kernel.inputs[inputTensorIdx];
+    weight = runtime_kernel.inputs[weightTensorIdx];
+    output = runtime_kernel.outputs[outputTensorIdx];
+    assert(input != nullptr);
+    assert(weight != nullptr);
+    // Bias can be nullptr
+    assert(output != nullptr);
+
+    // Read forward storage
+    {
+      runtime_kernel.getDataFromStorage(op_index, forward_storage, context);
+
+      input_data = runtime_kernel.inputs_data[inputTensorIdx];
+      weight_data = runtime_kernel.inputs_data[weightTensorIdx];
+      bias_data = runtime_kernel.inputs_data[biasTensorIdx];
+      output_data = runtime_kernel.outputs_data[outputTensorIdx];
+      // Bias_data can be nullptr
+      // Output_data can be nullptr
+      assert(input_data != nullptr);
+      assert(weight_data != nullptr);
+    }
+
+    // Read backward storage
+    {
+      runtime_kernel.getDataFromStorage(op_index, backward_storage, context);
+
+      dloss_dinput_data = runtime_kernel.inputs_data[inputTensorIdx];
+      dloss_dweight_data = runtime_kernel.inputs_data[weightTensorIdx];
+      dloss_dbias_data = runtime_kernel.inputs_data[biasTensorIdx];
+      dloss_doutput_data = runtime_kernel.outputs_data[outputTensorIdx];
+      // Bias_data and dloss_dinput_data can be nullptr
+      // Note: dloss_dinput_data can be nullptr due to it can be last trainable node
+      assert(dloss_dweight_data != nullptr);
+      assert(dloss_doutput_data != nullptr);
+    }
+
+    options = runtime_kernel.first_operator->builtin_options_as_FullyConnectedOptions();
+  }
+
+  OMRuntimeShape input_shape(input);
+  OMRuntimeShape output_shape(output);
+
+  // 1. Handle activation functions
+  switch (options->fused_activation_function())
+  {
+    case circle::ActivationFunctionType_NONE:
+      // Do nothing
+      break;
+    case circle::ActivationFunctionType_RELU:
+    {
+      assert(output_data != nullptr);
+      pal::ReluInputGrad(utils::castInputData<float>(output_data),
+                         utils::castOutputData<float>(dloss_doutput_data), output_shape);
+      break;
+    }
+    default:
+    {
+      assert(false && "Unsupported activation type");
+      return UnsupportedType;
+    }
+  }
+
+  // 2. Calculate weight gradient
+  pal::FullyConnectedWeightGrad(core::utils::castInputData<float>(dloss_doutput_data), output_shape,
+                                core::utils::castInputData<float>(input_data), input_shape,
+                                core::utils::castOutputData<float>(dloss_dweight_data));
+
+  // 3. Calculate bias gradient
+  // Just copy dloss_doutput_data to dloss_dbias_data
+  // TODO: introduce training inplace
+  if (dloss_dbias_data)
+  {
+    assert(bias_data != nullptr);
+    if (bias_data == nullptr)
+      return UnknownError;
+
+    std::memcpy(dloss_dbias_data, dloss_doutput_data,
+                sizeof(OMDataType(output->type())) *
+                  output_shape.dims(output_shape.dimensionsCount() - 1));
+  }
+
+  // 4. Calculate (if needed) input grad
+  if (args.is_last_layer == false)
+  {
+    assert(dloss_dinput_data != nullptr);
+
+    pal::FullyConnectedInputGrad(core::utils::castInputData<float>(dloss_doutput_data),
+                                 output_shape, core::utils::castInputData<float>(weight_data),
+                                 OMRuntimeShape(weight),
+                                 core::utils::castOutputData<float>(dloss_dinput_data));
+  }
+
+  return Ok;
+}