[onert-micro] Add cmsis-nn Add kernel (#11565)

This commit adds cmsis-nn Add kernel ONE-DCO-1.0-Signed-off-by: Artem Balyshev <[email protected]> Co-authored-by: Artem Balyshev <[email protected]>
Samsung · Sep 21, 2023 · 347b36b · 347b36b
1 parent 0eb8ad1
commit 347b36b
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 2 deletions.
diff --git a/onert-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst b/onert-micro/luci-interpreter/pal/cmsisnn/KernelsToBuild.lst
@@ -1,4 +1,5 @@
 REGISTER_KERNEL(ABS, Abs)
+REGISTER_KERNEL(ADD, Add)
 REGISTER_KERNEL(ARG_MAX, ArgMax)
 REGISTER_KERNEL(ARG_MIN, ArgMin)
 REGISTER_KERNEL(DIV, Div)

diff --git a/onert-micro/luci-interpreter/pal/cmsisnn/PALAdd.h b/onert-micro/luci-interpreter/pal/cmsisnn/PALAdd.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ADD_H
+#define LUCI_INTERPRETER_PAL_ADD_H
+
+#include "PALAddCommon.h"
+#include "arm_nnfunctions.h"
+
+namespace luci_interpreter_pal
+{
+template <>
+inline void Add<int8_t>(const ArithmeticParams &params, const int flat_size,
+                        const int8_t *input1_data, const int8_t *input2_data, int8_t *output_data)
+{
+  auto status = arm_elementwise_add_s8(
+    input1_data, input2_data, params.input1_offset, params.input1_multiplier, params.input1_shift,
+    params.input2_offset, params.input2_multiplier, params.input2_shift, params.left_shift,
+    output_data, params.output_offset, params.output_multiplier, params.output_shift,
+    params.quantized_activation_min, params.quantized_activation_max, flat_size);
+  assert(status == ARM_CMSIS_NN_SUCCESS);
+}
+
+template <>
+inline void Add<int16_t>(const ArithmeticParams &params, const int flat_size,
+                         const int16_t *input1_data, const int16_t *input2_data,
+                         int16_t *output_data)
+{
+  auto status = arm_elementwise_add_s16(
+    input1_data, input2_data, params.input1_offset, params.input1_multiplier, params.input1_shift,
+    params.input2_offset, params.input2_multiplier, params.input2_shift, params.left_shift,
+    output_data, params.output_offset, params.output_multiplier, params.output_shift,
+    params.quantized_activation_min, params.quantized_activation_max, flat_size);
+  assert(status == ARM_CMSIS_NN_SUCCESS);
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ADD_H
diff --git a/onert-micro/luci-interpreter/src/kernels/Add.cpp b/onert-micro/luci-interpreter/src/kernels/Add.cpp
@@ -25,6 +25,74 @@
 namespace luci_interpreter
 {
 
+namespace
+{
+
+#ifndef DIS_QUANT
+void evalQuantized(const circle::Tensor *input1, const circle::Tensor *input2,
+                   const circle::Tensor *output, const circle::AddOptions *options,
+                   BaseRuntimeGraph *runtime_graph, DataType type)
+{
+  assert(type == DataType::S16 or type == DataType::S8 && "Wrong Type");
+
+  luci_interpreter_pal::ArithmeticParams params{};
+  luci_interpreter::RuntimeShape input_shape1 =
+    kernels::getTensorRuntimeShape(input1, runtime_graph);
+  luci_interpreter::RuntimeShape input_shape2 =
+    kernels::getTensorRuntimeShape(input2, runtime_graph);
+
+  const bool need_broadcast =
+    luci_interpreter_pal::ProcessBroadcastShapes(input_shape1, input_shape2, &params);
+
+  assert(need_broadcast == false && "Broadcast for INT8 and INT16 not supported now");
+
+  params.input1_offset = -Tensor::zero_point(input1);
+  params.input2_offset = -Tensor::zero_point(input2);
+  params.output_offset = Tensor::zero_point(output);
+  params.left_shift = (type == DataType::S16) ? 15 : 20;
+
+  const auto input1_scale = Tensor::scale(input1);
+  const auto input2_scale = Tensor::scale(input2);
+  const auto output_scale = Tensor::scale(output);
+
+  const double twice_max_input_scale =
+    2 * static_cast<double>(std::max(input1_scale, input2_scale));
+  const double real_input1_multiplier = static_cast<double>(input1_scale / twice_max_input_scale);
+  const double real_input2_multiplier = static_cast<double>(input2_scale / twice_max_input_scale);
+  const double real_output_multiplier =
+    twice_max_input_scale / ((1 << params.left_shift) * static_cast<double>(output_scale));
+
+  kernels::quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &params.input1_multiplier,
+                                               &params.input1_shift);
+  kernels::quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &params.input2_multiplier,
+                                               &params.input2_shift);
+  kernels::quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &params.output_multiplier,
+                                               &params.output_shift);
+
+  kernels::calculateActivationRangeQuantized(luci_actfunc(options->fused_activation_function()),
+                                             output, &params.quantized_activation_min,
+                                             &params.quantized_activation_max);
+  if (type == DataType::S8)
+  {
+    luci_interpreter_pal::Add(
+      params, input_shape1.flatSize(),
+      kernels::getTensorData<int8_t>(runtime_graph->getDataByTensor(input1)),
+      kernels::getTensorData<int8_t>(runtime_graph->getDataByTensor(input2)),
+      kernels::getTensorData<int8_t>(runtime_graph->getDataByTensor(output)));
+  }
+  else
+  {
+    luci_interpreter_pal::Add(
+      params, input_shape1.flatSize(),
+      kernels::getTensorData<int16_t>(runtime_graph->getDataByTensor(input1)),
+      kernels::getTensorData<int16_t>(runtime_graph->getDataByTensor(input2)),
+      kernels::getTensorData<int16_t>(runtime_graph->getDataByTensor(output)));
+  }
+}
+#endif // DIS_QUANT
+
+} // namespace
+
 void configure_kernel_CircleAdd(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
 {
   kernels::TISOKernel kernel(cur_op, runtime_graph);
@@ -58,8 +126,8 @@ void execute_kernel_CircleAdd(const circle::Operator *cur_op, BaseRuntimeGraph *
     kernels::getTensorRuntimeShape(kernel.input2(), runtime_graph);
 
   bool is_inplace = runtime_graph->is_inplace_op(cur_op);
-
-  switch (Tensor::element_type(kernel.input1()))
+  const auto type = Tensor::element_type(kernel.input1());
+  switch (type)
   {
 #ifndef DIS_FLOAT
     case DataType::FLOAT32:
@@ -114,6 +182,13 @@ void execute_kernel_CircleAdd(const circle::Operator *cur_op, BaseRuntimeGraph *
       }
     }
     break;
+    case DataType::S8:
+    case DataType::S16:
+    {
+      evalQuantized(kernel.input1(), kernel.input2(), kernel.output(), options, runtime_graph,
+                    type);
+    }
+    break;
     default:
       assert(false && "Unsupported type.");
   }