Skip to content

Commit

Permalink
[onert-micro] Add cmsis-nn Add kernel (#11565)
Browse files Browse the repository at this point in the history
This commit adds cmsis-nn Add kernel

ONE-DCO-1.0-Signed-off-by: Artem Balyshev <[email protected]>

Co-authored-by: Artem Balyshev <[email protected]>
  • Loading branch information
BalyshevArtem and Artem Balyshev authored Sep 21, 2023
1 parent 0eb8ad1 commit 347b36b
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
REGISTER_KERNEL(ABS, Abs)
REGISTER_KERNEL(ADD, Add)
REGISTER_KERNEL(ARG_MAX, ArgMax)
REGISTER_KERNEL(ARG_MIN, ArgMin)
REGISTER_KERNEL(DIV, Div)
Expand Down
53 changes: 53 additions & 0 deletions onert-micro/luci-interpreter/pal/cmsisnn/PALAdd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef LUCI_INTERPRETER_PAL_ADD_H
#define LUCI_INTERPRETER_PAL_ADD_H

#include "PALAddCommon.h"
#include "arm_nnfunctions.h"

namespace luci_interpreter_pal
{
template <>
inline void Add<int8_t>(const ArithmeticParams &params, const int flat_size,
const int8_t *input1_data, const int8_t *input2_data, int8_t *output_data)
{
auto status = arm_elementwise_add_s8(
input1_data, input2_data, params.input1_offset, params.input1_multiplier, params.input1_shift,
params.input2_offset, params.input2_multiplier, params.input2_shift, params.left_shift,
output_data, params.output_offset, params.output_multiplier, params.output_shift,
params.quantized_activation_min, params.quantized_activation_max, flat_size);
assert(status == ARM_CMSIS_NN_SUCCESS);
}

template <>
inline void Add<int16_t>(const ArithmeticParams &params, const int flat_size,
const int16_t *input1_data, const int16_t *input2_data,
int16_t *output_data)
{
auto status = arm_elementwise_add_s16(
input1_data, input2_data, params.input1_offset, params.input1_multiplier, params.input1_shift,
params.input2_offset, params.input2_multiplier, params.input2_shift, params.left_shift,
output_data, params.output_offset, params.output_multiplier, params.output_shift,
params.quantized_activation_min, params.quantized_activation_max, flat_size);
assert(status == ARM_CMSIS_NN_SUCCESS);
}

} // namespace luci_interpreter_pal

#endif // LUCI_INTERPRETER_PAL_ADD_H
79 changes: 77 additions & 2 deletions onert-micro/luci-interpreter/src/kernels/Add.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,74 @@
namespace luci_interpreter
{

namespace
{

#ifndef DIS_QUANT
void evalQuantized(const circle::Tensor *input1, const circle::Tensor *input2,
const circle::Tensor *output, const circle::AddOptions *options,
BaseRuntimeGraph *runtime_graph, DataType type)
{
assert(type == DataType::S16 or type == DataType::S8 && "Wrong Type");

luci_interpreter_pal::ArithmeticParams params{};
luci_interpreter::RuntimeShape input_shape1 =
kernels::getTensorRuntimeShape(input1, runtime_graph);
luci_interpreter::RuntimeShape input_shape2 =
kernels::getTensorRuntimeShape(input2, runtime_graph);

const bool need_broadcast =
luci_interpreter_pal::ProcessBroadcastShapes(input_shape1, input_shape2, &params);

assert(need_broadcast == false && "Broadcast for INT8 and INT16 not supported now");

params.input1_offset = -Tensor::zero_point(input1);
params.input2_offset = -Tensor::zero_point(input2);
params.output_offset = Tensor::zero_point(output);
params.left_shift = (type == DataType::S16) ? 15 : 20;

const auto input1_scale = Tensor::scale(input1);
const auto input2_scale = Tensor::scale(input2);
const auto output_scale = Tensor::scale(output);

const double twice_max_input_scale =
2 * static_cast<double>(std::max(input1_scale, input2_scale));
const double real_input1_multiplier = static_cast<double>(input1_scale / twice_max_input_scale);
const double real_input2_multiplier = static_cast<double>(input2_scale / twice_max_input_scale);
const double real_output_multiplier =
twice_max_input_scale / ((1 << params.left_shift) * static_cast<double>(output_scale));

kernels::quantizeMultiplierSmallerThanOneExp(real_input1_multiplier, &params.input1_multiplier,
&params.input1_shift);
kernels::quantizeMultiplierSmallerThanOneExp(real_input2_multiplier, &params.input2_multiplier,
&params.input2_shift);
kernels::quantizeMultiplierSmallerThanOneExp(real_output_multiplier, &params.output_multiplier,
&params.output_shift);

kernels::calculateActivationRangeQuantized(luci_actfunc(options->fused_activation_function()),
output, &params.quantized_activation_min,
&params.quantized_activation_max);
if (type == DataType::S8)
{
luci_interpreter_pal::Add(
params, input_shape1.flatSize(),
kernels::getTensorData<int8_t>(runtime_graph->getDataByTensor(input1)),
kernels::getTensorData<int8_t>(runtime_graph->getDataByTensor(input2)),
kernels::getTensorData<int8_t>(runtime_graph->getDataByTensor(output)));
}
else
{
luci_interpreter_pal::Add(
params, input_shape1.flatSize(),
kernels::getTensorData<int16_t>(runtime_graph->getDataByTensor(input1)),
kernels::getTensorData<int16_t>(runtime_graph->getDataByTensor(input2)),
kernels::getTensorData<int16_t>(runtime_graph->getDataByTensor(output)));
}
}
#endif // DIS_QUANT

} // namespace

void configure_kernel_CircleAdd(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph)
{
kernels::TISOKernel kernel(cur_op, runtime_graph);
Expand Down Expand Up @@ -58,8 +126,8 @@ void execute_kernel_CircleAdd(const circle::Operator *cur_op, BaseRuntimeGraph *
kernels::getTensorRuntimeShape(kernel.input2(), runtime_graph);

bool is_inplace = runtime_graph->is_inplace_op(cur_op);

switch (Tensor::element_type(kernel.input1()))
const auto type = Tensor::element_type(kernel.input1());
switch (type)
{
#ifndef DIS_FLOAT
case DataType::FLOAT32:
Expand Down Expand Up @@ -114,6 +182,13 @@ void execute_kernel_CircleAdd(const circle::Operator *cur_op, BaseRuntimeGraph *
}
}
break;
case DataType::S8:
case DataType::S16:
{
evalQuantized(kernel.input1(), kernel.input2(), kernel.output(), options, runtime_graph,
type);
}
break;
default:
assert(false && "Unsupported type.");
}
Expand Down

0 comments on commit 347b36b

Please sign in to comment.