From bfdd5fcf19aa89e9386c558891042e1858176ce2 Mon Sep 17 00:00:00 2001 From: Artem Balyshev Date: Fri, 4 Aug 2023 14:24:43 +0300 Subject: [PATCH] [onert-micro] Replace common kernels impl to pal/common This pr replaces common kernels impl to pal/common dir. ONE-DCO-1.0-Signed-off-by: Artem Balyshev --- .../pal/common/PALAddCommon.h | 94 +++ .../pal/common/PALAveragePool2DCommon.h | 93 +++ .../pal/common/PALConv2DCommon.h | 201 +++++++ .../pal/{mcu => common}/PALDiv.h | 6 +- .../pal/common/PALFullyConnectedCommon.h | 103 ++++ .../pal/common/PALMaxPool2DCommon.h | 146 +++++ .../pal/common/PALMulCommon.h | 115 ++++ .../pal/{mcu => common}/PALResizeBilinear.h | 6 +- .../pal/{mcu => common}/PALSoftmax.h | 8 +- .../pal/{mcu => common}/PALSub.h | 6 +- .../PALUnidirectionalSequenceLSTMCommon.h | 567 ++++++++++++++++++ onert-micro/luci-interpreter/pal/mcu/PALAdd.h | 71 +-- .../pal/mcu/PALAveragePool2D.h | 69 +-- .../luci-interpreter/pal/mcu/PALConv2d.h | 179 +----- .../pal/mcu/PALFullyConnected.h | 93 +-- .../luci-interpreter/pal/mcu/PALMaxPool2D.h | 122 +--- onert-micro/luci-interpreter/pal/mcu/PALMul.h | 91 +-- .../pal/mcu/PALUnidirectionalSequenceLSTM.h | 505 +--------------- 18 files changed, 1391 insertions(+), 1084 deletions(-) create mode 100644 onert-micro/luci-interpreter/pal/common/PALAddCommon.h create mode 100644 onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h create mode 100644 onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h rename onert-micro/luci-interpreter/pal/{mcu => common}/PALDiv.h (97%) create mode 100644 onert-micro/luci-interpreter/pal/common/PALFullyConnectedCommon.h create mode 100644 onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h create mode 100644 onert-micro/luci-interpreter/pal/common/PALMulCommon.h rename onert-micro/luci-interpreter/pal/{mcu => common}/PALResizeBilinear.h (97%) rename onert-micro/luci-interpreter/pal/{mcu => common}/PALSoftmax.h (92%) rename onert-micro/luci-interpreter/pal/{mcu => common}/PALSub.h (96%) create mode 100644 onert-micro/luci-interpreter/pal/common/PALUnidirectionalSequenceLSTMCommon.h diff --git a/onert-micro/luci-interpreter/pal/common/PALAddCommon.h b/onert-micro/luci-interpreter/pal/common/PALAddCommon.h new file mode 100644 index 00000000000..57f9b107e46 --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/PALAddCommon.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_ADD_COMMON_H +#define LUCI_INTERPRETER_PAL_ADD_COMMON_H + +#include "Params.h" +#include "PALUtils.h" +#include "ProcessBroadcastShapes.h" + +namespace luci_interpreter_pal +{ + +// TODO: check if there real activation value +template +inline void Add(const ArithmeticParams ¶ms, const int flat_size, const T *input1_data, + const T *input2_data, T *output_data) +{ + T activation_min, activation_max; + getActivationParams(params, &activation_min, &activation_max); + + for (int i = 0; i < flat_size; ++i) + output_data[i] = + std::min(std::max(input1_data[i] + input2_data[i], activation_min), activation_max); +} + +template +inline void +BroadcastAdd4DSlow(const ArithmeticParams ¶ms, + const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, + const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, + const luci_interpreter::RuntimeShape &output_shape, T *output_data) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const luci_interpreter::RuntimeShape extended_output_shape = + luci_interpreter::RuntimeShape::extendedShape(4, output_shape); + + T activation_min, activation_max; + getActivationParams(params, &activation_min, &activation_max); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.dims(3); ++c) + { + const int output_data_offset = + ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * + extended_output_shape.dims(3) + + c; + + output_data[output_data_offset] = + std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] + + input2_data[subscriptToIndex(desc2, b, y, x, c)], + activation_min), + activation_max); + } + } + } + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_ADD_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h new file mode 100644 index 00000000000..ec6bb55b5b5 --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H +#define LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H + +#include "Params.h" +#include "PALUtils.h" + +namespace luci_interpreter_pal +{ + +// TODO: reduce code duplication with MaxPool +inline void AveragePool(const PoolParams ¶ms, const luci_interpreter::RuntimeShape &input_shape, + const float *input_data, const luci_interpreter::RuntimeShape &output_shape, + float *output_data) +{ + const int batches = input_shape.dims(0); + const int depth = output_shape.dims(3); + const int input_height = input_shape.dims(1); + const int input_width = input_shape.dims(2); + const int output_height = output_shape.dims(1); + const int output_width = output_shape.dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int channel = 0; channel < depth; ++channel) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + + float total = 0.f; + float filter_count = 0; + + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + + const int input_data_offset = + ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) * + input_shape.dims(3) + + channel; + + total += input_data[input_data_offset]; + filter_count++; + } + } + const int output_data_offset = + ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) * + output_shape.dims(3) + + channel; + + assert(filter_count != 0); + const float average = total / filter_count; + + output_data[output_data_offset] = + std::min(std::max(average, params.float_activation_min), params.float_activation_max); + } + } + } + } +} +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h new file mode 100644 index 00000000000..04b92cd48e2 --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_CONV2D_COMMON_H +#define LUCI_INTERPRETER_PAL_CONV2D_COMMON_H +#include "Params.h" +#include "PALUtils.h" + +namespace luci_interpreter_pal +{ +static inline void Conv(const ConvParams ¶ms, const int32_t *input_shape, + const float *input_data, const int32_t *filter_shape, + const float *filter_data, const float *bias_data, + const int32_t *output_shape, float *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + + const auto batches = input_shape[0]; + const int input_height = input_shape[1]; + const int input_width = input_shape[2]; + const int input_depth = input_shape[3]; + const int output_depth = filter_shape[0]; + const int filter_height = filter_shape[1]; + const int filter_width = filter_shape[2]; + const int output_height = output_shape[1]; + const int output_width = output_shape[2]; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + float total = 0.f; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); + + if (!is_point_inside_image) + { + continue; + } + + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + const int input_data_offset = + ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel; + + const int filter_data_offset = + ((out_channel * filter_height + filter_y) * filter_width + filter_x) * + input_depth + + in_channel; + + const float input_value = input_data[input_data_offset]; + const float filter_value = filter_data[filter_data_offset]; + total += (input_value * filter_value); + } + } + } + // float bias_value = 0.0f; + if (bias_data) + { + total += bias_data[out_channel]; + } + + const int output_data_offset = + ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel; + + output_data[output_data_offset] = + std::min(std::max(total, output_activation_min), output_activation_max); + } + } + } + } +} + +static inline void Conv(const ConvParams ¶ms, const int32_t *input_shape, + const uint8_t *input_data, const int32_t *filter_shape, + const uint8_t *filter_data, const int32_t *bias_data, + const int32_t *output_shape, uint8_t *output_data) +{ + const int stride_width = params.stride_width; + const int stride_height = params.stride_height; + const int dilation_width_factor = params.dilation_width_factor; + const int dilation_height_factor = params.dilation_height_factor; + const int pad_width = params.padding_values.width; + const int pad_height = params.padding_values.height; + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + const auto batches = input_shape[0]; + const int input_height = input_shape[1]; + const int input_width = input_shape[2]; + const int input_depth = input_shape[3]; + const int output_depth = filter_shape[0]; + const int filter_height = filter_shape[1]; + const int filter_width = filter_shape[2]; + const int output_height = output_shape[1]; + const int output_width = output_shape[2]; + + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + const int in_y_origin = (out_y * stride_height) - pad_height; + for (int out_x = 0; out_x < output_width; ++out_x) + { + const int in_x_origin = (out_x * stride_width) - pad_width; + for (int out_channel = 0; out_channel < output_depth; ++out_channel) + { + int32_t acc = 0; + for (int filter_y = 0; filter_y < filter_height; ++filter_y) + { + const int in_y = in_y_origin + dilation_height_factor * filter_y; + for (int filter_x = 0; filter_x < filter_width; ++filter_x) + { + const int in_x = in_x_origin + dilation_width_factor * filter_x; + + // Zero padding by omitting the areas outside the image. + const bool is_point_inside_image = + (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); + + if (!is_point_inside_image) + { + continue; + } + + for (int in_channel = 0; in_channel < input_depth; ++in_channel) + { + const int input_data_offset = + ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel; + + const int filter_data_offset = + ((out_channel * filter_height + filter_y) * filter_width + filter_x) * + input_depth + + in_channel; + + const int32_t input_val = input_data[input_data_offset]; + const int32_t filter_val = filter_data[filter_data_offset]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + } + } + if (bias_data) + { + acc += bias_data[out_channel]; + } + acc = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc += output_offset; + acc = std::max(acc, output_activation_min); + acc = std::min(acc, output_activation_max); + + const int output_data_offset = + ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel; + + output_data[output_data_offset] = static_cast(acc); + } + } + } + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_CONV2D_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/mcu/PALDiv.h b/onert-micro/luci-interpreter/pal/common/PALDiv.h similarity index 97% rename from onert-micro/luci-interpreter/pal/mcu/PALDiv.h rename to onert-micro/luci-interpreter/pal/common/PALDiv.h index cf84a1007e0..cca85cd224a 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALDiv.h +++ b/onert-micro/luci-interpreter/pal/common/PALDiv.h @@ -15,8 +15,8 @@ * limitations under the License. */ -#ifndef LUCI_INTERPRETER_PAL_DIV_H -#define LUCI_INTERPRETER_PAL_DIV_H +#ifndef LUCI_INTERPRETER_PAL_DIV_COMMON_H +#define LUCI_INTERPRETER_PAL_DIV_COMMON_H #include "Params.h" #include "PALUtils.h" @@ -112,4 +112,4 @@ BroadcastDiv4DSlow(const ArithmeticParams ¶ms, } // namespace luci_interpreter_pal -#endif // LUCI_INTERPRETER_PAL_DIV_H +#endif // LUCI_INTERPRETER_PAL_DIV_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/common/PALFullyConnectedCommon.h b/onert-micro/luci-interpreter/pal/common/PALFullyConnectedCommon.h new file mode 100644 index 00000000000..14934cc720a --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/PALFullyConnectedCommon.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_FULLY_CONNECTED_COMMON_H +#define LUCI_INTERPRETER_PAL_FULLY_CONNECTED_COMMON_H + +#include "PALUtils.h" +#include "Params.h" + +namespace luci_interpreter_pal +{ + +template +inline void FullyConnected(const FullyConnectedParams ¶ms, const int32_t *input_shape, + const InputType *input_data, const int32_t *filter_shape, + const WeightType *filter_data, const BiasType *bias_data, + const int32_t *output_shape, OutputType *output_data) +{ + const int32_t input_offset = params.input_offset; + const int32_t filter_offset = params.weights_offset; + const int32_t output_offset = params.output_offset; + const int32_t output_multiplier = params.output_multiplier; + const int output_shift = params.output_shift; + const int32_t output_activation_min = params.quantized_activation_min; + const int32_t output_activation_max = params.quantized_activation_max; + + const int batches = input_shape[0]; + const int output_depth = output_shape[1]; + const int accum_depth = filter_shape[1]; + + for (int b = 0; b < batches; ++b) + { + for (int out_c = 0; out_c < output_depth; ++out_c) + { + BiasType acc = 0; + for (int d = 0; d < accum_depth; ++d) + { + int32_t input_val = input_data[b * accum_depth + d]; + int32_t filter_val = filter_data[out_c * accum_depth + d]; + acc += (filter_val + filter_offset) * (input_val + input_offset); + } + if (bias_data) + { + acc += bias_data[out_c]; + } + int32_t acc_scaled = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); + acc_scaled += output_offset; + acc_scaled = std::max(acc_scaled, output_activation_min); + acc_scaled = std::min(acc_scaled, output_activation_max); + output_data[out_c + output_depth * b] = static_cast(acc_scaled); + } + } +} +template <> +inline void FullyConnected(const FullyConnectedParams ¶ms, const int32_t *input_shape, + const float *input_data, const int32_t *filter_shape, + const float *filter_data, const float *bias_data, + const int32_t *output_shape, float *output_data) +{ + const float output_activation_min = params.float_activation_min; + const float output_activation_max = params.float_activation_max; + + const int batches = input_shape[0]; + const int output_depth = output_shape[1]; + const int accum_depth = filter_shape[1]; + + for (int b = 0; b < batches; ++b) + { + for (int out_c = 0; out_c < output_depth; ++out_c) + { + float total = 0.f; + for (int d = 0; d < accum_depth; ++d) + { + total += input_data[b * accum_depth + d] * filter_data[out_c * accum_depth + d]; + } + float bias_value = 0.0f; + if (bias_data) + { + bias_value = bias_data[out_c]; + } + output_data[out_c + output_depth * b] = + std::min(std::max(total + bias_value, output_activation_min), output_activation_max); + } + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_FULLY_CONNECTED_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h new file mode 100644 index 00000000000..034319b8aea --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2020 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_MAX_POOL_2D_COMMON_H +#define LUCI_INTERPRETER_PAL_MAX_POOL_2D_COMMON_H + +#include "Params.h" +#include "PALUtils.h" + +namespace luci_interpreter_pal +{ + +inline void MaxPool(const PoolParams ¶ms, const luci_interpreter::RuntimeShape &input_shape, + const float *input_data, const luci_interpreter::RuntimeShape &output_shape, + float *output_data) +{ + const int batches = input_shape.dims(0); + const int depth = output_shape.dims(3); + const int input_height = input_shape.dims(1); + const int input_width = input_shape.dims(2); + const int output_height = output_shape.dims(1); + const int output_width = output_shape.dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int channel = 0; channel < depth; ++channel) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + float max = std::numeric_limits::lowest(); + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + + const int input_data_offset = + ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) * + input_shape.dims(3) + + channel; + + max = std::max(max, input_data[input_data_offset]); + } + } + const int output_data_offset = + ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) * + output_shape.dims(3) + + channel; + + output_data[output_data_offset] = + std::min(std::max(max, params.float_activation_min), params.float_activation_max); + } + } + } + } +} + +template +inline void MaxPool(const PoolParams ¶ms, const luci_interpreter::RuntimeShape &input_shape, + const T *input_data, const luci_interpreter::RuntimeShape &output_shape, + T *output_data) +{ + const int batches = input_shape.dims(0); + const int depth = output_shape.dims(3); + const int input_height = input_shape.dims(1); + const int input_width = input_shape.dims(2); + const int output_height = output_shape.dims(1); + const int output_width = output_shape.dims(2); + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + for (int batch = 0; batch < batches; ++batch) + { + for (int out_y = 0; out_y < output_height; ++out_y) + { + for (int out_x = 0; out_x < output_width; ++out_x) + { + for (int channel = 0; channel < depth; ++channel) + { + const int in_x_origin = (out_x * stride_width) - params.padding_values.width; + const int in_y_origin = (out_y * stride_height) - params.padding_values.height; + // Compute the boundaries of the filter region clamped so as to + // ensure that the filter window fits in the input array. + const int filter_x_start = std::max(0, -in_x_origin); + const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); + const int filter_y_start = std::max(0, -in_y_origin); + const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); + T max = std::numeric_limits::lowest(); + for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) + { + for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) + { + const int in_x = in_x_origin + filter_x; + const int in_y = in_y_origin + filter_y; + + const int input_data_offset = + ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) * + input_shape.dims(3) + + channel; + + max = std::max(max, input_data[input_data_offset]); + } + } + max = std::max(max, params.quantized_activation_min); + max = std::min(max, params.quantized_activation_max); + + const int output_data_offset = + ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) * + output_shape.dims(3) + + channel; + + output_data[output_data_offset] = static_cast(max); + } + } + } + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_MAX_POOL_2D_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/common/PALMulCommon.h b/onert-micro/luci-interpreter/pal/common/PALMulCommon.h new file mode 100644 index 00000000000..f1710403016 --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/PALMulCommon.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2019 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_MUL_COMMON_H +#define LUCI_INTERPRETER_PAL_MUL_COMMON_H + +#include "Params.h" +#include "PALUtils.h" +#include "ProcessBroadcastShapes.h" + +namespace luci_interpreter_pal +{ +template +inline void Mul(const ArithmeticParams ¶ms, const int flat_size, const T *input1_data, + const T *input2_data, T *output_data) +{ + T activation_min, activation_max; + getActivationParams(params, &activation_min, &activation_max); + + for (int i = 0; i < flat_size; ++i) + output_data[i] = + std::min(std::max(input1_data[i] * input2_data[i], activation_min), activation_max); +} + +template +inline void MulScalar(const ArithmeticParams ¶ms, const int flat_size, const T *input_data, + const T scalar_value, T *output_data) +{ + T activation_min, activation_max; + getActivationParams(params, &activation_min, &activation_max); + + for (int i = 0; i < flat_size; ++i) + output_data[i] = + std::min(std::max(input_data[i] * scalar_value, activation_min), activation_max); +} + +template +inline void +BroadcastMul4DSlow(const ArithmeticParams ¶ms, + const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, + const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, + const luci_interpreter::RuntimeShape &output_shape, T *output_data) +{ + const int flat_size = input1_shape.flatSize(); + + if (params.broadcast_category == BroadcastableOpCategory::kScalarFirstBroadcast) + { + return MulScalar(params, flat_size, input2_data, input1_data[0], output_data); + } + else if (params.broadcast_category == BroadcastableOpCategory::kScalarSecondBroadcast) + { + return MulScalar(params, flat_size, input1_data, input2_data[0], output_data); + } + + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + const luci_interpreter::RuntimeShape extended_output_shape = + luci_interpreter::RuntimeShape::extendedShape(4, output_shape); + + T activation_min, activation_max; + getActivationParams(params, &activation_min, &activation_max); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + for (int b = 0; b < extended_output_shape.dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.dims(3); ++c) + { + const int output_data_offset = + ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * + extended_output_shape.dims(3) + + c; + + output_data[output_data_offset] = + std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] * + input2_data[subscriptToIndex(desc2, b, y, x, c)], + activation_min), + activation_max); + } + } + } + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_MUL_H diff --git a/onert-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h b/onert-micro/luci-interpreter/pal/common/PALResizeBilinear.h similarity index 97% rename from onert-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h rename to onert-micro/luci-interpreter/pal/common/PALResizeBilinear.h index b2ddfa75da4..19686b70231 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h +++ b/onert-micro/luci-interpreter/pal/common/PALResizeBilinear.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H -#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H +#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_COMMON_H +#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_COMMON_H #include "PALUtils.h" @@ -132,4 +132,4 @@ ResizeBilinear(const circle::ResizeBilinearOptions *op_params, } // namespace luci_interpreter_pal -#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H +#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/mcu/PALSoftmax.h b/onert-micro/luci-interpreter/pal/common/PALSoftmax.h similarity index 92% rename from onert-micro/luci-interpreter/pal/mcu/PALSoftmax.h rename to onert-micro/luci-interpreter/pal/common/PALSoftmax.h index fffd39b19f6..a67785675a6 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALSoftmax.h +++ b/onert-micro/luci-interpreter/pal/common/PALSoftmax.h @@ -15,10 +15,8 @@ * limitations under the License. */ -#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_H -#define LUCI_INTERPRETER_PAL_SOFTMAX_H - -//#include +#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_COMMON_H +#define LUCI_INTERPRETER_PAL_SOFTMAX_COMMON_H namespace luci_interpreter_pal { @@ -77,4 +75,4 @@ inline void Softmax(const double beta, const luci_interpreter::RuntimeShape &inp } // namespace luci_interpreter_pal -#endif // LUCI_INTERPRETER_PAL_SOFTMAX_H +#endif // LUCI_INTERPRETER_PAL_SOFTMAX_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/mcu/PALSub.h b/onert-micro/luci-interpreter/pal/common/PALSub.h similarity index 96% rename from onert-micro/luci-interpreter/pal/mcu/PALSub.h rename to onert-micro/luci-interpreter/pal/common/PALSub.h index 7654a6413a1..faa94fdd394 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALSub.h +++ b/onert-micro/luci-interpreter/pal/common/PALSub.h @@ -14,8 +14,8 @@ * limitations under the License. */ -#ifndef LUCI_INTERPRETER_PAL_SUB_H -#define LUCI_INTERPRETER_PAL_SUB_H +#ifndef LUCI_INTERPRETER_PAL_SUB_COMMON_H +#define LUCI_INTERPRETER_PAL_SUB_COMMON_H #include "PALUtils.h" @@ -86,4 +86,4 @@ BroadcastSub4DSlow(const ArithmeticParams ¶ms, } // namespace luci_interpreter_pal -#endif // LUCI_INTERPRETER_PAL_SUB_H +#endif // LUCI_INTERPRETER_PAL_SUB_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/common/PALUnidirectionalSequenceLSTMCommon.h b/onert-micro/luci-interpreter/pal/common/PALUnidirectionalSequenceLSTMCommon.h new file mode 100644 index 00000000000..ad9631cf29b --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/PALUnidirectionalSequenceLSTMCommon.h @@ -0,0 +1,567 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * Copyright 2017 The TensorFlow Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H +#define LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H + +#include "kernels/UnidirectionalSequenceLSTM.h" +#include "PALTanh.h" +#include "PALLogistic.h" +#include "PALFullyConnected.h" +#include "PALMul.h" +#include "PALUtils.h" + +namespace luci_interpreter_pal +{ +namespace lstm_internal +{ +namespace +{ +// Possible fused activation functions. +typedef enum +{ + kTfLiteActNone = 0, + kTfLiteActRelu, + kTfLiteActReluN1To1, // min(max(-1, x), 1) + kTfLiteActRelu6, // min(max(0, x), 6) + kTfLiteActTanh, + kTfLiteActSignBit, + kTfLiteActSigmoid, +} FusedActivation; + +} // namespace + +#ifndef DIS_QUANT + +template +void mulElementwise(int size, const ArithmeticParams *params, const InputType *input1_data, + const InputType *input2_data, OutputType *output_data) +{ + for (int i = 0; i < size; ++i) + { + const int32_t input1_val = params->input1_offset + input1_data[i]; + const int32_t input2_val = params->input2_offset + input2_data[i]; + const int32_t unclamped_result = + params->output_offset + multiplyByQuantizedMultiplier(input1_val * input2_val, + params->output_multiplier, + params->output_shift); + const int32_t clamped_output = + std::min(params->quantized_activation_max, + std::max(params->quantized_activation_min, unclamped_result)); + output_data[i] = static_cast(clamped_output); + } +} + +// Input and output have the same shape in LSTM +void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params, + const int16_t *input1_data, const int16_t *input2_data, int8_t *output_data) +{ + return mulElementwise(shape.flatSize(), params, input1_data, input2_data, + output_data); +} + +// Input and output have the same shape in LSTM +void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params, + const int16_t *input1_data, const int16_t *input2_data, int16_t *output_data) +{ + return mulElementwise(shape.flatSize(), params, input1_data, input2_data, output_data); +} + +void addElementWise(const int16_t *input_1, const int16_t *input_2, int n_batch, int n_input, + int16_t *output) +{ + for (int batch = 0; batch < n_batch; ++batch) + { + for (int i = 0; i < n_input; ++i) + { + const int index = batch * n_input + i; + int32_t sum = input_1[index] + input_2[index]; + const int32_t sum_clamped = + std::min(static_cast(std::numeric_limits::max()), + std::max(static_cast(std::numeric_limits::min()), sum)); + output[index] = static_cast(sum_clamped); + } + } +} + +void tanh(int32_t cell_state_scale_power, const luci_interpreter::RuntimeShape &input_data_shape, + int16_t *input_data, const luci_interpreter::RuntimeShape &output_data_shape, + int16_t *output_data) +{ + int32_t tanh_input_left_shift = (15 + cell_state_scale_power) - 3; + int32_t input_multiplier = 0; + if (tanh_input_left_shift < 0) /* handling negative shift value */ + { + tanh_input_left_shift = -tanh_input_left_shift; + input_multiplier = 3; + } + const int flat_size = input_data_shape.flatSize(); + luci_interpreter_pal::Tanh(input_multiplier, tanh_input_left_shift, flat_size, input_data, + output_data); +} + +void sigmoid(const luci_interpreter::RuntimeShape &data_shape, int16_t *data) +{ + luci_interpreter_pal::Logistic(0, 0, data_shape.flatSize(), data, data); +} + +void clipping(const int v_size, const luci_interpreter::lstm::CellStateInfo *cell_state_info, + int16_t *vector) +{ + for (int i = 0; i < v_size; i++) + { + vector[i] = std::max(std::min(cell_state_info->quantized_cell_clip, vector[i]), + static_cast(-cell_state_info->quantized_cell_clip)); + } +} +#endif // DIS_QUANT + +#ifndef DIS_FLOAT +// Input and output have the same shape in LSTM +void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params, + const float *input1_data, const float *input2_data, float *output_data) +{ + const int flat_size = shape.flatSize(); + return luci_interpreter_pal::Mul(*params, flat_size, input1_data, input2_data, output_data); +} + +void addElementWise(const float *input_1, const float *input_2, int n_batch, int n_input, + float *output) +{ + for (int batch = 0; batch < n_batch; ++batch) + { + for (int i = 0; i < n_input; ++i) + { + const int index = batch * n_input + i; + output[index] = input_1[index] + input_2[index]; + } + } +} + +void tanh(int32_t, const luci_interpreter::RuntimeShape &input_data_shape, float *input_data, + const luci_interpreter::RuntimeShape &output_data_shape, float *output_data) +{ + const int flat_size = input_data_shape.flatSize(); + luci_interpreter_pal::Tanh(flat_size, input_data, output_data); +} + +void sigmoid(const luci_interpreter::RuntimeShape &data_shape, float *data) +{ + const int flat_size = data_shape.flatSize(); + luci_interpreter_pal::Logistic(flat_size, data, data); +} + +void clipping(const int v_size, const luci_interpreter::lstm::CellStateInfo *cell_state_info, + float *vector) +{ + for (int i = 0; i < v_size; i++) + { + vector[i] = + std::max(std::min(cell_state_info->cell_clip, vector[i]), -cell_state_info->cell_clip); + } +} +#endif // DIS_FLOAT + +// Size information about the LSTM kernel, which is deduced from tensors stored +// in the flat buffer file. +struct LstmSizeInfo +{ + bool time_major; + int32_t batch_size; + int32_t time_steps; + int32_t input_dimension; + int32_t state_dimension; +}; + +class LstmStepManager +{ +public: + LstmStepManager() = delete; + // Does not take any ownership, and all pointers must refer to valid objects + // that outlive the one constructed. + explicit LstmStepManager(const LstmSizeInfo &size_info) : size_info_(size_info) {} + + void updateTime() + { + current_time_ += 1; + // default as one batch per inference + int input_step = size_info_.input_dimension; + int output_step = size_info_.state_dimension; + // time major: batch inference + if (size_info_.time_major) + { + input_step = input_step * size_info_.batch_size; + output_step = output_step * size_info_.batch_size; + } + + input_offset_ += input_step; + output_offset_ += output_step; + } + + void updateBatch() + { + current_batch_ += 1; + // batch inference for time major: no action needed + if (size_info_.time_major) + { + return; + } + // otherwise: singe batch inference, go to the next batch + hidden_state_offset_ += size_info_.state_dimension; + cell_state_offset_ += size_info_.state_dimension; + } + + void resetTime() { current_time_ = 0; } + + luci_interpreter::RuntimeShape inputShape() const + { + int batch_size = 1; + if (size_info_.time_major) + { + batch_size = size_info_.batch_size; + } + const int dims[2] = {batch_size, size_info_.input_dimension}; + const int32_t *dims_data = reinterpret_cast(dims); + return luci_interpreter::RuntimeShape(2, dims_data); + } + + luci_interpreter::RuntimeShape stateShape() const + { + int batch_size = 1; + if (size_info_.time_major) + { + batch_size = size_info_.batch_size; + } + const int dims[2] = {batch_size, size_info_.state_dimension}; + const int32_t *dims_data = reinterpret_cast(dims); + return luci_interpreter::RuntimeShape(2, dims_data); + } + + int inputOffset() const { return input_offset_; } + + int outputOffset() const { return output_offset_; } + + int hiddenStateOffset() const { return hidden_state_offset_; } + + int cellStateOffset() const { return cell_state_offset_; } + +private: + int32_t current_time_ = 0; + int32_t current_batch_ = 0; + int32_t input_offset_ = 0; + int32_t output_offset_ = 0; + int32_t hidden_state_offset_ = 0; + int32_t cell_state_offset_ = 0; + + const LstmSizeInfo &size_info_; +}; + +// Calculates a single LSTM gate. +// Implements the following formula: +// gate = activate(FC(input) + FC(recurrent)) +// Activation is sigmoid except for the "cell" gate (configurable, usually tanh) +template +void calculateLstmGate(const LstmStepManager *step_info, + const luci_interpreter::lstm::GateParameters *gate_params, + // Input FC + ActivationType *input_data, const circle::Tensor *input_weight, + const circle::Tensor *input_bias, + // Recurrent FC + ActivationType *recurrent_data, const circle::Tensor *recurrent_weight, + const circle::Tensor *recurrent_bias, + // Output + CellType *gate_output, + // Scratch arrays + CellType *fc_output_buffer, const FusedActivation activation, + luci_interpreter::BaseRuntimeGraph *runtime_graph) +{ + // Input FC + const auto gate_output_shape = step_info->stateShape(); + { + FullyConnectedParams op_params{}; + op_params.input_offset = gate_params->input_fc_params.input_offset; + op_params.weights_offset = gate_params->input_fc_params.weights_offset; + op_params.output_offset = gate_params->input_fc_params.output_offset; + op_params.output_multiplier = gate_params->input_fc_params.output_multiplier; + op_params.output_shift = gate_params->input_fc_params.output_shift; + op_params.quantized_activation_min = gate_params->input_fc_params.quantized_activation_min; + op_params.quantized_activation_max = gate_params->input_fc_params.quantized_activation_max; + op_params.float_activation_max = gate_params->input_fc_params.float_activation_max; + op_params.float_activation_min = gate_params->input_fc_params.float_activation_min; + + int32_t input_weight_shape[luci_interpreter::kMaxSmallSize]; + luci_interpreter::kernels::getTensorDims(input_weight, runtime_graph, input_weight_shape); + + FullyConnected(op_params, step_info->inputShape().dimsData(), + input_data + step_info->inputOffset(), input_weight_shape, + luci_interpreter::kernels::getTensorData( + runtime_graph->getConstDataByTensor(input_weight)), + luci_interpreter::kernels::getTensorData( + runtime_graph->getConstDataByTensor(input_bias)), + gate_output_shape.dimsData(), gate_output); + } + + // Recurrent FC + { + FullyConnectedParams op_params{}; + op_params.input_offset = gate_params->recurrent_fc_params.input_offset; + op_params.weights_offset = gate_params->recurrent_fc_params.weights_offset; + op_params.output_offset = gate_params->recurrent_fc_params.output_offset; + op_params.output_multiplier = gate_params->recurrent_fc_params.output_multiplier; + op_params.output_shift = gate_params->recurrent_fc_params.output_shift; + op_params.quantized_activation_min = gate_params->recurrent_fc_params.quantized_activation_min; + op_params.quantized_activation_max = gate_params->recurrent_fc_params.quantized_activation_max; + op_params.float_activation_max = gate_params->recurrent_fc_params.float_activation_max; + op_params.float_activation_min = gate_params->recurrent_fc_params.float_activation_min; + + int32_t recurrent_weight_shape[luci_interpreter::kMaxSmallSize]; + luci_interpreter::kernels::getTensorDims(recurrent_weight, runtime_graph, + recurrent_weight_shape); + + FullyConnected(op_params, step_info->stateShape().dimsData(), + recurrent_data + step_info->hiddenStateOffset(), recurrent_weight_shape, + luci_interpreter::kernels::getTensorData( + runtime_graph->getConstDataByTensor(recurrent_weight)), + luci_interpreter::kernels::getTensorData( + runtime_graph->getConstDataByTensor(recurrent_bias)), + gate_output_shape.dimsData(), fc_output_buffer); + + addElementWise(gate_output, fc_output_buffer, /*n_batch=*/gate_output_shape.dimsData()[0], + /*n_state=*/gate_output_shape.dimsData()[1], gate_output); + + switch (activation) + { + case FusedActivation::kTfLiteActSigmoid: + sigmoid(gate_output_shape, gate_output); + break; + case FusedActivation::kTfLiteActTanh: + { + // Set the scale power to -12 to avoid shift + tanh(/*cell_state_scale_power=*/-12, gate_output_shape, gate_output, gate_output_shape, + gate_output); + } + break; + default: + // Only Sigmoid or Tanh is used. + assert(false && "Only Sigmoid or Tanh is used"); + } + } +} + +// Update the hidden state of the LSTM kernel using the following formula: +// updated_hidden_state = Tanh(updated_cell_state) * output_gate_output, * means +// element wise multiplication +template +void updateLstmHidden(const LstmStepManager *step_info, CellType *cell_state_data_base, + ActivationType *hidden_state_data, const CellType *output_gate_output, + const ArithmeticParams *mul_params, int32_t cell_state_scale_power, + CellType *buffer) +{ + auto cell_state_shape = step_info->stateShape(); + CellType *cell_state_data = cell_state_data_base + step_info->cellStateOffset(); + // Tanh(cell_state) + tanh(cell_state_scale_power, cell_state_shape, cell_state_data, cell_state_shape, buffer); + // Update the hidden state + mul(cell_state_shape, mul_params, buffer, output_gate_output, + hidden_state_data + step_info->hiddenStateOffset()); +} + +// Update the cell state using the output from the forget gate, input gate, and +// cell gate Formula: updated_cell_state = forget_gate_output*cell_state + +// input_gate_output * cell_gate_output, where * denotes element wise +// multiplication +template +void updateLstmCell(const LstmStepManager *step_info, CellType *cell_state_data, + // Gate outputs + CellType *forget_gate_output, const CellType *input_gate_output, + const CellType *cell_gate_output, + // Mul parameters + const ArithmeticParams &forget_cell_mul_params, + const ArithmeticParams &input_mul_params, + const luci_interpreter::lstm::CellStateInfo *cell_state_info, CellType *buffer) +{ + auto cell_state_shape = step_info->stateShape(); + // Forget Gate x Cell State + mul(cell_state_shape, &forget_cell_mul_params, forget_gate_output, + cell_state_data + step_info->cellStateOffset(), + cell_state_data + step_info->cellStateOffset()); + // Input Gate x Cell Gate + mul(cell_state_shape, &input_mul_params, input_gate_output, cell_gate_output, buffer); + + // Update the cell state + addElementWise(cell_state_data + step_info->cellStateOffset(), buffer, + /*n_batch=*/cell_state_shape.dimsData()[0], + /*n_state=*/cell_state_shape.dimsData()[1], + cell_state_data + step_info->cellStateOffset()); + + if (cell_state_info->cell_clip > 0) + { + clipping(cell_state_shape.flatSize(), cell_state_info, + cell_state_data + step_info->cellStateOffset()); + } +} + +template +void lstmStep(luci_interpreter::lstm::LSTMStruct *lstm_struct, + luci_interpreter::lstm::LSTMParameters *lstm_params, LstmStepManager *step_info, + luci_interpreter::lstm::CellStateInfo *cell_state_info, + ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0, + CellType *scratch1, CellType *scratch2, CellType *scratch3, + luci_interpreter::BaseRuntimeGraph *runtime_graph) +{ + /*Step1: Calculate gate outputs to prepare cell state update*/ + CellType *gate_internal_buffer = scratch3; + CellType *forget_gate_output = scratch0; + + auto input_data = luci_interpreter::kernels::getTensorData( + runtime_graph->getDataByTensor(lstm_struct->input())); + + calculateLstmGate( + step_info, &lstm_params->forget_gate_parameters, + // Input FC + input_data, lstm_struct->input_to_forget_weights(), lstm_struct->forget_gate_bias(), + // Recurrent FC + output_state_data, lstm_struct->recurrent_to_forget_weights(), nullptr, + // Output + forget_gate_output, gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph); + + // Input Gate calculation; + CellType *input_gate_output = scratch1; + calculateLstmGate( + step_info, &lstm_params->input_gate_parameters, + // Input FC + input_data, lstm_struct->input_to_input_weights(), lstm_struct->input_gate_bias(), + // Recurrent FC + output_state_data, lstm_struct->recurrent_to_input_weights(), + /*recurrent_bias*/ nullptr, + // Output + input_gate_output, + // Scratch arrays + gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph); + + // Cell Gate calculation + CellType *cell_gate_output = scratch2; + calculateLstmGate( + step_info, &lstm_params->cell_gate_parameters, + // Input FC + input_data, lstm_struct->input_to_cell_weights(), lstm_struct->cell_gate_bias(), + // Recurrent FC + output_state_data, lstm_struct->recurrent_to_cell_weights(), + /*recurrent_bias*/ nullptr, + // Output + cell_gate_output, + // Scratch arrays + gate_internal_buffer, FusedActivation::kTfLiteActTanh, runtime_graph); + + /*Step2: update the cell state */ + { + // const InterGateParameters& inter_gate_params = op_data.inter_gate_parameters; + CellType *updated_input_buffer = scratch1; // reuse buffer + + updateLstmCell( + step_info, cell_state_data, forget_gate_output, input_gate_output, cell_gate_output, + lstm_params->inter_gate_parameters.forget_cell_mul_params, + lstm_params->inter_gate_parameters.input_mul_params, cell_state_info, updated_input_buffer); + } + + { + /*Step3: update the hidden state */ + CellType *output_gate_output = scratch1; // reuse buffer + calculateLstmGate( + step_info, &lstm_params->output_gate_parameters, + // Input FC + input_data, lstm_struct->input_to_output_weights(), lstm_struct->output_gate_bias(), + // Recurrent FC + output_state_data, lstm_struct->recurrent_to_output_weights(), nullptr, + // Output + output_gate_output, + // Scratch arrays + gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph); + CellType *tanh_activated_cell_buffer = scratch0; // reuse buffer + updateLstmHidden( + step_info, cell_state_data, output_state_data, output_gate_output, + &lstm_params->inter_gate_parameters.output_mul_params, + cell_state_info->cell_state_scale_power, tanh_activated_cell_buffer); + + ActivationType *output_ptr = luci_interpreter::kernels::getTensorData( + runtime_graph->getDataByTensor(lstm_struct->output())); + std::memcpy(output_ptr + step_info->outputOffset(), + output_state_data + step_info->hiddenStateOffset(), + step_info->stateShape().flatSize() * sizeof(ActivationType)); + } +} + +} // namespace lstm_internal + +// Evaluate the LSTM kernel with (potential) multi-steps and multi-batch input +template +void evalLSTM(luci_interpreter::lstm::LSTMStruct *lstm_struct, + luci_interpreter::lstm::LSTMParameters *lstm_params, + luci_interpreter::lstm::CellStateInfo *cell_state_info, + ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0, + CellType *scratch1, CellType *scratch2, CellType *scratch3, + luci_interpreter::BaseRuntimeGraph *runtime_graph) +{ + lstm_internal::LstmSizeInfo size_info; + + size_info.time_major = lstm_struct->options->time_major(); + size_info.batch_size = size_info.time_major + ? luci_interpreter::Tensor::dim(lstm_struct->input(), 1) + : luci_interpreter::Tensor::dim(lstm_struct->input(), 0); + size_info.time_steps = size_info.time_major + ? luci_interpreter::Tensor::dim(lstm_struct->input(), 0) + : luci_interpreter::Tensor::dim(lstm_struct->input(), 1); + size_info.input_dimension = luci_interpreter::Tensor::dim(lstm_struct->input(), 2); + size_info.state_dimension = luci_interpreter::Tensor::dim(lstm_struct->output_state(), 1); + + lstm_internal::LstmStepManager step_info(size_info); + + // time is the first dimention, enable batch computation + if (size_info.time_major) + { + for (int t = 0; t < size_info.time_steps; t++) + { + lstm_internal::lstmStep( + lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data, + scratch0, scratch1, scratch2, scratch3, runtime_graph); + // prepare for the next time step + step_info.updateTime(); + } + } + else + { + // batch first, unable to size the input data. single batch inference + for (int b = 0; b < size_info.batch_size; b++) + { + for (int t = 0; t < size_info.time_steps; t++) + { + lstm_internal::lstmStep( + lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data, + scratch0, scratch1, scratch2, scratch3, runtime_graph); + // prepare for the next time step + step_info.updateTime(); + } + // prepare for the next batch + step_info.updateBatch(); + step_info.resetTime(); + } + } +} + +} // namespace luci_interpreter_pal + +#endif // LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H diff --git a/onert-micro/luci-interpreter/pal/mcu/PALAdd.h b/onert-micro/luci-interpreter/pal/mcu/PALAdd.h index d45d0aab01f..d9d1f7865ae 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALAdd.h +++ b/onert-micro/luci-interpreter/pal/mcu/PALAdd.h @@ -18,75 +18,22 @@ #ifndef LUCI_INTERPRETER_PAL_ADD_H #define LUCI_INTERPRETER_PAL_ADD_H -#include "Params.h" -#include "PALUtils.h" -#include "ProcessBroadcastShapes.h" +#include "PALAddCommon.h" namespace luci_interpreter_pal { - -// TODO: check if there real activation value -template -inline void Add(const ArithmeticParams ¶ms, const int flat_size, const T *input1_data, - const T *input2_data, T *output_data) +template <> +inline void Add(const ArithmeticParams &, const int, const int8_t *, const int8_t *, + int8_t *) { - T activation_min, activation_max; - getActivationParams(params, &activation_min, &activation_max); - - for (int i = 0; i < flat_size; ++i) - output_data[i] = - std::min(std::max(input1_data[i] + input2_data[i], activation_min), activation_max); + assert(false && "Not IMPL yet"); } -template -inline void -BroadcastAdd4DSlow(const ArithmeticParams ¶ms, - const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, - const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, - const luci_interpreter::RuntimeShape &output_shape, T *output_data) +template <> +inline void Add(const ArithmeticParams &, const int, const int16_t *, const int16_t *, + int16_t *) { - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); - const luci_interpreter::RuntimeShape extended_output_shape = - luci_interpreter::RuntimeShape::extendedShape(4, output_shape); - - T activation_min, activation_max; - getActivationParams(params, &activation_min, &activation_max); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < extended_output_shape.dims(0); ++b) - { - for (int y = 0; y < extended_output_shape.dims(1); ++y) - { - for (int x = 0; x < extended_output_shape.dims(2); ++x) - { - for (int c = 0; c < extended_output_shape.dims(3); ++c) - { - const int output_data_offset = - ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * - extended_output_shape.dims(3) + - c; - - output_data[output_data_offset] = - std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] + - input2_data[subscriptToIndex(desc2, b, y, x, c)], - activation_min), - activation_max); - } - } - } - } + assert(false && "Not IMPL yet"); } } // namespace luci_interpreter_pal diff --git a/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h b/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h index e111abbb572..ef5fe72230c 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h +++ b/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h @@ -18,76 +18,11 @@ #ifndef LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_H #define LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_H -#include "Params.h" -#include "PALUtils.h" +#include "PALAveragePool2DCommon.h" namespace luci_interpreter_pal { - -// TODO: reduce code duplication with MaxPool -inline void AveragePool(const PoolParams ¶ms, const luci_interpreter::RuntimeShape &input_shape, - const float *input_data, const luci_interpreter::RuntimeShape &output_shape, - float *output_data) -{ - const int batches = input_shape.dims(0); - const int depth = output_shape.dims(3); - const int input_height = input_shape.dims(1); - const int input_width = input_shape.dims(2); - const int output_height = output_shape.dims(1); - const int output_width = output_shape.dims(2); - const int stride_height = params.stride_height; - const int stride_width = params.stride_width; - for (int batch = 0; batch < batches; ++batch) - { - for (int out_y = 0; out_y < output_height; ++out_y) - { - for (int out_x = 0; out_x < output_width; ++out_x) - { - for (int channel = 0; channel < depth; ++channel) - { - const int in_x_origin = (out_x * stride_width) - params.padding_values.width; - const int in_y_origin = (out_y * stride_height) - params.padding_values.height; - // Compute the boundaries of the filter region clamped so as to - // ensure that the filter window fits in the input array. - const int filter_x_start = std::max(0, -in_x_origin); - const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); - const int filter_y_start = std::max(0, -in_y_origin); - const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); - - float total = 0.f; - float filter_count = 0; - - for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) - { - for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) - { - const int in_x = in_x_origin + filter_x; - const int in_y = in_y_origin + filter_y; - - const int input_data_offset = - ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) * - input_shape.dims(3) + - channel; - - total += input_data[input_data_offset]; - filter_count++; - } - } - const int output_data_offset = - ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) * - output_shape.dims(3) + - channel; - - assert(filter_count != 0); - const float average = total / filter_count; - - output_data[output_data_offset] = - std::min(std::max(average, params.float_activation_min), params.float_activation_max); - } - } - } - } -} +// TODO: add S8 and S16 kernel } // namespace luci_interpreter_pal #endif // LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_H diff --git a/onert-micro/luci-interpreter/pal/mcu/PALConv2d.h b/onert-micro/luci-interpreter/pal/mcu/PALConv2d.h index b7d6502c17c..c979f761031 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALConv2d.h +++ b/onert-micro/luci-interpreter/pal/mcu/PALConv2d.h @@ -17,185 +17,16 @@ #ifndef LUCI_INTERPRETER_PAL_CONV2D_H #define LUCI_INTERPRETER_PAL_CONV2D_H -#include "Params.h" -#include "PALUtils.h" +#include "PALConv2DCommon.h" namespace luci_interpreter_pal { -static inline void Conv(const ConvParams ¶ms, const int32_t *input_shape, - const float *input_data, const int32_t *filter_shape, - const float *filter_data, const float *bias_data, - const int32_t *output_shape, float *output_data) +static inline void QuantizedConvPerChannel(const ConvParams &, const int32_t *, const int8_t *, + const int32_t *, const int8_t *, const int32_t *, + const int32_t *, int8_t *) { - const int stride_width = params.stride_width; - const int stride_height = params.stride_height; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - const int pad_width = params.padding_values.width; - const int pad_height = params.padding_values.height; - const float output_activation_min = params.float_activation_min; - const float output_activation_max = params.float_activation_max; - - const auto batches = input_shape[0]; - const int input_height = input_shape[1]; - const int input_width = input_shape[2]; - const int input_depth = input_shape[3]; - const int output_depth = filter_shape[0]; - const int filter_height = filter_shape[1]; - const int filter_width = filter_shape[2]; - const int output_height = output_shape[1]; - const int output_width = output_shape[2]; - for (int batch = 0; batch < batches; ++batch) - { - for (int out_y = 0; out_y < output_height; ++out_y) - { - const int in_y_origin = (out_y * stride_height) - pad_height; - for (int out_x = 0; out_x < output_width; ++out_x) - { - const int in_x_origin = (out_x * stride_width) - pad_width; - for (int out_channel = 0; out_channel < output_depth; ++out_channel) - { - float total = 0.f; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) - { - const int in_y = in_y_origin + dilation_height_factor * filter_y; - for (int filter_x = 0; filter_x < filter_width; ++filter_x) - { - const int in_x = in_x_origin + dilation_width_factor * filter_x; - - // Zero padding by omitting the areas outside the image. - const bool is_point_inside_image = - (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); - - if (!is_point_inside_image) - { - continue; - } - - for (int in_channel = 0; in_channel < input_depth; ++in_channel) - { - const int input_data_offset = - ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel; - - const int filter_data_offset = - ((out_channel * filter_height + filter_y) * filter_width + filter_x) * - input_depth + - in_channel; - - const float input_value = input_data[input_data_offset]; - const float filter_value = filter_data[filter_data_offset]; - total += (input_value * filter_value); - } - } - } - // float bias_value = 0.0f; - if (bias_data) - { - total += bias_data[out_channel]; - } - - const int output_data_offset = - ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel; - - output_data[output_data_offset] = - std::min(std::max(total, output_activation_min), output_activation_max); - } - } - } - } + assert(false && "Not supported yet"); } - -static inline void Conv(const ConvParams ¶ms, const int32_t *input_shape, - const uint8_t *input_data, const int32_t *filter_shape, - const uint8_t *filter_data, const int32_t *bias_data, - const int32_t *output_shape, uint8_t *output_data) -{ - const int stride_width = params.stride_width; - const int stride_height = params.stride_height; - const int dilation_width_factor = params.dilation_width_factor; - const int dilation_height_factor = params.dilation_height_factor; - const int pad_width = params.padding_values.width; - const int pad_height = params.padding_values.height; - const int32_t input_offset = params.input_offset; - const int32_t filter_offset = params.weights_offset; - const int32_t output_offset = params.output_offset; - const int32_t output_multiplier = params.output_multiplier; - const int output_shift = params.output_shift; - const int32_t output_activation_min = params.quantized_activation_min; - const int32_t output_activation_max = params.quantized_activation_max; - - const auto batches = input_shape[0]; - const int input_height = input_shape[1]; - const int input_width = input_shape[2]; - const int input_depth = input_shape[3]; - const int output_depth = filter_shape[0]; - const int filter_height = filter_shape[1]; - const int filter_width = filter_shape[2]; - const int output_height = output_shape[1]; - const int output_width = output_shape[2]; - - for (int batch = 0; batch < batches; ++batch) - { - for (int out_y = 0; out_y < output_height; ++out_y) - { - const int in_y_origin = (out_y * stride_height) - pad_height; - for (int out_x = 0; out_x < output_width; ++out_x) - { - const int in_x_origin = (out_x * stride_width) - pad_width; - for (int out_channel = 0; out_channel < output_depth; ++out_channel) - { - int32_t acc = 0; - for (int filter_y = 0; filter_y < filter_height; ++filter_y) - { - const int in_y = in_y_origin + dilation_height_factor * filter_y; - for (int filter_x = 0; filter_x < filter_width; ++filter_x) - { - const int in_x = in_x_origin + dilation_width_factor * filter_x; - - // Zero padding by omitting the areas outside the image. - const bool is_point_inside_image = - (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height); - - if (!is_point_inside_image) - { - continue; - } - - for (int in_channel = 0; in_channel < input_depth; ++in_channel) - { - const int input_data_offset = - ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel; - - const int filter_data_offset = - ((out_channel * filter_height + filter_y) * filter_width + filter_x) * - input_depth + - in_channel; - - const int32_t input_val = input_data[input_data_offset]; - const int32_t filter_val = filter_data[filter_data_offset]; - acc += (filter_val + filter_offset) * (input_val + input_offset); - } - } - } - if (bias_data) - { - acc += bias_data[out_channel]; - } - acc = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); - acc += output_offset; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - - const int output_data_offset = - ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel; - - output_data[output_data_offset] = static_cast(acc); - } - } - } - } -} - } // namespace luci_interpreter_pal #endif // LUCI_INTERPRETER_PAL_CONV2D_H diff --git a/onert-micro/luci-interpreter/pal/mcu/PALFullyConnected.h b/onert-micro/luci-interpreter/pal/mcu/PALFullyConnected.h index d1a151d7fcb..4a024b137ed 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALFullyConnected.h +++ b/onert-micro/luci-interpreter/pal/mcu/PALFullyConnected.h @@ -18,83 +18,36 @@ #ifndef LUCI_INTERPRETER_PAL_FULLY_CONNECTED_H #define LUCI_INTERPRETER_PAL_FULLY_CONNECTED_H -#include "Params.h" -#include "PALUtils.h" +#include "PALFullyConnectedCommon.h" namespace luci_interpreter_pal { -template -inline void FullyConnected(const FullyConnectedParams ¶ms, const int32_t *input_shape, - const InputType *input_data, const int32_t *filter_shape, - const WeightType *filter_data, const BiasType *bias_data, - const int32_t *output_shape, OutputType *output_data) -{ - const int32_t input_offset = params.input_offset; - const int32_t filter_offset = params.weights_offset; - const int32_t output_offset = params.output_offset; - const int32_t output_multiplier = params.output_multiplier; - const int output_shift = params.output_shift; - const int32_t output_activation_min = params.quantized_activation_min; - const int32_t output_activation_max = params.quantized_activation_max; - - const int batches = input_shape[0]; - const int output_depth = output_shape[1]; - const int accum_depth = filter_shape[1]; - for (int b = 0; b < batches; ++b) - { - for (int out_c = 0; out_c < output_depth; ++out_c) - { - BiasType acc = 0; - for (int d = 0; d < accum_depth; ++d) - { - int32_t input_val = input_data[b * accum_depth + d]; - int32_t filter_val = filter_data[out_c * accum_depth + d]; - acc += (filter_val + filter_offset) * (input_val + input_offset); - } - if (bias_data) - { - acc += bias_data[out_c]; - } - int32_t acc_scaled = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift); - acc_scaled += output_offset; - acc_scaled = std::max(acc_scaled, output_activation_min); - acc_scaled = std::min(acc_scaled, output_activation_max); - output_data[out_c + output_depth * b] = static_cast(acc_scaled); - } - } -} template <> -inline void FullyConnected(const FullyConnectedParams ¶ms, const int32_t *input_shape, - const float *input_data, const int32_t *filter_shape, - const float *filter_data, const float *bias_data, - const int32_t *output_shape, float *output_data) +inline void +FullyConnected(const luci_interpreter_pal::FullyConnectedParams ¶ms, const int32_t *input_shape, + const int8_t *input_data, const int32_t *filter_shape, const int8_t *filter_data, + const int32_t *bias_data, const int32_t *output_shape, int8_t *output_data) { - const float output_activation_min = params.float_activation_min; - const float output_activation_max = params.float_activation_max; - - const int batches = input_shape[0]; - const int output_depth = output_shape[1]; - const int accum_depth = filter_shape[1]; + // MARK: At this moment this operation doesn't support + assert(false && "FullyConnected INT8 NYI"); + (void)params; + (void)input_shape; + (void)input_data; + (void)filter_shape; + (void)filter_data; + (void)bias_data; + (void)output_shape; + (void)output_data; +} - for (int b = 0; b < batches; ++b) - { - for (int out_c = 0; out_c < output_depth; ++out_c) - { - float total = 0.f; - for (int d = 0; d < accum_depth; ++d) - { - total += input_data[b * accum_depth + d] * filter_data[out_c * accum_depth + d]; - } - float bias_value = 0.0f; - if (bias_data) - { - bias_value = bias_data[out_c]; - } - output_data[out_c + output_depth * b] = - std::min(std::max(total + bias_value, output_activation_min), output_activation_max); - } - } +template <> +inline void FullyConnected(const luci_interpreter_pal::FullyConnectedParams &, const int32_t *, + const int16_t *, const int32_t *, const int8_t *, const int64_t *, + const int32_t *, int16_t *) +{ + // MARK: At this moment this operation doesn't support + assert(false && "FullyConnected INT8 NYI"); } } // namespace luci_interpreter_pal diff --git a/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h b/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h index dab583594c8..a0fff0c6d0e 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h +++ b/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h @@ -18,129 +18,11 @@ #ifndef LUCI_INTERPRETER_PAL_MAX_POOL_2D_H #define LUCI_INTERPRETER_PAL_MAX_POOL_2D_H -#include "Params.h" -#include "PALUtils.h" +#include "PALMaxPool2DCommon.h" namespace luci_interpreter_pal { - -inline void MaxPool(const PoolParams ¶ms, const luci_interpreter::RuntimeShape &input_shape, - const float *input_data, const luci_interpreter::RuntimeShape &output_shape, - float *output_data) -{ - const int batches = input_shape.dims(0); - const int depth = output_shape.dims(3); - const int input_height = input_shape.dims(1); - const int input_width = input_shape.dims(2); - const int output_height = output_shape.dims(1); - const int output_width = output_shape.dims(2); - const int stride_height = params.stride_height; - const int stride_width = params.stride_width; - for (int batch = 0; batch < batches; ++batch) - { - for (int out_y = 0; out_y < output_height; ++out_y) - { - for (int out_x = 0; out_x < output_width; ++out_x) - { - for (int channel = 0; channel < depth; ++channel) - { - const int in_x_origin = (out_x * stride_width) - params.padding_values.width; - const int in_y_origin = (out_y * stride_height) - params.padding_values.height; - // Compute the boundaries of the filter region clamped so as to - // ensure that the filter window fits in the input array. - const int filter_x_start = std::max(0, -in_x_origin); - const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); - const int filter_y_start = std::max(0, -in_y_origin); - const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); - float max = std::numeric_limits::lowest(); - for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) - { - for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) - { - const int in_x = in_x_origin + filter_x; - const int in_y = in_y_origin + filter_y; - - const int input_data_offset = - ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) * - input_shape.dims(3) + - channel; - - max = std::max(max, input_data[input_data_offset]); - } - } - const int output_data_offset = - ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) * - output_shape.dims(3) + - channel; - - output_data[output_data_offset] = - std::min(std::max(max, params.float_activation_min), params.float_activation_max); - } - } - } - } -} - -template -inline void MaxPool(const PoolParams ¶ms, const luci_interpreter::RuntimeShape &input_shape, - const T *input_data, const luci_interpreter::RuntimeShape &output_shape, - T *output_data) -{ - const int batches = input_shape.dims(0); - const int depth = output_shape.dims(3); - const int input_height = input_shape.dims(1); - const int input_width = input_shape.dims(2); - const int output_height = output_shape.dims(1); - const int output_width = output_shape.dims(2); - const int stride_height = params.stride_height; - const int stride_width = params.stride_width; - for (int batch = 0; batch < batches; ++batch) - { - for (int out_y = 0; out_y < output_height; ++out_y) - { - for (int out_x = 0; out_x < output_width; ++out_x) - { - for (int channel = 0; channel < depth; ++channel) - { - const int in_x_origin = (out_x * stride_width) - params.padding_values.width; - const int in_y_origin = (out_y * stride_height) - params.padding_values.height; - // Compute the boundaries of the filter region clamped so as to - // ensure that the filter window fits in the input array. - const int filter_x_start = std::max(0, -in_x_origin); - const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin); - const int filter_y_start = std::max(0, -in_y_origin); - const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin); - T max = std::numeric_limits::lowest(); - for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y) - { - for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x) - { - const int in_x = in_x_origin + filter_x; - const int in_y = in_y_origin + filter_y; - - const int input_data_offset = - ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) * - input_shape.dims(3) + - channel; - - max = std::max(max, input_data[input_data_offset]); - } - } - max = std::max(max, params.quantized_activation_min); - max = std::min(max, params.quantized_activation_max); - - const int output_data_offset = - ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) * - output_shape.dims(3) + - channel; - - output_data[output_data_offset] = static_cast(max); - } - } - } - } -} - +// TODO: Add INT8, INT16 kernels } // namespace luci_interpreter_pal #endif // LUCI_INTERPRETER_PAL_MAX_POOL_2D_H diff --git a/onert-micro/luci-interpreter/pal/mcu/PALMul.h b/onert-micro/luci-interpreter/pal/mcu/PALMul.h index 94ed17ef2bc..7b55cd1c832 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALMul.h +++ b/onert-micro/luci-interpreter/pal/mcu/PALMul.h @@ -18,96 +18,23 @@ #ifndef LUCI_INTERPRETER_PAL_MUL_H #define LUCI_INTERPRETER_PAL_MUL_H -#include "Params.h" -#include "PALUtils.h" -#include "ProcessBroadcastShapes.h" +#include "PALMulCommon.h" namespace luci_interpreter_pal { -template -inline void Mul(const ArithmeticParams ¶ms, const int flat_size, const T *input1_data, - const T *input2_data, T *output_data) -{ - T activation_min, activation_max; - getActivationParams(params, &activation_min, &activation_max); - - for (int i = 0; i < flat_size; ++i) - output_data[i] = - std::min(std::max(input1_data[i] * input2_data[i], activation_min), activation_max); -} -template -inline void MulScalar(const ArithmeticParams ¶ms, const int flat_size, const T *input_data, - const T scalar_value, T *output_data) +template <> +inline void Mul(const ArithmeticParams &, const int, const int8_t *, const int8_t *, + int8_t *) { - T activation_min, activation_max; - getActivationParams(params, &activation_min, &activation_max); - - for (int i = 0; i < flat_size; ++i) - output_data[i] = - std::min(std::max(input_data[i] * scalar_value, activation_min), activation_max); + assert(false && "Not IMPL yet"); } -template -inline void -BroadcastMul4DSlow(const ArithmeticParams ¶ms, - const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, - const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, - const luci_interpreter::RuntimeShape &output_shape, T *output_data) +template <> +inline void Mul(const ArithmeticParams &, const int, const int16_t *, const int16_t *, + int16_t *) { - const int flat_size = input1_shape.flatSize(); - - if (params.broadcast_category == BroadcastableOpCategory::kScalarFirstBroadcast) - { - return MulScalar(params, flat_size, input2_data, input1_data[0], output_data); - } - else if (params.broadcast_category == BroadcastableOpCategory::kScalarSecondBroadcast) - { - return MulScalar(params, flat_size, input1_data, input2_data[0], output_data); - } - - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); - const luci_interpreter::RuntimeShape extended_output_shape = - luci_interpreter::RuntimeShape::extendedShape(4, output_shape); - - T activation_min, activation_max; - getActivationParams(params, &activation_min, &activation_max); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - for (int b = 0; b < extended_output_shape.dims(0); ++b) - { - for (int y = 0; y < extended_output_shape.dims(1); ++y) - { - for (int x = 0; x < extended_output_shape.dims(2); ++x) - { - for (int c = 0; c < extended_output_shape.dims(3); ++c) - { - const int output_data_offset = - ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * - extended_output_shape.dims(3) + - c; - - output_data[output_data_offset] = - std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] * - input2_data[subscriptToIndex(desc2, b, y, x, c)], - activation_min), - activation_max); - } - } - } - } + assert(false && "Not IMPL yet"); } } // namespace luci_interpreter_pal diff --git a/onert-micro/luci-interpreter/pal/mcu/PALUnidirectionalSequenceLSTM.h b/onert-micro/luci-interpreter/pal/mcu/PALUnidirectionalSequenceLSTM.h index 0e267d3ed5b..35592ac6663 100644 --- a/onert-micro/luci-interpreter/pal/mcu/PALUnidirectionalSequenceLSTM.h +++ b/onert-micro/luci-interpreter/pal/mcu/PALUnidirectionalSequenceLSTM.h @@ -18,503 +18,18 @@ #ifndef LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_H #define LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_H -#include "kernels/UnidirectionalSequenceLSTM.h" -#include "PALTanh.h" -#include "PALLogistic.h" -#include "PALFullyConnected.h" -#include "PALMul.h" -#include "PALUtils.h" +#include "PALUnidirectionalSequenceLSTMCommon.h" namespace luci_interpreter_pal { -namespace lstm_internal -{ -namespace -{ -// Possible fused activation functions. -typedef enum -{ - kTfLiteActNone = 0, - kTfLiteActRelu, - kTfLiteActReluN1To1, // min(max(-1, x), 1) - kTfLiteActRelu6, // min(max(0, x), 6) - kTfLiteActTanh, - kTfLiteActSignBit, - kTfLiteActSigmoid, -} FusedActivation; - -} // namespace - -#ifndef DIS_QUANT - -template -void mulElementwise(int size, const ArithmeticParams *params, const InputType *input1_data, - const InputType *input2_data, OutputType *output_data) -{ - for (int i = 0; i < size; ++i) - { - const int32_t input1_val = params->input1_offset + input1_data[i]; - const int32_t input2_val = params->input2_offset + input2_data[i]; - const int32_t unclamped_result = - params->output_offset + multiplyByQuantizedMultiplier(input1_val * input2_val, - params->output_multiplier, - params->output_shift); - const int32_t clamped_output = - std::min(params->quantized_activation_max, - std::max(params->quantized_activation_min, unclamped_result)); - output_data[i] = static_cast(clamped_output); - } -} - -// Input and output have the same shape in LSTM -void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params, - const int16_t *input1_data, const int16_t *input2_data, int8_t *output_data) -{ - return mulElementwise(shape.flatSize(), params, input1_data, input2_data, - output_data); -} - -// Input and output have the same shape in LSTM -void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params, - const int16_t *input1_data, const int16_t *input2_data, int16_t *output_data) -{ - return mulElementwise(shape.flatSize(), params, input1_data, input2_data, output_data); -} - -void addElementWise(const int16_t *input_1, const int16_t *input_2, int n_batch, int n_input, - int16_t *output) -{ - for (int batch = 0; batch < n_batch; ++batch) - { - for (int i = 0; i < n_input; ++i) - { - const int index = batch * n_input + i; - int32_t sum = input_1[index] + input_2[index]; - const int32_t sum_clamped = - std::min(static_cast(std::numeric_limits::max()), - std::max(static_cast(std::numeric_limits::min()), sum)); - output[index] = static_cast(sum_clamped); - } - } -} - -void tanh(int32_t cell_state_scale_power, const luci_interpreter::RuntimeShape &input_data_shape, - int16_t *input_data, const luci_interpreter::RuntimeShape &output_data_shape, - int16_t *output_data) -{ - int32_t tanh_input_left_shift = (15 + cell_state_scale_power) - 3; - int32_t input_multiplier = 0; - if (tanh_input_left_shift < 0) /* handling negative shift value */ - { - tanh_input_left_shift = -tanh_input_left_shift; - input_multiplier = 3; - } - const int flat_size = input_data_shape.flatSize(); - luci_interpreter_pal::Tanh(input_multiplier, tanh_input_left_shift, flat_size, input_data, - output_data); -} - -void sigmoid(const luci_interpreter::RuntimeShape &data_shape, int16_t *data) -{ - luci_interpreter_pal::Logistic(0, 0, data_shape.flatSize(), data, data); -} - -void clipping(const int v_size, const luci_interpreter::lstm::CellStateInfo *cell_state_info, - int16_t *vector) -{ - for (int i = 0; i < v_size; i++) - { - vector[i] = std::max(std::min(cell_state_info->quantized_cell_clip, vector[i]), - static_cast(-cell_state_info->quantized_cell_clip)); - } -} -#endif // DIS_QUANT - -#ifndef DIS_FLOAT -// Input and output have the same shape in LSTM -void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params, - const float *input1_data, const float *input2_data, float *output_data) -{ - const int flat_size = shape.flatSize(); - return luci_interpreter_pal::Mul(*params, flat_size, input1_data, input2_data, output_data); -} - -void addElementWise(const float *input_1, const float *input_2, int n_batch, int n_input, - float *output) -{ - for (int batch = 0; batch < n_batch; ++batch) - { - for (int i = 0; i < n_input; ++i) - { - const int index = batch * n_input + i; - output[index] = input_1[index] + input_2[index]; - } - } -} - -void tanh(int32_t, const luci_interpreter::RuntimeShape &input_data_shape, float *input_data, - const luci_interpreter::RuntimeShape &output_data_shape, float *output_data) -{ - const int flat_size = input_data_shape.flatSize(); - luci_interpreter_pal::Tanh(flat_size, input_data, output_data); -} - -void sigmoid(const luci_interpreter::RuntimeShape &data_shape, float *data) -{ - const int flat_size = data_shape.flatSize(); - luci_interpreter_pal::Logistic(flat_size, data, data); -} - -void clipping(const int v_size, const luci_interpreter::lstm::CellStateInfo *cell_state_info, - float *vector) -{ - for (int i = 0; i < v_size; i++) - { - vector[i] = - std::max(std::min(cell_state_info->cell_clip, vector[i]), -cell_state_info->cell_clip); - } -} -#endif // DIS_FLOAT - -// Size information about the LSTM kernel, which is deduced from tensors stored -// in the flat buffer file. -struct LstmSizeInfo -{ - bool time_major; - int32_t batch_size; - int32_t time_steps; - int32_t input_dimension; - int32_t state_dimension; -}; - -class LstmStepManager -{ -public: - LstmStepManager() = delete; - // Does not take any ownership, and all pointers must refer to valid objects - // that outlive the one constructed. - explicit LstmStepManager(const LstmSizeInfo &size_info) : size_info_(size_info) {} - - void updateTime() - { - current_time_ += 1; - // default as one batch per inference - int input_step = size_info_.input_dimension; - int output_step = size_info_.state_dimension; - // time major: batch inference - if (size_info_.time_major) - { - input_step = input_step * size_info_.batch_size; - output_step = output_step * size_info_.batch_size; - } - - input_offset_ += input_step; - output_offset_ += output_step; - } - - void updateBatch() - { - current_batch_ += 1; - // batch inference for time major: no action needed - if (size_info_.time_major) - { - return; - } - // otherwise: singe batch inference, go to the next batch - hidden_state_offset_ += size_info_.state_dimension; - cell_state_offset_ += size_info_.state_dimension; - } - - void resetTime() { current_time_ = 0; } - - luci_interpreter::RuntimeShape inputShape() const - { - int batch_size = 1; - if (size_info_.time_major) - { - batch_size = size_info_.batch_size; - } - const int dims[2] = {batch_size, size_info_.input_dimension}; - const int32_t *dims_data = reinterpret_cast(dims); - return luci_interpreter::RuntimeShape(2, dims_data); - } - - luci_interpreter::RuntimeShape stateShape() const - { - int batch_size = 1; - if (size_info_.time_major) - { - batch_size = size_info_.batch_size; - } - const int dims[2] = {batch_size, size_info_.state_dimension}; - const int32_t *dims_data = reinterpret_cast(dims); - return luci_interpreter::RuntimeShape(2, dims_data); - } - - int inputOffset() const { return input_offset_; } - - int outputOffset() const { return output_offset_; } - - int hiddenStateOffset() const { return hidden_state_offset_; } - - int cellStateOffset() const { return cell_state_offset_; } - -private: - int32_t current_time_ = 0; - int32_t current_batch_ = 0; - int32_t input_offset_ = 0; - int32_t output_offset_ = 0; - int32_t hidden_state_offset_ = 0; - int32_t cell_state_offset_ = 0; - - const LstmSizeInfo &size_info_; -}; - -// Calculates a single LSTM gate. -// Implements the following formula: -// gate = activate(FC(input) + FC(recurrent)) -// Activation is sigmoid except for the "cell" gate (configurable, usually tanh) -template -void calculateLstmGate(const LstmStepManager *step_info, - const luci_interpreter::lstm::GateParameters *gate_params, - // Input FC - ActivationType *input_data, const circle::Tensor *input_weight, - const circle::Tensor *input_bias, - // Recurrent FC - ActivationType *recurrent_data, const circle::Tensor *recurrent_weight, - const circle::Tensor *recurrent_bias, - // Output - CellType *gate_output, - // Scratch arrays - CellType *fc_output_buffer, const FusedActivation activation, - luci_interpreter::BaseRuntimeGraph *runtime_graph) -{ - // Input FC - const auto gate_output_shape = step_info->stateShape(); - { - FullyConnectedParams op_params{}; - op_params.input_offset = gate_params->input_fc_params.input_offset; - op_params.weights_offset = gate_params->input_fc_params.weights_offset; - op_params.output_offset = gate_params->input_fc_params.output_offset; - op_params.output_multiplier = gate_params->input_fc_params.output_multiplier; - op_params.output_shift = gate_params->input_fc_params.output_shift; - op_params.quantized_activation_min = gate_params->input_fc_params.quantized_activation_min; - op_params.quantized_activation_max = gate_params->input_fc_params.quantized_activation_max; - op_params.float_activation_max = gate_params->input_fc_params.float_activation_max; - op_params.float_activation_min = gate_params->input_fc_params.float_activation_min; - - int32_t input_weight_shape[luci_interpreter::kMaxSmallSize]; - luci_interpreter::kernels::getTensorDims(input_weight, runtime_graph, input_weight_shape); - - FullyConnected(op_params, step_info->inputShape().dimsData(), - input_data + step_info->inputOffset(), input_weight_shape, - luci_interpreter::kernels::getTensorData( - runtime_graph->getConstDataByTensor(input_weight)), - luci_interpreter::kernels::getTensorData( - runtime_graph->getConstDataByTensor(input_bias)), - gate_output_shape.dimsData(), gate_output); - } - - // Recurrent FC - { - FullyConnectedParams op_params{}; - op_params.input_offset = gate_params->recurrent_fc_params.input_offset; - op_params.weights_offset = gate_params->recurrent_fc_params.weights_offset; - op_params.output_offset = gate_params->recurrent_fc_params.output_offset; - op_params.output_multiplier = gate_params->recurrent_fc_params.output_multiplier; - op_params.output_shift = gate_params->recurrent_fc_params.output_shift; - op_params.quantized_activation_min = gate_params->recurrent_fc_params.quantized_activation_min; - op_params.quantized_activation_max = gate_params->recurrent_fc_params.quantized_activation_max; - op_params.float_activation_max = gate_params->recurrent_fc_params.float_activation_max; - op_params.float_activation_min = gate_params->recurrent_fc_params.float_activation_min; - - int32_t recurrent_weight_shape[luci_interpreter::kMaxSmallSize]; - luci_interpreter::kernels::getTensorDims(recurrent_weight, runtime_graph, - recurrent_weight_shape); - - FullyConnected(op_params, step_info->stateShape().dimsData(), - recurrent_data + step_info->hiddenStateOffset(), recurrent_weight_shape, - luci_interpreter::kernels::getTensorData( - runtime_graph->getConstDataByTensor(recurrent_weight)), - luci_interpreter::kernels::getTensorData( - runtime_graph->getConstDataByTensor(recurrent_bias)), - gate_output_shape.dimsData(), fc_output_buffer); - - addElementWise(gate_output, fc_output_buffer, /*n_batch=*/gate_output_shape.dimsData()[0], - /*n_state=*/gate_output_shape.dimsData()[1], gate_output); - - switch (activation) - { - case FusedActivation::kTfLiteActSigmoid: - sigmoid(gate_output_shape, gate_output); - break; - case FusedActivation::kTfLiteActTanh: - { - // Set the scale power to -12 to avoid shift - tanh(/*cell_state_scale_power=*/-12, gate_output_shape, gate_output, gate_output_shape, - gate_output); - } - break; - default: - // Only Sigmoid or Tanh is used. - assert(false && "Only Sigmoid or Tanh is used"); - } - } -} - -// Update the hidden state of the LSTM kernel using the following formula: -// updated_hidden_state = Tanh(updated_cell_state) * output_gate_output, * means -// element wise multiplication -template -void updateLstmHidden(const LstmStepManager *step_info, CellType *cell_state_data_base, - ActivationType *hidden_state_data, const CellType *output_gate_output, - const ArithmeticParams *mul_params, int32_t cell_state_scale_power, - CellType *buffer) -{ - auto cell_state_shape = step_info->stateShape(); - CellType *cell_state_data = cell_state_data_base + step_info->cellStateOffset(); - // Tanh(cell_state) - tanh(cell_state_scale_power, cell_state_shape, cell_state_data, cell_state_shape, buffer); - // Update the hidden state - mul(cell_state_shape, mul_params, buffer, output_gate_output, - hidden_state_data + step_info->hiddenStateOffset()); -} - -// Update the cell state using the output from the forget gate, input gate, and -// cell gate Formula: updated_cell_state = forget_gate_output*cell_state + -// input_gate_output * cell_gate_output, where * denotes element wise -// multiplication -template -void updateLstmCell(const LstmStepManager *step_info, CellType *cell_state_data, - // Gate outputs - CellType *forget_gate_output, const CellType *input_gate_output, - const CellType *cell_gate_output, - // Mul parameters - const ArithmeticParams &forget_cell_mul_params, - const ArithmeticParams &input_mul_params, - const luci_interpreter::lstm::CellStateInfo *cell_state_info, CellType *buffer) -{ - auto cell_state_shape = step_info->stateShape(); - // Forget Gate x Cell State - mul(cell_state_shape, &forget_cell_mul_params, forget_gate_output, - cell_state_data + step_info->cellStateOffset(), - cell_state_data + step_info->cellStateOffset()); - // Input Gate x Cell Gate - mul(cell_state_shape, &input_mul_params, input_gate_output, cell_gate_output, buffer); - - // Update the cell state - addElementWise(cell_state_data + step_info->cellStateOffset(), buffer, - /*n_batch=*/cell_state_shape.dimsData()[0], - /*n_state=*/cell_state_shape.dimsData()[1], - cell_state_data + step_info->cellStateOffset()); - - if (cell_state_info->cell_clip > 0) - { - clipping(cell_state_shape.flatSize(), cell_state_info, - cell_state_data + step_info->cellStateOffset()); - } -} - -template -void lstmStep(luci_interpreter::lstm::LSTMStruct *lstm_struct, - luci_interpreter::lstm::LSTMParameters *lstm_params, LstmStepManager *step_info, - luci_interpreter::lstm::CellStateInfo *cell_state_info, - ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0, - CellType *scratch1, CellType *scratch2, CellType *scratch3, - luci_interpreter::BaseRuntimeGraph *runtime_graph) -{ - /*Step1: Calculate gate outputs to prepare cell state update*/ - CellType *gate_internal_buffer = scratch3; - CellType *forget_gate_output = scratch0; - - auto input_data = luci_interpreter::kernels::getTensorData( - runtime_graph->getDataByTensor(lstm_struct->input())); - - calculateLstmGate( - step_info, &lstm_params->forget_gate_parameters, - // Input FC - input_data, lstm_struct->input_to_forget_weights(), lstm_struct->forget_gate_bias(), - // Recurrent FC - output_state_data, lstm_struct->recurrent_to_forget_weights(), nullptr, - // Output - forget_gate_output, gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph); - - // Input Gate calculation; - CellType *input_gate_output = scratch1; - calculateLstmGate( - step_info, &lstm_params->input_gate_parameters, - // Input FC - input_data, lstm_struct->input_to_input_weights(), lstm_struct->input_gate_bias(), - // Recurrent FC - output_state_data, lstm_struct->recurrent_to_input_weights(), - /*recurrent_bias*/ nullptr, - // Output - input_gate_output, - // Scratch arrays - gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph); - - // Cell Gate calculation - CellType *cell_gate_output = scratch2; - calculateLstmGate( - step_info, &lstm_params->cell_gate_parameters, - // Input FC - input_data, lstm_struct->input_to_cell_weights(), lstm_struct->cell_gate_bias(), - // Recurrent FC - output_state_data, lstm_struct->recurrent_to_cell_weights(), - /*recurrent_bias*/ nullptr, - // Output - cell_gate_output, - // Scratch arrays - gate_internal_buffer, FusedActivation::kTfLiteActTanh, runtime_graph); - - /*Step2: update the cell state */ - { - // const InterGateParameters& inter_gate_params = op_data.inter_gate_parameters; - CellType *updated_input_buffer = scratch1; // reuse buffer - - updateLstmCell( - step_info, cell_state_data, forget_gate_output, input_gate_output, cell_gate_output, - lstm_params->inter_gate_parameters.forget_cell_mul_params, - lstm_params->inter_gate_parameters.input_mul_params, cell_state_info, updated_input_buffer); - } - - { - /*Step3: update the hidden state */ - CellType *output_gate_output = scratch1; // reuse buffer - calculateLstmGate( - step_info, &lstm_params->output_gate_parameters, - // Input FC - input_data, lstm_struct->input_to_output_weights(), lstm_struct->output_gate_bias(), - // Recurrent FC - output_state_data, lstm_struct->recurrent_to_output_weights(), nullptr, - // Output - output_gate_output, - // Scratch arrays - gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph); - CellType *tanh_activated_cell_buffer = scratch0; // reuse buffer - updateLstmHidden( - step_info, cell_state_data, output_state_data, output_gate_output, - &lstm_params->inter_gate_parameters.output_mul_params, - cell_state_info->cell_state_scale_power, tanh_activated_cell_buffer); - - ActivationType *output_ptr = luci_interpreter::kernels::getTensorData( - runtime_graph->getDataByTensor(lstm_struct->output())); - std::memcpy(output_ptr + step_info->outputOffset(), - output_state_data + step_info->hiddenStateOffset(), - step_info->stateShape().flatSize() * sizeof(ActivationType)); - } -} - -} // namespace lstm_internal - // Evaluate the LSTM kernel with (potential) multi-steps and multi-batch input -template -void evalLSTM(luci_interpreter::lstm::LSTMStruct *lstm_struct, - luci_interpreter::lstm::LSTMParameters *lstm_params, - luci_interpreter::lstm::CellStateInfo *cell_state_info, - ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0, - CellType *scratch1, CellType *scratch2, CellType *scratch3, - luci_interpreter::BaseRuntimeGraph *runtime_graph) +template <> +void evalLSTM( + luci_interpreter::lstm::LSTMStruct *lstm_struct, + luci_interpreter::lstm::LSTMParameters *lstm_params, + luci_interpreter::lstm::CellStateInfo *cell_state_info, int8_t *output_state_data, + int16_t *cell_state_data, int16_t *scratch0, int16_t *scratch1, int16_t *scratch2, + int16_t *scratch3, luci_interpreter::BaseRuntimeGraph *runtime_graph) { lstm_internal::LstmSizeInfo size_info; @@ -535,7 +50,7 @@ void evalLSTM(luci_interpreter::lstm::LSTMStruct *lstm_struct, { for (int t = 0; t < size_info.time_steps; t++) { - lstm_internal::lstmStep( + lstm_internal::lstmStep( lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data, scratch0, scratch1, scratch2, scratch3, runtime_graph); // prepare for the next time step @@ -549,7 +64,7 @@ void evalLSTM(luci_interpreter::lstm::LSTMStruct *lstm_struct, { for (int t = 0; t < size_info.time_steps; t++) { - lstm_internal::lstmStep( + lstm_internal::lstmStep( lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data, scratch0, scratch1, scratch2, scratch3, runtime_graph); // prepare for the next time step