diff --git a/onert-micro/luci-interpreter/pal/common/Broadcast.h b/onert-micro/luci-interpreter/pal/common/Broadcast.h new file mode 100644 index 00000000000..42cd01ac320 --- /dev/null +++ b/onert-micro/luci-interpreter/pal/common/Broadcast.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef LUCI_INTERPRETER_BROADCAST_H +#define LUCI_INTERPRETER_BROADCAST_H + +#include +#include +#include "ProcessBroadcastShapes.h" +namespace luci_interpreter_pal +{ +template +inline void +BroadcastTISO4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, + const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, + const luci_interpreter::RuntimeShape &output_shape, T *output_data, + std::function func) +{ + NdArrayDesc<4> desc1; + NdArrayDesc<4> desc2; + NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); + + const luci_interpreter::RuntimeShape extended_output_shape = + luci_interpreter::RuntimeShape::extendedShape(4, output_shape); + + // In Tensorflow, the dimensions are canonically named (batch_number, row, + // col, channel), with extents (batches, height, width, depth), with the + // trailing dimension changing most rapidly (channels has the smallest stride, + // typically 1 element). + // + // In generated C code, we store arrays with the dimensions reversed. The + // first dimension has smallest stride. + // + // We name our variables by their Tensorflow convention, but generate C code + // nesting loops such that the innermost loop has the smallest stride for the + // best cache behavior. + + for (int b = 0; b < extended_output_shape.dims(0); ++b) + { + for (int y = 0; y < extended_output_shape.dims(1); ++y) + { + for (int x = 0; x < extended_output_shape.dims(2); ++x) + { + for (int c = 0; c < extended_output_shape.dims(3); ++c) + { + const int output_data_offset = + ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * + extended_output_shape.dims(3) + + c; + + output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)], + input2_data[subscriptToIndex(desc2, b, y, x, c)]); + } + } + } + } +} +} // namespace luci_interpreter_pal +#endif // LUCI_INTERPRETER_BROADCAST_H diff --git a/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h b/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h index 187ca95ee27..283314f94da 100644 --- a/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h +++ b/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h @@ -21,6 +21,7 @@ #include "Params.h" #include "PALUtils.h" #include "ProcessBroadcastShapes.h" +#include "Broadcast.h" namespace luci_interpreter_pal { @@ -33,52 +34,16 @@ inline void Minimum(const int flat_size, const float *input1_data, const float * } } +template inline void -BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data, - const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data, - const luci_interpreter::RuntimeShape &output_shape, float *output_data) +BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data, + const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data, + const luci_interpreter::RuntimeShape &output_shape, T *output_data) { - NdArrayDesc<4> desc1; - NdArrayDesc<4> desc2; - NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2); - - const luci_interpreter::RuntimeShape extended_output_shape = - luci_interpreter::RuntimeShape::extendedShape(4, output_shape); - - // In Tensorflow, the dimensions are canonically named (batch_number, row, - // col, channel), with extents (batches, height, width, depth), with the - // trailing dimension changing most rapidly (channels has the smallest stride, - // typically 1 element). - // - // In generated C code, we store arrays with the dimensions reversed. The - // first dimension has smallest stride. - // - // We name our variables by their Tensorflow convention, but generate C code - // nesting loops such that the innermost loop has the smallest stride for the - // best cache behavior. - - for (int b = 0; b < extended_output_shape.dims(0); ++b) - { - for (int y = 0; y < extended_output_shape.dims(1); ++y) - { - for (int x = 0; x < extended_output_shape.dims(2); ++x) - { - for (int c = 0; c < extended_output_shape.dims(3); ++c) - { - const int output_data_offset = - ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) * - extended_output_shape.dims(3) + - c; - - output_data[output_data_offset] = - std::min(input1_data[subscriptToIndex(desc1, b, y, x, c)], - input2_data[subscriptToIndex(desc2, b, y, x, c)]); - } - } - } - } + auto func = [](const T &a, const T &b) -> const T & { return std::min(a, b); }; + BroadcastTISO4DSlow(input1_shape, input1_data, input2_shape, input2_data, output_shape, + output_data, func); } - } // namespace luci_interpreter_pal #endif // LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H diff --git a/onert-micro/luci-interpreter/src/kernels/Minimum.cpp b/onert-micro/luci-interpreter/src/kernels/Minimum.cpp index 223176751a5..0949b8990dc 100644 --- a/onert-micro/luci-interpreter/src/kernels/Minimum.cpp +++ b/onert-micro/luci-interpreter/src/kernels/Minimum.cpp @@ -81,7 +81,7 @@ void execute_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeGra } else { - luci_interpreter_pal::BroadcastMinimum4DSlow( + luci_interpreter_pal::BroadcastMinimum4DSlow( input_shape1, kernels::getTensorData(input_data1), input_shape2, kernels::getTensorData(input_data2), output_shape, kernels::getTensorData(output_data));