Skip to content

Commit

Permalink
[onert-micro] Refactor Minimum kernel (Samsung#12044)
Browse files Browse the repository at this point in the history
This commit refactors Minimum kernel.

ONE-DCO-1.0-Signed-off-by: Vyacheslav Bazhenov <[email protected]>

Co-authored-by: Vyacheslav Bazhenov <[email protected]>
  • Loading branch information
SlavikMIPT and Vyacheslav Bazhenov authored Nov 21, 2023
1 parent 9e89d52 commit d292936
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 44 deletions.
72 changes: 72 additions & 0 deletions onert-micro/luci-interpreter/pal/common/Broadcast.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
/*
* Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef LUCI_INTERPRETER_BROADCAST_H
#define LUCI_INTERPRETER_BROADCAST_H

#include <functional>
#include <luci_interpreter/core/Tensor.h>
#include "ProcessBroadcastShapes.h"
namespace luci_interpreter_pal
{
template <typename T>
inline void
BroadcastTISO4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
const luci_interpreter::RuntimeShape &output_shape, T *output_data,
std::function<const T &(const T &, const T &)> func)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.

for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
}
} // namespace luci_interpreter_pal
#endif // LUCI_INTERPRETER_BROADCAST_H
51 changes: 8 additions & 43 deletions onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"
#include "Broadcast.h"

namespace luci_interpreter_pal
{
Expand All @@ -33,52 +34,16 @@ inline void Minimum(const int flat_size, const float *input1_data, const float *
}
}

template <typename T>
inline void
BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data,
const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data,
const luci_interpreter::RuntimeShape &output_shape, float *output_data)
BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
const luci_interpreter::RuntimeShape &output_shape, T *output_data)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);

const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.

for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] =
std::min(input1_data[subscriptToIndex(desc1, b, y, x, c)],
input2_data[subscriptToIndex(desc2, b, y, x, c)]);
}
}
}
}
auto func = [](const T &a, const T &b) -> const T & { return std::min(a, b); };
BroadcastTISO4DSlow<float>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
output_data, func);
}

} // namespace luci_interpreter_pal

#endif // LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H
2 changes: 1 addition & 1 deletion onert-micro/luci-interpreter/src/kernels/Minimum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void execute_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeGra
}
else
{
luci_interpreter_pal::BroadcastMinimum4DSlow(
luci_interpreter_pal::BroadcastMinimum4DSlow<float>(
input_shape1, kernels::getTensorData<float>(input_data1), input_shape2,
kernels::getTensorData<float>(input_data2), output_shape,
kernels::getTensorData<float>(output_data));
Expand Down

0 comments on commit d292936

Please sign in to comment.