Skip to content

Commit

Permalink
[onert-micro] Replace common kernels impl to pal/common
Browse files Browse the repository at this point in the history
This pr replaces common kernels impl to pal/common dir.

ONE-DCO-1.0-Signed-off-by: Artem Balyshev <[email protected]>
  • Loading branch information
Artem Balyshev committed Aug 4, 2023
1 parent 2e8ff58 commit bfdd5fc
Show file tree
Hide file tree
Showing 18 changed files with 1,391 additions and 1,084 deletions.
94 changes: 94 additions & 0 deletions onert-micro/luci-interpreter/pal/common/PALAddCommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef LUCI_INTERPRETER_PAL_ADD_COMMON_H
#define LUCI_INTERPRETER_PAL_ADD_COMMON_H

#include "Params.h"
#include "PALUtils.h"
#include "ProcessBroadcastShapes.h"

namespace luci_interpreter_pal
{

// TODO: check if there real activation value
template <typename T>
inline void Add(const ArithmeticParams &params, const int flat_size, const T *input1_data,
const T *input2_data, T *output_data)
{
T activation_min, activation_max;
getActivationParams(params, &activation_min, &activation_max);

for (int i = 0; i < flat_size; ++i)
output_data[i] =
std::min(std::max(input1_data[i] + input2_data[i], activation_min), activation_max);
}

template <typename T>
inline void
BroadcastAdd4DSlow(const ArithmeticParams &params,
const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
const luci_interpreter::RuntimeShape &output_shape, T *output_data)
{
NdArrayDesc<4> desc1;
NdArrayDesc<4> desc2;
NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
const luci_interpreter::RuntimeShape extended_output_shape =
luci_interpreter::RuntimeShape::extendedShape(4, output_shape);

T activation_min, activation_max;
getActivationParams(params, &activation_min, &activation_max);

// In Tensorflow, the dimensions are canonically named (batch_number, row,
// col, channel), with extents (batches, height, width, depth), with the
// trailing dimension changing most rapidly (channels has the smallest stride,
// typically 1 element).
//
// In generated C code, we store arrays with the dimensions reversed. The
// first dimension has smallest stride.
//
// We name our variables by their Tensorflow convention, but generate C code
// nesting loops such that the innermost loop has the smallest stride for the
// best cache behavior.
for (int b = 0; b < extended_output_shape.dims(0); ++b)
{
for (int y = 0; y < extended_output_shape.dims(1); ++y)
{
for (int x = 0; x < extended_output_shape.dims(2); ++x)
{
for (int c = 0; c < extended_output_shape.dims(3); ++c)
{
const int output_data_offset =
((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
extended_output_shape.dims(3) +
c;

output_data[output_data_offset] =
std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] +
input2_data[subscriptToIndex(desc2, b, y, x, c)],
activation_min),
activation_max);
}
}
}
}
}

} // namespace luci_interpreter_pal

#endif // LUCI_INTERPRETER_PAL_ADD_COMMON_H
93 changes: 93 additions & 0 deletions onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
/*
* Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H
#define LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H

#include "Params.h"
#include "PALUtils.h"

namespace luci_interpreter_pal
{

// TODO: reduce code duplication with MaxPool
inline void AveragePool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
const float *input_data, const luci_interpreter::RuntimeShape &output_shape,
float *output_data)
{
const int batches = input_shape.dims(0);
const int depth = output_shape.dims(3);
const int input_height = input_shape.dims(1);
const int input_width = input_shape.dims(2);
const int output_height = output_shape.dims(1);
const int output_width = output_shape.dims(2);
const int stride_height = params.stride_height;
const int stride_width = params.stride_width;
for (int batch = 0; batch < batches; ++batch)
{
for (int out_y = 0; out_y < output_height; ++out_y)
{
for (int out_x = 0; out_x < output_width; ++out_x)
{
for (int channel = 0; channel < depth; ++channel)
{
const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
// Compute the boundaries of the filter region clamped so as to
// ensure that the filter window fits in the input array.
const int filter_x_start = std::max(0, -in_x_origin);
const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
const int filter_y_start = std::max(0, -in_y_origin);
const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);

float total = 0.f;
float filter_count = 0;

for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
{
for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
{
const int in_x = in_x_origin + filter_x;
const int in_y = in_y_origin + filter_y;

const int input_data_offset =
((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
input_shape.dims(3) +
channel;

total += input_data[input_data_offset];
filter_count++;
}
}
const int output_data_offset =
((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
output_shape.dims(3) +
channel;

assert(filter_count != 0);
const float average = total / filter_count;

output_data[output_data_offset] =
std::min(std::max(average, params.float_activation_min), params.float_activation_max);
}
}
}
}
}
} // namespace luci_interpreter_pal

#endif // LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H
201 changes: 201 additions & 0 deletions onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
/*
* Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#ifndef LUCI_INTERPRETER_PAL_CONV2D_COMMON_H
#define LUCI_INTERPRETER_PAL_CONV2D_COMMON_H
#include "Params.h"
#include "PALUtils.h"

namespace luci_interpreter_pal
{
static inline void Conv(const ConvParams &params, const int32_t *input_shape,
const float *input_data, const int32_t *filter_shape,
const float *filter_data, const float *bias_data,
const int32_t *output_shape, float *output_data)
{
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const float output_activation_min = params.float_activation_min;
const float output_activation_max = params.float_activation_max;

const auto batches = input_shape[0];
const int input_height = input_shape[1];
const int input_width = input_shape[2];
const int input_depth = input_shape[3];
const int output_depth = filter_shape[0];
const int filter_height = filter_shape[1];
const int filter_width = filter_shape[2];
const int output_height = output_shape[1];
const int output_width = output_shape[2];
for (int batch = 0; batch < batches; ++batch)
{
for (int out_y = 0; out_y < output_height; ++out_y)
{
const int in_y_origin = (out_y * stride_height) - pad_height;
for (int out_x = 0; out_x < output_width; ++out_x)
{
const int in_x_origin = (out_x * stride_width) - pad_width;
for (int out_channel = 0; out_channel < output_depth; ++out_channel)
{
float total = 0.f;
for (int filter_y = 0; filter_y < filter_height; ++filter_y)
{
const int in_y = in_y_origin + dilation_height_factor * filter_y;
for (int filter_x = 0; filter_x < filter_width; ++filter_x)
{
const int in_x = in_x_origin + dilation_width_factor * filter_x;

// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);

if (!is_point_inside_image)
{
continue;
}

for (int in_channel = 0; in_channel < input_depth; ++in_channel)
{
const int input_data_offset =
((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel;

const int filter_data_offset =
((out_channel * filter_height + filter_y) * filter_width + filter_x) *
input_depth +
in_channel;

const float input_value = input_data[input_data_offset];
const float filter_value = filter_data[filter_data_offset];
total += (input_value * filter_value);
}
}
}
// float bias_value = 0.0f;
if (bias_data)
{
total += bias_data[out_channel];
}

const int output_data_offset =
((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel;

output_data[output_data_offset] =
std::min(std::max(total, output_activation_min), output_activation_max);
}
}
}
}
}

static inline void Conv(const ConvParams &params, const int32_t *input_shape,
const uint8_t *input_data, const int32_t *filter_shape,
const uint8_t *filter_data, const int32_t *bias_data,
const int32_t *output_shape, uint8_t *output_data)
{
const int stride_width = params.stride_width;
const int stride_height = params.stride_height;
const int dilation_width_factor = params.dilation_width_factor;
const int dilation_height_factor = params.dilation_height_factor;
const int pad_width = params.padding_values.width;
const int pad_height = params.padding_values.height;
const int32_t input_offset = params.input_offset;
const int32_t filter_offset = params.weights_offset;
const int32_t output_offset = params.output_offset;
const int32_t output_multiplier = params.output_multiplier;
const int output_shift = params.output_shift;
const int32_t output_activation_min = params.quantized_activation_min;
const int32_t output_activation_max = params.quantized_activation_max;

const auto batches = input_shape[0];
const int input_height = input_shape[1];
const int input_width = input_shape[2];
const int input_depth = input_shape[3];
const int output_depth = filter_shape[0];
const int filter_height = filter_shape[1];
const int filter_width = filter_shape[2];
const int output_height = output_shape[1];
const int output_width = output_shape[2];

for (int batch = 0; batch < batches; ++batch)
{
for (int out_y = 0; out_y < output_height; ++out_y)
{
const int in_y_origin = (out_y * stride_height) - pad_height;
for (int out_x = 0; out_x < output_width; ++out_x)
{
const int in_x_origin = (out_x * stride_width) - pad_width;
for (int out_channel = 0; out_channel < output_depth; ++out_channel)
{
int32_t acc = 0;
for (int filter_y = 0; filter_y < filter_height; ++filter_y)
{
const int in_y = in_y_origin + dilation_height_factor * filter_y;
for (int filter_x = 0; filter_x < filter_width; ++filter_x)
{
const int in_x = in_x_origin + dilation_width_factor * filter_x;

// Zero padding by omitting the areas outside the image.
const bool is_point_inside_image =
(in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);

if (!is_point_inside_image)
{
continue;
}

for (int in_channel = 0; in_channel < input_depth; ++in_channel)
{
const int input_data_offset =
((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel;

const int filter_data_offset =
((out_channel * filter_height + filter_y) * filter_width + filter_x) *
input_depth +
in_channel;

const int32_t input_val = input_data[input_data_offset];
const int32_t filter_val = filter_data[filter_data_offset];
acc += (filter_val + filter_offset) * (input_val + input_offset);
}
}
}
if (bias_data)
{
acc += bias_data[out_channel];
}
acc = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
acc += output_offset;
acc = std::max(acc, output_activation_min);
acc = std::min(acc, output_activation_max);

const int output_data_offset =
((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel;

output_data[output_data_offset] = static_cast<uint8_t>(acc);
}
}
}
}
}

} // namespace luci_interpreter_pal

#endif // LUCI_INTERPRETER_PAL_CONV2D_COMMON_H
Loading

0 comments on commit bfdd5fc

Please sign in to comment.