[onert-micro] Replace common kernels impl to pal/common

This pr replaces common kernels impl to pal/common dir. ONE-DCO-1.0-Signed-off-by: Artem Balyshev <[email protected]>
Samsung · Aug 4, 2023 · bfdd5fc · bfdd5fc
1 parent 2e8ff58
commit bfdd5fc
Show file tree

Hide file tree

Showing 18 changed files with 1,391 additions and 1,084 deletions.
diff --git a/onert-micro/luci-interpreter/pal/common/PALAddCommon.h b/onert-micro/luci-interpreter/pal/common/PALAddCommon.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ADD_COMMON_H
+#define LUCI_INTERPRETER_PAL_ADD_COMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+#include "ProcessBroadcastShapes.h"
+
+namespace luci_interpreter_pal
+{
+
+// TODO: check if there real activation value
+template <typename T>
+inline void Add(const ArithmeticParams &params, const int flat_size, const T *input1_data,
+                const T *input2_data, T *output_data)
+{
+  T activation_min, activation_max;
+  getActivationParams(params, &activation_min, &activation_max);
+
+  for (int i = 0; i < flat_size; ++i)
+    output_data[i] =
+      std::min(std::max(input1_data[i] + input2_data[i], activation_min), activation_max);
+}
+
+template <typename T>
+inline void
+BroadcastAdd4DSlow(const ArithmeticParams &params,
+                   const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
+                   const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
+                   const luci_interpreter::RuntimeShape &output_shape, T *output_data)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const luci_interpreter::RuntimeShape extended_output_shape =
+    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
+
+  T activation_min, activation_max;
+  getActivationParams(params, &activation_min, &activation_max);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.dims(3); ++c)
+        {
+          const int output_data_offset =
+            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
+              extended_output_shape.dims(3) +
+            c;
+
+          output_data[output_data_offset] =
+            std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] +
+                                input2_data[subscriptToIndex(desc2, b, y, x, c)],
+                              activation_min),
+                     activation_max);
+        }
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ADD_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H
+#define LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+
+namespace luci_interpreter_pal
+{
+
+// TODO: reduce code duplication with MaxPool
+inline void AveragePool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
+                        const float *input_data, const luci_interpreter::RuntimeShape &output_shape,
+                        float *output_data)
+{
+  const int batches = input_shape.dims(0);
+  const int depth = output_shape.dims(3);
+  const int input_height = input_shape.dims(1);
+  const int input_width = input_shape.dims(2);
+  const int output_height = output_shape.dims(1);
+  const int output_width = output_shape.dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int channel = 0; channel < depth; ++channel)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+
+          float total = 0.f;
+          float filter_count = 0;
+
+          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+          {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+            {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+
+              const int input_data_offset =
+                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
+                  input_shape.dims(3) +
+                channel;
+
+              total += input_data[input_data_offset];
+              filter_count++;
+            }
+          }
+          const int output_data_offset =
+            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
+              output_shape.dims(3) +
+            channel;
+
+          assert(filter_count != 0);
+          const float average = total / filter_count;
+
+          output_data[output_data_offset] =
+            std::min(std::max(average, params.float_activation_min), params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CONV2D_COMMON_H
+#define LUCI_INTERPRETER_PAL_CONV2D_COMMON_H
+#include "Params.h"
+#include "PALUtils.h"
+
+namespace luci_interpreter_pal
+{
+static inline void Conv(const ConvParams &params, const int32_t *input_shape,
+                        const float *input_data, const int32_t *filter_shape,
+                        const float *filter_data, const float *bias_data,
+                        const int32_t *output_shape, float *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  const auto batches = input_shape[0];
+  const int input_height = input_shape[1];
+  const int input_width = input_shape[2];
+  const int input_depth = input_shape[3];
+  const int output_depth = filter_shape[0];
+  const int filter_height = filter_shape[1];
+  const int filter_width = filter_shape[2];
+  const int output_height = output_shape[1];
+  const int output_width = output_shape[2];
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          float total = 0.f;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+              if (!is_point_inside_image)
+              {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              {
+                const int input_data_offset =
+                  ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel;
+
+                const int filter_data_offset =
+                  ((out_channel * filter_height + filter_y) * filter_width + filter_x) *
+                    input_depth +
+                  in_channel;
+
+                const float input_value = input_data[input_data_offset];
+                const float filter_value = filter_data[filter_data_offset];
+                total += (input_value * filter_value);
+              }
+            }
+          }
+          // float bias_value = 0.0f;
+          if (bias_data)
+          {
+            total += bias_data[out_channel];
+          }
+
+          const int output_data_offset =
+            ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel;
+
+          output_data[output_data_offset] =
+            std::min(std::max(total, output_activation_min), output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+static inline void Conv(const ConvParams &params, const int32_t *input_shape,
+                        const uint8_t *input_data, const int32_t *filter_shape,
+                        const uint8_t *filter_data, const int32_t *bias_data,
+                        const int32_t *output_shape, uint8_t *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  const auto batches = input_shape[0];
+  const int input_height = input_shape[1];
+  const int input_width = input_shape[2];
+  const int input_depth = input_shape[3];
+  const int output_depth = filter_shape[0];
+  const int filter_height = filter_shape[1];
+  const int filter_width = filter_shape[2];
+  const int output_height = output_shape[1];
+  const int output_width = output_shape[2];
+
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+              if (!is_point_inside_image)
+              {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              {
+                const int input_data_offset =
+                  ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel;
+
+                const int filter_data_offset =
+                  ((out_channel * filter_height + filter_y) * filter_width + filter_x) *
+                    input_depth +
+                  in_channel;
+
+                const int32_t input_val = input_data[input_data_offset];
+                const int32_t filter_val = filter_data[filter_data_offset];
+                acc += (filter_val + filter_offset) * (input_val + input_offset);
+              }
+            }
+          }
+          if (bias_data)
+          {
+            acc += bias_data[out_channel];
+          }
+          acc = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+
+          const int output_data_offset =
+            ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel;
+
+          output_data[output_data_offset] = static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CONV2D_COMMON_H