[onert-micro] Refactor Minimum kernel (Samsung#12044)

This commit refactors Minimum kernel. ONE-DCO-1.0-Signed-off-by: Vyacheslav Bazhenov <[email protected]> Co-authored-by: Vyacheslav Bazhenov <[email protected]>
glistening · Nov 21, 2023 · d292936 · d292936
1 parent 9e89d52
commit d292936
Show file tree

Hide file tree

Showing 3 changed files with 81 additions and 44 deletions.
diff --git a/onert-micro/luci-interpreter/pal/common/Broadcast.h b/onert-micro/luci-interpreter/pal/common/Broadcast.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_BROADCAST_H
+#define LUCI_INTERPRETER_BROADCAST_H
+
+#include <functional>
+#include <luci_interpreter/core/Tensor.h>
+#include "ProcessBroadcastShapes.h"
+namespace luci_interpreter_pal
+{
+template <typename T>
+inline void
+BroadcastTISO4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
+                    const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
+                    const luci_interpreter::RuntimeShape &output_shape, T *output_data,
+                    std::function<const T &(const T &, const T &)> func)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+
+  const luci_interpreter::RuntimeShape extended_output_shape =
+    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+
+  for (int b = 0; b < extended_output_shape.dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.dims(3); ++c)
+        {
+          const int output_data_offset =
+            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
+              extended_output_shape.dims(3) +
+            c;
+
+          output_data[output_data_offset] = func(input1_data[subscriptToIndex(desc1, b, y, x, c)],
+                                                 input2_data[subscriptToIndex(desc2, b, y, x, c)]);
+        }
+      }
+    }
+  }
+}
+} // namespace luci_interpreter_pal
+#endif // LUCI_INTERPRETER_BROADCAST_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h b/onert-micro/luci-interpreter/pal/common/PALMinimumCommon.h
@@ -21,6 +21,7 @@
 #include "Params.h"
 #include "PALUtils.h"
 #include "ProcessBroadcastShapes.h"
+#include "Broadcast.h"
 
 namespace luci_interpreter_pal
 {
@@ -33,52 +34,16 @@ inline void Minimum(const int flat_size, const float *input1_data, const float *
   }
 }
 
+template <typename T>
 inline void
-BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const float *input1_data,
-                       const luci_interpreter::RuntimeShape &input2_shape, const float *input2_data,
-                       const luci_interpreter::RuntimeShape &output_shape, float *output_data)
+BroadcastMinimum4DSlow(const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
+                       const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
+                       const luci_interpreter::RuntimeShape &output_shape, T *output_data)
 {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            std::min(input1_data[subscriptToIndex(desc1, b, y, x, c)],
-                     input2_data[subscriptToIndex(desc2, b, y, x, c)]);
-        }
-      }
-    }
-  }
+  auto func = [](const T &a, const T &b) -> const T & { return std::min(a, b); };
+  BroadcastTISO4DSlow<float>(input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                             output_data, func);
 }
-
 } // namespace luci_interpreter_pal
 
 #endif // LUCI_INTERPRETER_PAL_MINIMUM_COMMON_H
diff --git a/onert-micro/luci-interpreter/src/kernels/Minimum.cpp b/onert-micro/luci-interpreter/src/kernels/Minimum.cpp
@@ -81,7 +81,7 @@ void execute_kernel_CircleMinimum(const circle::Operator *cur_op, BaseRuntimeGra
       }
       else
       {
-        luci_interpreter_pal::BroadcastMinimum4DSlow(
+        luci_interpreter_pal::BroadcastMinimum4DSlow<float>(
           input_shape1, kernels::getTensorData<float>(input_data1), input_shape2,
           kernels::getTensorData<float>(input_data2), output_shape,
           kernels::getTensorData<float>(output_data));