From bfdd5fcf19aa89e9386c558891042e1858176ce2 Mon Sep 17 00:00:00 2001
From: Artem Balyshev <a.balyshev@samsung.com>
Date: Fri, 4 Aug 2023 14:24:43 +0300
Subject: [PATCH] [onert-micro] Replace common kernels impl to pal/common

This pr replaces common kernels impl to pal/common dir.

ONE-DCO-1.0-Signed-off-by: Artem Balyshev <a.balyshev@samsung.com>
---
 .../pal/common/PALAddCommon.h                 |  94 +++
 .../pal/common/PALAveragePool2DCommon.h       |  93 +++
 .../pal/common/PALConv2DCommon.h              | 201 +++++++
 .../pal/{mcu => common}/PALDiv.h              |   6 +-
 .../pal/common/PALFullyConnectedCommon.h      | 103 ++++
 .../pal/common/PALMaxPool2DCommon.h           | 146 +++++
 .../pal/common/PALMulCommon.h                 | 115 ++++
 .../pal/{mcu => common}/PALResizeBilinear.h   |   6 +-
 .../pal/{mcu => common}/PALSoftmax.h          |   8 +-
 .../pal/{mcu => common}/PALSub.h              |   6 +-
 .../PALUnidirectionalSequenceLSTMCommon.h     | 567 ++++++++++++++++++
 onert-micro/luci-interpreter/pal/mcu/PALAdd.h |  71 +--
 .../pal/mcu/PALAveragePool2D.h                |  69 +--
 .../luci-interpreter/pal/mcu/PALConv2d.h      | 179 +-----
 .../pal/mcu/PALFullyConnected.h               |  93 +--
 .../luci-interpreter/pal/mcu/PALMaxPool2D.h   | 122 +---
 onert-micro/luci-interpreter/pal/mcu/PALMul.h |  91 +--
 .../pal/mcu/PALUnidirectionalSequenceLSTM.h   | 505 +---------------
 18 files changed, 1391 insertions(+), 1084 deletions(-)
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALAddCommon.h
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h
 rename onert-micro/luci-interpreter/pal/{mcu => common}/PALDiv.h (97%)
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALFullyConnectedCommon.h
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALMulCommon.h
 rename onert-micro/luci-interpreter/pal/{mcu => common}/PALResizeBilinear.h (97%)
 rename onert-micro/luci-interpreter/pal/{mcu => common}/PALSoftmax.h (92%)
 rename onert-micro/luci-interpreter/pal/{mcu => common}/PALSub.h (96%)
 create mode 100644 onert-micro/luci-interpreter/pal/common/PALUnidirectionalSequenceLSTMCommon.h
diff --git a/onert-micro/luci-interpreter/pal/common/PALAddCommon.h b/onert-micro/luci-interpreter/pal/common/PALAddCommon.h
new file mode 100644
index 00000000000..57f9b107e46
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALAddCommon.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_ADD_COMMON_H
+#define LUCI_INTERPRETER_PAL_ADD_COMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+#include "ProcessBroadcastShapes.h"
+
+namespace luci_interpreter_pal
+{
+
+// TODO: check if there real activation value
+template <typename T>
+inline void Add(const ArithmeticParams &params, const int flat_size, const T *input1_data,
+                const T *input2_data, T *output_data)
+{
+  T activation_min, activation_max;
+  getActivationParams(params, &activation_min, &activation_max);
+
+  for (int i = 0; i < flat_size; ++i)
+    output_data[i] =
+      std::min(std::max(input1_data[i] + input2_data[i], activation_min), activation_max);
+}
+
+template <typename T>
+inline void
+BroadcastAdd4DSlow(const ArithmeticParams &params,
+                   const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
+                   const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
+                   const luci_interpreter::RuntimeShape &output_shape, T *output_data)
+{
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const luci_interpreter::RuntimeShape extended_output_shape =
+    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
+
+  T activation_min, activation_max;
+  getActivationParams(params, &activation_min, &activation_max);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.dims(3); ++c)
+        {
+          const int output_data_offset =
+            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
+              extended_output_shape.dims(3) +
+            c;
+
+          output_data[output_data_offset] =
+            std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] +
+                                input2_data[subscriptToIndex(desc2, b, y, x, c)],
+                              activation_min),
+                     activation_max);
+        }
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_ADD_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h
new file mode 100644
index 00000000000..ec6bb55b5b5
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALAveragePool2DCommon.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H
+#define LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+
+namespace luci_interpreter_pal
+{
+
+// TODO: reduce code duplication with MaxPool
+inline void AveragePool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
+                        const float *input_data, const luci_interpreter::RuntimeShape &output_shape,
+                        float *output_data)
+{
+  const int batches = input_shape.dims(0);
+  const int depth = output_shape.dims(3);
+  const int input_height = input_shape.dims(1);
+  const int input_width = input_shape.dims(2);
+  const int output_height = output_shape.dims(1);
+  const int output_width = output_shape.dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int channel = 0; channel < depth; ++channel)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+
+          float total = 0.f;
+          float filter_count = 0;
+
+          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+          {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+            {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+
+              const int input_data_offset =
+                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
+                  input_shape.dims(3) +
+                channel;
+
+              total += input_data[input_data_offset];
+              filter_count++;
+            }
+          }
+          const int output_data_offset =
+            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
+              output_shape.dims(3) +
+            channel;
+
+          assert(filter_count != 0);
+          const float average = total / filter_count;
+
+          output_data[output_data_offset] =
+            std::min(std::max(average, params.float_activation_min), params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h
new file mode 100644
index 00000000000..04b92cd48e2
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALConv2DCommon.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_CONV2D_COMMON_H
+#define LUCI_INTERPRETER_PAL_CONV2D_COMMON_H
+#include "Params.h"
+#include "PALUtils.h"
+
+namespace luci_interpreter_pal
+{
+static inline void Conv(const ConvParams &params, const int32_t *input_shape,
+                        const float *input_data, const int32_t *filter_shape,
+                        const float *filter_data, const float *bias_data,
+                        const int32_t *output_shape, float *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  const auto batches = input_shape[0];
+  const int input_height = input_shape[1];
+  const int input_width = input_shape[2];
+  const int input_depth = input_shape[3];
+  const int output_depth = filter_shape[0];
+  const int filter_height = filter_shape[1];
+  const int filter_width = filter_shape[2];
+  const int output_height = output_shape[1];
+  const int output_width = output_shape[2];
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          float total = 0.f;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+              if (!is_point_inside_image)
+              {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              {
+                const int input_data_offset =
+                  ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel;
+
+                const int filter_data_offset =
+                  ((out_channel * filter_height + filter_y) * filter_width + filter_x) *
+                    input_depth +
+                  in_channel;
+
+                const float input_value = input_data[input_data_offset];
+                const float filter_value = filter_data[filter_data_offset];
+                total += (input_value * filter_value);
+              }
+            }
+          }
+          // float bias_value = 0.0f;
+          if (bias_data)
+          {
+            total += bias_data[out_channel];
+          }
+
+          const int output_data_offset =
+            ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel;
+
+          output_data[output_data_offset] =
+            std::min(std::max(total, output_activation_min), output_activation_max);
+        }
+      }
+    }
+  }
+}
+
+static inline void Conv(const ConvParams &params, const int32_t *input_shape,
+                        const uint8_t *input_data, const int32_t *filter_shape,
+                        const uint8_t *filter_data, const int32_t *bias_data,
+                        const int32_t *output_shape, uint8_t *output_data)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  const auto batches = input_shape[0];
+  const int input_height = input_shape[1];
+  const int input_width = input_shape[2];
+  const int input_depth = input_shape[3];
+  const int output_depth = filter_shape[0];
+  const int filter_height = filter_shape[1];
+  const int filter_width = filter_shape[2];
+  const int output_height = output_shape[1];
+  const int output_width = output_shape[2];
+
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+              if (!is_point_inside_image)
+              {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              {
+                const int input_data_offset =
+                  ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel;
+
+                const int filter_data_offset =
+                  ((out_channel * filter_height + filter_y) * filter_width + filter_x) *
+                    input_depth +
+                  in_channel;
+
+                const int32_t input_val = input_data[input_data_offset];
+                const int32_t filter_val = filter_data[filter_data_offset];
+                acc += (filter_val + filter_offset) * (input_val + input_offset);
+              }
+            }
+          }
+          if (bias_data)
+          {
+            acc += bias_data[out_channel];
+          }
+          acc = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+
+          const int output_data_offset =
+            ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel;
+
+          output_data[output_data_offset] = static_cast<uint8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_CONV2D_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALDiv.h b/onert-micro/luci-interpreter/pal/common/PALDiv.h
similarity index 97%
rename from onert-micro/luci-interpreter/pal/mcu/PALDiv.h
rename to onert-micro/luci-interpreter/pal/common/PALDiv.h
index cf84a1007e0..cca85cd224a 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALDiv.h
+++ b/onert-micro/luci-interpreter/pal/common/PALDiv.h
@@ -15,8 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef LUCI_INTERPRETER_PAL_DIV_H
-#define LUCI_INTERPRETER_PAL_DIV_H
+#ifndef LUCI_INTERPRETER_PAL_DIV_COMMON_H
+#define LUCI_INTERPRETER_PAL_DIV_COMMON_H
 
 #include "Params.h"
 #include "PALUtils.h"
@@ -112,4 +112,4 @@ BroadcastDiv4DSlow(const ArithmeticParams &params,
 
 } // namespace luci_interpreter_pal
 
-#endif // LUCI_INTERPRETER_PAL_DIV_H
+#endif // LUCI_INTERPRETER_PAL_DIV_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALFullyConnectedCommon.h b/onert-micro/luci-interpreter/pal/common/PALFullyConnectedCommon.h
new file mode 100644
index 00000000000..14934cc720a
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALFullyConnectedCommon.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_FULLY_CONNECTED_COMMON_H
+#define LUCI_INTERPRETER_PAL_FULLY_CONNECTED_COMMON_H
+
+#include "PALUtils.h"
+#include "Params.h"
+
+namespace luci_interpreter_pal
+{
+
+template <typename InputType, typename WeightType, typename OutputType, typename BiasType>
+inline void FullyConnected(const FullyConnectedParams &params, const int32_t *input_shape,
+                           const InputType *input_data, const int32_t *filter_shape,
+                           const WeightType *filter_data, const BiasType *bias_data,
+                           const int32_t *output_shape, OutputType *output_data)
+{
+  const int32_t input_offset = params.input_offset;
+  const int32_t filter_offset = params.weights_offset;
+  const int32_t output_offset = params.output_offset;
+  const int32_t output_multiplier = params.output_multiplier;
+  const int output_shift = params.output_shift;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  const int batches = input_shape[0];
+  const int output_depth = output_shape[1];
+  const int accum_depth = filter_shape[1];
+
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int out_c = 0; out_c < output_depth; ++out_c)
+    {
+      BiasType acc = 0;
+      for (int d = 0; d < accum_depth; ++d)
+      {
+        int32_t input_val = input_data[b * accum_depth + d];
+        int32_t filter_val = filter_data[out_c * accum_depth + d];
+        acc += (filter_val + filter_offset) * (input_val + input_offset);
+      }
+      if (bias_data)
+      {
+        acc += bias_data[out_c];
+      }
+      int32_t acc_scaled = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc_scaled += output_offset;
+      acc_scaled = std::max(acc_scaled, output_activation_min);
+      acc_scaled = std::min(acc_scaled, output_activation_max);
+      output_data[out_c + output_depth * b] = static_cast<OutputType>(acc_scaled);
+    }
+  }
+}
+template <>
+inline void FullyConnected(const FullyConnectedParams &params, const int32_t *input_shape,
+                           const float *input_data, const int32_t *filter_shape,
+                           const float *filter_data, const float *bias_data,
+                           const int32_t *output_shape, float *output_data)
+{
+  const float output_activation_min = params.float_activation_min;
+  const float output_activation_max = params.float_activation_max;
+
+  const int batches = input_shape[0];
+  const int output_depth = output_shape[1];
+  const int accum_depth = filter_shape[1];
+
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int out_c = 0; out_c < output_depth; ++out_c)
+    {
+      float total = 0.f;
+      for (int d = 0; d < accum_depth; ++d)
+      {
+        total += input_data[b * accum_depth + d] * filter_data[out_c * accum_depth + d];
+      }
+      float bias_value = 0.0f;
+      if (bias_data)
+      {
+        bias_value = bias_data[out_c];
+      }
+      output_data[out_c + output_depth * b] =
+        std::min(std::max(total + bias_value, output_activation_min), output_activation_max);
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_FULLY_CONNECTED_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h b/onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h
new file mode 100644
index 00000000000..034319b8aea
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALMaxPool2DCommon.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MAX_POOL_2D_COMMON_H
+#define LUCI_INTERPRETER_PAL_MAX_POOL_2D_COMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+
+namespace luci_interpreter_pal
+{
+
+inline void MaxPool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
+                    const float *input_data, const luci_interpreter::RuntimeShape &output_shape,
+                    float *output_data)
+{
+  const int batches = input_shape.dims(0);
+  const int depth = output_shape.dims(3);
+  const int input_height = input_shape.dims(1);
+  const int input_width = input_shape.dims(2);
+  const int output_height = output_shape.dims(1);
+  const int output_width = output_shape.dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int channel = 0; channel < depth; ++channel)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+          float max = std::numeric_limits<float>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+          {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+            {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+
+              const int input_data_offset =
+                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
+                  input_shape.dims(3) +
+                channel;
+
+              max = std::max(max, input_data[input_data_offset]);
+            }
+          }
+          const int output_data_offset =
+            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
+              output_shape.dims(3) +
+            channel;
+
+          output_data[output_data_offset] =
+            std::min(std::max(max, params.float_activation_min), params.float_activation_max);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void MaxPool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
+                    const T *input_data, const luci_interpreter::RuntimeShape &output_shape,
+                    T *output_data)
+{
+  const int batches = input_shape.dims(0);
+  const int depth = output_shape.dims(3);
+  const int input_height = input_shape.dims(1);
+  const int input_width = input_shape.dims(2);
+  const int output_height = output_shape.dims(1);
+  const int output_width = output_shape.dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        for (int channel = 0; channel < depth; ++channel)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          // Compute the boundaries of the filter region clamped so as to
+          // ensure that the filter window fits in the input array.
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+          T max = std::numeric_limits<T>::lowest();
+          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+          {
+            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
+            {
+              const int in_x = in_x_origin + filter_x;
+              const int in_y = in_y_origin + filter_y;
+
+              const int input_data_offset =
+                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
+                  input_shape.dims(3) +
+                channel;
+
+              max = std::max(max, input_data[input_data_offset]);
+            }
+          }
+          max = std::max<T>(max, params.quantized_activation_min);
+          max = std::min<T>(max, params.quantized_activation_max);
+
+          const int output_data_offset =
+            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
+              output_shape.dims(3) +
+            channel;
+
+          output_data[output_data_offset] = static_cast<T>(max);
+        }
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MAX_POOL_2D_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALMulCommon.h b/onert-micro/luci-interpreter/pal/common/PALMulCommon.h
new file mode 100644
index 00000000000..f1710403016
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALMulCommon.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_MUL_COMMON_H
+#define LUCI_INTERPRETER_PAL_MUL_COMMON_H
+
+#include "Params.h"
+#include "PALUtils.h"
+#include "ProcessBroadcastShapes.h"
+
+namespace luci_interpreter_pal
+{
+template <typename T>
+inline void Mul(const ArithmeticParams &params, const int flat_size, const T *input1_data,
+                const T *input2_data, T *output_data)
+{
+  T activation_min, activation_max;
+  getActivationParams(params, &activation_min, &activation_max);
+
+  for (int i = 0; i < flat_size; ++i)
+    output_data[i] =
+      std::min(std::max(input1_data[i] * input2_data[i], activation_min), activation_max);
+}
+
+template <typename T>
+inline void MulScalar(const ArithmeticParams &params, const int flat_size, const T *input_data,
+                      const T scalar_value, T *output_data)
+{
+  T activation_min, activation_max;
+  getActivationParams(params, &activation_min, &activation_max);
+
+  for (int i = 0; i < flat_size; ++i)
+    output_data[i] =
+      std::min(std::max(input_data[i] * scalar_value, activation_min), activation_max);
+}
+
+template <typename T>
+inline void
+BroadcastMul4DSlow(const ArithmeticParams &params,
+                   const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
+                   const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
+                   const luci_interpreter::RuntimeShape &output_shape, T *output_data)
+{
+  const int flat_size = input1_shape.flatSize();
+
+  if (params.broadcast_category == BroadcastableOpCategory::kScalarFirstBroadcast)
+  {
+    return MulScalar(params, flat_size, input2_data, input1_data[0], output_data);
+  }
+  else if (params.broadcast_category == BroadcastableOpCategory::kScalarSecondBroadcast)
+  {
+    return MulScalar(params, flat_size, input1_data, input2_data[0], output_data);
+  }
+
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
+  const luci_interpreter::RuntimeShape extended_output_shape =
+    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
+
+  T activation_min, activation_max;
+  getActivationParams(params, &activation_min, &activation_max);
+
+  // In Tensorflow, the dimensions are canonically named (batch_number, row,
+  // col, channel), with extents (batches, height, width, depth), with the
+  // trailing dimension changing most rapidly (channels has the smallest stride,
+  // typically 1 element).
+  //
+  // In generated C code, we store arrays with the dimensions reversed. The
+  // first dimension has smallest stride.
+  //
+  // We name our variables by their Tensorflow convention, but generate C code
+  // nesting loops such that the innermost loop has the smallest stride for the
+  // best cache behavior.
+  for (int b = 0; b < extended_output_shape.dims(0); ++b)
+  {
+    for (int y = 0; y < extended_output_shape.dims(1); ++y)
+    {
+      for (int x = 0; x < extended_output_shape.dims(2); ++x)
+      {
+        for (int c = 0; c < extended_output_shape.dims(3); ++c)
+        {
+          const int output_data_offset =
+            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
+              extended_output_shape.dims(3) +
+            c;
+
+          output_data[output_data_offset] =
+            std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] *
+                                input2_data[subscriptToIndex(desc2, b, y, x, c)],
+                              activation_min),
+                     activation_max);
+        }
+      }
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_MUL_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h b/onert-micro/luci-interpreter/pal/common/PALResizeBilinear.h
similarity index 97%
rename from onert-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h
rename to onert-micro/luci-interpreter/pal/common/PALResizeBilinear.h
index b2ddfa75da4..19686b70231 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALResizeBilinear.h
+++ b/onert-micro/luci-interpreter/pal/common/PALResizeBilinear.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
-#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+#ifndef LUCI_INTERPRETER_PAL_RESIZEBILINEAR_COMMON_H
+#define LUCI_INTERPRETER_PAL_RESIZEBILINEAR_COMMON_H
 
 #include "PALUtils.h"
 
@@ -132,4 +132,4 @@ ResizeBilinear(const circle::ResizeBilinearOptions *op_params,
 
 } // namespace luci_interpreter_pal
 
-#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_H
+#endif // LUCI_INTERPRETER_PAL_RESIZEBILINEAR_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALSoftmax.h b/onert-micro/luci-interpreter/pal/common/PALSoftmax.h
similarity index 92%
rename from onert-micro/luci-interpreter/pal/mcu/PALSoftmax.h
rename to onert-micro/luci-interpreter/pal/common/PALSoftmax.h
index fffd39b19f6..a67785675a6 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALSoftmax.h
+++ b/onert-micro/luci-interpreter/pal/common/PALSoftmax.h
@@ -15,10 +15,8 @@
  * limitations under the License.
  */
 
-#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_H
-#define LUCI_INTERPRETER_PAL_SOFTMAX_H
-
-//#include <tensorflow/lite/kernels/internal/reference/softmax.h>
+#ifndef LUCI_INTERPRETER_PAL_SOFTMAX_COMMON_H
+#define LUCI_INTERPRETER_PAL_SOFTMAX_COMMON_H
 
 namespace luci_interpreter_pal
 {
@@ -77,4 +75,4 @@ inline void Softmax(const double beta, const luci_interpreter::RuntimeShape &inp
 
 } // namespace luci_interpreter_pal
 
-#endif // LUCI_INTERPRETER_PAL_SOFTMAX_H
+#endif // LUCI_INTERPRETER_PAL_SOFTMAX_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALSub.h b/onert-micro/luci-interpreter/pal/common/PALSub.h
similarity index 96%
rename from onert-micro/luci-interpreter/pal/mcu/PALSub.h
rename to onert-micro/luci-interpreter/pal/common/PALSub.h
index 7654a6413a1..faa94fdd394 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALSub.h
+++ b/onert-micro/luci-interpreter/pal/common/PALSub.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef LUCI_INTERPRETER_PAL_SUB_H
-#define LUCI_INTERPRETER_PAL_SUB_H
+#ifndef LUCI_INTERPRETER_PAL_SUB_COMMON_H
+#define LUCI_INTERPRETER_PAL_SUB_COMMON_H
 
 #include "PALUtils.h"
 
@@ -86,4 +86,4 @@ BroadcastSub4DSlow(const ArithmeticParams &params,
 
 } // namespace luci_interpreter_pal
 
-#endif // LUCI_INTERPRETER_PAL_SUB_H
+#endif // LUCI_INTERPRETER_PAL_SUB_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/common/PALUnidirectionalSequenceLSTMCommon.h b/onert-micro/luci-interpreter/pal/common/PALUnidirectionalSequenceLSTMCommon.h
new file mode 100644
index 00000000000..ad9631cf29b
--- /dev/null
+++ b/onert-micro/luci-interpreter/pal/common/PALUnidirectionalSequenceLSTMCommon.h
@@ -0,0 +1,567 @@
+/*
+ * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H
+#define LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H
+
+#include "kernels/UnidirectionalSequenceLSTM.h"
+#include "PALTanh.h"
+#include "PALLogistic.h"
+#include "PALFullyConnected.h"
+#include "PALMul.h"
+#include "PALUtils.h"
+
+namespace luci_interpreter_pal
+{
+namespace lstm_internal
+{
+namespace
+{
+// Possible fused activation functions.
+typedef enum
+{
+  kTfLiteActNone = 0,
+  kTfLiteActRelu,
+  kTfLiteActReluN1To1, // min(max(-1, x), 1)
+  kTfLiteActRelu6,     // min(max(0, x), 6)
+  kTfLiteActTanh,
+  kTfLiteActSignBit,
+  kTfLiteActSigmoid,
+} FusedActivation;
+
+} // namespace
+
+#ifndef DIS_QUANT
+
+template <typename InputType, typename OutputType>
+void mulElementwise(int size, const ArithmeticParams *params, const InputType *input1_data,
+                    const InputType *input2_data, OutputType *output_data)
+{
+  for (int i = 0; i < size; ++i)
+  {
+    const int32_t input1_val = params->input1_offset + input1_data[i];
+    const int32_t input2_val = params->input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+      params->output_offset + multiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                            params->output_multiplier,
+                                                            params->output_shift);
+    const int32_t clamped_output =
+      std::min(params->quantized_activation_max,
+               std::max(params->quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<OutputType>(clamped_output);
+  }
+}
+
+// Input and output have the same shape in LSTM
+void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params,
+         const int16_t *input1_data, const int16_t *input2_data, int8_t *output_data)
+{
+  return mulElementwise<int16_t, int8_t>(shape.flatSize(), params, input1_data, input2_data,
+                                         output_data);
+}
+
+// Input and output have the same shape in LSTM
+void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params,
+         const int16_t *input1_data, const int16_t *input2_data, int16_t *output_data)
+{
+  return mulElementwise(shape.flatSize(), params, input1_data, input2_data, output_data);
+}
+
+void addElementWise(const int16_t *input_1, const int16_t *input_2, int n_batch, int n_input,
+                    int16_t *output)
+{
+  for (int batch = 0; batch < n_batch; ++batch)
+  {
+    for (int i = 0; i < n_input; ++i)
+    {
+      const int index = batch * n_input + i;
+      int32_t sum = input_1[index] + input_2[index];
+      const int32_t sum_clamped =
+        std::min(static_cast<int32_t>(std::numeric_limits<int16_t>::max()),
+                 std::max(static_cast<int32_t>(std::numeric_limits<int16_t>::min()), sum));
+      output[index] = static_cast<int16_t>(sum_clamped);
+    }
+  }
+}
+
+void tanh(int32_t cell_state_scale_power, const luci_interpreter::RuntimeShape &input_data_shape,
+          int16_t *input_data, const luci_interpreter::RuntimeShape &output_data_shape,
+          int16_t *output_data)
+{
+  int32_t tanh_input_left_shift = (15 + cell_state_scale_power) - 3;
+  int32_t input_multiplier = 0;
+  if (tanh_input_left_shift < 0) /* handling negative shift value */
+  {
+    tanh_input_left_shift = -tanh_input_left_shift;
+    input_multiplier = 3;
+  }
+  const int flat_size = input_data_shape.flatSize();
+  luci_interpreter_pal::Tanh(input_multiplier, tanh_input_left_shift, flat_size, input_data,
+                             output_data);
+}
+
+void sigmoid(const luci_interpreter::RuntimeShape &data_shape, int16_t *data)
+{
+  luci_interpreter_pal::Logistic(0, 0, data_shape.flatSize(), data, data);
+}
+
+void clipping(const int v_size, const luci_interpreter::lstm::CellStateInfo *cell_state_info,
+              int16_t *vector)
+{
+  for (int i = 0; i < v_size; i++)
+  {
+    vector[i] = std::max(std::min(cell_state_info->quantized_cell_clip, vector[i]),
+                         static_cast<int16_t>(-cell_state_info->quantized_cell_clip));
+  }
+}
+#endif // DIS_QUANT
+
+#ifndef DIS_FLOAT
+// Input and output have the same shape in LSTM
+void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params,
+         const float *input1_data, const float *input2_data, float *output_data)
+{
+  const int flat_size = shape.flatSize();
+  return luci_interpreter_pal::Mul(*params, flat_size, input1_data, input2_data, output_data);
+}
+
+void addElementWise(const float *input_1, const float *input_2, int n_batch, int n_input,
+                    float *output)
+{
+  for (int batch = 0; batch < n_batch; ++batch)
+  {
+    for (int i = 0; i < n_input; ++i)
+    {
+      const int index = batch * n_input + i;
+      output[index] = input_1[index] + input_2[index];
+    }
+  }
+}
+
+void tanh(int32_t, const luci_interpreter::RuntimeShape &input_data_shape, float *input_data,
+          const luci_interpreter::RuntimeShape &output_data_shape, float *output_data)
+{
+  const int flat_size = input_data_shape.flatSize();
+  luci_interpreter_pal::Tanh(flat_size, input_data, output_data);
+}
+
+void sigmoid(const luci_interpreter::RuntimeShape &data_shape, float *data)
+{
+  const int flat_size = data_shape.flatSize();
+  luci_interpreter_pal::Logistic(flat_size, data, data);
+}
+
+void clipping(const int v_size, const luci_interpreter::lstm::CellStateInfo *cell_state_info,
+              float *vector)
+{
+  for (int i = 0; i < v_size; i++)
+  {
+    vector[i] =
+      std::max(std::min(cell_state_info->cell_clip, vector[i]), -cell_state_info->cell_clip);
+  }
+}
+#endif // DIS_FLOAT
+
+// Size information about the LSTM kernel, which is deduced from tensors stored
+// in the flat buffer file.
+struct LstmSizeInfo
+{
+  bool time_major;
+  int32_t batch_size;
+  int32_t time_steps;
+  int32_t input_dimension;
+  int32_t state_dimension;
+};
+
+class LstmStepManager
+{
+public:
+  LstmStepManager() = delete;
+  // Does not take any ownership, and all pointers must refer to valid objects
+  // that outlive the one constructed.
+  explicit LstmStepManager(const LstmSizeInfo &size_info) : size_info_(size_info) {}
+
+  void updateTime()
+  {
+    current_time_ += 1;
+    // default as one batch per inference
+    int input_step = size_info_.input_dimension;
+    int output_step = size_info_.state_dimension;
+    // time major: batch inference
+    if (size_info_.time_major)
+    {
+      input_step = input_step * size_info_.batch_size;
+      output_step = output_step * size_info_.batch_size;
+    }
+
+    input_offset_ += input_step;
+    output_offset_ += output_step;
+  }
+
+  void updateBatch()
+  {
+    current_batch_ += 1;
+    // batch inference for time major: no action needed
+    if (size_info_.time_major)
+    {
+      return;
+    }
+    // otherwise: singe batch inference, go to the next batch
+    hidden_state_offset_ += size_info_.state_dimension;
+    cell_state_offset_ += size_info_.state_dimension;
+  }
+
+  void resetTime() { current_time_ = 0; }
+
+  luci_interpreter::RuntimeShape inputShape() const
+  {
+    int batch_size = 1;
+    if (size_info_.time_major)
+    {
+      batch_size = size_info_.batch_size;
+    }
+    const int dims[2] = {batch_size, size_info_.input_dimension};
+    const int32_t *dims_data = reinterpret_cast<const int32_t *>(dims);
+    return luci_interpreter::RuntimeShape(2, dims_data);
+  }
+
+  luci_interpreter::RuntimeShape stateShape() const
+  {
+    int batch_size = 1;
+    if (size_info_.time_major)
+    {
+      batch_size = size_info_.batch_size;
+    }
+    const int dims[2] = {batch_size, size_info_.state_dimension};
+    const int32_t *dims_data = reinterpret_cast<const int32_t *>(dims);
+    return luci_interpreter::RuntimeShape(2, dims_data);
+  }
+
+  int inputOffset() const { return input_offset_; }
+
+  int outputOffset() const { return output_offset_; }
+
+  int hiddenStateOffset() const { return hidden_state_offset_; }
+
+  int cellStateOffset() const { return cell_state_offset_; }
+
+private:
+  int32_t current_time_ = 0;
+  int32_t current_batch_ = 0;
+  int32_t input_offset_ = 0;
+  int32_t output_offset_ = 0;
+  int32_t hidden_state_offset_ = 0;
+  int32_t cell_state_offset_ = 0;
+
+  const LstmSizeInfo &size_info_;
+};
+
+// Calculates a single LSTM gate.
+// Implements the following formula:
+//   gate = activate(FC(input) + FC(recurrent))
+// Activation is sigmoid except for the "cell" gate (configurable, usually tanh)
+template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
+void calculateLstmGate(const LstmStepManager *step_info,
+                       const luci_interpreter::lstm::GateParameters *gate_params,
+                       // Input FC
+                       ActivationType *input_data, const circle::Tensor *input_weight,
+                       const circle::Tensor *input_bias,
+                       // Recurrent FC
+                       ActivationType *recurrent_data, const circle::Tensor *recurrent_weight,
+                       const circle::Tensor *recurrent_bias,
+                       // Output
+                       CellType *gate_output,
+                       // Scratch arrays
+                       CellType *fc_output_buffer, const FusedActivation activation,
+                       luci_interpreter::BaseRuntimeGraph *runtime_graph)
+{
+  // Input FC
+  const auto gate_output_shape = step_info->stateShape();
+  {
+    FullyConnectedParams op_params{};
+    op_params.input_offset = gate_params->input_fc_params.input_offset;
+    op_params.weights_offset = gate_params->input_fc_params.weights_offset;
+    op_params.output_offset = gate_params->input_fc_params.output_offset;
+    op_params.output_multiplier = gate_params->input_fc_params.output_multiplier;
+    op_params.output_shift = gate_params->input_fc_params.output_shift;
+    op_params.quantized_activation_min = gate_params->input_fc_params.quantized_activation_min;
+    op_params.quantized_activation_max = gate_params->input_fc_params.quantized_activation_max;
+    op_params.float_activation_max = gate_params->input_fc_params.float_activation_max;
+    op_params.float_activation_min = gate_params->input_fc_params.float_activation_min;
+
+    int32_t input_weight_shape[luci_interpreter::kMaxSmallSize];
+    luci_interpreter::kernels::getTensorDims(input_weight, runtime_graph, input_weight_shape);
+
+    FullyConnected(op_params, step_info->inputShape().dimsData(),
+                   input_data + step_info->inputOffset(), input_weight_shape,
+                   luci_interpreter::kernels::getTensorData<WeightType>(
+                     runtime_graph->getConstDataByTensor(input_weight)),
+                   luci_interpreter::kernels::getTensorData<BiasType>(
+                     runtime_graph->getConstDataByTensor(input_bias)),
+                   gate_output_shape.dimsData(), gate_output);
+  }
+
+  // Recurrent FC
+  {
+    FullyConnectedParams op_params{};
+    op_params.input_offset = gate_params->recurrent_fc_params.input_offset;
+    op_params.weights_offset = gate_params->recurrent_fc_params.weights_offset;
+    op_params.output_offset = gate_params->recurrent_fc_params.output_offset;
+    op_params.output_multiplier = gate_params->recurrent_fc_params.output_multiplier;
+    op_params.output_shift = gate_params->recurrent_fc_params.output_shift;
+    op_params.quantized_activation_min = gate_params->recurrent_fc_params.quantized_activation_min;
+    op_params.quantized_activation_max = gate_params->recurrent_fc_params.quantized_activation_max;
+    op_params.float_activation_max = gate_params->recurrent_fc_params.float_activation_max;
+    op_params.float_activation_min = gate_params->recurrent_fc_params.float_activation_min;
+
+    int32_t recurrent_weight_shape[luci_interpreter::kMaxSmallSize];
+    luci_interpreter::kernels::getTensorDims(recurrent_weight, runtime_graph,
+                                             recurrent_weight_shape);
+
+    FullyConnected(op_params, step_info->stateShape().dimsData(),
+                   recurrent_data + step_info->hiddenStateOffset(), recurrent_weight_shape,
+                   luci_interpreter::kernels::getTensorData<WeightType>(
+                     runtime_graph->getConstDataByTensor(recurrent_weight)),
+                   luci_interpreter::kernels::getTensorData<BiasType>(
+                     runtime_graph->getConstDataByTensor(recurrent_bias)),
+                   gate_output_shape.dimsData(), fc_output_buffer);
+
+    addElementWise(gate_output, fc_output_buffer, /*n_batch=*/gate_output_shape.dimsData()[0],
+                   /*n_state=*/gate_output_shape.dimsData()[1], gate_output);
+
+    switch (activation)
+    {
+      case FusedActivation::kTfLiteActSigmoid:
+        sigmoid(gate_output_shape, gate_output);
+        break;
+      case FusedActivation::kTfLiteActTanh:
+      {
+        // Set the scale power to -12 to avoid shift
+        tanh(/*cell_state_scale_power=*/-12, gate_output_shape, gate_output, gate_output_shape,
+             gate_output);
+      }
+      break;
+      default:
+        // Only Sigmoid or Tanh is used.
+        assert(false && "Only Sigmoid or Tanh is used");
+    }
+  }
+}
+
+// Update the hidden state of the LSTM kernel using the following formula:
+// updated_hidden_state = Tanh(updated_cell_state) * output_gate_output, * means
+// element wise multiplication
+template <typename CellType, typename ActivationType>
+void updateLstmHidden(const LstmStepManager *step_info, CellType *cell_state_data_base,
+                      ActivationType *hidden_state_data, const CellType *output_gate_output,
+                      const ArithmeticParams *mul_params, int32_t cell_state_scale_power,
+                      CellType *buffer)
+{
+  auto cell_state_shape = step_info->stateShape();
+  CellType *cell_state_data = cell_state_data_base + step_info->cellStateOffset();
+  // Tanh(cell_state)
+  tanh(cell_state_scale_power, cell_state_shape, cell_state_data, cell_state_shape, buffer);
+  // Update the hidden state
+  mul(cell_state_shape, mul_params, buffer, output_gate_output,
+      hidden_state_data + step_info->hiddenStateOffset());
+}
+
+// Update the cell state using the output from the forget gate, input gate, and
+// cell gate Formula: updated_cell_state = forget_gate_output*cell_state +
+// input_gate_output * cell_gate_output, where * denotes element wise
+// multiplication
+template <typename CellType>
+void updateLstmCell(const LstmStepManager *step_info, CellType *cell_state_data,
+                    // Gate outputs
+                    CellType *forget_gate_output, const CellType *input_gate_output,
+                    const CellType *cell_gate_output,
+                    // Mul parameters
+                    const ArithmeticParams &forget_cell_mul_params,
+                    const ArithmeticParams &input_mul_params,
+                    const luci_interpreter::lstm::CellStateInfo *cell_state_info, CellType *buffer)
+{
+  auto cell_state_shape = step_info->stateShape();
+  // Forget Gate x Cell State
+  mul(cell_state_shape, &forget_cell_mul_params, forget_gate_output,
+      cell_state_data + step_info->cellStateOffset(),
+      cell_state_data + step_info->cellStateOffset());
+  // Input Gate x Cell Gate
+  mul(cell_state_shape, &input_mul_params, input_gate_output, cell_gate_output, buffer);
+
+  // Update the cell state
+  addElementWise(cell_state_data + step_info->cellStateOffset(), buffer,
+                 /*n_batch=*/cell_state_shape.dimsData()[0],
+                 /*n_state=*/cell_state_shape.dimsData()[1],
+                 cell_state_data + step_info->cellStateOffset());
+
+  if (cell_state_info->cell_clip > 0)
+  {
+    clipping(cell_state_shape.flatSize(), cell_state_info,
+             cell_state_data + step_info->cellStateOffset());
+  }
+}
+
+template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
+void lstmStep(luci_interpreter::lstm::LSTMStruct *lstm_struct,
+              luci_interpreter::lstm::LSTMParameters *lstm_params, LstmStepManager *step_info,
+              luci_interpreter::lstm::CellStateInfo *cell_state_info,
+              ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0,
+              CellType *scratch1, CellType *scratch2, CellType *scratch3,
+              luci_interpreter::BaseRuntimeGraph *runtime_graph)
+{
+  /*Step1: Calculate gate outputs to prepare cell state update*/
+  CellType *gate_internal_buffer = scratch3;
+  CellType *forget_gate_output = scratch0;
+
+  auto input_data = luci_interpreter::kernels::getTensorData<ActivationType>(
+    runtime_graph->getDataByTensor(lstm_struct->input()));
+
+  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
+    step_info, &lstm_params->forget_gate_parameters,
+    // Input FC
+    input_data, lstm_struct->input_to_forget_weights(), lstm_struct->forget_gate_bias(),
+    // Recurrent FC
+    output_state_data, lstm_struct->recurrent_to_forget_weights(), nullptr,
+    // Output
+    forget_gate_output, gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph);
+
+  // Input Gate calculation;
+  CellType *input_gate_output = scratch1;
+  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
+    step_info, &lstm_params->input_gate_parameters,
+    // Input FC
+    input_data, lstm_struct->input_to_input_weights(), lstm_struct->input_gate_bias(),
+    // Recurrent FC
+    output_state_data, lstm_struct->recurrent_to_input_weights(),
+    /*recurrent_bias*/ nullptr,
+    // Output
+    input_gate_output,
+    // Scratch arrays
+    gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph);
+
+  // Cell Gate calculation
+  CellType *cell_gate_output = scratch2;
+  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
+    step_info, &lstm_params->cell_gate_parameters,
+    // Input FC
+    input_data, lstm_struct->input_to_cell_weights(), lstm_struct->cell_gate_bias(),
+    // Recurrent FC
+    output_state_data, lstm_struct->recurrent_to_cell_weights(),
+    /*recurrent_bias*/ nullptr,
+    // Output
+    cell_gate_output,
+    // Scratch arrays
+    gate_internal_buffer, FusedActivation::kTfLiteActTanh, runtime_graph);
+
+  /*Step2: update the cell state */
+  {
+    // const InterGateParameters& inter_gate_params = op_data.inter_gate_parameters;
+    CellType *updated_input_buffer = scratch1; // reuse buffer
+
+    updateLstmCell<CellType>(
+      step_info, cell_state_data, forget_gate_output, input_gate_output, cell_gate_output,
+      lstm_params->inter_gate_parameters.forget_cell_mul_params,
+      lstm_params->inter_gate_parameters.input_mul_params, cell_state_info, updated_input_buffer);
+  }
+
+  {
+    /*Step3: update the hidden state */
+    CellType *output_gate_output = scratch1; // reuse buffer
+    calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
+      step_info, &lstm_params->output_gate_parameters,
+      // Input FC
+      input_data, lstm_struct->input_to_output_weights(), lstm_struct->output_gate_bias(),
+      // Recurrent FC
+      output_state_data, lstm_struct->recurrent_to_output_weights(), nullptr,
+      // Output
+      output_gate_output,
+      // Scratch arrays
+      gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph);
+    CellType *tanh_activated_cell_buffer = scratch0; // reuse buffer
+    updateLstmHidden<CellType, ActivationType>(
+      step_info, cell_state_data, output_state_data, output_gate_output,
+      &lstm_params->inter_gate_parameters.output_mul_params,
+      cell_state_info->cell_state_scale_power, tanh_activated_cell_buffer);
+
+    ActivationType *output_ptr = luci_interpreter::kernels::getTensorData<ActivationType>(
+      runtime_graph->getDataByTensor(lstm_struct->output()));
+    std::memcpy(output_ptr + step_info->outputOffset(),
+                output_state_data + step_info->hiddenStateOffset(),
+                step_info->stateShape().flatSize() * sizeof(ActivationType));
+  }
+}
+
+} // namespace lstm_internal
+
+// Evaluate the LSTM kernel with (potential) multi-steps and multi-batch input
+template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
+void evalLSTM(luci_interpreter::lstm::LSTMStruct *lstm_struct,
+              luci_interpreter::lstm::LSTMParameters *lstm_params,
+              luci_interpreter::lstm::CellStateInfo *cell_state_info,
+              ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0,
+              CellType *scratch1, CellType *scratch2, CellType *scratch3,
+              luci_interpreter::BaseRuntimeGraph *runtime_graph)
+{
+  lstm_internal::LstmSizeInfo size_info;
+
+  size_info.time_major = lstm_struct->options->time_major();
+  size_info.batch_size = size_info.time_major
+                           ? luci_interpreter::Tensor::dim(lstm_struct->input(), 1)
+                           : luci_interpreter::Tensor::dim(lstm_struct->input(), 0);
+  size_info.time_steps = size_info.time_major
+                           ? luci_interpreter::Tensor::dim(lstm_struct->input(), 0)
+                           : luci_interpreter::Tensor::dim(lstm_struct->input(), 1);
+  size_info.input_dimension = luci_interpreter::Tensor::dim(lstm_struct->input(), 2);
+  size_info.state_dimension = luci_interpreter::Tensor::dim(lstm_struct->output_state(), 1);
+
+  lstm_internal::LstmStepManager step_info(size_info);
+
+  // time is the first dimention, enable batch computation
+  if (size_info.time_major)
+  {
+    for (int t = 0; t < size_info.time_steps; t++)
+    {
+      lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
+        lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
+        scratch0, scratch1, scratch2, scratch3, runtime_graph);
+      // prepare for the next time step
+      step_info.updateTime();
+    }
+  }
+  else
+  {
+    // batch first, unable to size the input data. single batch inference
+    for (int b = 0; b < size_info.batch_size; b++)
+    {
+      for (int t = 0; t < size_info.time_steps; t++)
+      {
+        lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
+          lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
+          scratch0, scratch1, scratch2, scratch3, runtime_graph);
+        // prepare for the next time step
+        step_info.updateTime();
+      }
+      // prepare for the next batch
+      step_info.updateBatch();
+      step_info.resetTime();
+    }
+  }
+}
+
+} // namespace luci_interpreter_pal
+
+#endif // LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_COMMON_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALAdd.h b/onert-micro/luci-interpreter/pal/mcu/PALAdd.h
index d45d0aab01f..d9d1f7865ae 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALAdd.h
+++ b/onert-micro/luci-interpreter/pal/mcu/PALAdd.h
@@ -18,75 +18,22 @@
 #ifndef LUCI_INTERPRETER_PAL_ADD_H
 #define LUCI_INTERPRETER_PAL_ADD_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
+#include "PALAddCommon.h"
 
 namespace luci_interpreter_pal
 {
-
-// TODO: check if there real activation value
-template <typename T>
-inline void Add(const ArithmeticParams &params, const int flat_size, const T *input1_data,
-                const T *input2_data, T *output_data)
+template <>
+inline void Add<int8_t>(const ArithmeticParams &, const int, const int8_t *, const int8_t *,
+                        int8_t *)
 {
-  T activation_min, activation_max;
-  getActivationParams(params, &activation_min, &activation_max);
-
-  for (int i = 0; i < flat_size; ++i)
-    output_data[i] =
-      std::min(std::max(input1_data[i] + input2_data[i], activation_min), activation_max);
+  assert(false && "Not IMPL yet");
 }
 
-template <typename T>
-inline void
-BroadcastAdd4DSlow(const ArithmeticParams &params,
-                   const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
-                   const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
-                   const luci_interpreter::RuntimeShape &output_shape, T *output_data)
+template <>
+inline void Add<int16_t>(const ArithmeticParams &, const int, const int16_t *, const int16_t *,
+                         int16_t *)
 {
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  T activation_min, activation_max;
-  getActivationParams(params, &activation_min, &activation_max);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] +
-                                input2_data[subscriptToIndex(desc2, b, y, x, c)],
-                              activation_min),
-                     activation_max);
-        }
-      }
-    }
-  }
+  assert(false && "Not IMPL yet");
 }
 
 } // namespace luci_interpreter_pal
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h b/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h
index e111abbb572..ef5fe72230c 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h
+++ b/onert-micro/luci-interpreter/pal/mcu/PALAveragePool2D.h
@@ -18,76 +18,11 @@
 #ifndef LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_H
 #define LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_H
 
-#include "Params.h"
-#include "PALUtils.h"
+#include "PALAveragePool2DCommon.h"
 
 namespace luci_interpreter_pal
 {
-
-// TODO: reduce code duplication with MaxPool
-inline void AveragePool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
-                        const float *input_data, const luci_interpreter::RuntimeShape &output_shape,
-                        float *output_data)
-{
-  const int batches = input_shape.dims(0);
-  const int depth = output_shape.dims(3);
-  const int input_height = input_shape.dims(1);
-  const int input_width = input_shape.dims(2);
-  const int output_height = output_shape.dims(1);
-  const int output_width = output_shape.dims(2);
-  const int stride_height = params.stride_height;
-  const int stride_width = params.stride_width;
-  for (int batch = 0; batch < batches; ++batch)
-  {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        for (int channel = 0; channel < depth; ++channel)
-        {
-          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
-          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
-          // Compute the boundaries of the filter region clamped so as to
-          // ensure that the filter window fits in the input array.
-          const int filter_x_start = std::max(0, -in_x_origin);
-          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
-          const int filter_y_start = std::max(0, -in_y_origin);
-          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
-
-          float total = 0.f;
-          float filter_count = 0;
-
-          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
-          {
-            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
-            {
-              const int in_x = in_x_origin + filter_x;
-              const int in_y = in_y_origin + filter_y;
-
-              const int input_data_offset =
-                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
-                  input_shape.dims(3) +
-                channel;
-
-              total += input_data[input_data_offset];
-              filter_count++;
-            }
-          }
-          const int output_data_offset =
-            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
-              output_shape.dims(3) +
-            channel;
-
-          assert(filter_count != 0);
-          const float average = total / filter_count;
-
-          output_data[output_data_offset] =
-            std::min(std::max(average, params.float_activation_min), params.float_activation_max);
-        }
-      }
-    }
-  }
-}
+// TODO: add S8 and S16 kernel
 } // namespace luci_interpreter_pal
 
 #endif // LUCI_INTERPRETER_PAL_AVERAGE_POOL_2D_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALConv2d.h b/onert-micro/luci-interpreter/pal/mcu/PALConv2d.h
index b7d6502c17c..c979f761031 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALConv2d.h
+++ b/onert-micro/luci-interpreter/pal/mcu/PALConv2d.h
@@ -17,185 +17,16 @@
 
 #ifndef LUCI_INTERPRETER_PAL_CONV2D_H
 #define LUCI_INTERPRETER_PAL_CONV2D_H
-#include "Params.h"
-#include "PALUtils.h"
+#include "PALConv2DCommon.h"
 
 namespace luci_interpreter_pal
 {
-static inline void Conv(const ConvParams &params, const int32_t *input_shape,
-                        const float *input_data, const int32_t *filter_shape,
-                        const float *filter_data, const float *bias_data,
-                        const int32_t *output_shape, float *output_data)
+static inline void QuantizedConvPerChannel(const ConvParams &, const int32_t *, const int8_t *,
+                                           const int32_t *, const int8_t *, const int32_t *,
+                                           const int32_t *, int8_t *)
 {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-
-  const auto batches = input_shape[0];
-  const int input_height = input_shape[1];
-  const int input_width = input_shape[2];
-  const int input_depth = input_shape[3];
-  const int output_depth = filter_shape[0];
-  const int filter_height = filter_shape[1];
-  const int filter_width = filter_shape[2];
-  const int output_height = output_shape[1];
-  const int output_width = output_shape[2];
-  for (int batch = 0; batch < batches; ++batch)
-  {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      const int in_y_origin = (out_y * stride_height) - pad_height;
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        const int in_x_origin = (out_x * stride_width) - pad_width;
-        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
-        {
-          float total = 0.f;
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
-          {
-            const int in_y = in_y_origin + dilation_height_factor * filter_y;
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
-            {
-              const int in_x = in_x_origin + dilation_width_factor * filter_x;
-
-              // Zero padding by omitting the areas outside the image.
-              const bool is_point_inside_image =
-                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
-
-              if (!is_point_inside_image)
-              {
-                continue;
-              }
-
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
-              {
-                const int input_data_offset =
-                  ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel;
-
-                const int filter_data_offset =
-                  ((out_channel * filter_height + filter_y) * filter_width + filter_x) *
-                    input_depth +
-                  in_channel;
-
-                const float input_value = input_data[input_data_offset];
-                const float filter_value = filter_data[filter_data_offset];
-                total += (input_value * filter_value);
-              }
-            }
-          }
-          // float bias_value = 0.0f;
-          if (bias_data)
-          {
-            total += bias_data[out_channel];
-          }
-
-          const int output_data_offset =
-            ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel;
-
-          output_data[output_data_offset] =
-            std::min(std::max(total, output_activation_min), output_activation_max);
-        }
-      }
-    }
-  }
+  assert(false && "Not supported yet");
 }
-
-static inline void Conv(const ConvParams &params, const int32_t *input_shape,
-                        const uint8_t *input_data, const int32_t *filter_shape,
-                        const uint8_t *filter_data, const int32_t *bias_data,
-                        const int32_t *output_shape, uint8_t *output_data)
-{
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int32_t input_offset = params.input_offset;
-  const int32_t filter_offset = params.weights_offset;
-  const int32_t output_offset = params.output_offset;
-  const int32_t output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-
-  const auto batches = input_shape[0];
-  const int input_height = input_shape[1];
-  const int input_width = input_shape[2];
-  const int input_depth = input_shape[3];
-  const int output_depth = filter_shape[0];
-  const int filter_height = filter_shape[1];
-  const int filter_width = filter_shape[2];
-  const int output_height = output_shape[1];
-  const int output_width = output_shape[2];
-
-  for (int batch = 0; batch < batches; ++batch)
-  {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      const int in_y_origin = (out_y * stride_height) - pad_height;
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        const int in_x_origin = (out_x * stride_width) - pad_width;
-        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
-        {
-          int32_t acc = 0;
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
-          {
-            const int in_y = in_y_origin + dilation_height_factor * filter_y;
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
-            {
-              const int in_x = in_x_origin + dilation_width_factor * filter_x;
-
-              // Zero padding by omitting the areas outside the image.
-              const bool is_point_inside_image =
-                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
-
-              if (!is_point_inside_image)
-              {
-                continue;
-              }
-
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
-              {
-                const int input_data_offset =
-                  ((batch * input_height + in_y) * input_width + in_x) * input_depth + in_channel;
-
-                const int filter_data_offset =
-                  ((out_channel * filter_height + filter_y) * filter_width + filter_x) *
-                    input_depth +
-                  in_channel;
-
-                const int32_t input_val = input_data[input_data_offset];
-                const int32_t filter_val = filter_data[filter_data_offset];
-                acc += (filter_val + filter_offset) * (input_val + input_offset);
-              }
-            }
-          }
-          if (bias_data)
-          {
-            acc += bias_data[out_channel];
-          }
-          acc = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-          acc += output_offset;
-          acc = std::max(acc, output_activation_min);
-          acc = std::min(acc, output_activation_max);
-
-          const int output_data_offset =
-            ((batch * output_height + out_y) * output_width + out_x) * output_depth + out_channel;
-
-          output_data[output_data_offset] = static_cast<uint8_t>(acc);
-        }
-      }
-    }
-  }
-}
-
 } // namespace luci_interpreter_pal
 
 #endif // LUCI_INTERPRETER_PAL_CONV2D_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALFullyConnected.h b/onert-micro/luci-interpreter/pal/mcu/PALFullyConnected.h
index d1a151d7fcb..4a024b137ed 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALFullyConnected.h
+++ b/onert-micro/luci-interpreter/pal/mcu/PALFullyConnected.h
@@ -18,83 +18,36 @@
 #ifndef LUCI_INTERPRETER_PAL_FULLY_CONNECTED_H
 #define LUCI_INTERPRETER_PAL_FULLY_CONNECTED_H
 
-#include "Params.h"
-#include "PALUtils.h"
+#include "PALFullyConnectedCommon.h"
 
 namespace luci_interpreter_pal
 {
-template <typename InputType, typename WeightType, typename OutputType, typename BiasType>
-inline void FullyConnected(const FullyConnectedParams &params, const int32_t *input_shape,
-                           const InputType *input_data, const int32_t *filter_shape,
-                           const WeightType *filter_data, const BiasType *bias_data,
-                           const int32_t *output_shape, OutputType *output_data)
-{
-  const int32_t input_offset = params.input_offset;
-  const int32_t filter_offset = params.weights_offset;
-  const int32_t output_offset = params.output_offset;
-  const int32_t output_multiplier = params.output_multiplier;
-  const int output_shift = params.output_shift;
-  const int32_t output_activation_min = params.quantized_activation_min;
-  const int32_t output_activation_max = params.quantized_activation_max;
-
-  const int batches = input_shape[0];
-  const int output_depth = output_shape[1];
-  const int accum_depth = filter_shape[1];
 
-  for (int b = 0; b < batches; ++b)
-  {
-    for (int out_c = 0; out_c < output_depth; ++out_c)
-    {
-      BiasType acc = 0;
-      for (int d = 0; d < accum_depth; ++d)
-      {
-        int32_t input_val = input_data[b * accum_depth + d];
-        int32_t filter_val = filter_data[out_c * accum_depth + d];
-        acc += (filter_val + filter_offset) * (input_val + input_offset);
-      }
-      if (bias_data)
-      {
-        acc += bias_data[out_c];
-      }
-      int32_t acc_scaled = multiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
-      acc_scaled += output_offset;
-      acc_scaled = std::max(acc_scaled, output_activation_min);
-      acc_scaled = std::min(acc_scaled, output_activation_max);
-      output_data[out_c + output_depth * b] = static_cast<OutputType>(acc_scaled);
-    }
-  }
-}
 template <>
-inline void FullyConnected(const FullyConnectedParams &params, const int32_t *input_shape,
-                           const float *input_data, const int32_t *filter_shape,
-                           const float *filter_data, const float *bias_data,
-                           const int32_t *output_shape, float *output_data)
+inline void
+FullyConnected(const luci_interpreter_pal::FullyConnectedParams &params, const int32_t *input_shape,
+               const int8_t *input_data, const int32_t *filter_shape, const int8_t *filter_data,
+               const int32_t *bias_data, const int32_t *output_shape, int8_t *output_data)
 {
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-
-  const int batches = input_shape[0];
-  const int output_depth = output_shape[1];
-  const int accum_depth = filter_shape[1];
+  // MARK: At this moment this operation doesn't support
+  assert(false && "FullyConnected INT8 NYI");
+  (void)params;
+  (void)input_shape;
+  (void)input_data;
+  (void)filter_shape;
+  (void)filter_data;
+  (void)bias_data;
+  (void)output_shape;
+  (void)output_data;
+}
 
-  for (int b = 0; b < batches; ++b)
-  {
-    for (int out_c = 0; out_c < output_depth; ++out_c)
-    {
-      float total = 0.f;
-      for (int d = 0; d < accum_depth; ++d)
-      {
-        total += input_data[b * accum_depth + d] * filter_data[out_c * accum_depth + d];
-      }
-      float bias_value = 0.0f;
-      if (bias_data)
-      {
-        bias_value = bias_data[out_c];
-      }
-      output_data[out_c + output_depth * b] =
-        std::min(std::max(total + bias_value, output_activation_min), output_activation_max);
-    }
-  }
+template <>
+inline void FullyConnected(const luci_interpreter_pal::FullyConnectedParams &, const int32_t *,
+                           const int16_t *, const int32_t *, const int8_t *, const int64_t *,
+                           const int32_t *, int16_t *)
+{
+  // MARK: At this moment this operation doesn't support
+  assert(false && "FullyConnected INT8 NYI");
 }
 
 } // namespace luci_interpreter_pal
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h b/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h
index dab583594c8..a0fff0c6d0e 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h
+++ b/onert-micro/luci-interpreter/pal/mcu/PALMaxPool2D.h
@@ -18,129 +18,11 @@
 #ifndef LUCI_INTERPRETER_PAL_MAX_POOL_2D_H
 #define LUCI_INTERPRETER_PAL_MAX_POOL_2D_H
 
-#include "Params.h"
-#include "PALUtils.h"
+#include "PALMaxPool2DCommon.h"
 
 namespace luci_interpreter_pal
 {
-
-inline void MaxPool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
-                    const float *input_data, const luci_interpreter::RuntimeShape &output_shape,
-                    float *output_data)
-{
-  const int batches = input_shape.dims(0);
-  const int depth = output_shape.dims(3);
-  const int input_height = input_shape.dims(1);
-  const int input_width = input_shape.dims(2);
-  const int output_height = output_shape.dims(1);
-  const int output_width = output_shape.dims(2);
-  const int stride_height = params.stride_height;
-  const int stride_width = params.stride_width;
-  for (int batch = 0; batch < batches; ++batch)
-  {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        for (int channel = 0; channel < depth; ++channel)
-        {
-          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
-          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
-          // Compute the boundaries of the filter region clamped so as to
-          // ensure that the filter window fits in the input array.
-          const int filter_x_start = std::max(0, -in_x_origin);
-          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
-          const int filter_y_start = std::max(0, -in_y_origin);
-          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
-          float max = std::numeric_limits<float>::lowest();
-          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
-          {
-            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
-            {
-              const int in_x = in_x_origin + filter_x;
-              const int in_y = in_y_origin + filter_y;
-
-              const int input_data_offset =
-                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
-                  input_shape.dims(3) +
-                channel;
-
-              max = std::max(max, input_data[input_data_offset]);
-            }
-          }
-          const int output_data_offset =
-            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
-              output_shape.dims(3) +
-            channel;
-
-          output_data[output_data_offset] =
-            std::min(std::max(max, params.float_activation_min), params.float_activation_max);
-        }
-      }
-    }
-  }
-}
-
-template <typename T>
-inline void MaxPool(const PoolParams &params, const luci_interpreter::RuntimeShape &input_shape,
-                    const T *input_data, const luci_interpreter::RuntimeShape &output_shape,
-                    T *output_data)
-{
-  const int batches = input_shape.dims(0);
-  const int depth = output_shape.dims(3);
-  const int input_height = input_shape.dims(1);
-  const int input_width = input_shape.dims(2);
-  const int output_height = output_shape.dims(1);
-  const int output_width = output_shape.dims(2);
-  const int stride_height = params.stride_height;
-  const int stride_width = params.stride_width;
-  for (int batch = 0; batch < batches; ++batch)
-  {
-    for (int out_y = 0; out_y < output_height; ++out_y)
-    {
-      for (int out_x = 0; out_x < output_width; ++out_x)
-      {
-        for (int channel = 0; channel < depth; ++channel)
-        {
-          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
-          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
-          // Compute the boundaries of the filter region clamped so as to
-          // ensure that the filter window fits in the input array.
-          const int filter_x_start = std::max(0, -in_x_origin);
-          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
-          const int filter_y_start = std::max(0, -in_y_origin);
-          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
-          T max = std::numeric_limits<T>::lowest();
-          for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
-          {
-            for (int filter_x = filter_x_start; filter_x < filter_x_end; ++filter_x)
-            {
-              const int in_x = in_x_origin + filter_x;
-              const int in_y = in_y_origin + filter_y;
-
-              const int input_data_offset =
-                ((batch * input_shape.dims(1) + in_y) * input_shape.dims(2) + in_x) *
-                  input_shape.dims(3) +
-                channel;
-
-              max = std::max(max, input_data[input_data_offset]);
-            }
-          }
-          max = std::max<T>(max, params.quantized_activation_min);
-          max = std::min<T>(max, params.quantized_activation_max);
-
-          const int output_data_offset =
-            ((batch * output_shape.dims(1) + out_y) * output_shape.dims(2) + out_x) *
-              output_shape.dims(3) +
-            channel;
-
-          output_data[output_data_offset] = static_cast<T>(max);
-        }
-      }
-    }
-  }
-}
-
+// TODO: Add INT8, INT16 kernels
 } // namespace luci_interpreter_pal
 
 #endif // LUCI_INTERPRETER_PAL_MAX_POOL_2D_H
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALMul.h b/onert-micro/luci-interpreter/pal/mcu/PALMul.h
index 94ed17ef2bc..7b55cd1c832 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALMul.h
+++ b/onert-micro/luci-interpreter/pal/mcu/PALMul.h
@@ -18,96 +18,23 @@
 #ifndef LUCI_INTERPRETER_PAL_MUL_H
 #define LUCI_INTERPRETER_PAL_MUL_H
 
-#include "Params.h"
-#include "PALUtils.h"
-#include "ProcessBroadcastShapes.h"
+#include "PALMulCommon.h"
 
 namespace luci_interpreter_pal
 {
-template <typename T>
-inline void Mul(const ArithmeticParams &params, const int flat_size, const T *input1_data,
-                const T *input2_data, T *output_data)
-{
-  T activation_min, activation_max;
-  getActivationParams(params, &activation_min, &activation_max);
-
-  for (int i = 0; i < flat_size; ++i)
-    output_data[i] =
-      std::min(std::max(input1_data[i] * input2_data[i], activation_min), activation_max);
-}
 
-template <typename T>
-inline void MulScalar(const ArithmeticParams &params, const int flat_size, const T *input_data,
-                      const T scalar_value, T *output_data)
+template <>
+inline void Mul<int8_t>(const ArithmeticParams &, const int, const int8_t *, const int8_t *,
+                        int8_t *)
 {
-  T activation_min, activation_max;
-  getActivationParams(params, &activation_min, &activation_max);
-
-  for (int i = 0; i < flat_size; ++i)
-    output_data[i] =
-      std::min(std::max(input_data[i] * scalar_value, activation_min), activation_max);
+  assert(false && "Not IMPL yet");
 }
 
-template <typename T>
-inline void
-BroadcastMul4DSlow(const ArithmeticParams &params,
-                   const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
-                   const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
-                   const luci_interpreter::RuntimeShape &output_shape, T *output_data)
+template <>
+inline void Mul<int16_t>(const ArithmeticParams &, const int, const int16_t *, const int16_t *,
+                         int16_t *)
 {
-  const int flat_size = input1_shape.flatSize();
-
-  if (params.broadcast_category == BroadcastableOpCategory::kScalarFirstBroadcast)
-  {
-    return MulScalar(params, flat_size, input2_data, input1_data[0], output_data);
-  }
-  else if (params.broadcast_category == BroadcastableOpCategory::kScalarSecondBroadcast)
-  {
-    return MulScalar(params, flat_size, input1_data, input2_data[0], output_data);
-  }
-
-  NdArrayDesc<4> desc1;
-  NdArrayDesc<4> desc2;
-  NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
-  const luci_interpreter::RuntimeShape extended_output_shape =
-    luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
-
-  T activation_min, activation_max;
-  getActivationParams(params, &activation_min, &activation_max);
-
-  // In Tensorflow, the dimensions are canonically named (batch_number, row,
-  // col, channel), with extents (batches, height, width, depth), with the
-  // trailing dimension changing most rapidly (channels has the smallest stride,
-  // typically 1 element).
-  //
-  // In generated C code, we store arrays with the dimensions reversed. The
-  // first dimension has smallest stride.
-  //
-  // We name our variables by their Tensorflow convention, but generate C code
-  // nesting loops such that the innermost loop has the smallest stride for the
-  // best cache behavior.
-  for (int b = 0; b < extended_output_shape.dims(0); ++b)
-  {
-    for (int y = 0; y < extended_output_shape.dims(1); ++y)
-    {
-      for (int x = 0; x < extended_output_shape.dims(2); ++x)
-      {
-        for (int c = 0; c < extended_output_shape.dims(3); ++c)
-        {
-          const int output_data_offset =
-            ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
-              extended_output_shape.dims(3) +
-            c;
-
-          output_data[output_data_offset] =
-            std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] *
-                                input2_data[subscriptToIndex(desc2, b, y, x, c)],
-                              activation_min),
-                     activation_max);
-        }
-      }
-    }
-  }
+  assert(false && "Not IMPL yet");
 }
 
 } // namespace luci_interpreter_pal
diff --git a/onert-micro/luci-interpreter/pal/mcu/PALUnidirectionalSequenceLSTM.h b/onert-micro/luci-interpreter/pal/mcu/PALUnidirectionalSequenceLSTM.h
index 0e267d3ed5b..35592ac6663 100644
--- a/onert-micro/luci-interpreter/pal/mcu/PALUnidirectionalSequenceLSTM.h
+++ b/onert-micro/luci-interpreter/pal/mcu/PALUnidirectionalSequenceLSTM.h
@@ -18,503 +18,18 @@
 #ifndef LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_H
 #define LUCI_INTERPRETER_PAL_UNIDIRECTIONAL_SEQUENCE_LSTM_H
 
-#include "kernels/UnidirectionalSequenceLSTM.h"
-#include "PALTanh.h"
-#include "PALLogistic.h"
-#include "PALFullyConnected.h"
-#include "PALMul.h"
-#include "PALUtils.h"
+#include "PALUnidirectionalSequenceLSTMCommon.h"
 
 namespace luci_interpreter_pal
 {
-namespace lstm_internal
-{
-namespace
-{
-// Possible fused activation functions.
-typedef enum
-{
-  kTfLiteActNone = 0,
-  kTfLiteActRelu,
-  kTfLiteActReluN1To1, // min(max(-1, x), 1)
-  kTfLiteActRelu6,     // min(max(0, x), 6)
-  kTfLiteActTanh,
-  kTfLiteActSignBit,
-  kTfLiteActSigmoid,
-} FusedActivation;
-
-} // namespace
-
-#ifndef DIS_QUANT
-
-template <typename InputType, typename OutputType>
-void mulElementwise(int size, const ArithmeticParams *params, const InputType *input1_data,
-                    const InputType *input2_data, OutputType *output_data)
-{
-  for (int i = 0; i < size; ++i)
-  {
-    const int32_t input1_val = params->input1_offset + input1_data[i];
-    const int32_t input2_val = params->input2_offset + input2_data[i];
-    const int32_t unclamped_result =
-      params->output_offset + multiplyByQuantizedMultiplier(input1_val * input2_val,
-                                                            params->output_multiplier,
-                                                            params->output_shift);
-    const int32_t clamped_output =
-      std::min(params->quantized_activation_max,
-               std::max(params->quantized_activation_min, unclamped_result));
-    output_data[i] = static_cast<OutputType>(clamped_output);
-  }
-}
-
-// Input and output have the same shape in LSTM
-void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params,
-         const int16_t *input1_data, const int16_t *input2_data, int8_t *output_data)
-{
-  return mulElementwise<int16_t, int8_t>(shape.flatSize(), params, input1_data, input2_data,
-                                         output_data);
-}
-
-// Input and output have the same shape in LSTM
-void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params,
-         const int16_t *input1_data, const int16_t *input2_data, int16_t *output_data)
-{
-  return mulElementwise(shape.flatSize(), params, input1_data, input2_data, output_data);
-}
-
-void addElementWise(const int16_t *input_1, const int16_t *input_2, int n_batch, int n_input,
-                    int16_t *output)
-{
-  for (int batch = 0; batch < n_batch; ++batch)
-  {
-    for (int i = 0; i < n_input; ++i)
-    {
-      const int index = batch * n_input + i;
-      int32_t sum = input_1[index] + input_2[index];
-      const int32_t sum_clamped =
-        std::min(static_cast<int32_t>(std::numeric_limits<int16_t>::max()),
-                 std::max(static_cast<int32_t>(std::numeric_limits<int16_t>::min()), sum));
-      output[index] = static_cast<int16_t>(sum_clamped);
-    }
-  }
-}
-
-void tanh(int32_t cell_state_scale_power, const luci_interpreter::RuntimeShape &input_data_shape,
-          int16_t *input_data, const luci_interpreter::RuntimeShape &output_data_shape,
-          int16_t *output_data)
-{
-  int32_t tanh_input_left_shift = (15 + cell_state_scale_power) - 3;
-  int32_t input_multiplier = 0;
-  if (tanh_input_left_shift < 0) /* handling negative shift value */
-  {
-    tanh_input_left_shift = -tanh_input_left_shift;
-    input_multiplier = 3;
-  }
-  const int flat_size = input_data_shape.flatSize();
-  luci_interpreter_pal::Tanh(input_multiplier, tanh_input_left_shift, flat_size, input_data,
-                             output_data);
-}
-
-void sigmoid(const luci_interpreter::RuntimeShape &data_shape, int16_t *data)
-{
-  luci_interpreter_pal::Logistic(0, 0, data_shape.flatSize(), data, data);
-}
-
-void clipping(const int v_size, const luci_interpreter::lstm::CellStateInfo *cell_state_info,
-              int16_t *vector)
-{
-  for (int i = 0; i < v_size; i++)
-  {
-    vector[i] = std::max(std::min(cell_state_info->quantized_cell_clip, vector[i]),
-                         static_cast<int16_t>(-cell_state_info->quantized_cell_clip));
-  }
-}
-#endif // DIS_QUANT
-
-#ifndef DIS_FLOAT
-// Input and output have the same shape in LSTM
-void mul(const luci_interpreter::RuntimeShape &shape, const ArithmeticParams *params,
-         const float *input1_data, const float *input2_data, float *output_data)
-{
-  const int flat_size = shape.flatSize();
-  return luci_interpreter_pal::Mul(*params, flat_size, input1_data, input2_data, output_data);
-}
-
-void addElementWise(const float *input_1, const float *input_2, int n_batch, int n_input,
-                    float *output)
-{
-  for (int batch = 0; batch < n_batch; ++batch)
-  {
-    for (int i = 0; i < n_input; ++i)
-    {
-      const int index = batch * n_input + i;
-      output[index] = input_1[index] + input_2[index];
-    }
-  }
-}
-
-void tanh(int32_t, const luci_interpreter::RuntimeShape &input_data_shape, float *input_data,
-          const luci_interpreter::RuntimeShape &output_data_shape, float *output_data)
-{
-  const int flat_size = input_data_shape.flatSize();
-  luci_interpreter_pal::Tanh(flat_size, input_data, output_data);
-}
-
-void sigmoid(const luci_interpreter::RuntimeShape &data_shape, float *data)
-{
-  const int flat_size = data_shape.flatSize();
-  luci_interpreter_pal::Logistic(flat_size, data, data);
-}
-
-void clipping(const int v_size, const luci_interpreter::lstm::CellStateInfo *cell_state_info,
-              float *vector)
-{
-  for (int i = 0; i < v_size; i++)
-  {
-    vector[i] =
-      std::max(std::min(cell_state_info->cell_clip, vector[i]), -cell_state_info->cell_clip);
-  }
-}
-#endif // DIS_FLOAT
-
-// Size information about the LSTM kernel, which is deduced from tensors stored
-// in the flat buffer file.
-struct LstmSizeInfo
-{
-  bool time_major;
-  int32_t batch_size;
-  int32_t time_steps;
-  int32_t input_dimension;
-  int32_t state_dimension;
-};
-
-class LstmStepManager
-{
-public:
-  LstmStepManager() = delete;
-  // Does not take any ownership, and all pointers must refer to valid objects
-  // that outlive the one constructed.
-  explicit LstmStepManager(const LstmSizeInfo &size_info) : size_info_(size_info) {}
-
-  void updateTime()
-  {
-    current_time_ += 1;
-    // default as one batch per inference
-    int input_step = size_info_.input_dimension;
-    int output_step = size_info_.state_dimension;
-    // time major: batch inference
-    if (size_info_.time_major)
-    {
-      input_step = input_step * size_info_.batch_size;
-      output_step = output_step * size_info_.batch_size;
-    }
-
-    input_offset_ += input_step;
-    output_offset_ += output_step;
-  }
-
-  void updateBatch()
-  {
-    current_batch_ += 1;
-    // batch inference for time major: no action needed
-    if (size_info_.time_major)
-    {
-      return;
-    }
-    // otherwise: singe batch inference, go to the next batch
-    hidden_state_offset_ += size_info_.state_dimension;
-    cell_state_offset_ += size_info_.state_dimension;
-  }
-
-  void resetTime() { current_time_ = 0; }
-
-  luci_interpreter::RuntimeShape inputShape() const
-  {
-    int batch_size = 1;
-    if (size_info_.time_major)
-    {
-      batch_size = size_info_.batch_size;
-    }
-    const int dims[2] = {batch_size, size_info_.input_dimension};
-    const int32_t *dims_data = reinterpret_cast<const int32_t *>(dims);
-    return luci_interpreter::RuntimeShape(2, dims_data);
-  }
-
-  luci_interpreter::RuntimeShape stateShape() const
-  {
-    int batch_size = 1;
-    if (size_info_.time_major)
-    {
-      batch_size = size_info_.batch_size;
-    }
-    const int dims[2] = {batch_size, size_info_.state_dimension};
-    const int32_t *dims_data = reinterpret_cast<const int32_t *>(dims);
-    return luci_interpreter::RuntimeShape(2, dims_data);
-  }
-
-  int inputOffset() const { return input_offset_; }
-
-  int outputOffset() const { return output_offset_; }
-
-  int hiddenStateOffset() const { return hidden_state_offset_; }
-
-  int cellStateOffset() const { return cell_state_offset_; }
-
-private:
-  int32_t current_time_ = 0;
-  int32_t current_batch_ = 0;
-  int32_t input_offset_ = 0;
-  int32_t output_offset_ = 0;
-  int32_t hidden_state_offset_ = 0;
-  int32_t cell_state_offset_ = 0;
-
-  const LstmSizeInfo &size_info_;
-};
-
-// Calculates a single LSTM gate.
-// Implements the following formula:
-//   gate = activate(FC(input) + FC(recurrent))
-// Activation is sigmoid except for the "cell" gate (configurable, usually tanh)
-template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
-void calculateLstmGate(const LstmStepManager *step_info,
-                       const luci_interpreter::lstm::GateParameters *gate_params,
-                       // Input FC
-                       ActivationType *input_data, const circle::Tensor *input_weight,
-                       const circle::Tensor *input_bias,
-                       // Recurrent FC
-                       ActivationType *recurrent_data, const circle::Tensor *recurrent_weight,
-                       const circle::Tensor *recurrent_bias,
-                       // Output
-                       CellType *gate_output,
-                       // Scratch arrays
-                       CellType *fc_output_buffer, const FusedActivation activation,
-                       luci_interpreter::BaseRuntimeGraph *runtime_graph)
-{
-  // Input FC
-  const auto gate_output_shape = step_info->stateShape();
-  {
-    FullyConnectedParams op_params{};
-    op_params.input_offset = gate_params->input_fc_params.input_offset;
-    op_params.weights_offset = gate_params->input_fc_params.weights_offset;
-    op_params.output_offset = gate_params->input_fc_params.output_offset;
-    op_params.output_multiplier = gate_params->input_fc_params.output_multiplier;
-    op_params.output_shift = gate_params->input_fc_params.output_shift;
-    op_params.quantized_activation_min = gate_params->input_fc_params.quantized_activation_min;
-    op_params.quantized_activation_max = gate_params->input_fc_params.quantized_activation_max;
-    op_params.float_activation_max = gate_params->input_fc_params.float_activation_max;
-    op_params.float_activation_min = gate_params->input_fc_params.float_activation_min;
-
-    int32_t input_weight_shape[luci_interpreter::kMaxSmallSize];
-    luci_interpreter::kernels::getTensorDims(input_weight, runtime_graph, input_weight_shape);
-
-    FullyConnected(op_params, step_info->inputShape().dimsData(),
-                   input_data + step_info->inputOffset(), input_weight_shape,
-                   luci_interpreter::kernels::getTensorData<WeightType>(
-                     runtime_graph->getConstDataByTensor(input_weight)),
-                   luci_interpreter::kernels::getTensorData<BiasType>(
-                     runtime_graph->getConstDataByTensor(input_bias)),
-                   gate_output_shape.dimsData(), gate_output);
-  }
-
-  // Recurrent FC
-  {
-    FullyConnectedParams op_params{};
-    op_params.input_offset = gate_params->recurrent_fc_params.input_offset;
-    op_params.weights_offset = gate_params->recurrent_fc_params.weights_offset;
-    op_params.output_offset = gate_params->recurrent_fc_params.output_offset;
-    op_params.output_multiplier = gate_params->recurrent_fc_params.output_multiplier;
-    op_params.output_shift = gate_params->recurrent_fc_params.output_shift;
-    op_params.quantized_activation_min = gate_params->recurrent_fc_params.quantized_activation_min;
-    op_params.quantized_activation_max = gate_params->recurrent_fc_params.quantized_activation_max;
-    op_params.float_activation_max = gate_params->recurrent_fc_params.float_activation_max;
-    op_params.float_activation_min = gate_params->recurrent_fc_params.float_activation_min;
-
-    int32_t recurrent_weight_shape[luci_interpreter::kMaxSmallSize];
-    luci_interpreter::kernels::getTensorDims(recurrent_weight, runtime_graph,
-                                             recurrent_weight_shape);
-
-    FullyConnected(op_params, step_info->stateShape().dimsData(),
-                   recurrent_data + step_info->hiddenStateOffset(), recurrent_weight_shape,
-                   luci_interpreter::kernels::getTensorData<WeightType>(
-                     runtime_graph->getConstDataByTensor(recurrent_weight)),
-                   luci_interpreter::kernels::getTensorData<BiasType>(
-                     runtime_graph->getConstDataByTensor(recurrent_bias)),
-                   gate_output_shape.dimsData(), fc_output_buffer);
-
-    addElementWise(gate_output, fc_output_buffer, /*n_batch=*/gate_output_shape.dimsData()[0],
-                   /*n_state=*/gate_output_shape.dimsData()[1], gate_output);
-
-    switch (activation)
-    {
-      case FusedActivation::kTfLiteActSigmoid:
-        sigmoid(gate_output_shape, gate_output);
-        break;
-      case FusedActivation::kTfLiteActTanh:
-      {
-        // Set the scale power to -12 to avoid shift
-        tanh(/*cell_state_scale_power=*/-12, gate_output_shape, gate_output, gate_output_shape,
-             gate_output);
-      }
-      break;
-      default:
-        // Only Sigmoid or Tanh is used.
-        assert(false && "Only Sigmoid or Tanh is used");
-    }
-  }
-}
-
-// Update the hidden state of the LSTM kernel using the following formula:
-// updated_hidden_state = Tanh(updated_cell_state) * output_gate_output, * means
-// element wise multiplication
-template <typename CellType, typename ActivationType>
-void updateLstmHidden(const LstmStepManager *step_info, CellType *cell_state_data_base,
-                      ActivationType *hidden_state_data, const CellType *output_gate_output,
-                      const ArithmeticParams *mul_params, int32_t cell_state_scale_power,
-                      CellType *buffer)
-{
-  auto cell_state_shape = step_info->stateShape();
-  CellType *cell_state_data = cell_state_data_base + step_info->cellStateOffset();
-  // Tanh(cell_state)
-  tanh(cell_state_scale_power, cell_state_shape, cell_state_data, cell_state_shape, buffer);
-  // Update the hidden state
-  mul(cell_state_shape, mul_params, buffer, output_gate_output,
-      hidden_state_data + step_info->hiddenStateOffset());
-}
-
-// Update the cell state using the output from the forget gate, input gate, and
-// cell gate Formula: updated_cell_state = forget_gate_output*cell_state +
-// input_gate_output * cell_gate_output, where * denotes element wise
-// multiplication
-template <typename CellType>
-void updateLstmCell(const LstmStepManager *step_info, CellType *cell_state_data,
-                    // Gate outputs
-                    CellType *forget_gate_output, const CellType *input_gate_output,
-                    const CellType *cell_gate_output,
-                    // Mul parameters
-                    const ArithmeticParams &forget_cell_mul_params,
-                    const ArithmeticParams &input_mul_params,
-                    const luci_interpreter::lstm::CellStateInfo *cell_state_info, CellType *buffer)
-{
-  auto cell_state_shape = step_info->stateShape();
-  // Forget Gate x Cell State
-  mul(cell_state_shape, &forget_cell_mul_params, forget_gate_output,
-      cell_state_data + step_info->cellStateOffset(),
-      cell_state_data + step_info->cellStateOffset());
-  // Input Gate x Cell Gate
-  mul(cell_state_shape, &input_mul_params, input_gate_output, cell_gate_output, buffer);
-
-  // Update the cell state
-  addElementWise(cell_state_data + step_info->cellStateOffset(), buffer,
-                 /*n_batch=*/cell_state_shape.dimsData()[0],
-                 /*n_state=*/cell_state_shape.dimsData()[1],
-                 cell_state_data + step_info->cellStateOffset());
-
-  if (cell_state_info->cell_clip > 0)
-  {
-    clipping(cell_state_shape.flatSize(), cell_state_info,
-             cell_state_data + step_info->cellStateOffset());
-  }
-}
-
-template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
-void lstmStep(luci_interpreter::lstm::LSTMStruct *lstm_struct,
-              luci_interpreter::lstm::LSTMParameters *lstm_params, LstmStepManager *step_info,
-              luci_interpreter::lstm::CellStateInfo *cell_state_info,
-              ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0,
-              CellType *scratch1, CellType *scratch2, CellType *scratch3,
-              luci_interpreter::BaseRuntimeGraph *runtime_graph)
-{
-  /*Step1: Calculate gate outputs to prepare cell state update*/
-  CellType *gate_internal_buffer = scratch3;
-  CellType *forget_gate_output = scratch0;
-
-  auto input_data = luci_interpreter::kernels::getTensorData<ActivationType>(
-    runtime_graph->getDataByTensor(lstm_struct->input()));
-
-  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
-    step_info, &lstm_params->forget_gate_parameters,
-    // Input FC
-    input_data, lstm_struct->input_to_forget_weights(), lstm_struct->forget_gate_bias(),
-    // Recurrent FC
-    output_state_data, lstm_struct->recurrent_to_forget_weights(), nullptr,
-    // Output
-    forget_gate_output, gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph);
-
-  // Input Gate calculation;
-  CellType *input_gate_output = scratch1;
-  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
-    step_info, &lstm_params->input_gate_parameters,
-    // Input FC
-    input_data, lstm_struct->input_to_input_weights(), lstm_struct->input_gate_bias(),
-    // Recurrent FC
-    output_state_data, lstm_struct->recurrent_to_input_weights(),
-    /*recurrent_bias*/ nullptr,
-    // Output
-    input_gate_output,
-    // Scratch arrays
-    gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph);
-
-  // Cell Gate calculation
-  CellType *cell_gate_output = scratch2;
-  calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
-    step_info, &lstm_params->cell_gate_parameters,
-    // Input FC
-    input_data, lstm_struct->input_to_cell_weights(), lstm_struct->cell_gate_bias(),
-    // Recurrent FC
-    output_state_data, lstm_struct->recurrent_to_cell_weights(),
-    /*recurrent_bias*/ nullptr,
-    // Output
-    cell_gate_output,
-    // Scratch arrays
-    gate_internal_buffer, FusedActivation::kTfLiteActTanh, runtime_graph);
-
-  /*Step2: update the cell state */
-  {
-    // const InterGateParameters& inter_gate_params = op_data.inter_gate_parameters;
-    CellType *updated_input_buffer = scratch1; // reuse buffer
-
-    updateLstmCell<CellType>(
-      step_info, cell_state_data, forget_gate_output, input_gate_output, cell_gate_output,
-      lstm_params->inter_gate_parameters.forget_cell_mul_params,
-      lstm_params->inter_gate_parameters.input_mul_params, cell_state_info, updated_input_buffer);
-  }
-
-  {
-    /*Step3: update the hidden state */
-    CellType *output_gate_output = scratch1; // reuse buffer
-    calculateLstmGate<ActivationType, WeightType, CellType, BiasType>(
-      step_info, &lstm_params->output_gate_parameters,
-      // Input FC
-      input_data, lstm_struct->input_to_output_weights(), lstm_struct->output_gate_bias(),
-      // Recurrent FC
-      output_state_data, lstm_struct->recurrent_to_output_weights(), nullptr,
-      // Output
-      output_gate_output,
-      // Scratch arrays
-      gate_internal_buffer, FusedActivation::kTfLiteActSigmoid, runtime_graph);
-    CellType *tanh_activated_cell_buffer = scratch0; // reuse buffer
-    updateLstmHidden<CellType, ActivationType>(
-      step_info, cell_state_data, output_state_data, output_gate_output,
-      &lstm_params->inter_gate_parameters.output_mul_params,
-      cell_state_info->cell_state_scale_power, tanh_activated_cell_buffer);
-
-    ActivationType *output_ptr = luci_interpreter::kernels::getTensorData<ActivationType>(
-      runtime_graph->getDataByTensor(lstm_struct->output()));
-    std::memcpy(output_ptr + step_info->outputOffset(),
-                output_state_data + step_info->hiddenStateOffset(),
-                step_info->stateShape().flatSize() * sizeof(ActivationType));
-  }
-}
-
-} // namespace lstm_internal
-
 // Evaluate the LSTM kernel with (potential) multi-steps and multi-batch input
-template <typename ActivationType, typename WeightType, typename CellType, typename BiasType>
-void evalLSTM(luci_interpreter::lstm::LSTMStruct *lstm_struct,
-              luci_interpreter::lstm::LSTMParameters *lstm_params,
-              luci_interpreter::lstm::CellStateInfo *cell_state_info,
-              ActivationType *output_state_data, CellType *cell_state_data, CellType *scratch0,
-              CellType *scratch1, CellType *scratch2, CellType *scratch3,
-              luci_interpreter::BaseRuntimeGraph *runtime_graph)
+template <>
+void evalLSTM<int8_t, int8_t, int16_t, int32_t>(
+  luci_interpreter::lstm::LSTMStruct *lstm_struct,
+  luci_interpreter::lstm::LSTMParameters *lstm_params,
+  luci_interpreter::lstm::CellStateInfo *cell_state_info, int8_t *output_state_data,
+  int16_t *cell_state_data, int16_t *scratch0, int16_t *scratch1, int16_t *scratch2,
+  int16_t *scratch3, luci_interpreter::BaseRuntimeGraph *runtime_graph)
 {
   lstm_internal::LstmSizeInfo size_info;
 
@@ -535,7 +50,7 @@ void evalLSTM(luci_interpreter::lstm::LSTMStruct *lstm_struct,
   {
     for (int t = 0; t < size_info.time_steps; t++)
     {
-      lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
+      lstm_internal::lstmStep<int8_t, int8_t, int16_t, int32_t>(
         lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
         scratch0, scratch1, scratch2, scratch3, runtime_graph);
       // prepare for the next time step
@@ -549,7 +64,7 @@ void evalLSTM(luci_interpreter::lstm::LSTMStruct *lstm_struct,
     {
       for (int t = 0; t < size_info.time_steps; t++)
       {
-        lstm_internal::lstmStep<ActivationType, WeightType, CellType, BiasType>(
+        lstm_internal::lstmStep<int8_t, int8_t, int16_t, int32_t>(
           lstm_struct, lstm_params, &step_info, cell_state_info, output_state_data, cell_state_data,
           scratch0, scratch1, scratch2, scratch3, runtime_graph);
         // prepare for the next time step