Support for per channel quantized FC (#142)

* Adds new FC top level functions as well as support function for per channels quantization. * Adds a new FC wrapper. * Remove gcc unroll pragma for MVE in current FC support function.
ARM-software · Aug 21, 2024 · b958a3f · b958a3f
1 parent 9d924bd
commit b958a3f
Show file tree

Hide file tree

Showing 57 changed files with 2,132 additions and 120 deletions.
diff --git a/ARM.CMSIS-NN.pdsc b/ARM.CMSIS-NN.pdsc
@@ -103,6 +103,7 @@
         <file category="source" name="Source/ReshapeFunctions/arm_reshape_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s4.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c"/>
+        <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_per_ch_s8.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16_s16.c"/>
         <file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c"/>
@@ -132,6 +133,8 @@
         <file category="source" name="Source/FullyConnectedFunctions/arm_batch_matmul_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s4.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s8.c"/>
+        <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_per_channel_s8.c"/>
+        <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_wrapper_s8.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c"/>
         <file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c"/>

diff --git a/Include/arm_nn_types.h b/Include/arm_nn_types.h
@@ -22,8 +22,8 @@
  * Description:  Public header file to contain the CMSIS-NN structs for the
  *               TensorFlowLite micro compliant functions
  *
- * $Date:        19 June 2024
- * $Revision:    V.3.3.0
+ * $Date:        19 Aug 2024
+ * $Revision:    V.3.4.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -111,6 +111,17 @@ typedef struct
     int32_t shift;      /**< Shift value */
 } cmsis_nn_per_tensor_quant_params;
 
+/** CMSIS-NN object for quantization parameters.
+ *  This struct supports both per-tensor and per-channels requantization
+ *  and is recommended for new operators.
+ */
+typedef struct
+{
+    int32_t *multiplier;    /**< Multiplier values */
+    int32_t *shift;         /**< Shift values */
+    int32_t is_per_channel; /** Indicating if per channel or per tensor quantization */
+} cmsis_nn_quant_params;
+
 /** CMSIS-NN object for the quantized Relu activation */
 typedef struct
 {

diff --git a/Include/arm_nnfunctions.h b/Include/arm_nnfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnfunctions.h
  * Description:  Public header file for CMSIS NN Library
  *
- * $Date:        19 June 2024
- * $Revision:    V.16.2.0
+ * $Date:        19 Aug 2024
+ * $Revision:    V.16.3.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -1499,7 +1499,7 @@ int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dim
  *                               fc_params->filter_offset : 0
  *                               Range of fc_params->output_offset : [-128, 127]
  * @param[in]      quant_params  Per-tensor quantization info.
- *                               It contains the multiplier and shift values to be applied to the output tensor.
+ *                               It contains the multiplier and shift value to be applied to the output tensor.
  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  *                               Input dimension is taken as Nx(H * W * C_IN)
  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
@@ -1547,7 +1547,7 @@ arm_cmsis_nn_status arm_fully_connected_s4(const cmsis_nn_context *ctx,
  *                               fc_params->filter_offset : 0
  *                               Range of fc_params->output_offset : [-128, 127]
  * @param[in]      quant_params  Per-tensor quantization info.
- *                               It contains the multiplier and shift values to be applied to the output tensor.
+ *                               It contains the multiplier and shift value to be applied to the output tensor.
  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  *                               Input dimension is taken as Nx(H * W * C_IN)
  * @param[in]      input_data    Input (activation) data pointer. Data type: int8
@@ -1584,6 +1584,106 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
                                            const cmsis_nn_dims *output_dims,
                                            int8_t *output_data);
 
+/**
+ * @brief Basic s8 Fully Connected function using per channel quantization.
+ *
+ * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
+ *                               definition file to see if an additional buffer is required.
+ *                               Optional function {API}_get_buffer_size() provides the buffer
+ *                               size if an additional buffer is required.
+ *                               The caller is expected to clear the buffer, if applicable, for security reasons.
+ * @param[in]      fc_params     Fully Connected layer parameters.
+ *                               Range of fc_params->input_offset  : [-127, 128]
+ *                               fc_params->filter_offset : 0
+ *                               Range of fc_params->output_offset : [-128, 127]
+ * @param[in]      quant_params  Per-channel quantization info.
+ *                               It contains the multiplier and shift values to be applied to each output channel
+ * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ *                               Input dimension is taken as Nx(H * W * C_IN)
+ * @param[in]      input_data    Input (activation) data pointer. Data type: int8
+ * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
+ *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
+ *                               C : output depth and equals C_OUT in output_dims
+ *                               H & W : Not used
+ * @param[in]      filter_data   Filter data pointer. Data type: int8
+ * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
+ *                               N, H, W : Not used
+ * @param[in]      bias_data     Bias data pointer. Data type: int32
+ * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
+ *                               N : Batches
+ *                               C_OUT : Output depth
+ *                               H & W : Not used.
+ * @param[in, out] output_data    Output data pointer. Data type: int8
+ *
+ * @return     The function returns either
+ *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
+ *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
+ *
+ * @details
+ *    - Supported framework: TensorFlow Lite
+ */
+arm_cmsis_nn_status arm_fully_connected_per_channel_s8(const cmsis_nn_context *ctx,
+                                                       const cmsis_nn_fc_params *fc_params,
+                                                       const cmsis_nn_per_channel_quant_params *quant_params,
+                                                       const cmsis_nn_dims *input_dims,
+                                                       const int8_t *input_data,
+                                                       const cmsis_nn_dims *filter_dims,
+                                                       const int8_t *filter_data,
+                                                       const cmsis_nn_dims *bias_dims,
+                                                       const int32_t *bias_data,
+                                                       const cmsis_nn_dims *output_dims,
+                                                       int8_t *output_data);
+
+/**
+ * @brief s8 Fully Connected layer wrapper function
+ *
+ * @param[in, out] ctx           Function context (e.g. temporary buffer). Check the function
+ *                               definition file to see if an additional buffer is required.
+ *                               Optional function {API}_get_buffer_size() provides the buffer
+ *                               size if an additional buffer is required.
+ *                               The caller is expected to clear the buffer, if applicable, for security reasons.
+ * @param[in]      fc_params     Fully Connected layer parameters.
+ *                               Range of fc_params->input_offset  : [-127, 128]
+ *                               fc_params->filter_offset : 0
+ *                               Range of fc_params->output_offset : [-128, 127]
+ * @param[in]      quant_params  Per-channel or per-tensor quantization info. Check struct defintion for details.
+ *                               It contains the multiplier and shift value(s) to be applied to each output channel
+ * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
+ *                               Input dimension is taken as Nx(H * W * C_IN)
+ * @param[in]      input_data    Input (activation) data pointer. Data type: int8
+ * @param[in]      filter_dims   Two dimensional filter dimensions. Format: [N, C]
+ *                               N : accumulation depth and equals (H * W * C_IN) from input_dims
+ *                               C : output depth and equals C_OUT in output_dims
+ *                               H & W : Not used
+ * @param[in]      filter_data   Filter data pointer. Data type: int8
+ * @param[in]      bias_dims     Bias tensor dimensions. Format: [C_OUT]
+ *                               N, H, W : Not used
+ * @param[in]      bias_data     Bias data pointer. Data type: int32
+ * @param[in]      output_dims   Output tensor dimensions. Format: [N, C_OUT]
+ *                               N : Batches
+ *                               C_OUT : Output depth
+ *                               H & W : Not used.
+ * @param[in, out] output_data    Output data pointer. Data type: int8
+ *
+ * @return     The function returns either
+ *                  <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
+ *                  <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
+ *
+ * @details
+ *    - Supported framework: TensorFlow Lite
+ */
+arm_cmsis_nn_status arm_fully_connected_wrapper_s8(const cmsis_nn_context *ctx,
+                                                   const cmsis_nn_fc_params *fc_params,
+                                                   const cmsis_nn_quant_params *quant_params,
+                                                   const cmsis_nn_dims *input_dims,
+                                                   const int8_t *input_data,
+                                                   const cmsis_nn_dims *filter_dims,
+                                                   const int8_t *filter_data,
+                                                   const cmsis_nn_dims *bias_dims,
+                                                   const int32_t *bias_data,
+                                                   const cmsis_nn_dims *output_dims,
+                                                   int8_t *output_data);
+
 /**
  * @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s32 bias_data.
  * @param[in, out]      vector_sum_buf              Buffer for vector sums
@@ -1662,7 +1762,7 @@ int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_d
  *                               fc_params->filter_offset : 0
  *                               fc_params->output_offset : 0
  * @param[in]      quant_params  Per-tensor quantization info.
- *                               It contains the multiplier and shift values to be applied to the output tensor.
+ *                               It contains the multiplier and shift value to be applied to the output tensor.
  * @param[in]      input_dims    Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
  *                               Input dimension is taken as Nx(H * W * C_IN)
  * @param[in]      input_data    Input (activation) data pointer. Data type: int16

diff --git a/Include/arm_nnsupportfunctions.h b/Include/arm_nnsupportfunctions.h
@@ -21,8 +21,8 @@
  * Title:        arm_nnsupportfunctions.h
  * Description:  Public header file of support functions for CMSIS NN Library
  *
- * $Date:        19 June 2024
- * $Revision:    V.22.2.0
+ * $Date:        12 Jul 2024
+ * $Revision:    V.22.3.0
  *
  * Target :  Arm(R) M-Profile Architecture
  * -------------------------------------------------------------------- */
@@ -703,6 +703,47 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
                                              const int32_t address_offset,
                                              const int32_t rhs_offset);
 
+/**
+ * @brief s8 Vector by Matrix (transposed) multiplication using per channel quantization for output
+ *
+ * @param[in]      lhs             Input left-hand side vector
+ * @param[in]      rhs             Input right-hand side matrix (transposed)
+ * @param[in]      kernel_sum      Kernel sums of the kernels (rhs). See arm_vector_sum_s8 for more info.
+ * @param[in]      bias            Input bias
+ * @param[out]     dst             Output vector
+ * @param[in]      lhs_offset      Offset to be added to the input values of the left-hand side vector.
+ *                                 Range: -127 to 128
+ * @param[in]      dst_offset      Offset to be added to the output values. Range: -127 to 128
+ * @param[in]      dst_multiplier  Output multipliers
+ * @param[in]      dst_shift       Output shifts
+ * @param[in]      rhs_cols        Number of columns in the right-hand side input matrix
+ * @param[in]      rhs_rows        Number of rows in the right-hand side input matrix
+ * @param[in]      activation_min  Minimum value to clamp the output to. Range: int8
+ * @param[in]      activation_max  Maximum value to clamp the output to. Range: int8
+ * @param[in]      address_offset  Memory position offset for dst. First output is stored at 'dst', the
+ *                                 second at 'dst + address_offset' and so on. Default value is typically 1.
+ * @param[in]      rhs_offset      Offset to be added to the input values of the right-hand side vector.
+ *                                 Range: -127 to 128
+ *
+ * @return         The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
+ *
+ */
+arm_cmsis_nn_status arm_nn_vec_mat_mult_t_per_ch_s8(const int8_t *lhs,
+                                                    const int8_t *rhs,
+                                                    const int32_t *kernel_sum,
+                                                    const int32_t *bias,
+                                                    int8_t *dst,
+                                                    const int32_t lhs_offset,
+                                                    const int32_t dst_offset,
+                                                    const int32_t *dst_multiplier,
+                                                    const int32_t *dst_shift,
+                                                    const int32_t rhs_cols,
+                                                    const int32_t rhs_rows,
+                                                    const int32_t activation_min,
+                                                    const int32_t activation_max,
+                                                    const int32_t address_offset,
+                                                    const int32_t rhs_offset);
+
 /**
  * @brief s16 Vector by s8 Matrix (transposed) multiplication
  *

diff --git a/Source/FullyConnectedFunctions/arm_fully_connected_per_channel_s8.c b/Source/FullyConnectedFunctions/arm_fully_connected_per_channel_s8.c
@@ -0,0 +1,102 @@
+/*
+ * SPDX-FileCopyrightText: Copyright 2010-2024 Arm Limited and/or its affiliates <[email protected]>
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/* ----------------------------------------------------------------------
+ * Project:      CMSIS NN Library
+ * Title:        arm_fully_connected_per_channel_s8
+ * Description:  Fully connected function compatible with TF Lite.
+ *
+ * $Date:        15 Aug 2024
+ * $Revision:    V.1.0.0
+ *
+ * Target :  Arm(R) M-Profile Architecture
+ *
+ * -------------------------------------------------------------------- */
+
+#include "arm_nnfunctions.h"
+#include "arm_nnsupportfunctions.h"
+
+/**
+ *  @ingroup Public
+ */
+
+/**
+ * @addtogroup FC
+ * @{
+ */
+
+/*
+ * S8 basic fully-connected and matrix multiplication layer function using per-channel quantization for TensorFlow Lite
+ *
+ * Refer header file for details.
+ *
+ */
+arm_cmsis_nn_status arm_fully_connected_per_channel_s8(const cmsis_nn_context *ctx,
+                                                       const cmsis_nn_fc_params *fc_params,
+                                                       const cmsis_nn_per_channel_quant_params *quant_params,
+                                                       const cmsis_nn_dims *input_dims,
+                                                       const int8_t *input_data,
+                                                       const cmsis_nn_dims *filter_dims,
+                                                       const int8_t *kernel,
+                                                       const cmsis_nn_dims *bias_dims,
+                                                       const int32_t *bias_data,
+                                                       const cmsis_nn_dims *output_dims,
+                                                       int8_t *output_data)
+{
+    (void)bias_dims;
+
+    int32_t batch_cnt = input_dims->n;
+
+#if defined(ARM_MATH_MVEI)
+    if (ctx->buf == NULL)
+    {
+        return (ARM_CMSIS_NN_ARG_ERROR);
+    }
+#endif
+
+    const int32_t *kernel_sum = (const int32_t *)ctx->buf;
+
+    while (batch_cnt)
+    {
+
+        arm_nn_vec_mat_mult_t_per_ch_s8(input_data,
+                                        kernel,
+                                        kernel_sum,
+                                        bias_data,
+                                        output_data,
+                                        fc_params->input_offset,
+                                        fc_params->output_offset,
+                                        quant_params->multiplier,
+                                        quant_params->shift,
+                                        filter_dims->n, /* col_dim or accum_depth */
+                                        output_dims->c, /* row_dim or output_depth */
+                                        fc_params->activation.min,
+                                        fc_params->activation.max,
+                                        1L,
+                                        fc_params->filter_offset);
+
+        input_data += filter_dims->n;
+        output_data += output_dims->c;
+        batch_cnt--;
+    }
+    return (ARM_CMSIS_NN_SUCCESS);
+}
+
+/**
+ * @} end of FC group
+ */