Skip to content

Commit

Permalink
Support for per channel quantized FC (#142)
Browse files Browse the repository at this point in the history
* Adds new FC top level functions as well as support function for per
channels quantization.
* Adds a new FC wrapper.
* Remove gcc unroll pragma for MVE in current FC support function.
  • Loading branch information
mansnils authored Aug 21, 2024
1 parent 9d924bd commit b958a3f
Show file tree
Hide file tree
Showing 57 changed files with 2,132 additions and 120 deletions.
3 changes: 3 additions & 0 deletions ARM.CMSIS-NN.pdsc
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
<file category="source" name="Source/ReshapeFunctions/arm_reshape_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s4.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_per_ch_s8.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_s16_s16.c"/>
<file category="source" name="Source/NNSupportFunctions/arm_nn_vec_mat_mult_t_svdf_s8.c"/>
Expand Down Expand Up @@ -132,6 +133,8 @@
<file category="source" name="Source/FullyConnectedFunctions/arm_batch_matmul_s16.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s4.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s8.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_per_channel_s8.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_wrapper_s8.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_s16.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s16.c"/>
<file category="source" name="Source/FullyConnectedFunctions/arm_fully_connected_get_buffer_sizes_s8.c"/>
Expand Down
15 changes: 13 additions & 2 deletions Include/arm_nn_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
* Description: Public header file to contain the CMSIS-NN structs for the
* TensorFlowLite micro compliant functions
*
* $Date: 19 June 2024
* $Revision: V.3.3.0
* $Date: 19 Aug 2024
* $Revision: V.3.4.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -111,6 +111,17 @@ typedef struct
int32_t shift; /**< Shift value */
} cmsis_nn_per_tensor_quant_params;

/** CMSIS-NN object for quantization parameters.
* This struct supports both per-tensor and per-channels requantization
* and is recommended for new operators.
*/
typedef struct
{
int32_t *multiplier; /**< Multiplier values */
int32_t *shift; /**< Shift values */
int32_t is_per_channel; /** Indicating if per channel or per tensor quantization */
} cmsis_nn_quant_params;

/** CMSIS-NN object for the quantized Relu activation */
typedef struct
{
Expand Down
110 changes: 105 additions & 5 deletions Include/arm_nnfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnfunctions.h
* Description: Public header file for CMSIS NN Library
*
* $Date: 19 June 2024
* $Revision: V.16.2.0
* $Date: 19 Aug 2024
* $Revision: V.16.3.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -1499,7 +1499,7 @@ int32_t arm_depthwise_conv_s4_opt_get_buffer_size(const cmsis_nn_dims *input_dim
* fc_params->filter_offset : 0
* Range of fc_params->output_offset : [-128, 127]
* @param[in] quant_params Per-tensor quantization info.
* It contains the multiplier and shift values to be applied to the output tensor.
* It contains the multiplier and shift value to be applied to the output tensor.
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Input dimension is taken as Nx(H * W * C_IN)
* @param[in] input_data Input (activation) data pointer. Data type: int8
Expand Down Expand Up @@ -1547,7 +1547,7 @@ arm_cmsis_nn_status arm_fully_connected_s4(const cmsis_nn_context *ctx,
* fc_params->filter_offset : 0
* Range of fc_params->output_offset : [-128, 127]
* @param[in] quant_params Per-tensor quantization info.
* It contains the multiplier and shift values to be applied to the output tensor.
* It contains the multiplier and shift value to be applied to the output tensor.
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Input dimension is taken as Nx(H * W * C_IN)
* @param[in] input_data Input (activation) data pointer. Data type: int8
Expand Down Expand Up @@ -1584,6 +1584,106 @@ arm_cmsis_nn_status arm_fully_connected_s8(const cmsis_nn_context *ctx,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Basic s8 Fully Connected function using per channel quantization.
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* The caller is expected to clear the buffer, if applicable, for security reasons.
* @param[in] fc_params Fully Connected layer parameters.
* Range of fc_params->input_offset : [-127, 128]
* fc_params->filter_offset : 0
* Range of fc_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel quantization info.
* It contains the multiplier and shift values to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Input dimension is taken as Nx(H * W * C_IN)
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
* N : accumulation depth and equals (H * W * C_IN) from input_dims
* C : output depth and equals C_OUT in output_dims
* H & W : Not used
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* N, H, W : Not used
* @param[in] bias_data Bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
* N : Batches
* C_OUT : Output depth
* H & W : Not used.
* @param[in, out] output_data Output data pointer. Data type: int8
*
* @return The function returns either
* <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
* <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
*
* @details
* - Supported framework: TensorFlow Lite
*/
arm_cmsis_nn_status arm_fully_connected_per_channel_s8(const cmsis_nn_context *ctx,
const cmsis_nn_fc_params *fc_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief s8 Fully Connected layer wrapper function
*
* @param[in, out] ctx Function context (e.g. temporary buffer). Check the function
* definition file to see if an additional buffer is required.
* Optional function {API}_get_buffer_size() provides the buffer
* size if an additional buffer is required.
* The caller is expected to clear the buffer, if applicable, for security reasons.
* @param[in] fc_params Fully Connected layer parameters.
* Range of fc_params->input_offset : [-127, 128]
* fc_params->filter_offset : 0
* Range of fc_params->output_offset : [-128, 127]
* @param[in] quant_params Per-channel or per-tensor quantization info. Check struct defintion for details.
* It contains the multiplier and shift value(s) to be applied to each output channel
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Input dimension is taken as Nx(H * W * C_IN)
* @param[in] input_data Input (activation) data pointer. Data type: int8
* @param[in] filter_dims Two dimensional filter dimensions. Format: [N, C]
* N : accumulation depth and equals (H * W * C_IN) from input_dims
* C : output depth and equals C_OUT in output_dims
* H & W : Not used
* @param[in] filter_data Filter data pointer. Data type: int8
* @param[in] bias_dims Bias tensor dimensions. Format: [C_OUT]
* N, H, W : Not used
* @param[in] bias_data Bias data pointer. Data type: int32
* @param[in] output_dims Output tensor dimensions. Format: [N, C_OUT]
* N : Batches
* C_OUT : Output depth
* H & W : Not used.
* @param[in, out] output_data Output data pointer. Data type: int8
*
* @return The function returns either
* <code>ARM_CMSIS_NN_ARG_ERROR</code> if argument constraints fail. or,
* <code>ARM_CMSIS_NN_SUCCESS</code> on successful completion.
*
* @details
* - Supported framework: TensorFlow Lite
*/
arm_cmsis_nn_status arm_fully_connected_wrapper_s8(const cmsis_nn_context *ctx,
const cmsis_nn_fc_params *fc_params,
const cmsis_nn_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *filter_data,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data);

/**
* @brief Calculate the sum of each row in vector_data, multiply by lhs_offset and optionally add s32 bias_data.
* @param[in, out] vector_sum_buf Buffer for vector sums
Expand Down Expand Up @@ -1662,7 +1762,7 @@ int32_t arm_fully_connected_s8_get_buffer_size_mve(const cmsis_nn_dims *filter_d
* fc_params->filter_offset : 0
* fc_params->output_offset : 0
* @param[in] quant_params Per-tensor quantization info.
* It contains the multiplier and shift values to be applied to the output tensor.
* It contains the multiplier and shift value to be applied to the output tensor.
* @param[in] input_dims Input (activation) tensor dimensions. Format: [N, H, W, C_IN]
* Input dimension is taken as Nx(H * W * C_IN)
* @param[in] input_data Input (activation) data pointer. Data type: int16
Expand Down
45 changes: 43 additions & 2 deletions Include/arm_nnsupportfunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
* Title: arm_nnsupportfunctions.h
* Description: Public header file of support functions for CMSIS NN Library
*
* $Date: 19 June 2024
* $Revision: V.22.2.0
* $Date: 12 Jul 2024
* $Revision: V.22.3.0
*
* Target : Arm(R) M-Profile Architecture
* -------------------------------------------------------------------- */
Expand Down Expand Up @@ -703,6 +703,47 @@ arm_cmsis_nn_status arm_nn_vec_mat_mult_t_s8(const int8_t *lhs,
const int32_t address_offset,
const int32_t rhs_offset);

/**
* @brief s8 Vector by Matrix (transposed) multiplication using per channel quantization for output
*
* @param[in] lhs Input left-hand side vector
* @param[in] rhs Input right-hand side matrix (transposed)
* @param[in] kernel_sum Kernel sums of the kernels (rhs). See arm_vector_sum_s8 for more info.
* @param[in] bias Input bias
* @param[out] dst Output vector
* @param[in] lhs_offset Offset to be added to the input values of the left-hand side vector.
* Range: -127 to 128
* @param[in] dst_offset Offset to be added to the output values. Range: -127 to 128
* @param[in] dst_multiplier Output multipliers
* @param[in] dst_shift Output shifts
* @param[in] rhs_cols Number of columns in the right-hand side input matrix
* @param[in] rhs_rows Number of rows in the right-hand side input matrix
* @param[in] activation_min Minimum value to clamp the output to. Range: int8
* @param[in] activation_max Maximum value to clamp the output to. Range: int8
* @param[in] address_offset Memory position offset for dst. First output is stored at 'dst', the
* second at 'dst + address_offset' and so on. Default value is typically 1.
* @param[in] rhs_offset Offset to be added to the input values of the right-hand side vector.
* Range: -127 to 128
*
* @return The function returns <code>ARM_CMSIS_NN_SUCCESS</code>
*
*/
arm_cmsis_nn_status arm_nn_vec_mat_mult_t_per_ch_s8(const int8_t *lhs,
const int8_t *rhs,
const int32_t *kernel_sum,
const int32_t *bias,
int8_t *dst,
const int32_t lhs_offset,
const int32_t dst_offset,
const int32_t *dst_multiplier,
const int32_t *dst_shift,
const int32_t rhs_cols,
const int32_t rhs_rows,
const int32_t activation_min,
const int32_t activation_max,
const int32_t address_offset,
const int32_t rhs_offset);

/**
* @brief s16 Vector by s8 Matrix (transposed) multiplication
*
Expand Down
102 changes: 102 additions & 0 deletions Source/FullyConnectedFunctions/arm_fully_connected_per_channel_s8.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
* SPDX-FileCopyrightText: Copyright 2010-2024 Arm Limited and/or its affiliates <[email protected]>
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the License); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an AS IS BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/* ----------------------------------------------------------------------
* Project: CMSIS NN Library
* Title: arm_fully_connected_per_channel_s8
* Description: Fully connected function compatible with TF Lite.
*
* $Date: 15 Aug 2024
* $Revision: V.1.0.0
*
* Target : Arm(R) M-Profile Architecture
*
* -------------------------------------------------------------------- */

#include "arm_nnfunctions.h"
#include "arm_nnsupportfunctions.h"

/**
* @ingroup Public
*/

/**
* @addtogroup FC
* @{
*/

/*
* S8 basic fully-connected and matrix multiplication layer function using per-channel quantization for TensorFlow Lite
*
* Refer header file for details.
*
*/
arm_cmsis_nn_status arm_fully_connected_per_channel_s8(const cmsis_nn_context *ctx,
const cmsis_nn_fc_params *fc_params,
const cmsis_nn_per_channel_quant_params *quant_params,
const cmsis_nn_dims *input_dims,
const int8_t *input_data,
const cmsis_nn_dims *filter_dims,
const int8_t *kernel,
const cmsis_nn_dims *bias_dims,
const int32_t *bias_data,
const cmsis_nn_dims *output_dims,
int8_t *output_data)
{
(void)bias_dims;

int32_t batch_cnt = input_dims->n;

#if defined(ARM_MATH_MVEI)
if (ctx->buf == NULL)
{
return (ARM_CMSIS_NN_ARG_ERROR);
}
#endif

const int32_t *kernel_sum = (const int32_t *)ctx->buf;

while (batch_cnt)
{

arm_nn_vec_mat_mult_t_per_ch_s8(input_data,
kernel,
kernel_sum,
bias_data,
output_data,
fc_params->input_offset,
fc_params->output_offset,
quant_params->multiplier,
quant_params->shift,
filter_dims->n, /* col_dim or accum_depth */
output_dims->c, /* row_dim or output_depth */
fc_params->activation.min,
fc_params->activation.max,
1L,
fc_params->filter_offset);

input_data += filter_dims->n;
output_data += output_dims->c;
batch_cnt--;
}
return (ARM_CMSIS_NN_SUCCESS);
}

/**
* @} end of FC group
*/
Loading

0 comments on commit b958a3f

Please sign in to comment.