ARM-software · christophe0606 · Jan 24, 2025 · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024
diff --git a/.github/workflows/runneontest.yaml b/.github/workflows/runneontest.yaml
@@ -0,0 +1,118 @@
+name: Neon tests
+on:
+  workflow_dispatch:
+  pull_request:
+    branches: [main]
+  push:
+    branches: [main]
+
+permissions: 
+  actions: read
+  security-events: write
+
+jobs:
+   CI_test_run: 
+    runs-on: ubuntu-22.04-arm
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Setup Python 3.10
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'    
+
+      - name: Install system packages
+        run: |
+          sudo add-apt-repository ppa:deadsnakes/ppa
+          sudo apt-get install libpython3.9 libtinfo5
+          sudo apt install build-essential
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 60 --slave /usr/bin/g++ g++ /usr/bin/g++-11
+          sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 40 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+          sudo update-alternatives --set gcc  /usr/bin/gcc-12
+
+
+      - name: Activate vcpkg
+        uses: ARM-software/cmsis-actions/vcpkg@v1
+        with:
+          config: "./vcpkg-neon-configuration.json"
+
+      - name: Prepare framework
+        run: |
+          cd Testing
+          echo "Create missing folders"
+          mkdir FullBenchmark
+          mkdir Output
+          mkdir GeneratedInclude
+          mkdir GeneratedSource
+          mkdir GeneratedIncludeBench
+          mkdir GeneratedSourceBench
+          mkdir build
+
+          echo "Install missing python packages"
+          pip install -r requirements.txt
+
+          echo "Preprocess test description"
+          python preprocess.py -f desc.txt -o Output.pickle
+          python preprocess.py -f desc_neon.txt -o Output_neon.pickle
+          python preprocess.py -f desc_f16.txt -o Output_f16.pickle
+
+          echo "Generate missing CPP headers"
+          python processTests.py -gen . -p Patterns -d Parameters -f Output.pickle -e
+          python processTests.py -gen . -p Patterns -d Parameters -f Output_neon.pickle -e
+          python processTests.py -gen . -p Patterns -d Parameters -f Output_f16.pickle -e
+
+          cd build
+
+          cmake -G "Ninja" .. 
+
+#      - name: Setup tmate session
+#        uses: mxschmitt/action-tmate@v3
+
+      - name: Execute generic tests
+        run: |
+          cd Testing/build
+
+          python ../processTests.py -p ../Patterns -d ../Parameters -gen .. -e -f ../Output.pickle
+          ninja 
+          ./test > result.txt
+          python ../processResult.py --noerr -e -f ../Output.pickle -r result.txt -html > result.html 
+
+      - name: Execute neon specific C tests
+        run: |
+          cd Testing/build
+
+          python ../processTests.py -p ../Patterns -d ../Parameters -gen .. -e -f ../Output_neon.pickle
+          ninja 
+          ./test > result_neon.txt
+          python ../processResult.py --noerr -e -f ../Output_neon.pickle -r result_neon.txt -html > result_neon.html 
+
+      - name: Execute f16 C tests
+        run: |
+          cd Testing/build
+
+          python ../processTests.py -p ../Patterns -d ../Parameters -gen .. -e -f ../Output_f16.pickle
+          ninja 
+          ./test > result_f16.txt
+          python ../processResult.py --noerr -e -f ../Output_f16.pickle -r result_f16.txt -html > result_f16.html 
+
+      - name: Upload test report
+        uses: actions/upload-artifact@v4
+        with:
+          name: neon-test-report
+          path: |
+             Testing/build/result.html
+             Testing/build/result_neon.html
+             Testing/build/result_f16.html
+
+
+      - name: Check error
+        run: |
+          cd Testing/build
+
+          echo "Checking output..."
+          test "$(grep "FAILED" result.html | wc -l)" -eq 0
+          test "$(grep "FAILED" result_neon.html | wc -l)" -eq 0
+          test "$(grep "FAILED" result_f16.html | wc -l)" -eq 0
+
diff --git a/Include/dsp/matrix_utils.h b/Include/dsp/matrix_utils.h
@@ -51,7 +51,7 @@ extern "C"
                                         \
   for(_w=0;_w < nb; _w++)                  \
   {                                     \
-     *data *= CAST v;                   \
+     *data = CAST *data * CAST v;                   \
      data += _numCols;                   \
   }                                     \
 }
@@ -178,54 +178,63 @@ extern "C"
   }                                    \
 }
 
-#define SCALE_ROW_F16(A,COL,v,i)       \
-{                                      \
+#define SCALE_ROW_F16(A,COL,v,i)        \
+{                                       \
   int32_t _w;                           \
-  float16_t *data = (A)->pData;        \
+  float16_t *data = (A)->pData;         \
   const int32_t _numCols = (A)->numCols;\
   const int32_t nb = _numCols-(COL);    \
-                                       \
+                                        \
   data += i*_numCols + (COL);           \
-                                       \
-  for(_w=0;_w < nb; _w++)                 \
-  {                                    \
-     *data++ *= (_Float16)v;           \
-  }                                    \
+                                        \
+  _Float16 sum;                         \
+  for(_w=0;_w < nb; _w++)               \
+  {                                     \
+     sum = *data;                       \
+     sum *= (_Float16)v;                \
+     *data++ = sum;                     \
+  }                                     \
 }
 
 
-#define MAC_ROW_F16(COL,A,i,v,B,j)                \
-{                                                 \
-  int32_t _w;                                      \
-  float16_t *dataA = (A)->pData;                  \
-  float16_t *dataB = (B)->pData;                  \
-  const int32_t _numCols = (A)->numCols;           \
-  const int32_t nb = _numCols-(COL);               \
-                                                  \
-  dataA += i*_numCols + (COL);                     \
-  dataB += j*_numCols + (COL);                     \
-                                                  \
-  for(_w=0;_w < nb; _w++)                            \
-  {                                               \
-     *dataA++ += (_Float16)v * (_Float16)*dataB++;\
-  }                                               \
+#define MAC_ROW_F16(COL,A,i,v,B,j)           \
+{                                            \
+  int32_t _w;                                \
+  float16_t *dataA = (A)->pData;             \
+  float16_t *dataB = (B)->pData;             \
+  const int32_t _numCols = (A)->numCols;     \
+  const int32_t nb = _numCols-(COL);         \
+                                             \
+  dataA += i*_numCols + (COL);               \
+  dataB += j*_numCols + (COL);               \
+                                             \
+  _Float16 sum ;                             \
+  for(_w=0;_w < nb; _w++)                    \
+  {                                          \
+     sum = *dataA;                           \
+     sum += (_Float16)v * (_Float16)*dataB++;\
+     *dataA++ = sum;                         \
+  }                                          \
 }
 
-#define MAS_ROW_F16(COL,A,i,v,B,j)                \
-{                                                 \
-  int32_t _w;                                      \
-  float16_t *dataA = (A)->pData;                  \
-  float16_t *dataB = (B)->pData;                  \
-  const int32_t _numCols = (A)->numCols;           \
-  const int32_t nb = _numCols-(COL);               \
-                                                  \
-  dataA += i*_numCols + (COL);                     \
-  dataB += j*_numCols + (COL);                     \
-                                                  \
-  for(_w=0;_w < nb; _w++)                            \
-  {                                               \
-     *dataA++ -= (_Float16)v * (_Float16)*dataB++;\
-  }                                               \
+#define MAS_ROW_F16(COL,A,i,v,B,j)           \
+{                                            \
+  int32_t _w;                                \
+  float16_t *dataA = (A)->pData;             \
+  float16_t *dataB = (B)->pData;             \
+  const int32_t _numCols = (A)->numCols;     \
+  const int32_t nb = _numCols-(COL);         \
+                                             \
+  dataA += i*_numCols + (COL);               \
+  dataB += j*_numCols + (COL);               \
+                                             \
+  _Float16 sum ;                             \
+  for(_w=0;_w < nb; _w++)                    \
+  {                                          \
+     sum = *dataA;                           \
+     sum -= (_Float16)v * (_Float16)*dataB++;\
+     *dataA++ = sum;                         \
+  }                                          \
 }
 
 #endif /*defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)*/

diff --git a/Ne10/CMSIS_NE10_fft.neonintrinsic.h b/Ne10/CMSIS_NE10_fft.neonintrinsic.h
@@ -110,17 +110,17 @@
 
 #define VDUPQ_N_F32(VAR) { VAR, VAR, VAR, VAR }
 
-#define CONST_TW_81   0.70710678
-#define CONST_TW_81N -0.70710678
+#define CONST_TW_81   0.70710678f
+#define CONST_TW_81N -0.70710678f
 
-const static float32x4_t Q_TW_81    = VDUPQ_N_F32(CONST_TW_81 );
-const static float32x4_t Q_TW_81N   = VDUPQ_N_F32(CONST_TW_81N);
+static const float32x4_t Q_TW_81    = VDUPQ_N_F32(CONST_TW_81 );
+static const float32x4_t Q_TW_81N   = VDUPQ_N_F32(CONST_TW_81N);
 
 #define DIV_TW81   1.4142136f
 #define DIV_TW81N -1.4142136f
 
-const static float32x4_t DIV_TW81_NEON  = VDUPQ_N_F32(DIV_TW81);
-const static float32x4_t DIV_TW81N_NEON = VDUPQ_N_F32(DIV_TW81N);
+static const float32x4_t DIV_TW81_NEON  = VDUPQ_N_F32(DIV_TW81);
+static const float32x4_t DIV_TW81N_NEON = VDUPQ_N_F32(DIV_TW81N);
 
 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do {   \
         Q_OUT ## 0 = vaddq_f32 (Q_IN ## 0, Q_IN ## 4);      \

diff --git a/Ne10/CMSIS_NE10_fft.neonintrinsic_f16.h b/Ne10/CMSIS_NE10_fft.neonintrinsic_f16.h
@@ -119,14 +119,14 @@
 #define CONST_TW_81   0.70710678f16
 #define CONST_TW_81N -0.70710678f16
 
-const static float16x4_t Q_TW_81    = VDUPQ_N_F16(CONST_TW_81 );
-const static float16x4_t Q_TW_81N   = VDUPQ_N_F16(CONST_TW_81N);
+static const float16x4_t Q_TW_81    = VDUPQ_N_F16(CONST_TW_81 );
+static const float16x4_t Q_TW_81N   = VDUPQ_N_F16(CONST_TW_81N);
 
 #define DIV_TW81   1.4142136f16
 #define DIV_TW81N -1.4142136f16
 
-const static float16x4_t DIV_TW81_NEON  = VDUPQ_N_F16(DIV_TW81);
-const static float16x4_t DIV_TW81N_NEON = VDUPQ_N_F16(DIV_TW81N);
+static const float16x4_t DIV_TW81_NEON  = VDUPQ_N_F16(DIV_TW81);
+static const float16x4_t DIV_TW81N_NEON = VDUPQ_N_F16(DIV_TW81N);
 
 #define NE10_RADIX8x4_R2C_NEON_KERNEL_S1(Q_OUT,Q_IN) do {   \
         Q_OUT ## 0 = vadd_f16 (Q_IN ## 0, Q_IN ## 4);      \

diff --git a/Ne10/CMSIS_NE10_fft_common_variables.h b/Ne10/CMSIS_NE10_fft_common_variables.h
@@ -40,60 +40,60 @@
 ///////////////////////////
 
 /* Twiddles used in Radix-8 FFT */
-const static ne10_float32_t TW_81_F32  =  0.70710678; // sqrt (2) / 2
-const static ne10_float32_t TW_81N_F32 = -0.70710678; // - TW_81_F32
+static const ne10_float32_t TW_81_F32  =  0.70710678f; // sqrt (2) / 2
+static const ne10_float32_t TW_81N_F32 = -0.70710678f; // - TW_81_F32
 
 /* Twiddles used in Radix-5 FFT */
-const static ne10_fft_cpx_float32_t TW_5A_F32 =
+static const ne10_fft_cpx_float32_t TW_5A_F32 =
         {
-             0.309016994374947, //   cos (2 * pi / 5)
-            -0.951056516295154  // - sin (2 * pi / 5)
+             0.309016994374947f, //   cos (2 * pi / 5)
+            -0.951056516295154f  // - sin (2 * pi / 5)
         };
-const static ne10_fft_cpx_int32_t TW_5A_S32 =
+static const ne10_fft_cpx_int32_t TW_5A_S32 =
         {
               663608942, // round (TW_5A_F32.r * 2^31)
             -2042378317  // round (TW_5A_F32.i * 2^31)
         };
 
-const static ne10_fft_cpx_float32_t TW_5B_F32 =
+static const ne10_fft_cpx_float32_t TW_5B_F32 =
         {
-            -0.809016994374947, //   cos (4 * pi / 5)
-            -0.587785252292473  // - sin (4 * pi / 5)
+            -0.809016994374947f, //   cos (4 * pi / 5)
+            -0.587785252292473f  // - sin (4 * pi / 5)
         };
-const static ne10_fft_cpx_int32_t TW_5B_S32 =
+static const ne10_fft_cpx_int32_t TW_5B_S32 =
         {
             -1737350766, // round (TW_5B_F32.r * 2^31)
             -1262259218  // round (TW_5B_F32.i * 2^31)
         };
 
 /* Twiddles used in Radix-3 FFT */
-const static ne10_float32_t TW_3I_F32  =   0.866025403784439; // sqrt (3) / 2
-const static ne10_float32_t TW_3IN_F32 = - 0.866025403784439; // - TW_3IN_F32
-const static ne10_int32_t TW_3I_S32 = 1859775393; // round (TW_3I_F32 * 2^31)
-const static ne10_int32_t TW_3IN_S32 = -1859775393; // round (TW_3IN_F32 * 2^31)
+static const ne10_float32_t TW_3I_F32  =   0.866025403784439f; // sqrt (3) / 2
+static const ne10_float32_t TW_3IN_F32 = - 0.866025403784439f; // - TW_3IN_F32
+static const ne10_int32_t TW_3I_S32 = 1859775393; // round (TW_3I_F32 * 2^31)
+static const ne10_int32_t TW_3IN_S32 = -1859775393; // round (TW_3IN_F32 * 2^31)
 
 #if defined(ARM_MATH_NEON_FLOAT16) && defined(ARM_FLOAT16_SUPPORTED)
 
 /* Twiddles used in Radix-8 FFT */
-const static ne10_float16_t TW_81_F16  =  0.70710678f16; // sqrt (2) / 2
-const static ne10_float16_t TW_81N_F16 = -0.70710678f16; // - TW_81_F32
+static const ne10_float16_t TW_81_F16  =  0.70710678f16; // sqrt (2) / 2
+static const ne10_float16_t TW_81N_F16 = -0.70710678f16; // - TW_81_F32
 
 /* Twiddles used in Radix-5 FFT */
-const static ne10_fft_cpx_float16_t TW_5A_F16 =
+static const ne10_fft_cpx_float16_t TW_5A_F16 =
         {
              0.309016994374947f16, //   cos (2 * pi / 5)
             -0.951056516295154f16  // - sin (2 * pi / 5)
         };
 
-const static ne10_fft_cpx_float16_t TW_5B_F16 =
+static const ne10_fft_cpx_float16_t TW_5B_F16 =
         {
             -0.809016994374947f16, //   cos (4 * pi / 5)
             -0.587785252292473f16  // - sin (4 * pi / 5)
         };
 
 /* Twiddles used in Radix-3 FFT */
-const static ne10_float16_t TW_3I_F16  =   0.866025403784439f16; // sqrt (3) / 2
-const static ne10_float16_t TW_3IN_F16 = - 0.866025403784439f16; // - TW_3IN_F32
+static const ne10_float16_t TW_3I_F16  =   0.866025403784439f16; // sqrt (3) / 2
+static const ne10_float16_t TW_3IN_F16 = - 0.866025403784439f16; // - TW_3IN_F32
 #endif 
 
 #endif // NE10_FFT_COMMON_VARIBLES_H