From 1f6c3291ac6a0b0a9c2c0ad3fdb9fb59bd7ebea0 Mon Sep 17 00:00:00 2001 From: "peter.marcisovsky" Date: Tue, 17 Dec 2024 15:10:43 +0100 Subject: [PATCH] feat(lvgl_port_simd): RBG565 image blend to RGB565 - RGB565 blend to RGB565 (optimized memcpy for RGB565 type) - esp32s3 assembly implementation using SIMD instructions - esp32 assembly fallback --- components/esp_lvgl_port/CMakeLists.txt | 5 + .../include/esp_lvgl_port_lv_blend.h | 20 + .../src/lvgl9/simd/lv_macro_memcpy.S | 60 +++ .../lv_rgb565_blend_normal_to_rgb565_esp32.S | 264 +++++++++++++ ...lv_rgb565_blend_normal_to_rgb565_esp32s3.S | 372 ++++++++++++++++++ .../esp_lvgl_port/test_apps/simd/README.md | 13 +- .../test_apps/simd/main/CMakeLists.txt | 12 +- .../main/lv_blend/include/lv_draw_sw_blend.h | 1 + .../simd/main/lv_blend/include/lv_string.h | 79 ++++ .../simd/main/lv_blend/include/lv_types.h | 2 + .../src/lv_draw_sw_blend_to_argb8888.c | 10 +- .../lv_blend/src/lv_draw_sw_blend_to_rgb565.c | 8 +- .../main/lv_blend/src/lv_string_builtin.c | 188 +++++++++ .../test_apps/simd/main/lv_fill_common.h | 2 +- .../test_apps/simd/main/lv_image_common.h | 111 ++++++ .../simd/main/test_lv_image_benchmark.c | 171 ++++++++ .../simd/main/test_lv_image_functionality.c | 351 +++++++++++++++++ 17 files changed, 1658 insertions(+), 11 deletions(-) create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_image_common.h create mode 100644 components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c create mode 100644 components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c diff --git a/components/esp_lvgl_port/CMakeLists.txt b/components/esp_lvgl_port/CMakeLists.txt index 8dc53693..4a84cbf0 100644 --- a/components/esp_lvgl_port/CMakeLists.txt +++ b/components/esp_lvgl_port/CMakeLists.txt @@ -85,6 +85,10 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0")) else() file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32.S) # Select only esp32 related files endif() + + # Explicitly add all assembly macro files + file(GLOB_RECURSE ASM_MACROS ${PORT_PATH}/simd/lv_macro_*.S) + list(APPEND ADD_SRCS ${ASM_MACROS}) list(APPEND ADD_SRCS ${ASM_SRCS}) # Include component libraries, so lvgl component would see lvgl_port includes @@ -94,6 +98,7 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0")) # Force link .S files set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_argb8888_esp") set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb565_esp") + set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_rgb565_blend_normal_to_rgb565_esp") endif() endif() diff --git a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h index c00de1c0..999153fe 100644 --- a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h +++ b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h @@ -32,6 +32,10 @@ extern "C" { _lv_color_blend_to_rgb565_esp(dsc) #endif +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565 +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc) \ + _lv_rgb565_blend_normal_to_rgb565_esp(dsc) +#endif /********************** * TYPEDEFS @@ -83,6 +87,22 @@ static inline lv_result_t _lv_color_blend_to_rgb565_esp(_lv_draw_sw_blend_fill_d return lv_color_blend_to_rgb565_esp(&asm_dsc); } +extern int lv_rgb565_blend_normal_to_rgb565_esp(asm_dsc_t *asm_dsc); + +static inline lv_result_t _lv_rgb565_blend_normal_to_rgb565_esp(_lv_draw_sw_blend_image_dsc_t * dsc) +{ + asm_dsc_t asm_dsc = { + .dst_buf = dsc->dest_buf, + .dst_w = dsc->dest_w, + .dst_h = dsc->dest_h, + .dst_stride = dsc->dest_stride, + .src_buf = dsc->src_buf, + .src_stride = dsc->src_stride + }; + + return lv_rgb565_blend_normal_to_rgb565_esp(&asm_dsc); +} + #endif // CONFIG_LV_DRAW_SW_ASM_CUSTOM #ifdef __cplusplus diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S new file mode 100644 index 00000000..377f9285 --- /dev/null +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S @@ -0,0 +1,60 @@ +/* + * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +// Memcpy macros for modulo checking +// After running the main loop, there is need to check remaining bytes to be copied out of the main loop +// Macros work with both, aligned and unaligned (4-byte boundary) memories +// but performance is significantly lower when using unaligned memory, because of the unaligned memory access exception + + +// Macro for checking modulo 8 + .macro macro_memcpy_mod_8 src_buf, dest_buf, condition, x1, x2, JUMP_TAG + // Check modulo 8 of the \condition, if - then copy 8 bytes + bbci \condition, 3, ._mod_8_check_\JUMP_TAG // Branch if 3-rd bit of \condition is clear + l32i.n \x1, \src_buf, 0 // Load 32 bits from \src_buff to \x1, offset 0 + l32i.n \x2, \src_buf, 4 // Load 32 bits from \src_buff to \x2, offset 4 + s32i.n \x1, \dest_buf, 0 // Save 32 bits from \x1 to \dest_buff, offset 0 + s32i.n \x2, \dest_buf, 4 // Save 32 bits from \x2 to \dest_buff, offset 4 + addi.n \src_buf, \src_buf, 8 // Increment \src_buff pointer by 8 + addi.n \dest_buf, \dest_buf, 8 // Increment \dest_buff pointer 8 + ._mod_8_check_\JUMP_TAG: +.endm // macro_memcpy_mod_8 + + +// Macro for checking modulo 4 + .macro macro_memcpy_mod_4 src_buf, dest_buf, condition, x1, JUMP_TAG + // Check modulo 4 of the \condition, if - then copy 4 bytes + bbci \condition, 2, ._mod_4_check_\JUMP_TAG // Branch if 2-nd bit of \condition is clear + l32i.n \x1, \src_buf, 0 // Load 32 bits from \src_buff to \x1, offset 0 + addi.n \src_buf, \src_buf, 4 // Increment \src_buff pointer by 4 + s32i.n \x1, \dest_buf, 0 // Save 32 bits from \x1 to \dest_buff, offset 0 + addi.n \dest_buf, \dest_buf, 4 // Increment \dest_buff pointer 4 + ._mod_4_check_\JUMP_TAG: +.endm // macro_memcpy_mod_4 + + +// Macro for checking modulo 2 + .macro macro_memcpy_mod_2 src_buf, dest_buf, condition, x1, JUMP_TAG + // Check modulo 2 of the \condition, if - then copy 2 bytes + bbci \condition, 1, ._mod_2_check_\JUMP_TAG // Branch if 1-st bit of \condition is clear + l16ui \x1, \src_buf, 0 // Load 16 bits from \src_buff to \x1, offset 0 + addi.n \src_buf, \src_buf, 2 // Increment \src_buff pointer by 2 + s16i \x1, \dest_buf, 0 // Save 16 bits from \x1 to \dest_buff, offset 0 + addi.n \dest_buf, \dest_buf, 2 // Increment \dest_buff pointer 2 + ._mod_2_check_\JUMP_TAG: +.endm // macro_memcpy_mod_2 + + +// Macro for checking modulo 1 + .macro macro_memcpy_mod_1 src_buf, dest_buf, condition, x1, JUMP_TAG + // Check modulo 1 of the \condition, if - then copy 1 byte + bbci \condition, 0, ._mod_1_check_\JUMP_TAG // Branch if 0-th bit of \condition is clear + l8ui \x1, \src_buf, 0 // Load 8 bits from \src_buff to \x1, offset 0 + addi.n \src_buf, \src_buf, 1 // Increment \src_buff pointer by 1 + s8i \x1, \dest_buf, 0 // Save 8 bits from \x1 to \dest_buff, offset 0 + addi.n \dest_buf, \dest_buf, 1 // Increment \dest_buff pointer 1 + ._mod_1_check_\JUMP_TAG: +.endm // macro_memcpy_mod_1 \ No newline at end of file diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S new file mode 100644 index 00000000..7752aae4 --- /dev/null +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S @@ -0,0 +1,264 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "lv_macro_memcpy.S" // Memcpy macros + +// This is LVGL RGB565 image blend to RGB565 for ESP32 processor + + .section .text + .align 4 + .global lv_rgb565_blend_normal_to_rgb565_esp + .type lv_rgb565_blend_normal_to_rgb565_esp,@function +// The function implements the following C code: +// void rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t * dsc); + +// Input params +// +// dsc - a2 + +// typedef struct { +// uint32_t opa; l32i 0 +// void * dst_buf; l32i 4 +// uint32_t dst_w; l32i 8 +// uint32_t dst_h; l32i 12 +// uint32_t dst_stride; l32i 16 +// const void * src_buf; l32i 20 +// uint32_t src_stride; l32i 24 +// const lv_opa_t * mask_buf; l32i 28 +// uint32_t mask_stride; l32i 32 +// } asm_dsc_t; + +lv_rgb565_blend_normal_to_rgb565_esp: + + entry a1, 32 + l32i.n a3, a2, 4 // a3 - dest_buff + l32i.n a4, a2, 8 // a4 - dest_w in uint16_t + l32i.n a5, a2, 12 // a5 - dest_h in uint16_t + l32i.n a6, a2, 16 // a6 - dest_stride in bytes + l32i.n a7, a2, 20 // a7 - src_buff + l32i.n a8, a2, 24 // a8 - src_stride in bytes + slli a11, a4, 1 // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w + + // No need to convert any colors here, we are copying from rgb565 to rgb565 + + // Check dest_w length + bltui a4, 8, _matrix_width_check // Branch if dest_w (a4) is lower than 8 + + // Check memory alignment and input parameters lengths and decide which implementation to use + movi.n a10, 0x3 // a10 = 0x3 alignment mask (4-byte alignment) + or a15, a7, a3 // a15 = src_buff (a7) OR dest_buff (a3) + or a15, a15, a6 // a15 = a15 OR dest_stride (a6) + or a15, a15, a8 // a15 = a15 OR src_stride (a8) + or a15, a15, a11 // a15 = a15 OR dest_w_bytes (a11) + and a15, a15, a10 // a15 = a15 AND alignment mask (a10) + bnez a15, _alignment_check // Branch if a15 not equals to zero + +//********************************************************************************************************************** + + // The most ideal case - both arrays aligned, both strides and dest_w are multiples of 4 + + // dest_buff (a3) - 4-byte aligned + // src_buff (a7) - 4-byte aligned + // dest_stride (a6) - 4-byte multiple + // src_stride (a8) - 4-byte multiple + // dest_w (a4) - 4-byte multiple + + srli a9, a4, 3 // a9 - loop_len = dest_w / 4 + // Convert strides to matrix paddings + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) + + .outer_loop_align: + + // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run + loopnez a9, ._main_loop_aligned + l32i.n a15, a7, 0 // Load 32 bits from src_buff a7 to a15, offset 0 + l32i.n a14, a7, 4 // Load 32 bits from src_buff a7 to a14, offset 4 + l32i.n a13, a7, 8 // Load 32 bits from src_buff a7 to a13, offset 8 + l32i.n a12, a7, 12 // Load 32 bits from src_buff a7 to a12, offset 12 + s32i.n a15, a3, 0 // Save 32 bits from a15 to dest_buff a3, offset 0 + s32i.n a14, a3, 4 // Save 32 bits from a15 to dest_buff a3, offset 4 + s32i.n a13, a3, 8 // Save 32 bits from a15 to dest_buff a3, offset 8 + s32i.n a12, a3, 12 // Save 32 bits from a15 to dest_buff a3, offset 12 + addi.n a7, a7, 16 // Increment src_buff pointer a7 by 16 + addi.n a3, a3, 16 // Increment dest_buff pointer a3 by 16 + ._main_loop_aligned: + + // Finish the remaining bytes out of the main loop + + // Check modulo 8 of the dest_w_bytes (a11), if - then copy 8 bytes (4 RGB565 pixels) + // src_buff a7, dest_buff a3, dest_w_bytes a11, copy registers a14 a15 + macro_memcpy_mod_8 a7, a3, a11, a14, a15 __LINE__ + + // Check modulo 4 of the dest_w_bytes (a11), if - then copy 4 bytes (2 RGB565 pixels) + // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15 + macro_memcpy_mod_4 a7, a3, a11, a15 __LINE__ + + // Check modulo 2 of the dest_w_bytes (a11), if - then copy 2 bytes (1 RGB565 pixel) + // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15 + macro_memcpy_mod_2 a7, a3, a11, a15 __LINE__ + + // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel) + // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15 + macro_memcpy_mod_1 a7, a3, a11, a15 __LINE__ + + add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) + add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8) + addi.n a5, a5, -1 // Decrease the outer loop + bnez a5, .outer_loop_align + + movi.n a2, 1 // Return LV_RESULT_OK = 1 + retw.n // Return + + +//********************************************************************************************************************** + + // The most general case - at leas one array is not aligned, or one parameter is not multiple of 4 + _alignment_check: + + // dest_buff (a3) - 4-byte aligned, or not + // src_buff (a7) - 4-byte aligned, or not + // dest_stride (a6) - 4-byte multiple, or not + // src_stride (a8) - 4-byte multiple, or not + // dest_w (a4) - 4-byte multiple, or not + + // Convert strides to matrix paddings + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) + + .outer_loop_unalign: + + extui a13, a3, 0, 2 // Get last two bits of the dest_buff address a3, to a13 + movi.n a15, 4 // Move 4 to a15, for calculation of the destination alignment loop + sub a14, a15, a13 // Calculate destination alignment loop length (a14 = 4 - a13) + + // In case of the dest_buff a3 being already aligned (for example by matrix padding), correct a14 value, + // to prevent the destination aligning loop to run 4 times (to prevent aligning already aligned memory) + moveqz a14, a13, a13 // If a13 is zero, move a13 to a14, move 0 to a14 + + sub a10, a11, a14 // Get the dest_w_bytes after the aligning loop + srli a9, a10, 4 // Calculate main loop len (a9 = dest_w_bytes_local / 16) + + // Run dest_buff aligning loop byte by byte + loopnez a14, ._dest_aligning_loop + l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0 + addi.n a7, a7, 1 // Increment src_buff pointer a7 by 1 + s8i a15, a3, 0 // Save 8 bits from a15 to dest_buff a3, offset 0 + addi.n a3, a3, 1 // Increment dest_buff pointer a3 by 1 + ._dest_aligning_loop: + + // Destination is aligned, source is unaligned + + // For more information about this implementation, see chapter 3.3.2 Shifts and the Shift Amount Register (SAR) + // in Xtensa Instruction Set Architecture (ISA) Reference Manual + + ssa8l a7 // Set SAR_BYTE from src_buff a7 unalignment + extui a4, a7, 0, 2 // Get last 2 bits of the src_buff, a4 = src_buff_unalignment + sub a7, a7, a4 // "align" the src_buff a7, to 4-byte boundary by decreasing it's pointer to the nearest aligned boundary + + // First preload for the loopnez cycle + l32i.n a15, a7, 0 // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 0 + + // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run + loopnez a9, ._main_loop_unalign + l32i.n a14, a7, 4 // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4 + l32i.n a13, a7, 8 // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8 + src a15, a14, a15 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 + s32i.n a15, a3, 0 // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0 + l32i.n a12, a7, 12 // Load 32 bits from 4-byte aligned src_buff a7 to a12, offset 12 + src a14, a13, a14 // Concatenate a13 and a14 and shift by SAR_BYTE amount to a14 + s32i.n a14, a3, 4 // Save 32 bits from shift-corrected a14 to dest_buff a3, offset 4 + l32i.n a15, a7, 16 // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 16 + src a13, a12, a13 // Concatenate a12 and a13 and shift by SAR_BYTE amount to a13 + s32i.n a13, a3, 8 // Save 32 bits from shift-corrected a13 to dest_buff a3, offset 8 + addi.n a7, a7, 16 // Increment src_buff pointer a7 by 16 + src a12, a15, a12 // Concatenate a15 and a12 and shift by SAR_BYTE amount to a12 + s32i.n a12, a3, 12 // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 12 + addi.n a3, a3, 16 // Increment dest_buff pointer a3 by 16 + ._main_loop_unalign: + + // Finish the remaining bytes out of the loop + // Check modulo 8 of the dest_w_bytes_local (a10), if - then copy 8 bytes + bbci a10, 3, _mod_8_check // Branch if 3-rd bit of dest_w_bytes_local is clear + l32i.n a14, a7, 4 // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4 + l32i.n a13, a7, 8 // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8 + src a15, a14, a15 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps) + s32i.n a15, a3, 0 // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0 + addi.n a7, a7, 8 // Increment src_buff pointer a7 by 8 + src a14, a13, a14 // Concatenate a13 and a14 and shift by SAR_BYTE amount to a14 + s32i.n a14, a3, 4 // Save 32 bits from shift-corrected a14 to dest_buff a3, offset 4 + addi.n a3, a3, 8 // Increment dest_buff pointer a3 by 8 + mov a15, a13 // Prepare a15 for the next steps (copy a13 to a15) + _mod_8_check: + + // Check modulo 4 of the dest_w_bytes_local (a10), if - then copy 4 bytes + bbci a10, 2, _mod_4_check // Branch if 2-nd bit of dest_w_bytes_local is clear + l32i.n a14, a7, 4 // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4 + addi.n a7, a7, 4 // Increment src_buff pointer a7 by 4 + src a15, a14, a15 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps) + s32i.n a15, a3, 0 // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0 + addi.n a3, a3, 4 // Increment dest_buff pointer a3 by 4 + mov a15, a14 // Prepare a15 for the next steps (copy a14 to a15) + _mod_4_check: + + extui a13, a10, 0, 2 // Get the last 2 bytes of the dest_w_bytes_local (a10), a13 = a10[1:0], to find out how many bytes are needs copied and to increase src and dest pointer accordingly + beqz a13, _mod_1_2_check // Branch if a13 equal to zero, E.G. if there are no bytes to be copied + l32i.n a14, a7, 4 // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4 + l32i.n a12, a3, 0 // Get dest_buff value: Load 32 bits from 4-byte aligned dest_buff a3 to a12, offset 0 + src a15, a14, a15 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps) + ssa8l a10 // Set SAR_BYTE from dest_w_bytes_local a10 length + sll a15, a15 // Shift the dest word a15 by SAR_BYTE amount + srl a12, a12 // Shift the src word a12 by SAR_BYTE amount + ssa8b a10 // Set SAR_BYTE from dest_w_bytes_local a10 length + src a12, a12, a15 // Concatenate a12 and a15 and shift by SAR_BYTE amount to a12 + s32i.n a12, a3, 0 // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 0 + add a7, a7, a13 // Increment src_buff pointer a7, by amount of copied bytes (a13) + add a3, a3, a13 // Increment dest_buff pointer a3, by amount of copied bytes (a13) + _mod_1_2_check: + + add a7, a7, a4 // Correct the src_buff back by src_buff_unalignment (a4), after we have force-aligned it to 4-byte boundary before the main loop + add a3, a3, a6 // dest_buff + dest_stride + add a7, a7, a8 // src_buff + src_stride + addi.n a5, a5, -1 // Decrease the outer loop + bnez a5, .outer_loop_unalign + + movi.n a2, 1 // Return LV_RESULT_OK = 1 + retw.n // Return + +//********************************************************************************************************************** + + // Small matrix width, keep it simple for lengths less than 8 pixels + _matrix_width_check: // Matrix width is greater or equal 8 pixels + + // Convert strides to matrix paddings + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) + + .outer_loop_short_matrix_length: + + // Run main loop which copies 2 bytes (one RGB565 pixel) in one loop run + loopnez a4, ._main_loop_short_matrix_length + l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0 + l8ui a14, a7, 1 // Load 8 bits from src_buff a7 to a14, offset 1 + s8i a15, a3, 0 // Save 8 bits from a15 to dest_buff a3, offset 0 + s8i a14, a3, 1 // Save 8 bits from a14 to dest_buff a3, offset 1 + addi.n a7, a7, 2 // Increment src_buff pointer a7 by 1 + addi.n a3, a3, 2 // Increment dest_buff pointer a3 by 2 + ._main_loop_short_matrix_length: + + // Finish remaining byte out of the main loop + + // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel) + // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15 + macro_memcpy_mod_1 a7, a3, a11, a15, __LINE__ + + add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) + add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8) + addi.n a5, a5, -1 // Decrease the outer loop + bnez a5, .outer_loop_short_matrix_length + + movi.n a2, 1 // Return LV_RESULT_OK = 1 + retw.n // Return \ No newline at end of file diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S new file mode 100644 index 00000000..a0dc9066 --- /dev/null +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S @@ -0,0 +1,372 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "lv_macro_memcpy.S" // Memcpy macros + +// This is LVGL RGB565 image blend to RGB565 for ESP32S3 processor + + .section .text + .align 4 + .global lv_rgb565_blend_normal_to_rgb565_esp + .type lv_rgb565_blend_normal_to_rgb565_esp,@function +// The function implements the following C code: +// void lv_color_blend_to_rgb565(_lv_draw_sw_blend_fill_dsc_t * dsc); + +// Input params +// +// dsc - a2 + +// typedef struct { +// uint32_t opa; l32i 0 +// void * dst_buf; l32i 4 +// uint32_t dst_w; l32i 8 +// uint32_t dst_h; l32i 12 +// uint32_t dst_stride; l32i 16 +// const void * src_buf; l32i 20 +// uint32_t src_stride; l32i 24 +// const lv_opa_t * mask_buf; l32i 28 +// uint32_t mask_stride; l32i 32 +// } asm_dsc_t; + +lv_rgb565_blend_normal_to_rgb565_esp: + + entry a1, 32 + l32i.n a3, a2, 4 // a3 - dest_buff + l32i.n a4, a2, 8 // a4 - dest_w in uint16_t + l32i.n a5, a2, 12 // a5 - dest_h in uint16_t + l32i.n a6, a2, 16 // a6 - dest_stride in bytes + l32i.n a7, a2, 20 // a7 - src_buff + l32i.n a8, a2, 24 // a8 - src_stride in bytes + movi.n a10, 0xf // 0xf alignment mask (16-byte alignment) + slli a11, a4, 1 // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w + + // No need to convert any colors here, we are copying from rgb565 to rgb565 + + // Check dest_w length + bltui a4, 8, _matrix_width_check // Branch if dest_w (a4) is lower than 8 + + // Check dest_buff alignment fist + and a15, a10, a3 // 16-byte alignment mask AND dest_buff pointer a3 + bnez a15, _src_unalign_dest_unalign // Branch if a15 not equals to zero + // Jump straight to the last implementation, since this is the only one which deals with unaligned destination arrays + + // Check src_buff alignment + and a15, a10, a7 // 16-byte alignment mask AND src_buff pointer a7 + bnez a15, _src_align_dest_unalign // Branch if a15 not equals to zero + // Jump to check, if the second or third implementation can be used (depends on both strides and dest_w) + + // Check dest_stride alignment + and a15, a10, a6 // 16-byte alignment mask AND dest_stride a6 + bnez a15, _src_unalign_dest_unalign // Branch if a15 not equals to zero + // Jump straight to the last implementation, since this is the only one which deals with destination stride not aligned + + // Check src_stride alignment + and a15, a10, a8 // 16-byte alignment mask AND src_stride a8 + bnez a15, _src_align_dest_unalign // Branch if a15 not equals to zero + // Jump to check, if the second or third implementation can be used (depends on dest_w_bytes) + + // Check dest_w_bytes alignment + and a15, a10, a11 // 16-byte alignment mask AND dest_w_bytes + bnez a15, _src_unalign_dest_unalign // Branch if a15 not equals to zero + // Jump straight to the last implementation, since this is the only one which deals with dest_w_bytes not aligned + +//********************************************************************************************************************** + + // The most ideal case - both arrays aligned, both strides and dest_w are multiples of 16 + + // dest_buff (a3) - 16-byte aligned + // src_buff (a7) - 16-byte aligned + // dest_stride (a6) - 16-byte multiple + // src_stride (a8) - 16-byte multiple + // dest_w (a4) - 16-byte multiple + + srli a9, a4, 4 // a9 - loop_len = dest_w / 16 + // Convert strides to matrix paddings + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) + + .outer_loop_align: + + // Run main loop which copies 32 bytes (16 RGB565 pixels) in one loop run + loopnez a9, ._main_loop_align // 32 bytes (16 RGB565 pixels) in one loop run + ee.vld.128.ip q0, a7, 16 // Load 16 bytes from src_buff a7 to q0, increase src_buf pointer a7 by 16 + ee.vld.128.ip q1, a7, 16 // Load 16 bytes from src_buff a7 to q1, increase src_buf pointer a7 by 16 + ee.vst.128.ip q0, a3, 16 // Store 16 bytes from q0 to dest_buff a3, increase dest_buff pointer a3 by 16 + ee.vst.128.ip q1, a3, 16 // Store 16 bytes from q1 to dest_buff a3, increase dest_buff pointer a3 by 16 + ._main_loop_align: + + // Finish remaining bytes out of the main loop + + // Check modulo 16 of the dest_w, if - then copy 16 bytes (8 RGB565 pixels) + bbci a11, 4, _align_mod_16_check // Branch if 4-th bit of dest_w_bytes a11 is clear + ee.vld.128.ip q0, a7, 16 // Load 16 bytes from src_buff a7 to q0, increase src_buf pointer a7 by 16 + ee.vst.128.ip q0, a3, 16 // Store 16 bytes from q0 to dest_buff a3, increase dest_buff pointer a3 by 16 + _align_mod_16_check: + + add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) + add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8) + addi.n a5, a5, -1 // Decrease the outer loop + bnez a5, .outer_loop_align + + movi.n a2, 1 // Return LV_RESULT_OK = 1 + retw.n // Return + + + _src_align_dest_unalign: + + // Check dest_stride alignment + and a15, a10, a6 // 16-byte alignment mask AND dest_stride a6 + bnez a15, _src_unalign_dest_unalign // Branch if a15 not equals to zero + + // Check dest_w_bytes alignment + and a15, a10, a11 // 16-byte alignment mask AND dest_w_bytes a11 + bnez a15, _src_unalign_dest_unalign // Branch if a15 not equals to zero + + // We don't check src_stride alignment for this implementation, as it can be either align, or unalign + +//********************************************************************************************************************** + + // Less ideal case - Only destination array is aligned, src array is unaligned + // Source stride is either aligned or unaligned, destination stride must be aligned, dest_w_bytes must be aligned + + // dest_buff (a3) - 16-byte aligned + // src_buff (a7) - unaligned + // dest_stride (a6) - 16-byte multiple + // src_stride (a8) - does not matter if 16-byte multiple + // dest_w (a4) - 16-byte multiple + + // Convert strides to matrix paddings + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) + + // Calculate modulo for non-aligned data + movi a15, 48 // a15 = 48 (main loop copies 48 bytes) + quou a9, a11, a15 // a9 = dest_w_bytes (a11) MOD 48 (15) + remu a12, a11, a15 // a12 = dest_w_bytes (a11) remainder div 48 (15) + + .outer_loop_src_unalign_dest_align: + + ee.ld.128.usar.ip q2, a7, 16 // Preload 16 bytes from src_buff a7 to q2, get value of the SAR_BYTE, increase src_buf pointer a7 by 16 + ee.ld.128.usar.ip q3, a7, 16 // Preload 16 bytes from src_buff a7 to q3, get value of the SAR_BYTE, increase src_buf pointer a7 by 16 + + // Run main loop which copies 48 bytes (24 RGB565 pixels) in one loop run + loopnez a9, ._main_loop_src_unalign_dest_align // 48 bytes (24 RGB565 pixels) in one loop + ee.src.q.ld.ip q4, a7, 16, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q.ld.ip q2, a7, 16, q3, q4 // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 + ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q.ld.ip q3, a7, 16, q4, q2 // Load 16 bytes from src_buff a7 to q3, concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 + ee.vst.128.ip q4, a3, 16 // Store 16 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ._main_loop_src_unalign_dest_align: + + // Finish the main loop outside of the loop from Q registers preloads + + // Check modulo 32 of the loop_len_remainder, if - then copy 32 bytes (16 RGB565 pixels) + bbci a12, 5, _unalign_mod_32_check // Branch if 5-th bit of loop_len_remainder a12 is clear + ee.src.q.ld.ip q4, a7, 0, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7 + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q q3, q3, q4 // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount + ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + j _end_of_row_src_unalign_dest_align + _unalign_mod_32_check: + + // Check modulo 16 of the loop_len_remainder, if - then copy 16 bytes (8 RGB565 pixels) + bbci a12, 4, _unalign_mod_16_check // Branch if 4-th bit of loop_len_remainder a12 is clear + ee.src.q q2, q2, q3 // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + addi a7, a7, -16 // Correct the src_buff pointer a7, caused by q reg preload + j _end_of_row_src_unalign_dest_align + _unalign_mod_16_check: + + // Nothing to copy outside of the main loop + addi a7, a7, -32 // Correct the src_buff pointer a7, caused by q reg preload + + _end_of_row_src_unalign_dest_align: + + add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) + add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8) + addi.n a5, a5, -1 // Decrease the outer loop + bnez a5, .outer_loop_src_unalign_dest_align + + movi.n a2, 1 // Return LV_RESULT_OK = 1 + retw.n // Return + + + _src_unalign_dest_unalign: + +//********************************************************************************************************************** + + // The most general case, can handle all the possible combinations + + // dest_buff (a3) - unaligned + // src_buff (a7) - unaligned + // dest_stride (a6) - not 16-byte multiple + // src_stride (a8) - not 16-byte multiple + // dest_w (a4) - not 16-byte multiple + + // Convert strides to matrix paddings + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) + + .outer_loop_all_unalign: + + // dest_buff alignment check + and a13, a10, a3 // Alignment mask 0xf (a10) AND dest_buff pointer + beqz a13, _dest_buff_aligned // Branch if a13 = 0 (if dest_buff is aligned) + + movi.n a14, 16 // a14 = 16 + sub a13, a14, a13 // a13 = 16 - unalignment + + // Check modulo 8 of the unalignment a13, if - then copy 8 bytes (4 RGB565 pixels) + // src_buff a7, dest_buff a3, unalignment a13, copy registers a14, a15 + macro_memcpy_mod_8 a7, a3, a13, a15, a14, __LINE__ + + // Check modulo 4 of the unalignment, if - then copy 4 bytes (2 RGB565 pixels) + // src_buff a7, dest_buff a3, unalignment a13, copy register a15 + macro_memcpy_mod_4 a7, a3, a13, a15, __LINE__ + + // Check modulo 2 of the unalignment, if - then copy 2 bytes (1 RGB565 pixel) + // src_buff a7, dest_buff a3, unalignment a13, copy register a15 + macro_memcpy_mod_2 a7, a3, a13, a15, __LINE__ + + // Check modulo 1 of the unalignment, if - then copy 1 byte (1/2 of RGB565 pixel) + // src_buff a7, dest_buff a3, unalignment a13, copy register a15 + macro_memcpy_mod_1 a7, a3, a13, a15, __LINE__ + + _dest_buff_aligned: + + // Calculate modulo for non-aligned data + sub a11, a11, a13 // a11 = local_dest_w_bytes (a11) = dest_w_bytes (a11) - (16 - unalignment) + movi a15, 48 // a15 = 48 + quou a9, a11, a15 // a9 = local_dest_w_bytes (a11) MOD 48 (a15) + remu a12, a11, a15 // a12 = local_dest_w_bytes (a11) remainder div 48 (a15) + + ee.ld.128.usar.ip q2, a7, 16 // Preload 16 bytes from src_buff a7 to q2, get value of the SAR_BYTE, increase src_buf pointer a7 by 16 + ee.ld.128.usar.ip q3, a7, 16 // Preload 16 bytes from src_buff a7 to q3, get value of the SAR_BYTE, increase src_buf pointer a7 by 16 + + // Run main loop which copies 48 bytes (24 RGB565 pixels) in one loop run + loopnez a9, ._main_loop_all_unalign // 48 bytes (24 RGB565 pixels) in one loop + ee.src.q.ld.ip q4, a7, 16, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q.ld.ip q2, a7, 16, q3, q4 // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 + ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q.ld.ip q3, a7, 16, q4, q2 // Load 16 bytes from src_buff a7 to q3, concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 + ee.vst.128.ip q4, a3, 16 // Store 16 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ._main_loop_all_unalign: + + // Finish the main loop outside of the loop from Q registers preloads + + // Check modulo 32 and modulo 8 of the loop_len_remainder a12 + bbci a12, 5, _all_unalign_mod_32_check // Branch if 5-th bit of loop_len_remainder a12 is clear + bbsi a12, 3, _all_unalign_mod_32_mod_8_check // Branch if 3-rd bif of loop_len_remainder a12 is set + + // Copy 32 bytes (16 RGB565 pixels) (47 - 40) + ee.src.q.ld.ip q4, a7, 0, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7 + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q q3, q3, q4 // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount + ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + j _skip_mod16 + + _all_unalign_mod_32_mod_8_check: + // Copy 40 bytes (20 RGB565 pixels) + ee.src.q.ld.ip q4, a7, 16, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16 + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q.ld.ip q2, a7, 0, q3, q4 // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, don't increase src_buf pointer a7 + ee.vst.128.ip q3, a3, 16 // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q q4, q4, q2 // Concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount + ee.vst.l.64.ip q4, a3, 8 // Store lower 8 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 8 + addi a7, a7, -8 // Correct the src_buff pointer a7, caused by q reg preload + j _skip_mod16 + + _all_unalign_mod_32_check: + + // Check modulo 16 and modulo 8 of the loop_len_remainder a12 + bbci a12, 4, _all_unalign_mod_16_check // branch if 4-th bit of loop_len_remainder a12 is clear + bbsi a12, 3, _all_unalign_mod_16_mod_8_check // branch if 3-rd bit of loop_len_remainder a12 is set + + // Copy 16 bytes (8 RGB565 pixels) + ee.src.q q2, q2, q3 // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + addi a7, a7, -16 // Correct the src_buff pointer a7, caused by q reg preload + j _skip_mod16 + + _all_unalign_mod_16_mod_8_check: + // Copy 24 bytes (12 RGB565 pixels) + ee.src.q.ld.ip q4, a7, 0, q2, q3 // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7 + ee.vst.128.ip q2, a3, 16 // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16 + ee.src.q q3, q3, q4 // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount + ee.vst.l.64.ip q3, a3, 8 // Store lower 8 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 8 + addi a7, a7, -8 // Correct the src_buff pointer a7, caused by q reg preload + j _skip_mod16 + _all_unalign_mod_16_check: + + bbci a12, 3, _all_unalign_mod_8_check // Branch if 3-rd bit of loop_len_remainder a12 is clear + // Copy 8 bytes (4 RGB565 pixels) + ee.src.q q2, q2, q3 // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount + ee.vst.l.64.ip q2, a3, 8 // Store lower 8 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 8 + addi a7, a7, -24 // Correct the src_buff pointer a7, caused by q reg preload + j _skip_mod16 + _all_unalign_mod_8_check: + + addi a7, a7, -32 // Correct the src_buff pointer a7, caused by q reg preload + + _skip_mod16: + + // Check modulo 4 of the loop_len_remainder, if - then copy 4 bytes (2 RGB565 pixels) + // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15 + macro_memcpy_mod_4 a7, a3, a12, a15, __LINE__ + + // Check modulo 2 of the loop_len_remainder, if - then copy 2 bytes (1 RGB565 pixel) + // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15 + macro_memcpy_mod_2 a7, a3, a12, a15, __LINE__ + + // Check modulo 1 of the loop_len_remainder, if - then copy 1 byte (1/2 RGB565 pixel) + // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15 + macro_memcpy_mod_1 a7, a3, a12, a15, __LINE_ + + slli a11, a4, 1 // Refresh dest_w_bytes + add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) + add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8) + addi.n a5, a5, -1 // Decrease the outer loop + bnez a5, .outer_loop_all_unalign + + movi.n a2, 1 // Return LV_RESULT_OK = 1 + retw.n // Return + +//********************************************************************************************************************** + + // Small matrix width, keep it simple for lengths less than 8 pixels + _matrix_width_check: // Matrix width is greater or equal 8 pixels + + // Convert strides to matrix paddings + sub a6, a6, a11 // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11) + sub a8, a8, a11 // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11) + + .outer_loop_short_matrix_length: + + // Run main loop which copies 2 bytes (one RGB565 pixel) in one loop run + loopnez a4, ._main_loop_short_matrix_length + l8ui a15, a7, 0 // Load 8 bits from src_buff a7 to a15, offset 0 + l8ui a14, a7, 1 // Load 8 bits from src_buff a7 to a14, offset 1 + s8i a15, a3, 0 // Save 8 bits from a15 to dest_buff a3, offset 0 + s8i a14, a3, 1 // Save 8 bits from a14 to dest_buff a3, offset 1 + addi.n a7, a7, 2 // Increment src_buff pointer a7 by 1 + addi.n a3, a3, 2 // Increment dest_buff pointer a3 by 2 + ._main_loop_short_matrix_length: + + // Finish remaining byte out of the main loop + + // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel) + // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15 + macro_memcpy_mod_1 a7, a3, a11, a15, __LINE__ + + add a3, a3, a6 // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6) + add a7, a7, a8 // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8) + addi.n a5, a5, -1 // Decrease the outer loop + bnez a5, .outer_loop_short_matrix_length + + movi.n a2, 1 // Return LV_RESULT_OK = 1 + retw.n // Return diff --git a/components/esp_lvgl_port/test_apps/simd/README.md b/components/esp_lvgl_port/test_apps/simd/README.md index d319e2e3..90efa55c 100644 --- a/components/esp_lvgl_port/test_apps/simd/README.md +++ b/components/esp_lvgl_port/test_apps/simd/README.md @@ -4,7 +4,7 @@ Test app accommodates two types of tests: [`functionality test`](#Functionality- Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) component. Header file with the assembly function prototypes is provided into the LVGL using Kconfig option `LV_DRAW_SW_ASM_CUSTOM_INCLUDE` and can be found in the [`lvgl_port/include`](../../include/esp_lvgl_port_lv_blend.h) -## Benchmark results +## Benchmark results for LV Fill functions (memset) | Color format | Matrix size | Memory alignment | ASM version | ANSI C version | | :----------- | :---------- | :--------------- | :------------- | :------------- | @@ -15,6 +15,15 @@ Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) * this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x127 1 byte aligned matrix (worst case) * the values represent cycles per sample to perform simple fill of the matrix on esp32s3 +## Benchmark results for LV Image functions (memcpy) + +| Color format | Matrix size | Memory alignment | ASM version | ANSI C version | +| :----------- | :---------- | :--------------- | :------------- | :------------- | +| RGB565 | 128x128 | 16 byte | 0.352 | 3.437 | +| | 127x128 | 1 byte | 0.866 | 5.978 | +* this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x128 1 byte aligned matrix (worst case) +* the values represent cycles per sample to perform memory copy between two matrices on esp32s3 + ## Functionality test * Tests, whether the HW accelerated assembly version of an LVGL function provides the same results as the ANSI version * A top-level flow of the functionality test: @@ -62,6 +71,8 @@ Here's the test menu, pick your combo: (2) "Test fill functionality RGB565" [fill][functionality][RGB565] (3) "LV Fill benchmark ARGB8888" [fill][benchmark][ARGB8888] (4) "LV Fill benchmark RGB565" [fill][benchmark][RGB565] +(5) "LV Image functionality RGB565 blend to RGB565" [image][functionality][RGB565] +(6) "LV Image benchmark RGB565 blend to RGB565" [image][benchmark][RGB565] Enter test for running. ``` diff --git a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt index 0a6d5da4..20c061ff 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt +++ b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt @@ -8,6 +8,9 @@ if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3) else() file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32.S) # Select only esp32 related files endif() + + file(GLOB_RECURSE ASM_MACROS ${PORT_PATH}/simd/lv_macro_*.S) # Explicitly add all assembler macro files + else() message(WARNING "This test app is intended only for esp32 and esp32s3") endif() @@ -15,7 +18,14 @@ endif() # Hard copy of LV files file(GLOB_RECURSE BLEND_SRCS lv_blend/src/*.c) -idf_component_register(SRCS "test_app_main.c" "test_lv_fill_functionality.c" "test_lv_fill_benchmark.c" ${BLEND_SRCS} ${ASM_SOURCES} +idf_component_register(SRCS "test_app_main.c" + "test_lv_fill_functionality.c" # memset tests + "test_lv_fill_benchmark.c" + "test_lv_image_functionality.c" # memcpy tests + "test_lv_image_benchmark.c" + ${BLEND_SRCS} # Hard copy of LVGL's blend API, to simplify testing + ${ASM_SOURCES} # Assembly src files + ${ASM_MACROS} # Assembly macro files INCLUDE_DIRS "lv_blend/include" "../../../include" REQUIRES unity WHOLE_ARCHIVE) diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h index 01c5f769..40ab1e84 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h @@ -57,6 +57,7 @@ typedef struct { lv_color_format_t src_color_format; lv_opa_t opa; lv_blend_mode_t blend_mode; + bool use_asm; } _lv_draw_sw_blend_image_dsc_t; /********************** diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h new file mode 100644 index 00000000..9747af98 --- /dev/null +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h @@ -0,0 +1,79 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + * + * This file is derived from the LVGL project. + * See https://github.com/lvgl/lvgl for details. + */ + +/** + * @file lv_stringn.h + * + */ + +#ifndef LV_STRING_H +#define LV_STRING_H + +#ifdef __cplusplus +extern "C" { +#endif + +/********************* + * INCLUDES + *********************/ +//#include "../lv_conf_internal.h" +#include +#include +#include "lv_types.h" + +/********************* + * DEFINES + *********************/ + +/********************** + * TYPEDEFS + **********************/ + +/********************** + * GLOBAL PROTOTYPES + **********************/ + +/** + * @brief Copies a block of memory from a source address to a destination address. + * @param dst Pointer to the destination array where the content is to be copied. + * @param src Pointer to the source of data to be copied. + * @param len Number of bytes to copy. + * @return Pointer to the destination array. + * @note The function does not check for any overlapping of the source and destination memory blocks. + */ +void * lv_memcpy(void * dst, const void * src, size_t len); + +/** + * @brief Fills a block of memory with a specified value. + * @param dst Pointer to the destination array to fill with the specified value. + * @param v Value to be set. The value is passed as an int, but the function fills + * the block of memory using the unsigned char conversion of this value. + * @param len Number of bytes to be set to the value. + */ +void lv_memset(void * dst, uint8_t v, size_t len); + +/** + * @brief Move a block of memory from source to destination + * @param dst Pointer to the destination array where the content is to be copied. + * @param src Pointer to the source of data to be copied. + * @param len Number of bytes to copy + * @return Pointer to the destination array. + */ +void * lv_memmove(void * dst, const void * src, size_t len); + + +/********************** + * MACROS + **********************/ + +#ifdef __cplusplus +} /*extern "C"*/ +#endif + +#endif /*LV_STRING_H*/ diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h index 2e9244fe..f97a51eb 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h @@ -19,6 +19,8 @@ extern "C" { #endif +#include + /********************** * TYPEDEFS **********************/ diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c index f18e3670..2f95bc74 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c @@ -23,7 +23,7 @@ #include "lv_draw_sw_blend.h" #include "lv_math.h" #include "lv_color.h" -#include "string.h" +#include "lv_string.h" #include "esp_lvgl_port_lv_blend.h" @@ -628,7 +628,7 @@ static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc if (src_px_size == 4) { uint32_t line_in_bytes = w * 4; for (y = 0; y < h; y++) { - memcpy(dest_buf_c32, src_buf, line_in_bytes); // lv_memcpy + lv_memcpy(dest_buf_c32, src_buf, line_in_bytes); dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride); src_buf = drawbuf_next_row(src_buf, src_stride); } @@ -870,9 +870,9 @@ static inline lv_color32_t LV_ATTRIBUTE_FAST_MEM lv_color_32_32_mix(lv_color32_t void lv_color_mix_with_alpha_cache_init(lv_color_mix_alpha_cache_t *cache) { - memset(&cache->fg_saved, 0x00, sizeof(lv_color32_t)); //lv_memzero - memset(&cache->bg_saved, 0x00, sizeof(lv_color32_t)); //lv_memzero - memset(&cache->res_saved, 0x00, sizeof(lv_color32_t)); //lv_memzero + lv_memset(&cache->fg_saved, 0x00, sizeof(lv_color32_t)); //lv_memzero + lv_memset(&cache->bg_saved, 0x00, sizeof(lv_color32_t)); //lv_memzero + lv_memset(&cache->res_saved, 0x00, sizeof(lv_color32_t)); //lv_memzero cache->res_alpha_saved = 255; cache->ratio_saved = 255; } diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c index 361571ff..dd6e5392 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c @@ -23,7 +23,7 @@ #include "lv_draw_sw_blend.h" #include "lv_math.h" #include "lv_color.h" -#include "string.h" +#include "lv_string.h" #include "esp_lvgl_port_lv_blend.h" @@ -601,10 +601,12 @@ static void LV_ATTRIBUTE_FAST_MEM rgb565_image_blend(_lv_draw_sw_blend_image_dsc if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) { if (mask_buf == NULL && opa >= LV_OPA_MAX) { - if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc)) { + if (dsc->use_asm) { + LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc); + } else { uint32_t line_in_bytes = w * 2; for (y = 0; y < h; y++) { - memcpy(dest_buf_u16, src_buf_u16, line_in_bytes); // lv_memcpy + lv_memcpy(dest_buf_u16, src_buf_u16, line_in_bytes); dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride); src_buf_u16 = drawbuf_next_row(src_buf_u16, src_stride); } diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c new file mode 100644 index 00000000..0e604048 --- /dev/null +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c @@ -0,0 +1,188 @@ +/* + * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + * + * This file is derived from the LVGL project. + * See https://github.com/lvgl/lvgl for details. + */ + +/** + * @file lv_string.c + */ + +/********************* + * INCLUDES + *********************/ +//#include "../../lv_conf_internal.h" +#if LV_USE_STDLIB_STRING == LV_STDLIB_BUILTIN +#include "lv_assert.h" +#include "lv_log.h" +#include "lv_math.h" +#include "lv_string.h" + +/********************* + * DEFINES + *********************/ +#ifdef LV_ARCH_64 + #define MEM_UNIT uint64_t + #define ALIGN_MASK 0x7 +#else + #define MEM_UNIT uint32_t + #define ALIGN_MASK 0x3 +#endif + +#define LV_ATTRIBUTE_FAST_MEM + +/********************** + * TYPEDEFS + **********************/ + +/********************** + * STATIC PROTOTYPES + **********************/ + +/********************** + * STATIC VARIABLES + **********************/ + +/********************** + * MACROS + **********************/ +#if LV_USE_LOG && LV_LOG_TRACE_MEM + #define LV_TRACE_MEM(...) LV_LOG_TRACE(__VA_ARGS__) +#else + #define LV_TRACE_MEM(...) +#endif + +#define _COPY(d, s) *d = *s; d++; s++; +#define _SET(d, v) *d = v; d++; +#define _REPEAT8(expr) expr expr expr expr expr expr expr expr + +/********************** + * GLOBAL FUNCTIONS + **********************/ + +void * LV_ATTRIBUTE_FAST_MEM lv_memcpy(void * dst, const void * src, size_t len) +{ + uint8_t * d8 = dst; + const uint8_t * s8 = src; + + /*Simplify for small memories*/ + if(len < 16) { + while(len) { + *d8 = *s8; + d8++; + s8++; + len--; + } + return dst; + } + + lv_uintptr_t d_align = (lv_uintptr_t)d8 & ALIGN_MASK; + lv_uintptr_t s_align = (lv_uintptr_t)s8 & ALIGN_MASK; + + /*Byte copy for unaligned memories*/ + if(s_align != d_align) { + while(len > 32) { + _REPEAT8(_COPY(d8, s8)); + _REPEAT8(_COPY(d8, s8)); + _REPEAT8(_COPY(d8, s8)); + _REPEAT8(_COPY(d8, s8)); + len -= 32; + } + while(len) { + _COPY(d8, s8) + len--; + } + return dst; + } + + /*Make the memories aligned*/ + if(d_align) { + d_align = ALIGN_MASK + 1 - d_align; + while(d_align && len) { + _COPY(d8, s8); + d_align--; + len--; + } + } + + uint32_t * d32 = (uint32_t *)d8; + const uint32_t * s32 = (uint32_t *)s8; + while(len > 32) { + _REPEAT8(_COPY(d32, s32)) + len -= 32; + } + + d8 = (uint8_t *)d32; + s8 = (const uint8_t *)s32; + while(len) { + _COPY(d8, s8) + len--; + } + + return dst; +} + +void LV_ATTRIBUTE_FAST_MEM lv_memset(void * dst, uint8_t v, size_t len) +{ + uint8_t * d8 = (uint8_t *)dst; + uintptr_t d_align = (lv_uintptr_t) d8 & ALIGN_MASK; + + /*Make the address aligned*/ + if(d_align) { + d_align = ALIGN_MASK + 1 - d_align; + while(d_align && len) { + _SET(d8, v); + len--; + d_align--; + } + } + + uint32_t v32 = (uint32_t)v + ((uint32_t)v << 8) + ((uint32_t)v << 16) + ((uint32_t)v << 24); + uint32_t * d32 = (uint32_t *)d8; + + while(len > 32) { + _REPEAT8(_SET(d32, v32)); + len -= 32; + } + + d8 = (uint8_t *)d32; + while(len) { + _SET(d8, v); + len--; + } +} + +void * LV_ATTRIBUTE_FAST_MEM lv_memmove(void * dst, const void * src, size_t len) +{ + if(dst < src || (char *)dst > ((char *)src + len)) { + return lv_memcpy(dst, src, len); + } + + if(dst > src) { + char * tmp = (char *)dst + len - 1; + char * s = (char *)src + len - 1; + + while(len--) { + *tmp-- = *s--; + } + } + else { + char * tmp = (char *)dst; + char * s = (char *)src; + + while(len--) { + *tmp++ = *s++; + } + } + + return dst; +} + +/********************** + * STATIC FUNCTIONS + **********************/ + +#endif /*LV_STDLIB_BUILTIN*/ diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h index 5243857e..b208819c 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h @@ -64,7 +64,7 @@ typedef struct { unsigned int cc_width; // Corner case test array width unsigned int benchmark_cycles; // Count of benchmark cycles void *array_align16; // test array with 16 byte alignment - testing most ideal case - void *array_align1; // test array with 1 byte alignment - testing wort case + void *array_align1; // test array with 1 byte alignment - testing worst case void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function } bench_test_case_params_t; diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_image_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_image_common.h new file mode 100644 index 00000000..f2c64827 --- /dev/null +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_image_common.h @@ -0,0 +1,111 @@ +/* + * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include "esp_err.h" +#include +#include "lv_color.h" +#include "lv_draw_sw_blend.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// ------------------------------------------------- Macros and Types -------------------------------------------------- + +/** + * @brief Type of blend DUT function + */ +typedef enum { + OPERATION_FILL, + OPERATION_FILL_WITH_OPA, +} blend_operation_t; + +/** + * @brief Canary pixels amount depending on data type + * @note + * - We should use at least 16 bytes of memory for canary pixels because of esp32s3 TIE 16-bytes wide Q registers + * - Canary pixels are multiplied by sizeof(used_data_type) to get the memory length occupied by the canary pixels + * - The memory occupied by canary pixels should be in 16-byte multiples, to achieve 16-byte memory alignment in functionality test + * - For example, ideally, for RGB565 we would need 8 canary pixels -> 8 * sizeof(uint16_t) = 16 + */ +typedef enum { + CANARY_PIXELS_ARGB8888 = 4, /*!< Canary pixels: 4 * sizeof(uint32_t) = 16 */ + CANARY_PIXELS_RGB565 = 8, /*!< Canary pixels: 8 * sizeof(uint16_t) = 16 */ +} canary_pixels_t; + +/** + * @brief Functionality test combinations for LV Image + */ +typedef struct { + unsigned int min_w; /*!< Minimum width of the test array */ + unsigned int min_h; /*!< Minimum height of the test array */ + unsigned int max_w; /*!< Maximum width of the test array */ + unsigned int max_h; /*!< Maximum height of the test array */ + unsigned int src_min_unalign_byte; /*!< Minimum amount of unaligned bytes of the source test array */ + unsigned int dest_min_unalign_byte; /*!< Minimum amount of unaligned bytes of the destination test array */ + unsigned int src_max_unalign_byte; /*!< Maximum amount of unaligned bytes of the source test array */ + unsigned int dest_max_unalign_byte; /*!< Maximum amount of unaligned bytes of the destination test array */ + unsigned int src_unalign_step; /*!< Increment step in bytes unalignment of the source test array */ + unsigned int dest_unalign_step; /*!< Increment step in bytes unalignment of the destination test array */ + unsigned int src_stride_step; /*!< Increment step in destination stride of the source test array */ + unsigned int dest_stride_step; /*!< Increment step in destination stride of the destination test array */ + unsigned int test_combinations_count; /*!< Count of fest combinations */ +} test_matrix_lv_image_params_t; + + +/** + * @brief Functionality test case parameters for LV Image + */ +typedef struct { + struct { + void *p_src; /*!< pointer to the source test buff (common src buffer for both the ANSI and ASM) */ + void *p_src_alloc; /*!< pointer to the beginning of the memory allocated for the source ASM test buf, used in free() */ + void *p_dest_asm; /*!< pointer to the destination ASM test buf */ + void *p_dest_ansi; /*!< pointer to the destination ANSI test buf */ + void *p_dest_asm_alloc; /*!< pointer to the beginning of the memory allocated for the destination ASM test buf, used in free() */ + void *p_dest_ansi_alloc; /*!< pointer to the beginning of the memory allocated for the destination ANSI test buf, used in free() */ + } buf; + void (*blend_api_func)(_lv_draw_sw_blend_image_dsc_t *); /*!< pointer to LVGL API function */ + lv_color_format_t color_format; /*!< LV color format */ + size_t src_data_type_size; /*!< Used data type size in the source buffer, eg sizeof(src_buff[0]) */ + size_t dest_data_type_size; /*!< Used data type size in the destination buffer, eg sizeof(dest_buff[0]) */ + size_t src_buf_len; /*!< Length of the source buffer, including matrix padding (no Canary pixels are used for source buffer) */ + size_t active_dest_buf_len; /*!< Length of the destination buffer, where the actual data are stored, including matrix padding, not including Canary pixels */ + size_t total_dest_buf_len; /*!< Total length of the destination buffer (including Canary pixels and matrix padding) */ + size_t canary_pixels; /*!< Canary pixels must be adjusted according to the used color type, to achieve aligned memory effect */ + unsigned int dest_w; /*!< Destination buffer width */ + unsigned int dest_h; /*!< Destination buffer height */ + unsigned int src_stride; /*!< Source buffer stride */ + unsigned int dest_stride; /*!< Destination buffer stride */ + unsigned int src_unalign_byte; /*!< Source buffer memory unalignment */ + unsigned int dest_unalign_byte; /*!< Destination buffer memory unalignment */ + blend_operation_t operation_type; /*!< Type of fundamental blend operation */ +} func_test_case_lv_image_params_t; + + +/** + * @brief Benchmark test case parameters for LV Image + */ +typedef struct { + unsigned int height; /*!< Test array height */ + unsigned int width; /*!< Test array width */ + unsigned int dest_stride; /*!< Destination test array stride */ + unsigned int src_stride; /*!< Source test array stride */ + unsigned int cc_height; /*!< Corner case test array height */ + unsigned int cc_width; /*!< Corner case test array width */ + unsigned int benchmark_cycles; /*!< Count of benchmark cycles */ + void *src_array_align16; /*!< Source test array with 16 byte alignment - testing most ideal case */ + void *src_array_align1; /*!< Source test array with 1 byte alignment - testing worst case */ + void *dest_array_align16; /*!< Destination test array with 16 byte alignment - testing most ideal case */ + void *dest_array_align1; /*!< Destination test array with 1 byte alignment - testing worst case */ + void (*blend_api_func)(_lv_draw_sw_blend_image_dsc_t *); /*!< pointer to LVGL API function */ +} bench_test_case_lv_image_params_t; + +#ifdef __cplusplus +} /*extern "C"*/ +#endif diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c new file mode 100644 index 00000000..2372f07a --- /dev/null +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c @@ -0,0 +1,171 @@ +/* + * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include + +#include "unity.h" +#include "esp_log.h" +#include "freertos/FreeRTOS.h" // for xthal_get_ccount() +#include "lv_image_common.h" +#include "lv_draw_sw_blend.h" +#include "lv_draw_sw_blend_to_rgb565.h" + +#define COMMON_DIM 128 // Common matrix dimension 128x128 pixels +#define WIDTH COMMON_DIM +#define HEIGHT COMMON_DIM +#define STRIDE WIDTH +#define UNALIGN_BYTES 3 +#define BENCHMARK_CYCLES 1000 + +// ------------------------------------------------ Static variables --------------------------------------------------- + +static const char *TAG_LV_IMAGE_BENCH = "LV Image Benchmark"; +static const char *asm_ansi_func[] = {"ASM", "ANSI"}; + +// ------------------------------------------------ Static function headers -------------------------------------------- + +/** + * @brief Initialize the benchmark test + */ +static void lv_image_benchmark_init(bench_test_case_lv_image_params_t *test_params); + +/** + * @brief Run the benchmark test + */ +static float lv_image_benchmark_run(bench_test_case_lv_image_params_t *test_params, _lv_draw_sw_blend_image_dsc_t *dsc); + +// ------------------------------------------------ Test cases --------------------------------------------------------- + +/* +Benchmark tests + +Requires: + - To pass functionality tests first + +Purpose: + - Test that an acceleration is achieved by an assembly implementation of LVGL blending API + +Procedure: + - Initialize input parameters (test array length, width, allocate array...) of the benchmark test + - Run assembly version of LVGL blending API multiple times (1000-times or so) + - Firstly use an input test parameters for the most ideal case (16-byte aligned arrays, arrays widths divisible by 2 for RGB565 color format) + - Then use worst-case input test parameters (1-byte aligned arrays, arrays width NOT divisible by 2 for RGB565 color format) + - Count how many CPU cycles does it take to run a function from the LVGL blending API for each case (ideal and worst case) + - Run ansi version of LVGL blending API multiple times (1000-times or so) and repeat the 2 above steps for the ansi version + - Compare the results + - Free test arrays and structures needed for LVGL blending API + +Inducing Most ideal and worst case scenarios: + - Most ideal: + - Both, the source and the destination buffers should be aligned by 16-byte (Xtensa PIE), or 4-byte (Xtensa base) boundaries + - Matrix width (in pixels) should be equal to the main loop length in the assembly src code + typically multiples of 16 bytes (for RGB565 it's either 32 bytes - 16 pixels or 48 bytes - 24 pixels) + - Matrix height does not have any effect on benchmark unit tests, unit the matrix is too large that cache limitations start to affect the performance + - Matrix strides, should be equal to the matrix widths (0 matrix padding), or their multiples (matrix width = matrix padding) + - Worst case: + - Both, hte source and the destination buffers should NOT be aligned by 16-byte (Xtensa PIE), or 4-byte (Xtensa base) boundaries, + Source buffer unalignment should be different from the destination unalignment, with one unalignment being even, the other being odd + The unalignments shall be small numbers (preferably 1 or 2 bytes) + - Matrix width should be one pixels smaller, than the matrix width for the most ideal case + - Matrix height does not have any effect on benchmark unit tests, unit the matrix is too large that cache limitations start to affect the performance + - Matrix strides, should NOT be equal to the matrix widths (non 0 matrix padding) +*/ +// ------------------------------------------------ Test cases stages -------------------------------------------------- + +TEST_CASE("LV Image benchmark RGB565 blend to RGB565", "[image][benchmark][RGB565]") +{ + uint16_t *dest_array_align16 = (uint16_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint16_t) + UNALIGN_BYTES); + uint16_t *src_array_align16 = (uint16_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint16_t) + UNALIGN_BYTES); + TEST_ASSERT_NOT_EQUAL(NULL, dest_array_align16); + TEST_ASSERT_NOT_EQUAL(NULL, src_array_align16); + + // Apply byte unalignment for the worst-case test scenario + uint16_t *dest_array_align1 = (uint16_t *)((uint8_t *)dest_array_align16 + UNALIGN_BYTES - 2); + uint16_t *src_array_align1 = (uint16_t *)((uint8_t *)src_array_align16 + UNALIGN_BYTES); + + bench_test_case_lv_image_params_t test_params = { + .height = HEIGHT, + .width = WIDTH, + .dest_stride = STRIDE * sizeof(uint16_t), + .src_stride = STRIDE * sizeof(uint16_t), + .cc_height = HEIGHT, + .cc_width = WIDTH - 1, + .benchmark_cycles = BENCHMARK_CYCLES, + .src_array_align16 = (void *)src_array_align16, + .src_array_align1 = (void *)src_array_align1, + .dest_array_align16 = (void *)dest_array_align16, + .dest_array_align1 = (void *)dest_array_align1, + .blend_api_func = &lv_draw_sw_blend_image_to_rgb565, + }; + + ESP_LOGI(TAG_LV_IMAGE_BENCH, "running test for RGB565 color format"); + lv_image_benchmark_init(&test_params); + free(dest_array_align16); + free(src_array_align16); +} +// ------------------------------------------------ Static test functions ---------------------------------------------- + +static void lv_image_benchmark_init(bench_test_case_lv_image_params_t *test_params) +{ + // Init structure for LVGL blend API, to call the Assembly API + _lv_draw_sw_blend_image_dsc_t dsc = { + .dest_buf = test_params->dest_array_align16, + .dest_w = test_params->width, + .dest_h = test_params->height, + .dest_stride = test_params->dest_stride, // stride * sizeof() + .mask_buf = NULL, + .src_buf = test_params->src_array_align16, + .src_stride = test_params->src_stride, + .src_color_format = LV_COLOR_FORMAT_RGB565, + .opa = LV_OPA_MAX, + .blend_mode = LV_BLEND_MODE_NORMAL, + .use_asm = true, + }; + + // Init structure for LVGL blend API, to call the ANSI API + _lv_draw_sw_blend_image_dsc_t dsc_cc = dsc; + dsc_cc.dest_buf = test_params->dest_array_align1; + dsc_cc.dest_w = test_params->cc_width; + dsc_cc.dest_h = test_params->cc_height; + dsc_cc.src_buf = test_params->src_array_align1; + + // Run benchmark 2 times: + // First run using assembly, second run using ANSI + for (int i = 0; i < 2; i++) { + + // Run benchmark with the most ideal input parameters + float cycles = lv_image_benchmark_run(test_params, &dsc); // Call Benchmark cycle + float per_sample = cycles / ((float)(dsc.dest_w * dsc.dest_h)); + ESP_LOGI(TAG_LV_IMAGE_BENCH, " %s ideal case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample", asm_ansi_func[i], cycles, dsc.dest_w, dsc.dest_h, per_sample); + + // Run benchmark with the corner case input parameters + cycles = lv_image_benchmark_run(test_params, &dsc_cc); // Call Benchmark cycle + per_sample = cycles / ((float)(dsc_cc.dest_w * dsc_cc.dest_h)); + ESP_LOGI(TAG_LV_IMAGE_BENCH, " %s corner case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample\n", asm_ansi_func[i], cycles, dsc_cc.dest_w, dsc_cc.dest_h, per_sample); + + // change to ANSI + dsc.use_asm = false; + dsc_cc.use_asm = false; + } +} + +static float lv_image_benchmark_run(bench_test_case_lv_image_params_t *test_params, _lv_draw_sw_blend_image_dsc_t *dsc) +{ + // Call the DUT function for the first time to init the benchmark test + test_params->blend_api_func(dsc); + + const unsigned int start_b = xthal_get_ccount(); + for (int i = 0; i < test_params->benchmark_cycles; i++) { + test_params->blend_api_func(dsc); + } + const unsigned int end_b = xthal_get_ccount(); + + const float total_b = end_b - start_b; + const float cycles = total_b / (test_params->benchmark_cycles); + return cycles; +} diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c new file mode 100644 index 00000000..17317deb --- /dev/null +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c @@ -0,0 +1,351 @@ +/* + * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include +#include +#include +#include "sdkconfig.h" +#include "unity.h" +#include "esp_log.h" +#include "lv_image_common.h" +#include "lv_draw_sw_blend.h" +#include "lv_draw_sw_blend_to_rgb565.h" + +// ------------------------------------------------- Defines ----------------------------------------------------------- + +#define DBG_PRINT_OUTPUT false + +// ------------------------------------------------- Macros and Types -------------------------------------------------- + +#define UPDATE_TEST_CASE(test_case_ptr, dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte) ({ \ + (test_case_ptr)->src_buf_len = (size_t)(dest_h * src_stride); \ + (test_case_ptr)->active_dest_buf_len = (size_t)(dest_h * dest_stride); \ + (test_case_ptr)->total_dest_buf_len = (size_t)((dest_h * dest_stride) + (test_case_ptr->canary_pixels * 2)); \ + (test_case_ptr)->dest_w = (dest_w); \ + (test_case_ptr)->dest_h = (dest_h); \ + (test_case_ptr)->src_stride = (src_stride); \ + (test_case_ptr)->dest_stride = (dest_stride); \ + (test_case_ptr)->src_unalign_byte = (src_unalign_byte); \ + (test_case_ptr)->dest_unalign_byte = (dest_unalign_byte); \ +}) + +// ------------------------------------------------ Static variables --------------------------------------------------- + +static const char *TAG_LV_IMAGE_FUNC = "LV Image Functionality"; +static char test_msg_buf[200]; + +static const test_matrix_lv_image_params_t default_test_matrix_image_rgb565_blend_rgb565 = { +#if CONFIG_IDF_TARGET_ESP32S3 + .min_w = 8, // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed + .min_h = 1, + .max_w = 24, + .max_h = 2, + .src_max_unalign_byte = 16, // Use 16-byte boundary check for Xtensa PIE + .dest_max_unalign_byte = 16, + .dest_unalign_step = 1, // Step 1 as the destination array is being aligned in the assembly code all the time + .src_unalign_step = 3, // Step 3 (more relaxed) as source array is used unaligned in the assembly code + .src_stride_step = 3, + .dest_stride_step = 3, +#else + .min_w = 1, + .min_h = 1, + .max_w = 16, + .max_h = 2, + .src_max_unalign_byte = 4, // Use 4-byte boundary check for Xtensa base + .dest_max_unalign_byte = 4, + .dest_unalign_step = 1, + .src_unalign_step = 1, + .src_stride_step = 1, + .dest_stride_step = 1, +#endif + .src_min_unalign_byte = 0, + .dest_min_unalign_byte = 0, + .test_combinations_count = 0, +}; + +// ------------------------------------------------ Static function headers -------------------------------------------- + +/** + * @brief Generate all the functionality test combinations + * + * - generate functionality test combinations, based on the provided test_matrix struct + * + * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations + * @param[in] test_case Pointer ot structure defining functionality test case + */ +static void functionality_test_matrix(test_matrix_lv_image_params_t *test_matrix, func_test_case_lv_image_params_t *test_case); + +/** + * @brief Fill test buffers for image functionality test + * + * @param[in] test_case Pointer ot structure defining functionality test case + */ +static void fill_test_bufs(func_test_case_lv_image_params_t *test_case); + +/** + * @brief The actual functionality test + * + * - function prepares structures for functionality testing and runs the LVGL API + * + * @param[in] test_case Pointer ot structure defining functionality test case + */ +static void lv_image_functionality(func_test_case_lv_image_params_t *test_case); + +/** + * @brief Evaluate results of LV Image functionality for 16bit data length + * + * @param[in] test_case Pointer ot structure defining functionality test case + */ +static void test_eval_image_16bit_data(func_test_case_lv_image_params_t *test_case); + +// ------------------------------------------------ Test cases --------------------------------------------------------- + +/* +Functionality tests + +Purpose: + - Test that an assembly version of LVGL blending API achieves the same results as the ANSI version + +Procedure: + - Prepare testing matrix, to cover all the possible combinations of destination and source arrays widths, + lengths, strides and memory alignments + - Run assembly version of the LVGL blending API + - Run ANSI C version of the LVGL blending API + - Compare the results + - Repeat above 3 steps for each test matrix setup +*/ + +// ------------------------------------------------ Test cases stages -------------------------------------------------- + +TEST_CASE("LV Image functionality RGB565 blend to RGB565", "[image][functionality][RGB565]") +{ + test_matrix_lv_image_params_t test_matrix = default_test_matrix_image_rgb565_blend_rgb565; + + func_test_case_lv_image_params_t test_case = { + .blend_api_func = &lv_draw_sw_blend_image_to_rgb565, + .color_format = LV_COLOR_FORMAT_RGB565, + .canary_pixels = CANARY_PIXELS_RGB565, + .src_data_type_size = sizeof(uint16_t), + .dest_data_type_size = sizeof(uint16_t), + .operation_type = OPERATION_FILL, + }; + + ESP_LOGI(TAG_LV_IMAGE_FUNC, "running test for RGB565 color format"); + functionality_test_matrix(&test_matrix, &test_case); +} + +// ------------------------------------------------ Static test functions ---------------------------------------------- + +static void functionality_test_matrix(test_matrix_lv_image_params_t *test_matrix, func_test_case_lv_image_params_t *test_case) +{ + // Step destination array width + for (int dest_w = test_matrix->min_w; dest_w <= test_matrix->max_w; dest_w++) { + + // Step destination array height + for (int dest_h = test_matrix->min_h; dest_h <= test_matrix->max_h; dest_h++) { + + // Step source array stride + for (int src_stride = dest_w; src_stride <= dest_w * 2; src_stride += test_matrix->src_stride_step) { + + // Step destination array stride + for (int dest_stride = dest_w; dest_stride <= dest_w * 2; dest_stride += test_matrix->dest_stride_step) { + + // Step source array unalignment + for (int src_unalign_byte = test_matrix->src_min_unalign_byte; src_unalign_byte <= test_matrix->src_max_unalign_byte; src_unalign_byte += test_matrix->src_unalign_step) { + + // Step destination array unalignment + for (int dest_unalign_byte = test_matrix->dest_min_unalign_byte; dest_unalign_byte <= test_matrix->dest_max_unalign_byte; dest_unalign_byte += test_matrix->dest_unalign_step) { + + // Call functionality test + UPDATE_TEST_CASE(test_case, dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte); + lv_image_functionality(test_case); + test_matrix->test_combinations_count++; + } + } + } + } + } + } + ESP_LOGI(TAG_LV_IMAGE_FUNC, "test combinations: %d\n", test_matrix->test_combinations_count); +} + +static void lv_image_functionality(func_test_case_lv_image_params_t *test_case) +{ + fill_test_bufs(test_case); + + _lv_draw_sw_blend_image_dsc_t dsc_asm = { + .dest_buf = test_case->buf.p_dest_asm, + .dest_w = test_case->dest_w, + .dest_h = test_case->dest_h, + .dest_stride = test_case->dest_stride * test_case->dest_data_type_size, // dest_stride * sizeof(data_type) + .mask_buf = NULL, + .mask_stride = 0, + .src_buf = test_case->buf.p_src, + .src_stride = test_case->src_stride * test_case->src_data_type_size, // src_stride * sizeof(data_type) + .src_color_format = test_case->color_format, + .opa = LV_OPA_MAX, + .blend_mode = LV_BLEND_MODE_NORMAL, + .use_asm = true, + }; + + // Init structure for LVGL blend API, to call the ANSI API + _lv_draw_sw_blend_image_dsc_t dsc_ansi = dsc_asm; + dsc_ansi.dest_buf = test_case->buf.p_dest_ansi; + dsc_ansi.use_asm = false; + + test_case->blend_api_func(&dsc_asm); // Call the LVGL API with Assembly code + test_case->blend_api_func(&dsc_ansi); // Call the LVGL API with ANSI code + + // Shift array pointers by (Canary pixels amount * data type length) back + test_case->buf.p_dest_asm -= test_case->canary_pixels * test_case->dest_data_type_size; + test_case->buf.p_dest_ansi -= test_case->canary_pixels * test_case->dest_data_type_size; + + // Evaluate the results + sprintf(test_msg_buf, "Test case: dest_w = %d, dest_h = %d, dest_stride = %d, src_stride = %d, dest_unalign_byte = %d, src_unalign_byte = %d\n", + test_case->dest_w, test_case->dest_h, test_case->dest_stride, test_case->src_stride, test_case->dest_unalign_byte, test_case->src_unalign_byte); +#if DBG_PRINT_OUTPUT + printf("%s\n", test_msg_buf); +#endif + switch (test_case->color_format) { + case LV_COLOR_FORMAT_RGB565: + test_eval_image_16bit_data(test_case); + break; + default: + TEST_ASSERT_MESSAGE(false, "LV Color format not found"); + break; + } + + // Free memory allocated for test buffers + free(test_case->buf.p_dest_asm_alloc); + free(test_case->buf.p_dest_ansi_alloc); + free(test_case->buf.p_src_alloc); +} + +static void fill_test_bufs(func_test_case_lv_image_params_t *test_case) +{ + const size_t src_data_type_size = test_case->src_data_type_size; // sizeof() of used data type in the source buffer + const size_t dest_data_type_size = test_case->dest_data_type_size; // sizeof() of used data type in the destination buffer + const size_t src_buf_len = test_case->src_buf_len; // Total source buffer length, data part of the source buffer including matrix padding (no Canary pixels are used for source buffer) + const size_t total_dest_buf_len = test_case->total_dest_buf_len; // Total destination buffer length, data part of the destination buffer including the Canary pixels and matrix padding + const size_t active_dest_buf_len = test_case->active_dest_buf_len; // Length of the data part of the destination buffer including matrix padding + const size_t canary_pixels = test_case->canary_pixels; // Canary pixels, according to the data type + const unsigned int src_unalign_byte = test_case->src_unalign_byte; // Unalignment bytes for source buffer + const unsigned int dest_unalign_byte = test_case->dest_unalign_byte; // Unalignment bytes for destination buffer + + // Allocate destination arrays and source array for Assembly and ANSI LVGL Blend API + void *src_mem_common = memalign(16, (src_buf_len * src_data_type_size) + src_unalign_byte); + void *dest_mem_asm = memalign(16, (total_dest_buf_len * dest_data_type_size) + dest_unalign_byte); + void *dest_mem_ansi = memalign(16, (total_dest_buf_len * dest_data_type_size) + dest_unalign_byte); + TEST_ASSERT_NOT_NULL_MESSAGE(src_mem_common, "Lack of memory"); + TEST_ASSERT_NOT_NULL_MESSAGE(dest_mem_asm, "Lack of memory"); + TEST_ASSERT_NOT_NULL_MESSAGE(dest_mem_ansi, "Lack of memory"); + + // Save a pointer to the beginning of the allocated memory which will be used to free() + test_case->buf.p_src_alloc = src_mem_common; + test_case->buf.p_dest_asm_alloc = dest_mem_asm; + test_case->buf.p_dest_ansi_alloc = dest_mem_ansi; + + // Apply destination and source array unalignment + uint8_t *src_buf_common = (uint8_t *)src_mem_common + src_unalign_byte; + uint8_t *dest_buf_asm = (uint8_t *)dest_mem_asm + dest_unalign_byte; + uint8_t *dest_buf_ansi = (uint8_t *)dest_mem_ansi + dest_unalign_byte; + + // Set the whole buffer to 0, including the Canary pixels part + memset(src_buf_common, 0, src_buf_len * src_data_type_size); + memset(dest_buf_asm, 0, total_dest_buf_len * src_data_type_size); + memset(dest_buf_ansi, 0, total_dest_buf_len * src_data_type_size); + + switch (test_case->operation_type) { + case OPERATION_FILL: + // Fill the actual part of the destination buffers with known values, + // Values must be same, because of the stride + + if (test_case->color_format == LV_COLOR_FORMAT_RGB565) { + uint16_t *dest_buf_asm_uint16 = (uint16_t *)dest_buf_asm; + uint16_t *dest_buf_ansi_uint16 = (uint16_t *)dest_buf_ansi; + uint16_t *src_buf_uint16 = (uint16_t*)src_buf_common; + + // Fill destination buffers + for (int i = 0; i < active_dest_buf_len; i++) { + dest_buf_asm_uint16[canary_pixels + i] = i + ((i & 1) ? 0x6699 : 0x9966); + dest_buf_ansi_uint16[canary_pixels + i] = dest_buf_asm_uint16[canary_pixels + i]; + } + + // Fill source buffer + for (int i = 0; i < src_buf_len; i++) { + src_buf_uint16[i] = i + ((i & 1) ? 0x55AA : 0xAA55); + } + } + + break; + default: + TEST_ASSERT_MESSAGE(false, "LV Operation not found"); + break; + } + + // Shift array pointers by (Canary pixels amount * data type length) forward + dest_buf_asm += canary_pixels * dest_data_type_size; + dest_buf_ansi += canary_pixels * dest_data_type_size; + + // Save a pointer to the working part of the memory, where the test data are stored + test_case->buf.p_src = (void *)src_buf_common; + test_case->buf.p_dest_asm = (void *)dest_buf_asm; + test_case->buf.p_dest_ansi = (void *)dest_buf_ansi; + +#if DBG_PRINT_OUTPUT + printf("Destination buffers fill:\n"); + for (uint32_t i = 0; i < test_case->active_dest_buf_len; i++) { + printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx16" \t asm = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_dest_ansi)[i], ((uint16_t *)test_case->buf.p_dest_asm)[i]); + } + printf("\n"); + + printf("Source buffer fill:\n"); + for (uint32_t i = 0; i < test_case->src_buf_len; i++) { + printf("src_buf[%"PRIi32"] %s = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_src)[i]); + } + printf("\n"); +#endif + +} + +static void test_eval_image_16bit_data(func_test_case_lv_image_params_t *test_case) +{ + // Print results, 16bit data +#if DBG_PRINT_OUTPUT + printf("\nEval\nDestination buffers fill:\n"); + for (uint32_t i = 0; i < test_case->total_dest_buf_len; i++) { + printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx16" \t asm = %8"PRIx16" %s \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_dest_ansi)[i], ((uint16_t *)test_case->buf.p_dest_asm)[i], (((uint16_t *)test_case->buf.p_dest_ansi)[i] == ((uint16_t *)test_case->buf.p_dest_asm)[i]) ? ("OK") : ("FAIL")); + } + printf("\n"); + + printf("Source buffer fill:\n"); + for (uint32_t i = 0; i < test_case->src_buf_len; i++) { + printf("src_buf[%"PRIi32"] %s = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_src)[i]); + } + printf("\n"); +#endif + + // Canary pixels area must stay 0 + const size_t canary_pixels = test_case->canary_pixels; + TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_ansi, canary_pixels, test_msg_buf); + TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_asm, canary_pixels, test_msg_buf); + + // dest_buf_asm and dest_buf_ansi must be equal + TEST_ASSERT_EQUAL_UINT16_ARRAY_MESSAGE((uint16_t *)test_case->buf.p_dest_ansi + canary_pixels, (uint16_t *)test_case->buf.p_dest_asm + canary_pixels, test_case->active_dest_buf_len, test_msg_buf); + + // Data part of the destination buffer and source buffer (not considering matrix padding) must be equal + uint16_t *dest_row_begin = (uint16_t *)test_case->buf.p_dest_asm + canary_pixels; + uint16_t *src_row_begin = (uint16_t *)test_case->buf.p_src; + for (int row = 0; row < test_case->dest_h; row++) { + TEST_ASSERT_EQUAL_UINT16_ARRAY_MESSAGE(dest_row_begin, src_row_begin, test_case->dest_w, test_msg_buf); + dest_row_begin += test_case->dest_stride; // Move pointer of the destination buffer to the next row + src_row_begin += test_case->src_stride; // Move pointer of the source buffer to the next row + } + + // Canary pixels area must stay 0 + TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_ansi + (test_case->total_dest_buf_len - canary_pixels), canary_pixels, test_msg_buf); + TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_asm + (test_case->total_dest_buf_len - canary_pixels), canary_pixels, test_msg_buf); +}