From 4e493251cb39157f630e4b3d83c58bed1d877b47 Mon Sep 17 00:00:00 2001 From: Pavel Borcin Date: Thu, 28 Nov 2024 11:34:19 +0100 Subject: [PATCH] feature(lvgl_port): RGB888 SIMD fill --- components/esp_lvgl_port/CMakeLists.txt | 1 + .../include/esp_lvgl_port_lv_blend.h | 19 + .../simd/lv_color_blend_to_argb8888_esp32s3.S | 3 +- .../simd/lv_color_blend_to_rgb565_esp32s3.S | 3 +- .../simd/lv_color_blend_to_rgb888_esp32.S | 105 ++ .../simd/lv_color_blend_to_rgb888_esp32s3.S | 351 +++++++ .../esp_lvgl_port/test_apps/simd/README.md | 3 + .../include/lv_draw_sw_blend_to_rgb888.h | 53 + .../lv_blend/src/lv_draw_sw_blend_to_rgb888.c | 953 ++++++++++++++++++ .../test_apps/simd/main/lv_fill_common.h | 6 +- .../simd/main/test_lv_fill_benchmark.c | 47 +- .../simd/main/test_lv_fill_functionality.c | 88 +- 12 files changed, 1613 insertions(+), 19 deletions(-) create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c diff --git a/components/esp_lvgl_port/CMakeLists.txt b/components/esp_lvgl_port/CMakeLists.txt index 8dc53693..0148fd77 100644 --- a/components/esp_lvgl_port/CMakeLists.txt +++ b/components/esp_lvgl_port/CMakeLists.txt @@ -94,6 +94,7 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0")) # Force link .S files set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_argb8888_esp") set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb565_esp") + set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb888_esp") endif() endif() diff --git a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h index c00de1c0..cb9560ab 100644 --- a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h +++ b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h @@ -32,6 +32,10 @@ extern "C" { _lv_color_blend_to_rgb565_esp(dsc) #endif +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888 +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888(dsc) \ + _lv_color_blend_to_rgb888_esp(dsc) +#endif /********************** * TYPEDEFS @@ -83,6 +87,21 @@ static inline lv_result_t _lv_color_blend_to_rgb565_esp(_lv_draw_sw_blend_fill_d return lv_color_blend_to_rgb565_esp(&asm_dsc); } +extern int lv_color_blend_to_rgb888_esp(asm_dsc_t *asm_dsc); + +static inline lv_result_t _lv_color_blend_to_rgb888_esp(_lv_draw_sw_blend_fill_dsc_t *dsc) +{ + asm_dsc_t asm_dsc = { + .dst_buf = dsc->dest_buf, + .dst_w = dsc->dest_w, + .dst_h = dsc->dest_h, + .dst_stride = dsc->dest_stride, + .src_buf = &dsc->color, + }; + + return lv_color_blend_to_rgb888_esp(&asm_dsc); +} + #endif // CONFIG_LV_DRAW_SW_ASM_CUSTOM #ifdef __cplusplus diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S index bb3956e6..10276f4f 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S @@ -32,8 +32,7 @@ lv_color_blend_to_argb8888_esp: - entry a1, 32 - ee.zero.q q0 // dummy TIE instruction, to enable the TIE + entry a1, 32 l32i.n a3, a2, 4 // a3 - dest_buff l32i.n a4, a2, 8 // a4 - dest_w in uint32_t diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S index ee9f8a9c..3a9fe43c 100644 --- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S @@ -31,8 +31,7 @@ lv_color_blend_to_rgb565_esp: - entry a1, 32 - ee.zero.q q0 // dummy TIE instruction, to enable the TIE + entry a1, 32 l32i.n a3, a2, 4 // a3 - dest_buff l32i.n a4, a2, 8 // a4 - dest_w in uint16_t diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S new file mode 100644 index 00000000..467b5348 --- /dev/null +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S @@ -0,0 +1,105 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +// This is LVGL RGB888 simple fill for ESP32 processor + + .section .text + .align 4 + .global lv_color_blend_to_rgb888_esp + .type lv_color_blend_to_rgb888_esp,@function +// The function implements the following C code: +// void lv_color_blend_to_rgb888(_lv_draw_sw_blend_fill_dsc_t * dsc); + +// Input params +// +// dsc - a2 + +// typedef struct { +// uint32_t opa; l32i 0 +// void * dst_buf; l32i 4 +// uint32_t dst_w; l32i 8 +// uint32_t dst_h; l32i 12 +// uint32_t dst_stride; l32i 16 +// const void * src_buf; l32i 20 +// uint32_t src_stride; l32i 24 +// const lv_opa_t * mask_buf; l32i 28 +// uint32_t mask_stride; l32i 32 +// } asm_dsc_t; + +lv_color_blend_to_rgb888_esp: + + entry a1, 32 + + l32i.n a3, a2, 4 // a3 - dest_buff + l32i.n a4, a2, 8 // a4 - dest_w in uint24_t + l32i.n a5, a2, 12 // a5 - dest_h in uint16_t + l32i.n a6, a2, 16 // a6 - dest_stride in bytes + l32i.n a7, a2, 20 // a7 - src_buff (color) + l32i.n a8, a7, 0 // a8 - color as value + + // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4 + slli a11, a4, 1 // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w + add a11, a11, a4 // a11 - dest_w_bytes = a11 + a4 + + // Prepare register combinations + // a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR + l8ui a13, a7, 0 // blue 000B + slli a13, a13, 24 // shift to B000 + or a13, a13, a8 // a13 BRGB + + srli a14, a8, 8 // a14 00RG + slli a10, a8, 16 // a10 GB00 + or a14, a14, a10 // a14 GBRG + + slli a15, a8, 8 // a15 RGB0 + l8ui a10, a7, 2 // a7 000R + or a15, a15, a10 // a15 RGBR + + sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes + + // Prepare main loop length and dest_w_bytes + srli a9, a4, 2 // a9 = loop_len = dest_w / 4, calculate main loop_len for original dest_w + movi.n a8, 0x3 // a8 = 0x3, remainder mask + and a10, a4, a8 // a10 - remainder after division by 4 = a4 and 0x3 + + .outer_loop: + + // Run main loop which sets 12 bytes (4 rgb888) in one loop run + loopnez a9, ._main_loop + s32i.n a13, a3, 0 // save 32 bits from 32-bit color a13 to dest_buff a3, offset 0 + s32i.n a14, a3, 4 // save 32 bits from 32-bit color a14 to dest_buff a3, offset 4 + s32i.n a15, a3, 8 // save 32 bits from 32-bit color a15 to dest_buff a3, offset 8 + addi.n a3, a3, 12 // increment dest_buff pointer by 12 + ._main_loop: + + bnei a10, 0x3, _less_than_3 // branch if less than 3 values left + s32i.n a13, a3, 0 // save 32 bits from a13 to dest_buff a3, offset 0 bytes + s32i.n a14, a3, 4 // save 32 bits from a14 to dest_buff a3, offset 4 bytes + s8i a15, a3, 8 // save 8 bits from a15 to dest_buff a3, offset 8 bytes + addi.n a3, a3, 9 // increment dest_buff pointer by 9 bytes + j _less_than_1 + _less_than_3: + + bnei a10, 0x2, _less_than_2 // branch if less than 2 values left + s32i.n a13, a3, 0 // save 32 bits from a13 to dest_buff a3, offset 0 bytes + s16i a14, a3, 4 // save 16 bits from a14 to dest_buff a3, offset 4 bytes + addi.n a3, a3, 6 // increment dest_buff pointer by 6 bytes + j _less_than_1 + _less_than_2: + + bnei a10, 0x1, _less_than_1 // branch if less than 1 value left + s16i a13, a3, 0 // save 16 bits from a13 to dest_buff a3, offset 0 bytes + s8i a15, a3, 2 // save 8 bits from a15 to dest_buff a3, offset 2 bytes + addi.n a3, a3, 3 // increment dest_buff pointer by 3 bytes + _less_than_1: + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + and a7, a8, a3 // a7 = dest_buff AND 0x3 (check if the address is 4-byte aligned) + bnez a5, .outer_loop + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S new file mode 100644 index 00000000..bb69f75e --- /dev/null +++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S @@ -0,0 +1,351 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + */ + +// This is LVGL RGB888 simple fill for ESP32S3 processor + + .section .text + .align 4 + .global lv_color_blend_to_rgb888_esp + .type lv_color_blend_to_rgb888_esp,@function +// The function implements the following C code: +// void lv_color_blend_to_rgb888(_lv_draw_sw_blend_fill_dsc_t * dsc); + +// Input params +// +// dsc - a2 + +// typedef struct { +// uint32_t opa; l32i 0 +// void * dst_buf; l32i 4 +// uint32_t dst_w; l32i 8 +// uint32_t dst_h; l32i 12 +// uint32_t dst_stride; l32i 16 +// const void * src_buf; l32i 20 +// uint32_t src_stride; l32i 24 +// const lv_opa_t * mask_buf; l32i 28 +// uint32_t mask_stride; l32i 32 +// } asm_dsc_t; + +lv_color_blend_to_rgb888_esp: + + entry a1, 32 + + l32i.n a3, a2, 4 // a3 - dest_buff + l32i.n a4, a2, 8 // a4 - dest_w in uint24_t + l32i.n a5, a2, 12 // a5 - dest_h in uint16_t + l32i.n a6, a2, 16 // a6 - dest_stride in bytes + l32i.n a7, a2, 20 // a7 - src_buff (color) + l32i.n a8, a7, 0 // a8 - color as value + + // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4 + slli a11, a4, 1 // a11 - dest_w_bytes = 2 * dest_w + add a11, a11, a4 // a11 - dest_w_bytes = a11 + a4 + + // Prepare register combinations + // a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR + l8ui a13, a7, 0 // blue 000B + slli a13, a13, 24 // shift to B000 + or a13, a13, a8 // a13 BRGB + + srli a14, a8, 8 // a14 00RG + slli a10, a8, 16 // a10 GB00 + or a14, a14, a10 // a14 GBRG + + slli a15, a8, 8 // a15 RGB0 + l8ui a10, a7, 2 // a7 000R + or a15, a15, a10 // a15 RGBR + + sub a6, a6, a11 // dest_stride = dest_stride - dest_w_bytes + + // Check for short lengths + // dest_w should be at least 12, othewise it's not worth using esp32s3 TIE + bgei a4, 12, _esp32s3_implementation // Branch if dest_w is greater than or equal to 12 + j .lv_color_blend_to_rgb888_esp32_body // Jump to esp32 implementation + + _esp32s3_implementation: + + // Prepare q registers for the main loop + ee.movi.32.q q3, a13, 0 // fill q3 register from a13 by 32 bits + ee.movi.32.q q3, a14, 1 // fill q3 register from a14 by 32 bits + ee.movi.32.q q3, a15, 2 // fill q3 register from a15 by 32 bits + ee.movi.32.q q3, a13, 3 // fill q3 register from a13 by 32 bits + + ee.movi.32.q q4, a14, 0 // fill q4 register from a14 by 32 bits + ee.movi.32.q q4, a15, 1 // fill q4 register from a15 by 32 bits + ee.movi.32.q q4, a13, 2 // fill q4 register from a13 by 32 bits + ee.movi.32.q q4, a14, 3 // fill q4 register from a14 by 32 bits + + ee.movi.32.q q5, a15, 0 // fill q5 register from a15 by 32 bits + ee.movi.32.q q5, a13, 1 // fill q5 register from a13 by 32 bits + ee.movi.32.q q5, a14, 2 // fill q5 register from a14 by 32 bits + ee.movi.32.q q5, a15, 3 // fill q5 register from a15 by 32 bits + + .outer_loop_aligned: + + // Copy q3 to q0 + ee.zero.q q0 // clear q0 + ee.orq q0, q0, q3 // copy q3 to q0 + + // Copy q4 to q1 + ee.zero.q q1 // clear q1 + ee.orq q1, q1, q4 // copy q4 to q1 + + // Copy q5 to q2 + ee.zero.q q2 // clear q2 + ee.orq q2, q2, q5 // copy q5 to q2 + + + // alignment check + extui a8, a3, 0, 4 // a8 = a3 AND 0xf + + // if a8 = 0 skip unalignment computation + bnez a8, _unaligned_dest_buff // If already aligned, jump to aligned + mov.n a10, a11 // a10 - local_dest_w_bytes = dest_w_bytes + j _aligned_dest_buff + _unaligned_dest_buff: + + // length + movi.n a12, 16 // a12 - 16 + sub a2, a12, a8 // a2 = 16 - unalignment (lower 4 bits of dest_buff address) + sub a10, a11, a2 // local_dest_w_bytes = len - (16 - unalignment) + + _aligned_dest_buff: + movi a12, 48 // a12 = 48 (main loop copies 48 bytes) + quou a9, a10, a12 // a9 = local_dest_w_bytes (a10) DIV 48 (a12) + remu a10, a10, a12 // a10 = local_dest_w_bytes (a10) remainder div 48 (a12) + + beqz a8, _dest_buff_aligned // If already aligned, skip aligning + + movi a7, unalignment_table // Load unalignment_table address + + addx4 a7, a8, a7 // a7 = offset * 4 + jump_table address + l32i a7, a7, 0 // Load target address from jump table + jx a7 // Jump to the corresponding handler + + +// a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR +handle_0: +handle_1: + s8i a13, a3, 0 // save 8 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + s16i a14, a3, 0 // save 16 bits from a14 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + s32i a13, a3, 0 // save 32 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + ee.vst.l.64.ip q1, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + j _shift_q_regs +handle_2: + s16i a13, a3, 0 // save 16 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + s32i a15, a3, 0 // save 32 bits from a15 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + ee.vst.l.64.ip q0, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + j _shift_q_regs +handle_3: + s8i a13, a3, 0 // save 8 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + s32i a14, a3, 0 // save 32 bits from a14 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + ee.vst.l.64.ip q2, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + j _shift_q_regs +handle_4: + s32i a13, a3, 0 // save 32 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + ee.vst.l.64.ip q1, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + j _shift_q_regs +handle_5: + s8i a13, a3, 0 // save 8 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + s16i a14, a3, 0 // save 16 bits from a14 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + ee.vst.l.64.ip q0, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + j _shift_q_regs +handle_6: + s16i a13, a3, 0 // save 16 bits from a13 to dest_buff a3, offset 0 byte + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + ee.vst.l.64.ip q2, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + j _shift_q_regs +handle_7: + s8i a13, a3, 0 // save 8 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + ee.vst.l.64.ip q1, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + j _shift_q_regs +handle_8: + ee.vst.l.64.ip q0, a3, 8 // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes + j _shift_q_regs + +handle_9: + s8i a13, a3, 0 // save 8 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + s16i a14, a3, 0 // save 16 bits from a14 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + s32i a13, a3, 0 // save 32 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + j _shift_q_regs +handle_10: + s16i a13, a3, 0 // save 16 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + s32i a15, a3, 0 // save 32 bits from a15 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + j _shift_q_regs +handle_11: + s8i a13, a3, 0 // save 8 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + s32i a14, a3, 0 // save 32 bits from a14 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + j _shift_q_regs +handle_12: + s32i a13, a3, 0 // save 32 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + j _shift_q_regs +handle_13: + s8i a13, a3, 0 // save 8 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + s16i a14, a3, 0 // save 16 bits from a14 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + j _shift_q_regs +handle_14: + s16i a13, a3, 0 // save 16 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + j _shift_q_regs +handle_15: + s8i a13, a3, 0 // save 8 bits from a13 to dest_buff a3, offset 0 bytes + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + j _shift_q_regs + +.align 4 + +unalignment_table: + .word handle_0 // Case 0: Dummy case for easier address computation + .word handle_1 // Case 1: Align 15 bytes + .word handle_2 // Case 2: Align 14 bytes + .word handle_3 // Case 3: Align 13 bytes + .word handle_4 // Case 4: Align 12 bytes + .word handle_5 // Case 5: Align 11 bytes + .word handle_6 // Case 6: Align 10 bytes + .word handle_7 // Case 7: Align 9 bytes + .word handle_8 // Case 8: Align 8 bytes + .word handle_9 // Case 9: Align 7 bytes + .word handle_10 // Case 10: Align 6 bytes + .word handle_11 // Case 11: Align 5 bytes + .word handle_12 // Case 12: Align 4 bytes + .word handle_13 // Case 13: Align 3 bytes + .word handle_14 // Case 14: Align 2 bytes + .word handle_15 // Case 15: Align 1 byte + + + _shift_q_regs: + wur.sar_byte a2 // apply unalignment to the SAR_BYTE + ee.src.q q0, q0, q1 // shift concat. of q0 and q1 to q0 by SAR_BYTE amount + ee.src.q q1, q1, q2 // shift concat. of q1 and q2 to q1 by SAR_BYTE amount + ee.src.q q2, q2, q3 // shift concat. of q2 and q3 to q2 by SAR_BYTE amount + + _dest_buff_aligned: + loopnez a9, ._main_loop_aligned // 48 bytes (16 rgb888) in one loop + ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3 + ee.vst.128.ip q1, a3, 16 // store 16 bytes from q1 to dest_buff a3 + ee.vst.128.ip q2, a3, 16 // store 16 bytes from q2 to dest_buff a3 + ._main_loop_aligned: + + // Check modulo 32 of the unalignment, if - then set 32 bytes + bbci a10, 5, .lt_32 // branch if 5-th bit of local_dest_w_bytes a10 is clear + ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3 + ee.vst.128.ip q1, a3, 16 // store 16 bytes from q1 to dest_buff a3 + + ee.srci.2q q0, q1, 1 // shift q0 register to have next bytes to store ready from LSB + .lt_32: + + // Check modulo 16 of the unalignment, if - then set 16 bytes + bbci a10, 4, .lt_16 // branch if 4-th bit of local_dest_w_bytes a10 is clear + ee.vst.128.ip q0, a3, 16 // store 16 bytes from q0 to dest_buff a3 + + ee.srci.2q q0, q1, 0 // shift q0 register to have next bytes to store ready from LSB + .lt_16: + + // Check modulo 8 of the unalignment, if - then set 8 bytes + bbci a10, 3, .lt_8 + ee.vst.l.64.ip q0, a3, 8 // store 8 bytes from q0 to dest_buff a3 + + ee.srci.2q q0, q1, 1 // shift q0 register to have next bytes to store ready from LSB + .lt_8: + + // Check modulo 4 of the unalignment, if - then set 4 bytes + bbci a10, 2, .lt_4 + ee.movi.32.a q0, a2, 0 // move lowest 32 bits of q0 to a2 + s32i.n a2, a3, 0 // save 32 bits from a2 to dest_buff a3, offset 0 + addi.n a3, a3, 4 // increment dest_buff pointer by 4 bytes + + ee.srci.2q q0, q1, 0 // shift q0 register to have next bytes to store ready from LSB + .lt_4: + + // Check modulo 2 of the unalignment, if - then set 2 bytes + bbci a10, 1, .lt_2 + ee.movi.32.a q0, a2, 0 // move lowest 32 bits of q0 to a2 + s16i a2, a3, 0 // save 16 bits from a2 to dest_buff a3, offset 0 + addi.n a3, a3, 2 // increment dest_buff pointer by 2 bytes + + ee.srci.2q q0, q1, 1 // shift q0 register to have next bytes to store ready from LSB + .lt_2: + + // Check modulo 1 of the unalignment, if - then set 1 byte + bbci a10, 0, .lt_1 + ee.movi.32.a q0, a2, 0 // move lowest 32 bits of q0 to a2 + s8i a2, a3, 0 // save 8 bits from a2 to dest_buff a3, offset 0 + addi.n a3, a3, 1 // increment dest_buff pointer by 1 byte + .lt_1: + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + bnez a5, .outer_loop_aligned + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return + + .lv_color_blend_to_rgb888_esp32_body: + + // Prepare main loop length and dest_w_bytes + srli a9, a4, 2 // a9 = loop_len = dest_w / 4, calculate main loop_len for original dest_w + movi.n a8, 0x3 // a8 = 0x3, remainder mask + and a10, a4, a8 // a10 - remainder after division by 4 = a4 & 0x3 + + .outer_loop: + + // Run main loop which sets 12 bytes (4 rgb888) in one loop run + loopnez a9, ._main_loop + s32i.n a13, a3, 0 // save 32 bits from 32-bit color a13 to dest_buff a3, offset 0 + s32i.n a14, a3, 4 // save 32 bits from 32-bit color a14 to dest_buff a3, offset 4 + s32i.n a15, a3, 8 // save 32 bits from 32-bit color a15 to dest_buff a3, offset 8 + addi.n a3, a3, 12 // increment dest_buff pointer by 12 + ._main_loop: + + bnei a10, 0x3, _less_than_3 // branch if less than 3 values left + s32i.n a13, a3, 0 // save 32 bits from a13 to dest_buff a3, offset 0 bytes + s32i.n a14, a3, 4 // save 32 bits from a14 to dest_buff a3, offset 4 bytes + s8i a15, a3, 8 // save 8 bits from a15 to dest_buff a3, offset 8 bytes + addi.n a3, a3, 9 // increment dest_buff pointer by 9 bytes + j _less_than_1 + _less_than_3: + + bnei a10, 0x2, _less_than_2 // branch if less than 2 values left + s32i.n a13, a3, 0 // save 32 bits from a13 to dest_buff a3, offset 0 bytes + s16i a14, a3, 4 // save 16 bits from a14 to dest_buff a3, offset 4 bytes + addi.n a3, a3, 6 // increment dest_buff pointer by 6 bytes + j _less_than_1 + _less_than_2: + + bnei a10, 0x1, _less_than_1 // branch if less than 1 value left + s16i a13, a3, 0 // save 16 bits from a13 to dest_buff a3, offset 0 bytes + s8i a15, a3, 2 // save 8 bits from a15 to dest_buff a3, offset 2 bytes + addi.n a3, a3, 3 // increment dest_buff pointer by 3 bytes + _less_than_1: + + add a3, a3, a6 // dest_buff + dest_stride + addi.n a5, a5, -1 // decrease the outer loop + and a7, a8, a3 // a7 = dest_buff AND 0x3 (chck if the address is 4-byte aligned) + bnez a5, .outer_loop + + movi.n a2, 1 // return LV_RESULT_OK = 1 + retw.n // return diff --git a/components/esp_lvgl_port/test_apps/simd/README.md b/components/esp_lvgl_port/test_apps/simd/README.md index d319e2e3..6a994d31 100644 --- a/components/esp_lvgl_port/test_apps/simd/README.md +++ b/components/esp_lvgl_port/test_apps/simd/README.md @@ -12,6 +12,8 @@ Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) | | 127x127 | 1 byte | 0.488 | 1.597 | | RGB565 | 128x128 | 16 byte | 0.196 | 1.146 | | | 127x127 | 1 byte | 0.497 | 1.124 | +| RGB888 | 128x128 | 16 byte | 0.608 | 2.247 | +| | 127x127 | 1 byte | 0.826 | 2.413 | * this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x127 1 byte aligned matrix (worst case) * the values represent cycles per sample to perform simple fill of the matrix on esp32s3 @@ -118,3 +120,4 @@ Example of an best and corner case input parameters for benchmark test, for a co | :----------------- | :--------------- | :------------- | :------------- | :------------- | | Best case | 16-byte aligned | Multiple of 8 | Multiple of 8 | Multiple of 8 | | Corner case | 1-byte aligned | Not power of 2 | Not power of 2 | Not power of 2 | + diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h new file mode 100644 index 00000000..3c7ac340 --- /dev/null +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h @@ -0,0 +1,53 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + * + * This file is derived from the LVGL project. + * See https://github.com/lvgl/lvgl for details. + */ + +/** + * @file lv_draw_sw_blend_rgb888.h + * + */ + +#ifndef LV_DRAW_SW_BLEND_RGB888_H +#define LV_DRAW_SW_BLEND_RGB888_H + +#ifdef __cplusplus +extern "C" { +#endif + +/********************* + * INCLUDES + *********************/ +#include "lv_draw_sw_blend.h" + +/********************* + * DEFINES + *********************/ + +/********************** + * TYPEDEFS + **********************/ + +/********************** + * GLOBAL PROTOTYPES + **********************/ + +void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_color_to_rgb888(_lv_draw_sw_blend_fill_dsc_t *dsc, + uint32_t dest_px_size); + +void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_image_to_rgb888(_lv_draw_sw_blend_image_dsc_t *dsc, + uint32_t dest_px_size); + +/********************** + * MACROS + **********************/ + +#ifdef __cplusplus +} /*extern "C"*/ +#endif + +#endif /*LV_DRAW_SW_BLEND_RGB888_H*/ diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c new file mode 100644 index 00000000..344c6d1b --- /dev/null +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c @@ -0,0 +1,953 @@ +/* + * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD + * + * SPDX-License-Identifier: Apache-2.0 + * + * This file is derived from the LVGL project. + * See https://github.com/lvgl/lvgl for details. + */ + +/** + * @file lv_draw_sw_blend_to_rgb888.c + * + */ + +/********************* + * INCLUDES + *********************/ +#include "lv_draw_sw_blend_to_rgb888.h" + +#include "lv_assert.h" +#include "lv_types.h" +#include "lv_log.h" +#include "lv_draw_sw_blend.h" +#include "lv_math.h" +#include "lv_color.h" +#include "string.h" + +#include "esp_lvgl_port_lv_blend.h" + +/********************* + * DEFINES + *********************/ + +#define LV_ATTRIBUTE_FAST_MEM + +/********************** + * TYPEDEFS + **********************/ + +/********************** + * STATIC PROTOTYPES + **********************/ + +static void /* LV_ATTRIBUTE_FAST_MEM */ al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size); + +static void /* LV_ATTRIBUTE_FAST_MEM */ i1_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size); + +static inline uint8_t /* LV_ATTRIBUTE_FAST_MEM */ get_bit(const uint8_t *buf, int32_t bit_idx); + +static void /* LV_ATTRIBUTE_FAST_MEM */ l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size); + +static void /* LV_ATTRIBUTE_FAST_MEM */ rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size); + +static void /* LV_ATTRIBUTE_FAST_MEM */ rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, + const uint8_t dest_px_size, + uint32_t src_px_size); + +static void /* LV_ATTRIBUTE_FAST_MEM */ argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, + uint32_t dest_px_size); + +static inline void /* LV_ATTRIBUTE_FAST_MEM */ lv_color_8_24_mix(const uint8_t src, uint8_t *dest, uint8_t mix); + +static inline void /* LV_ATTRIBUTE_FAST_MEM */ lv_color_24_24_mix(const uint8_t *src, uint8_t *dest, uint8_t mix); + +static inline void /* LV_ATTRIBUTE_FAST_MEM */ blend_non_normal_pixel(uint8_t *dest, lv_color32_t src, + lv_blend_mode_t mode); +static inline void * /* LV_ATTRIBUTE_FAST_MEM */ drawbuf_next_row(const void *buf, uint32_t stride); + +/********************** + * STATIC VARIABLES + **********************/ + +/********************** + * MACROS + **********************/ + +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888 +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888 +#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888 +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888 +#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888 +#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888 +#define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA +#define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK +#define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK(...) LV_RESULT_INVALID +#endif + +#ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA +#define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA(...) LV_RESULT_INVALID +#endif + +/********************** + * GLOBAL FUNCTIONS + **********************/ + +void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_color_to_rgb888(_lv_draw_sw_blend_fill_dsc_t *dsc, uint32_t dest_px_size) +{ + int32_t w = dsc->dest_w; + int32_t h = dsc->dest_h; + lv_opa_t opa = dsc->opa; + const lv_opa_t *mask = dsc->mask_buf; + int32_t mask_stride = dsc->mask_stride; + int32_t dest_stride = dsc->dest_stride; + + int32_t x; + int32_t y; + + LV_UNUSED(w); + LV_UNUSED(h); + LV_UNUSED(x); + LV_UNUSED(y); + LV_UNUSED(opa); + LV_UNUSED(mask); + LV_UNUSED(mask_stride); + LV_UNUSED(dest_stride); + + /*Simple fill*/ + if (mask == NULL && opa >= LV_OPA_MAX) { + if (dsc->use_asm && dest_px_size == 3) { + LV_DRAW_SW_COLOR_BLEND_TO_RGB888(dsc); + } else { + if (dest_px_size == 3) { + uint8_t *dest_buf_u8 = dsc->dest_buf; + uint8_t *dest_buf_ori = dsc->dest_buf; + w *= dest_px_size; + + for (x = 0; x < w; x += 3) { + dest_buf_u8[x + 0] = dsc->color.blue; + dest_buf_u8[x + 1] = dsc->color.green; + dest_buf_u8[x + 2] = dsc->color.red; + } + + dest_buf_u8 += dest_stride; + + for (y = 1; y < h; y++) { + // TODO: lv_memcpy + memcpy(dest_buf_u8, dest_buf_ori, w); + dest_buf_u8 += dest_stride; + } + } + if (dest_px_size == 4) { + uint32_t color32 = lv_color_to_u32(dsc->color); + uint32_t *dest_buf_u32 = dsc->dest_buf; + for (y = 0; y < h; y++) { + for (x = 0; x <= w - 16; x += 16) { + dest_buf_u32[x + 0] = color32; + dest_buf_u32[x + 1] = color32; + dest_buf_u32[x + 2] = color32; + dest_buf_u32[x + 3] = color32; + + dest_buf_u32[x + 4] = color32; + dest_buf_u32[x + 5] = color32; + dest_buf_u32[x + 6] = color32; + dest_buf_u32[x + 7] = color32; + + dest_buf_u32[x + 8] = color32; + dest_buf_u32[x + 9] = color32; + dest_buf_u32[x + 10] = color32; + dest_buf_u32[x + 11] = color32; + + dest_buf_u32[x + 12] = color32; + dest_buf_u32[x + 13] = color32; + dest_buf_u32[x + 14] = color32; + dest_buf_u32[x + 15] = color32; + } + for (; x < w; x ++) { + dest_buf_u32[x] = color32; + } + + dest_buf_u32 = drawbuf_next_row(dest_buf_u32, dest_stride); + } + } + } + } + /*Opacity only*/ + else if (mask == NULL && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA(dsc, dest_px_size)) { + uint32_t color32 = lv_color_to_u32(dsc->color); + uint8_t *dest_buf = dsc->dest_buf; + w *= dest_px_size; + for (y = 0; y < h; y++) { + for (x = 0; x < w; x += dest_px_size) { + lv_color_24_24_mix((const uint8_t *)&color32, &dest_buf[x], opa); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + } + } + } + /*Masked with full opacity*/ + else if (mask && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK(dsc, dest_px_size)) { + uint32_t color32 = lv_color_to_u32(dsc->color); + uint8_t *dest_buf = dsc->dest_buf; + w *= dest_px_size; + + for (y = 0; y < h; y++) { + uint32_t mask_x; + for (x = 0, mask_x = 0; x < w; x += dest_px_size, mask_x++) { + lv_color_24_24_mix((const uint8_t *)&color32, &dest_buf[x], mask[mask_x]); + } + dest_buf += dest_stride; + mask += mask_stride; + } + } + } + /*Masked with opacity*/ + else { + if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) { + uint32_t color32 = lv_color_to_u32(dsc->color); + uint8_t *dest_buf = dsc->dest_buf; + w *= dest_px_size; + + for (y = 0; y < h; y++) { + uint32_t mask_x; + for (x = 0, mask_x = 0; x < w; x += dest_px_size, mask_x++) { + lv_color_24_24_mix((const uint8_t *) &color32, &dest_buf[x], LV_OPA_MIX2(opa, mask[mask_x])); + } + dest_buf += dest_stride; + mask += mask_stride; + } + } + } +} + +void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_image_to_rgb888(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size) +{ + + switch (dsc->src_color_format) { + case LV_COLOR_FORMAT_RGB565: + rgb565_image_blend(dsc, dest_px_size); + break; + case LV_COLOR_FORMAT_RGB888: + rgb888_image_blend(dsc, dest_px_size, 3); + break; + case LV_COLOR_FORMAT_XRGB8888: + rgb888_image_blend(dsc, dest_px_size, 4); + break; + case LV_COLOR_FORMAT_ARGB8888: + argb8888_image_blend(dsc, dest_px_size); + break; + case LV_COLOR_FORMAT_L8: + l8_image_blend(dsc, dest_px_size); + break; + case LV_COLOR_FORMAT_AL88: + al88_image_blend(dsc, dest_px_size); + break; + case LV_COLOR_FORMAT_I1: + i1_image_blend(dsc, dest_px_size); + break; + default: + LV_LOG_WARN("Not supported source color format"); + break; + } +} + +/********************** + * STATIC FUNCTIONS + **********************/ + +static void LV_ATTRIBUTE_FAST_MEM i1_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size) +{ + int32_t w = dsc->dest_w; + int32_t h = dsc->dest_h; + lv_opa_t opa = dsc->opa; + uint8_t *dest_buf_u8 = dsc->dest_buf; + int32_t dest_stride = dsc->dest_stride; + const uint8_t *src_buf_i1 = dsc->src_buf; + int32_t src_stride = dsc->src_stride; + const lv_opa_t *mask_buf = dsc->mask_buf; + int32_t mask_stride = dsc->mask_stride; + + int32_t dest_x; + int32_t src_x; + int32_t y; + + if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) { + if (mask_buf == NULL && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888(dsc)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255; + dest_buf_u8[dest_x + 2] = chan_val; + dest_buf_u8[dest_x + 1] = chan_val; + dest_buf_u8[dest_x + 0] = chan_val; + } + dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride); + src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride); + } + } + } else if (mask_buf == NULL && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA(dsc)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255; + lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], opa); + } + dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride); + src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride); + } + } + } else if (mask_buf && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK(dsc)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255; + lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], mask_buf[src_x]); + } + dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride); + src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride); + mask_buf += mask_stride; + } + } + } else if (mask_buf && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA(dsc)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255; + lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x])); + } + dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride); + src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride); + mask_buf += mask_stride; + } + } + } + } else { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color32_t src_argb; + src_argb.red = get_bit(src_buf_i1, src_x) * 255; + src_argb.green = src_argb.red; + src_argb.blue = src_argb.red; + if (mask_buf == NULL) { + src_argb.alpha = opa; + } else { + src_argb.alpha = LV_OPA_MIX2(mask_buf[src_x], opa); + } + blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode); + } + if (mask_buf) { + mask_buf += mask_stride; + } + dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride); + src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride); + } + } +} + +static void LV_ATTRIBUTE_FAST_MEM al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size) +{ + int32_t w = dsc->dest_w; + int32_t h = dsc->dest_h; + lv_opa_t opa = dsc->opa; + uint8_t *dest_buf_u8 = dsc->dest_buf; + int32_t dest_stride = dsc->dest_stride; + const lv_color16a_t *src_buf_al88 = dsc->src_buf; + int32_t src_stride = dsc->src_stride; + const lv_opa_t *mask_buf = dsc->mask_buf; + int32_t mask_stride = dsc->mask_stride; + + int32_t dest_x; + int32_t src_x; + int32_t y; + + if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) { + if (mask_buf == NULL && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], src_buf_al88[src_x].alpha); + } + dest_buf_u8 += dest_stride; + src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride); + } + } + } else if (mask_buf == NULL && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa)); + } + dest_buf_u8 += dest_stride; + src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride); + } + } + } else if (mask_buf && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX2(src_buf_al88[src_x].alpha, + mask_buf[src_x])); + } + dest_buf_u8 += dest_stride; + src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride); + mask_buf += mask_stride; + } + } + } else if (mask_buf && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX3(src_buf_al88[src_x].alpha, + mask_buf[src_x], opa)); + } + dest_buf_u8 += dest_stride; + src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride); + mask_buf += mask_stride; + } + } + } + } else { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color32_t src_argb; + src_argb.red = src_argb.green = src_argb.blue = src_buf_al88[src_x].lumi; + if (mask_buf == NULL) { + src_argb.alpha = LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa); + } else { + src_argb.alpha = LV_OPA_MIX3(src_buf_al88[src_x].alpha, mask_buf[dest_x], opa); + } + blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode); + } + if (mask_buf) { + mask_buf += mask_stride; + } + dest_buf_u8 += dest_stride; + src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride); + } + } +} + +static void LV_ATTRIBUTE_FAST_MEM l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size) +{ + int32_t w = dsc->dest_w; + int32_t h = dsc->dest_h; + lv_opa_t opa = dsc->opa; + uint8_t *dest_buf_u8 = dsc->dest_buf; + int32_t dest_stride = dsc->dest_stride; + const uint8_t *src_buf_l8 = dsc->src_buf; + int32_t src_stride = dsc->src_stride; + const lv_opa_t *mask_buf = dsc->mask_buf; + int32_t mask_stride = dsc->mask_stride; + + int32_t dest_x; + int32_t src_x; + int32_t y; + + if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) { + if (mask_buf == NULL && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + dest_buf_u8[dest_x + 2] = src_buf_l8[src_x]; + dest_buf_u8[dest_x + 1] = src_buf_l8[src_x]; + dest_buf_u8[dest_x + 0] = src_buf_l8[src_x]; + } + dest_buf_u8 += dest_stride; + src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride); + } + } + } else if (mask_buf == NULL && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], opa); + } + dest_buf_u8 += dest_stride; + src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride); + } + } + } else if (mask_buf && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], mask_buf[src_x]); + } + dest_buf_u8 += dest_stride; + src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride); + mask_buf += mask_stride; + } + } + } else if (mask_buf && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x])); + } + dest_buf_u8 += dest_stride; + src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride); + mask_buf += mask_stride; + } + } + } + } else { + lv_color32_t src_argb; + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + src_argb.red = src_buf_l8[src_x]; + src_argb.green = src_buf_l8[src_x]; + src_argb.blue = src_buf_l8[src_x]; + if (mask_buf == NULL) { + src_argb.alpha = opa; + } else { + src_argb.alpha = LV_OPA_MIX2(mask_buf[dest_x], opa); + } + blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode); + } + if (mask_buf) { + mask_buf += mask_stride; + } + dest_buf_u8 += dest_stride; + src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride); + } + } +} + +static void LV_ATTRIBUTE_FAST_MEM rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size) +{ + int32_t w = dsc->dest_w; + int32_t h = dsc->dest_h; + lv_opa_t opa = dsc->opa; + uint8_t *dest_buf_u8 = dsc->dest_buf; + int32_t dest_stride = dsc->dest_stride; + const lv_color16_t *src_buf_c16 = (const lv_color16_t *) dsc->src_buf; + int32_t src_stride = dsc->src_stride; + const lv_opa_t *mask_buf = dsc->mask_buf; + int32_t mask_stride = dsc->mask_stride; + + int32_t src_x; + int32_t dest_x; + int32_t y; + + if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) { + if (mask_buf == NULL && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + dest_buf_u8[dest_x + 2] = (src_buf_c16[src_x].red * 2106) >> 8; /*To make it rounded*/ + dest_buf_u8[dest_x + 1] = (src_buf_c16[src_x].green * 1037) >> 8; + dest_buf_u8[dest_x + 0] = (src_buf_c16[src_x].blue * 2106) >> 8; + } + dest_buf_u8 += dest_stride; + src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride); + } + } + } else if (mask_buf == NULL && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) { + uint8_t res[3]; + for (y = 0; y < h; y++) { + for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + res[2] = (src_buf_c16[src_x].red * 2106) >> 8; /*To make it rounded*/ + res[1] = (src_buf_c16[src_x].green * 1037) >> 8; + res[0] = (src_buf_c16[src_x].blue * 2106) >> 8; + lv_color_24_24_mix(res, &dest_buf_u8[dest_x], opa); + } + dest_buf_u8 += dest_stride; + src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride); + } + } + } else if (mask_buf && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) { + uint8_t res[3]; + for (y = 0; y < h; y++) { + for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + res[2] = (src_buf_c16[src_x].red * 2106) >> 8; /*To make it rounded*/ + res[1] = (src_buf_c16[src_x].green * 1037) >> 8; + res[0] = (src_buf_c16[src_x].blue * 2106) >> 8; + lv_color_24_24_mix(res, &dest_buf_u8[dest_x], mask_buf[src_x]); + } + dest_buf_u8 += dest_stride; + src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride); + mask_buf += mask_stride; + } + } + } else { + if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) { + uint8_t res[3]; + for (y = 0; y < h; y++) { + for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + res[2] = (src_buf_c16[src_x].red * 2106) >> 8; /*To make it rounded*/ + res[1] = (src_buf_c16[src_x].green * 1037) >> 8; + res[0] = (src_buf_c16[src_x].blue * 2106) >> 8; + lv_color_24_24_mix(res, &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x])); + } + dest_buf_u8 += dest_stride; + src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride); + mask_buf += mask_stride; + } + } + } + } else { + lv_color32_t src_argb; + for (y = 0; y < h; y++) { + for (src_x = 0, dest_x = 0; src_x < w; src_x++, dest_x += dest_px_size) { + src_argb.red = (src_buf_c16[src_x].red * 2106) >> 8; + src_argb.green = (src_buf_c16[src_x].green * 1037) >> 8; + src_argb.blue = (src_buf_c16[src_x].blue * 2106) >> 8; + if (mask_buf == NULL) { + src_argb.alpha = opa; + } else { + src_argb.alpha = LV_OPA_MIX2(mask_buf[src_x], opa); + } + blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode); + } + if (mask_buf) { + mask_buf += mask_stride; + } + dest_buf_u8 += dest_stride; + src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride); + } + } +} + +static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, const uint8_t dest_px_size, + uint32_t src_px_size) +{ + int32_t w = dsc->dest_w * dest_px_size; + int32_t h = dsc->dest_h; + lv_opa_t opa = dsc->opa; + uint8_t *dest_buf = dsc->dest_buf; + int32_t dest_stride = dsc->dest_stride; + const uint8_t *src_buf = dsc->src_buf; + int32_t src_stride = dsc->src_stride; + const lv_opa_t *mask_buf = dsc->mask_buf; + int32_t mask_stride = dsc->mask_stride; + + int32_t dest_x; + int32_t src_x; + int32_t y; + + if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) { + /*Special case*/ + if (mask_buf == NULL && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size, src_px_size)) { + if (src_px_size == dest_px_size) { + for (y = 0; y < h; y++) { + memcpy(dest_buf, src_buf, w); + dest_buf += dest_stride; + src_buf += src_stride; + } + } else { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) { + dest_buf[dest_x + 0] = src_buf[src_x + 0]; + dest_buf[dest_x + 1] = src_buf[src_x + 1]; + dest_buf[dest_x + 2] = src_buf[src_x + 2]; + } + dest_buf += dest_stride; + src_buf += src_stride; + } + } + } + } + if (mask_buf == NULL && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size, src_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) { + lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], opa); + } + dest_buf += dest_stride; + src_buf += src_stride; + } + } + } + if (mask_buf && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size, src_px_size)) { + uint32_t mask_x; + for (y = 0; y < h; y++) { + for (mask_x = 0, dest_x = 0, src_x = 0; dest_x < w; mask_x++, dest_x += dest_px_size, src_x += src_px_size) { + lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], mask_buf[mask_x]); + } + dest_buf += dest_stride; + src_buf += src_stride; + mask_buf += mask_stride; + } + } + } + if (mask_buf && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size, src_px_size)) { + uint32_t mask_x; + for (y = 0; y < h; y++) { + for (mask_x = 0, dest_x = 0, src_x = 0; dest_x < w; mask_x++, dest_x += dest_px_size, src_x += src_px_size) { + lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], LV_OPA_MIX2(opa, mask_buf[mask_x])); + } + dest_buf += dest_stride; + src_buf += src_stride; + mask_buf += mask_stride; + } + } + } + } else { + lv_color32_t src_argb; + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) { + src_argb.red = src_buf[src_x + 2]; + src_argb.green = src_buf[src_x + 1]; + src_argb.blue = src_buf[src_x + 0]; + if (mask_buf == NULL) { + src_argb.alpha = opa; + } else { + src_argb.alpha = LV_OPA_MIX2(mask_buf[dest_x], opa); + } + + blend_non_normal_pixel(&dest_buf[dest_x], src_argb, dsc->blend_mode); + } + if (mask_buf) { + mask_buf += mask_stride; + } + dest_buf += dest_stride; + src_buf += src_stride; + } + } +} + +static void LV_ATTRIBUTE_FAST_MEM argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size) +{ + int32_t w = dsc->dest_w; + int32_t h = dsc->dest_h; + lv_opa_t opa = dsc->opa; + uint8_t *dest_buf = dsc->dest_buf; + int32_t dest_stride = dsc->dest_stride; + const lv_color32_t *src_buf_c32 = dsc->src_buf; + int32_t src_stride = dsc->src_stride; + const lv_opa_t *mask_buf = dsc->mask_buf; + int32_t mask_stride = dsc->mask_stride; + + int32_t dest_x; + int32_t src_x; + int32_t y; + + if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) { + if (mask_buf == NULL && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x], src_buf_c32[src_x].alpha); + } + dest_buf += dest_stride; + src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride); + } + } + } else if (mask_buf == NULL && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x], LV_OPA_MIX2(src_buf_c32[src_x].alpha, opa)); + } + dest_buf += dest_stride; + src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride); + } + } + } else if (mask_buf && opa >= LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x], + LV_OPA_MIX2(src_buf_c32[src_x].alpha, mask_buf[src_x])); + } + dest_buf += dest_stride; + src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride); + mask_buf += mask_stride; + } + } + } else if (mask_buf && opa < LV_OPA_MAX) { + if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) { + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) { + lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x], + LV_OPA_MIX3(src_buf_c32[src_x].alpha, mask_buf[src_x], opa)); + } + dest_buf += dest_stride; + src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride); + mask_buf += mask_stride; + } + } + } + } else { + lv_color32_t src_argb; + for (y = 0; y < h; y++) { + for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x ++) { + src_argb = src_buf_c32[src_x]; + if (mask_buf == NULL) { + src_argb.alpha = LV_OPA_MIX2(src_argb.alpha, opa); + } else { + src_argb.alpha = LV_OPA_MIX3(src_argb.alpha, mask_buf[dest_x], opa); + } + + blend_non_normal_pixel(&dest_buf[dest_x], src_argb, dsc->blend_mode); + } + if (mask_buf) { + mask_buf += mask_stride; + } + dest_buf += dest_stride; + src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride); + } + } +} + +static inline void LV_ATTRIBUTE_FAST_MEM blend_non_normal_pixel(uint8_t *dest, lv_color32_t src, lv_blend_mode_t mode) +{ + uint8_t res[3] = {0, 0, 0}; + switch (mode) { + case LV_BLEND_MODE_ADDITIVE: + res[0] = LV_MIN(dest[0] + src.blue, 255); + res[1] = LV_MIN(dest[1] + src.green, 255); + res[2] = LV_MIN(dest[2] + src.red, 255); + break; + case LV_BLEND_MODE_SUBTRACTIVE: + res[0] = LV_MAX(dest[0] - src.blue, 0); + res[1] = LV_MAX(dest[1] - src.green, 0); + res[2] = LV_MAX(dest[2] - src.red, 0); + break; + case LV_BLEND_MODE_MULTIPLY: + res[0] = (dest[0] * src.blue) >> 8; + res[1] = (dest[1] * src.green) >> 8; + res[2] = (dest[2] * src.red) >> 8; + break; + default: + LV_LOG_WARN("Not supported blend mode: %d", mode); + return; + } + lv_color_24_24_mix(res, dest, src.alpha); +} + +static inline void LV_ATTRIBUTE_FAST_MEM lv_color_8_24_mix(const uint8_t src, uint8_t *dest, uint8_t mix) +{ + + if (mix == 0) { + return; + } + + if (mix >= LV_OPA_MAX) { + dest[0] = src; + dest[1] = src; + dest[2] = src; + } else { + lv_opa_t mix_inv = 255 - mix; + dest[0] = (uint32_t)((uint32_t)src * mix + dest[0] * mix_inv) >> 8; + dest[1] = (uint32_t)((uint32_t)src * mix + dest[1] * mix_inv) >> 8; + dest[2] = (uint32_t)((uint32_t)src * mix + dest[2] * mix_inv) >> 8; + } +} + +static inline void LV_ATTRIBUTE_FAST_MEM lv_color_24_24_mix(const uint8_t *src, uint8_t *dest, uint8_t mix) +{ + + if (mix == 0) { + return; + } + + if (mix >= LV_OPA_MAX) { + dest[0] = src[0]; + dest[1] = src[1]; + dest[2] = src[2]; + } else { + lv_opa_t mix_inv = 255 - mix; + dest[0] = (uint32_t)((uint32_t)src[0] * mix + dest[0] * mix_inv) >> 8; + dest[1] = (uint32_t)((uint32_t)src[1] * mix + dest[1] * mix_inv) >> 8; + dest[2] = (uint32_t)((uint32_t)src[2] * mix + dest[2] * mix_inv) >> 8; + } +} + +static inline uint8_t LV_ATTRIBUTE_FAST_MEM get_bit(const uint8_t *buf, int32_t bit_idx) +{ + return (buf[bit_idx / 8] >> (7 - (bit_idx % 8))) & 1; +} + +static inline void *LV_ATTRIBUTE_FAST_MEM drawbuf_next_row(const void *buf, uint32_t stride) +{ + return (void *)((uint8_t *)buf + stride); +} diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h index 5243857e..fb0ddece 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h +++ b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h @@ -42,7 +42,8 @@ typedef struct { void *p_asm_alloc; // pointer to the beginning of the memory allocated for ASM test buf, used in free() void *p_ansi_alloc; // pointer to the beginning of the memory allocated for ANSI test buf, used in free() } buf; - void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function + void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function + void (*blend_api_px_func)(_lv_draw_sw_blend_fill_dsc_t *, uint32_t); // pointer to LVGL API function with dest_px_size argument lv_color_format_t color_format; // LV color format size_t data_type_size; // Used data type size, eg sizeof() size_t active_buf_len; // Length of buffer, where the actual data are stored (not including Canary bytes) @@ -65,7 +66,8 @@ typedef struct { unsigned int benchmark_cycles; // Count of benchmark cycles void *array_align16; // test array with 16 byte alignment - testing most ideal case void *array_align1; // test array with 1 byte alignment - testing wort case - void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function + void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function + void (*blend_api_px_func)(_lv_draw_sw_blend_fill_dsc_t *, uint32_t); // pointer to LVGL API function with dest_px_size argument } bench_test_case_params_t; #ifdef __cplusplus diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c index 85935985..f038e679 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c @@ -15,6 +15,7 @@ #include "lv_draw_sw_blend.h" #include "lv_draw_sw_blend_to_argb8888.h" #include "lv_draw_sw_blend_to_rgb565.h" +#include "lv_draw_sw_blend_to_rgb888.h" #define WIDTH 128 #define HEIGHT 128 @@ -115,6 +116,31 @@ TEST_CASE("LV Fill benchmark RGB565", "[fill][benchmark][RGB565]") lv_fill_benchmark_init(&test_params); free(dest_array_align16); } + +TEST_CASE("LV Fill benchmark RGB888", "[fill][benchmark][RGB888]") +{ + uint8_t *dest_array_align16 = (uint8_t *)memalign(16, STRIDE * HEIGHT * 3 + UNALIGN_BYTES); + TEST_ASSERT_NOT_EQUAL(NULL, dest_array_align16); + + // Apply byte unalignment for the worst-case test scenario + uint8_t *dest_array_align1 = dest_array_align16 + UNALIGN_BYTES; + + bench_test_case_params_t test_params = { + .height = HEIGHT, + .width = WIDTH, + .stride = STRIDE * 3, + .cc_height = HEIGHT - 1, + .cc_width = WIDTH - 1, + .benchmark_cycles = BENCHMARK_CYCLES, + .array_align16 = (void *)dest_array_align16, + .array_align1 = (void *)dest_array_align1, + .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888, + }; + + ESP_LOGI(TAG_LV_FILL_BENCH, "running test for RGB888 color format"); + lv_fill_benchmark_init(&test_params); + free(dest_array_align16); +} // ------------------------------------------------ Static test functions ---------------------------------------------- static void lv_fill_benchmark_init(bench_test_case_params_t *test_params) @@ -143,6 +169,7 @@ static void lv_fill_benchmark_init(bench_test_case_params_t *test_params) // Run benchmark with the most ideal input parameters // Dest array is 16 byte aligned, dest_w and dest_h are dividable by 4 + float cycles = lv_fill_benchmark_run(test_params, &dsc); // Call Benchmark cycle float per_sample = cycles / ((float)(dsc.dest_w * dsc.dest_h)); ESP_LOGI(TAG_LV_FILL_BENCH, " %s ideal case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample", asm_ansi_func[i], cycles, dsc.dest_w, dsc.dest_h, per_sample); @@ -162,15 +189,29 @@ static void lv_fill_benchmark_init(bench_test_case_params_t *test_params) static float lv_fill_benchmark_run(bench_test_case_params_t *test_params, _lv_draw_sw_blend_fill_dsc_t *dsc) { // Call the DUT function for the first time to init the benchmark test - test_params->blend_api_func(dsc); + if (test_params->blend_api_func != NULL) { + test_params->blend_api_func(dsc); + } else if (test_params->blend_api_px_func != NULL) { + test_params->blend_api_px_func(dsc, 3); + } + const unsigned int start_b = xthal_get_ccount(); - for (int i = 0; i < test_params->benchmark_cycles; i++) { - test_params->blend_api_func(dsc); + + if (test_params->blend_api_func != NULL) { + for (int i = 0; i < test_params->benchmark_cycles; i++) { + test_params->blend_api_func(dsc); + } + } else if (test_params->blend_api_px_func != NULL) { + for (int i = 0; i < test_params->benchmark_cycles; i++) { + test_params->blend_api_px_func(dsc, 3); + } } + const unsigned int end_b = xthal_get_ccount(); const float total_b = end_b - start_b; const float cycles = total_b / (test_params->benchmark_cycles); + return cycles; } diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c index 5bf29558..45ebd219 100644 --- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c +++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c @@ -13,6 +13,7 @@ #include "lv_draw_sw_blend.h" #include "lv_draw_sw_blend_to_argb8888.h" #include "lv_draw_sw_blend_to_rgb565.h" +#include "lv_draw_sw_blend_to_rgb888.h" // ------------------------------------------------- Defines ----------------------------------------------------------- @@ -47,14 +48,14 @@ static lv_color_t test_color = { * - generate functionality test combinations, based on the provided test_matrix struct * * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations - * @param[in] test_case Pointer ot structure defining functionality test case + * @param[in] test_case Pointer to structure defining functionality test case */ static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case); /** * @brief Fill test buffers for functionality test * - * @param[in] test_case Pointer ot structure defining functionality test case + * @param[in] test_case Pointer to structure defining functionality test case */ static void fill_test_bufs(func_test_case_params_t *test_case); @@ -63,24 +64,31 @@ static void fill_test_bufs(func_test_case_params_t *test_case); * * - function prepares structures for functionality testing and runs the LVGL API * - * @param[in] test_case Pointer ot structure defining functionality test case + * @param[in] test_case Pointer to structure defining functionality test case */ static void lv_fill_functionality(func_test_case_params_t *test_case); /** * @brief Evaluate results for 32bit data length * - * @param[in] test_case Pointer ot structure defining functionality test case + * @param[in] test_case Pointer to structure defining functionality test case */ static void test_eval_32bit_data(func_test_case_params_t *test_case); /** * @brief Evaluate results for 16bit data length * - * @param[in] test_case Pointer ot structure defining functionality test case + * @param[in] test_case Pointer to structure defining functionality test case */ static void test_eval_16bit_data(func_test_case_params_t *test_case); +/** + * @brief Evaluate results for 24bit data length + * + * @param[in] test_case Pointer to structure defining functionality test case + */ +static void test_eval_24bit_data(func_test_case_params_t *test_case); + // ------------------------------------------------ Test cases --------------------------------------------------------- /* @@ -147,6 +155,29 @@ TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]") functionality_test_matrix(&test_matrix, &test_case); } +TEST_CASE("Test fill functionality RGB888", "[fill][functionality][RGB888]") +{ + test_matrix_params_t test_matrix = { + .min_w = 12, // 12 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed + .min_h = 1, + .max_w = 32, + .max_h = 3, + .min_unalign_byte = 0, + .max_unalign_byte = 16, + .unalign_step = 1, + .dest_stride_step = 1, + .test_combinations_count = 0, + }; + + func_test_case_params_t test_case = { + .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888, + .color_format = LV_COLOR_FORMAT_RGB888, + .data_type_size = sizeof(uint8_t) * 3, // 24-bit data length + }; + + ESP_LOGI(TAG_LV_FILL_FUNC, "running test for RGB888 color format"); + functionality_test_matrix(&test_matrix, &test_case); +} // ------------------------------------------------ Static test functions ---------------------------------------------- static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case) @@ -195,8 +226,13 @@ static void lv_fill_functionality(func_test_case_params_t *test_case) dsc_ansi.dest_buf = test_case->buf.p_ansi; dsc_ansi.use_asm = false; - test_case->blend_api_func(&dsc_asm); // Call the LVGL API with Assembly code - test_case->blend_api_func(&dsc_ansi); // Call the LVGL API with ANSI code + if (test_case->blend_api_func != NULL) { + test_case->blend_api_func(&dsc_asm); // Call the LVGL API with Assembly code + test_case->blend_api_func(&dsc_ansi); // Call the LVGL API with ANSI code + } else if (test_case->blend_api_px_func != NULL) { + test_case->blend_api_px_func(&dsc_asm, 3); // Call the LVGL API with Assembly code + test_case->blend_api_px_func(&dsc_ansi, 3); // Call the LVGL API with ANSI code + } // Shift array pointers by Canary Bytes amount back test_case->buf.p_asm -= CANARY_BYTES * test_case->data_type_size; @@ -216,6 +252,11 @@ static void lv_fill_functionality(func_test_case_params_t *test_case) break; } + case LV_COLOR_FORMAT_RGB888: { + test_eval_24bit_data(test_case); + break; + } + default: TEST_ASSERT_MESSAGE(false, "LV Color format not found"); } @@ -233,6 +274,7 @@ static void fill_test_bufs(func_test_case_params_t *test_case) const unsigned int unalign_byte = test_case->unalign_byte; // Allocate destination arrays for Assembly and ANSI LVGL Blend API + void *mem_asm = memalign(16, (total_buf_len * data_type_size) + unalign_byte); void *mem_ansi = memalign(16, (total_buf_len * data_type_size) + unalign_byte); TEST_ASSERT_NOT_NULL_MESSAGE(mem_asm, "Lack of memory"); @@ -275,7 +317,6 @@ static void test_eval_32bit_data(func_test_case_params_t *test_case) } printf("\n"); #endif - // Canary bytes area must stay 0 TEST_ASSERT_EACH_EQUAL_UINT32_MESSAGE(0, (uint32_t *)test_case->buf.p_ansi, CANARY_BYTES, test_msg_buf); TEST_ASSERT_EACH_EQUAL_UINT32_MESSAGE(0, (uint32_t *)test_case->buf.p_asm, CANARY_BYTES, test_msg_buf); @@ -297,8 +338,6 @@ static void test_eval_16bit_data(func_test_case_params_t *test_case) } printf("\n"); #endif - - // Canary bytes area must stay 0 TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_ansi, CANARY_BYTES, test_msg_buf); TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_asm, CANARY_BYTES, test_msg_buf); @@ -309,3 +348,32 @@ static void test_eval_16bit_data(func_test_case_params_t *test_case) TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_ansi + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf); TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_asm + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf); } + +static void test_eval_24bit_data(func_test_case_params_t *test_case) +{ + // Print results, 24bit data +#if DBG_PRINT_OUTPUT + size_t data_type_size = test_case->data_type_size; + for (uint32_t i = 0; i < test_case->total_buf_len; i++) { + uint32_t ansi_value = ((uint8_t *)test_case->buf.p_ansi)[i * data_type_size] + | (((uint8_t *)test_case->buf.p_ansi)[i * data_type_size + 1] << 8) + | (((uint8_t *)test_case->buf.p_ansi)[i * data_type_size + 2] << 16); + uint32_t asm_value = ((uint8_t *)test_case->buf.p_asm)[i * data_type_size] + | (((uint8_t *)test_case->buf.p_asm)[i * data_type_size + 1] << 8) + | (((uint8_t *)test_case->buf.p_asm)[i * data_type_size + 2] << 16); + printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx32" \t asm = %8"PRIx32" \n", i, ((i < 10) ? (" ") : ("")), ansi_value, asm_value); + } + printf("\n"); +#endif + + // Canary bytes area must stay 0 + TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_ansi, CANARY_BYTES * test_case->data_type_size, test_msg_buf); + TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_asm, CANARY_BYTES * test_case->data_type_size, test_msg_buf); + + // dest_buf_asm and dest_buf_ansi must be equal + TEST_ASSERT_EQUAL_UINT8_ARRAY_MESSAGE((uint8_t *)test_case->buf.p_asm + CANARY_BYTES * test_case->data_type_size, (uint8_t *)test_case->buf.p_ansi + CANARY_BYTES * test_case->data_type_size, test_case->active_buf_len * test_case->data_type_size, test_msg_buf); + + // Canary bytes area must stay 0 + TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_ansi + test_case->active_buf_len * test_case->data_type_size + CANARY_BYTES * test_case->data_type_size, CANARY_BYTES * test_case->data_type_size, test_msg_buf); + TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_asm + test_case->active_buf_len * test_case->data_type_size + CANARY_BYTES * test_case->data_type_size, CANARY_BYTES * test_case->data_type_size, test_msg_buf); +}