From 4e493251cb39157f630e4b3d83c58bed1d877b47 Mon Sep 17 00:00:00 2001
From: Pavel Borcin <palec52@gmail.com>
Date: Thu, 28 Nov 2024 11:34:19 +0100
Subject: [PATCH] feature(lvgl_port): RGB888 SIMD fill

---
 components/esp_lvgl_port/CMakeLists.txt       |   1 +
 .../include/esp_lvgl_port_lv_blend.h          |  19 +
 .../simd/lv_color_blend_to_argb8888_esp32s3.S |   3 +-
 .../simd/lv_color_blend_to_rgb565_esp32s3.S   |   3 +-
 .../simd/lv_color_blend_to_rgb888_esp32.S     | 105 ++
 .../simd/lv_color_blend_to_rgb888_esp32s3.S   | 351 +++++++
 .../esp_lvgl_port/test_apps/simd/README.md    |   3 +
 .../include/lv_draw_sw_blend_to_rgb888.h      |  53 +
 .../lv_blend/src/lv_draw_sw_blend_to_rgb888.c | 953 ++++++++++++++++++
 .../test_apps/simd/main/lv_fill_common.h      |   6 +-
 .../simd/main/test_lv_fill_benchmark.c        |  47 +-
 .../simd/main/test_lv_fill_functionality.c    |  88 +-
 12 files changed, 1613 insertions(+), 19 deletions(-)
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c

diff --git a/components/esp_lvgl_port/CMakeLists.txt b/components/esp_lvgl_port/CMakeLists.txt
index 8dc53693..0148fd77 100644
--- a/components/esp_lvgl_port/CMakeLists.txt
+++ b/components/esp_lvgl_port/CMakeLists.txt
@@ -94,6 +94,7 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0"))
         # Force link .S files
         set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_argb8888_esp")
         set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb565_esp")
+        set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb888_esp")
     endif()
 endif()
 
diff --git a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
index c00de1c0..cb9560ab 100644
--- a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
+++ b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
@@ -32,6 +32,10 @@ extern "C" {
     _lv_color_blend_to_rgb565_esp(dsc)
 #endif
 
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888(dsc) \
+    _lv_color_blend_to_rgb888_esp(dsc)
+#endif
 
 /**********************
  *      TYPEDEFS
@@ -83,6 +87,21 @@ static inline lv_result_t _lv_color_blend_to_rgb565_esp(_lv_draw_sw_blend_fill_d
     return lv_color_blend_to_rgb565_esp(&asm_dsc);
 }
 
+extern int lv_color_blend_to_rgb888_esp(asm_dsc_t *asm_dsc);
+
+static inline lv_result_t _lv_color_blend_to_rgb888_esp(_lv_draw_sw_blend_fill_dsc_t *dsc)
+{
+    asm_dsc_t asm_dsc = {
+        .dst_buf = dsc->dest_buf,
+        .dst_w = dsc->dest_w,
+        .dst_h = dsc->dest_h,
+        .dst_stride = dsc->dest_stride,
+        .src_buf = &dsc->color,
+    };
+
+    return lv_color_blend_to_rgb888_esp(&asm_dsc);
+}
+
 #endif // CONFIG_LV_DRAW_SW_ASM_CUSTOM
 
 #ifdef __cplusplus
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
index bb3956e6..10276f4f 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_argb8888_esp32s3.S
@@ -32,8 +32,7 @@
 
 lv_color_blend_to_argb8888_esp:
 
-    entry      a1,    32
-    ee.zero.q  q0                               // dummy TIE instruction, to enable the TIE
+    entry    a1,    32
 
     l32i.n   a3,    a2,    4                    // a3 - dest_buff
     l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint32_t
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
index ee9f8a9c..3a9fe43c 100644
--- a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb565_esp32s3.S
@@ -31,8 +31,7 @@
 
 lv_color_blend_to_rgb565_esp:
 
-    entry      a1,    32
-    ee.zero.q  q0                               // dummy TIE instruction, to enable the TIE
+    entry    a1,    32
 
     l32i.n   a3,    a2,    4                    // a3 - dest_buff
     l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S
new file mode 100644
index 00000000..467b5348
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32.S
@@ -0,0 +1,105 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// This is LVGL RGB888 simple fill for ESP32 processor
+
+    .section .text
+    .align  4
+    .global lv_color_blend_to_rgb888_esp
+    .type   lv_color_blend_to_rgb888_esp,@function
+// The function implements the following C code:
+// void lv_color_blend_to_rgb888(_lv_draw_sw_blend_fill_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+lv_color_blend_to_rgb888_esp:
+
+    entry    a1,    32
+
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint24_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
+    l32i.n   a8,    a7,    0                    // a8 - color as value
+
+    // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4
+    slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
+    add      a11,   a11,   a4                   // a11 - dest_w_bytes = a11 + a4
+
+    // Prepare register combinations
+    // a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR
+    l8ui     a13,   a7,    0                     // blue     000B
+    slli     a13,   a13,   24                    // shift to B000
+    or       a13,   a13,   a8                    // a13      BRGB
+
+    srli     a14,   a8,    8                     // a14      00RG
+    slli     a10,   a8,    16                    // a10      GB00
+    or       a14,   a14,   a10                   // a14      GBRG
+
+    slli     a15,   a8,    8                     // a15      RGB0
+    l8ui     a10,   a7,    2                     // a7       000R
+    or       a15,   a15,   a10                   // a15      RGBR
+
+    sub      a6,     a6,    a11                  // dest_stride = dest_stride - dest_w_bytes
+
+    // Prepare main loop length and dest_w_bytes
+    srli     a9,     a4,    2                    // a9 = loop_len = dest_w / 4, calculate main loop_len for original dest_w
+    movi.n   a8,     0x3                         // a8 = 0x3, remainder mask
+    and      a10,    a4,    a8                   // a10 - remainder after division by 4 = a4 and 0x3
+
+    .outer_loop:
+
+        // Run main loop which sets 12 bytes (4 rgb888) in one loop run
+        loopnez a9, ._main_loop
+            s32i.n      a13,  a3,  0                    // save 32 bits from 32-bit color a13 to dest_buff a3, offset 0
+            s32i.n      a14,  a3,  4                    // save 32 bits from 32-bit color a14 to dest_buff a3, offset 4
+            s32i.n      a15,  a3,  8                    // save 32 bits from 32-bit color a15 to dest_buff a3, offset 8
+            addi.n      a3,   a3,  12                   // increment dest_buff pointer by 12
+        ._main_loop:
+
+        bnei   a10,  0x3,  _less_than_3                 // branch if less than 3 values left
+            s32i.n      a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
+            s32i.n      a14,  a3,  4                    // save 32 bits from a14 to dest_buff a3, offset 4 bytes
+            s8i         a15,  a3,  8                    // save  8 bits from a15 to dest_buff a3, offset 8 bytes
+            addi.n      a3,   a3,  9                    // increment dest_buff pointer by 9 bytes
+            j           _less_than_1
+        _less_than_3:
+
+        bnei  a10,  0x2,  _less_than_2                  // branch if less than 2 values left
+            s32i.n      a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
+            s16i        a14,  a3,  4                    // save 16 bits from a14 to dest_buff a3, offset 4 bytes
+            addi.n      a3,   a3,  6                    // increment dest_buff pointer by 6 bytes
+            j           _less_than_1
+        _less_than_2:
+
+        bnei  a10,  0x1,  _less_than_1                  // branch if less than 1 value left
+            s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
+            s8i         a15,  a3,  2                    // save  8 bits from a15 to dest_buff a3, offset 2 bytes
+            addi.n      a3,   a3,  3                    // increment dest_buff pointer by 3 bytes
+        _less_than_1:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+        and     a7,  a8,  a3                            // a7 = dest_buff AND 0x3 (check if the address is 4-byte aligned)
+    bnez a5, .outer_loop
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S
new file mode 100644
index 00000000..bb69f75e
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_color_blend_to_rgb888_esp32s3.S
@@ -0,0 +1,351 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// This is LVGL RGB888 simple fill for ESP32S3 processor
+
+    .section .text
+    .align  4
+    .global lv_color_blend_to_rgb888_esp
+    .type   lv_color_blend_to_rgb888_esp,@function
+// The function implements the following C code:
+// void lv_color_blend_to_rgb888(_lv_draw_sw_blend_fill_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+lv_color_blend_to_rgb888_esp:
+
+    entry    a1,    32
+
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint24_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff (color)
+    l32i.n   a8,    a7,    0                    // a8 - color as value
+
+    // a11 - dest_w_bytes = sizeof(uint24_t) * dest_w = 3 * a4
+    slli     a11,   a4,    1                    // a11 - dest_w_bytes = 2 * dest_w
+    add      a11,   a11,   a4                   // a11 - dest_w_bytes = a11 + a4
+
+    // Prepare register combinations
+    // a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR
+    l8ui     a13,   a7,    0                     // blue     000B
+    slli     a13,   a13,   24                    // shift to B000
+    or       a13,   a13,   a8                    // a13      BRGB
+
+    srli     a14,   a8,    8                     // a14      00RG
+    slli     a10,   a8,    16                    // a10      GB00
+    or       a14,   a14,   a10                   // a14      GBRG
+
+    slli     a15,   a8,    8                     // a15      RGB0
+    l8ui     a10,   a7,    2                     // a7       000R
+    or       a15,   a15,   a10                   // a15      RGBR
+
+    sub      a6,     a6,    a11                  // dest_stride = dest_stride - dest_w_bytes
+
+    // Check for short lengths
+    // dest_w should be at least 12, othewise it's not worth using esp32s3 TIE
+    bgei     a4,   12,  _esp32s3_implementation         // Branch if dest_w is greater than or equal to 12
+    j .lv_color_blend_to_rgb888_esp32_body              // Jump to esp32 implementation
+
+    _esp32s3_implementation:
+
+    // Prepare q registers for the main loop
+    ee.movi.32.q   q3,   a13,  0                        // fill q3 register from a13 by 32 bits
+    ee.movi.32.q   q3,   a14,  1                        // fill q3 register from a14 by 32 bits
+    ee.movi.32.q   q3,   a15,  2                        // fill q3 register from a15 by 32 bits
+    ee.movi.32.q   q3,   a13,  3                        // fill q3 register from a13 by 32 bits
+
+    ee.movi.32.q   q4,   a14,  0                        // fill q4 register from a14 by 32 bits
+    ee.movi.32.q   q4,   a15,  1                        // fill q4 register from a15 by 32 bits
+    ee.movi.32.q   q4,   a13,  2                        // fill q4 register from a13 by 32 bits
+    ee.movi.32.q   q4,   a14,  3                        // fill q4 register from a14 by 32 bits
+
+    ee.movi.32.q   q5,   a15,  0                        // fill q5 register from a15 by 32 bits
+    ee.movi.32.q   q5,   a13,  1                        // fill q5 register from a13 by 32 bits
+    ee.movi.32.q   q5,   a14,  2                        // fill q5 register from a14 by 32 bits
+    ee.movi.32.q   q5,   a15,  3                        // fill q5 register from a15 by 32 bits
+
+    .outer_loop_aligned:
+
+        // Copy q3 to q0
+        ee.zero.q      q0                                   // clear q0
+        ee.orq         q0,   q0,   q3                       // copy q3 to q0
+
+        // Copy q4 to q1
+        ee.zero.q      q1                                   // clear q1
+        ee.orq         q1,   q1,   q4                       // copy q4 to q1
+
+        // Copy q5 to q2
+        ee.zero.q      q2                                   // clear q2
+        ee.orq         q2,   q2,   q5                       // copy q5 to q2
+
+
+        // alignment check
+        extui   a8,    a3,  0,  4                           // a8 = a3 AND 0xf
+
+        // if a8 = 0 skip unalignment computation
+        bnez    a8,    _unaligned_dest_buff                  // If already aligned, jump to aligned
+            mov.n   a10,   a11                              // a10 - local_dest_w_bytes = dest_w_bytes
+            j    _aligned_dest_buff
+        _unaligned_dest_buff:
+
+        // length
+        movi.n  a12,   16                           // a12 - 16
+        sub     a2,    a12,   a8                    // a2  = 16 - unalignment (lower 4 bits of dest_buff address)
+        sub     a10,   a11,   a2                    // local_dest_w_bytes = len - (16 - unalignment)
+
+        _aligned_dest_buff:
+        movi    a12,   48                           // a12 = 48 (main loop copies 48 bytes)
+        quou    a9,    a10,   a12                   // a9 = local_dest_w_bytes (a10) DIV 48 (a12)
+        remu    a10,   a10,   a12                   // a10 = local_dest_w_bytes (a10) remainder div 48 (a12)
+
+        beqz    a8,    _dest_buff_aligned           // If already aligned, skip aligning
+
+        movi    a7,    unalignment_table            // Load unalignment_table address
+
+        addx4   a7,    a8,    a7                    // a7 = offset * 4 + jump_table address
+        l32i    a7,    a7,    0                     // Load target address from jump table
+        jx      a7                                  // Jump to the corresponding handler
+
+
+// a13 - 0xBBRRGGBB a14 - 0xGGBBRRGG a15 - 0xRRGGBBRR
+handle_0:
+handle_1:
+    s8i         a13,  a3,  0                    // save 8 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
+    s16i        a14,  a3,  0                    // save 16 bits from a14 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+    s32i        a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+    ee.vst.l.64.ip    q1,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+    j _shift_q_regs
+handle_2:
+    s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+    s32i        a15,  a3,  0                    // save 32 bits from a15 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+    ee.vst.l.64.ip    q0,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+    j _shift_q_regs
+handle_3:
+    s8i         a13,  a3,  0                    // save 8 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
+    s32i        a14,  a3,  0                    // save 32 bits from a14 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+    ee.vst.l.64.ip    q2,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+    j _shift_q_regs
+handle_4:
+    s32i        a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+    ee.vst.l.64.ip    q1,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+    j _shift_q_regs
+handle_5:
+    s8i         a13,  a3,  0                    // save 8 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
+    s16i        a14,  a3,  0                    // save 16 bits from a14 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+    ee.vst.l.64.ip    q0,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+    j _shift_q_regs
+handle_6:
+    s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 byte
+    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+    ee.vst.l.64.ip    q2,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+    j _shift_q_regs
+handle_7:
+    s8i         a13,  a3,  0                    // save 8 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
+    ee.vst.l.64.ip    q1,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+    j _shift_q_regs
+handle_8:
+    ee.vst.l.64.ip    q0,  a3,  8               // save lower 64 bits from q0 to dest_buff a3, increase dest_buff pointer by 8 bytes
+    j _shift_q_regs
+
+handle_9:
+    s8i         a13,  a3,  0                    // save  8 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
+    s16i        a14,  a3,  0                    // save 16 bits from a14 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+    s32i        a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+    j _shift_q_regs
+handle_10:
+    s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+    s32i        a15,  a3,  0                    // save 32 bits from a15 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+    j _shift_q_regs
+handle_11:
+    s8i         a13,  a3,  0                    // save  8 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
+    s32i        a14,  a3,  0                    // save 32 bits from a14 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+    j _shift_q_regs
+handle_12:
+    s32i        a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  4                    // increment dest_buff pointer by 4 bytes
+    j _shift_q_regs
+handle_13:
+    s8i         a13,  a3,  0                    // save  8 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
+    s16i        a14,  a3,  0                    // save 16 bits from a14 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+    j _shift_q_regs
+handle_14:
+    s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  2                    // increment dest_buff pointer by 2 bytes
+    j _shift_q_regs
+handle_15:
+    s8i         a13,  a3,  0                    // save  8 bits from a13 to dest_buff a3, offset 0 bytes
+    addi.n      a3,   a3,  1                    // increment dest_buff pointer by 1 byte
+    j _shift_q_regs
+
+.align 4
+
+unalignment_table:
+    .word handle_0            // Case 0: Dummy case for easier address computation
+    .word handle_1            // Case 1: Align 15 bytes
+    .word handle_2            // Case 2: Align 14 bytes
+    .word handle_3            // Case 3: Align 13 bytes
+    .word handle_4            // Case 4: Align 12 bytes
+    .word handle_5            // Case 5: Align 11 bytes
+    .word handle_6            // Case 6: Align 10 bytes
+    .word handle_7            // Case 7: Align 9 bytes
+    .word handle_8            // Case 8: Align 8 bytes
+    .word handle_9            // Case 9: Align 7 bytes
+    .word handle_10           // Case 10: Align 6 bytes
+    .word handle_11           // Case 11: Align 5 bytes
+    .word handle_12           // Case 12: Align 4 bytes
+    .word handle_13           // Case 13: Align 3 bytes
+    .word handle_14           // Case 14: Align 2 bytes
+    .word handle_15           // Case 15: Align 1 byte 
+
+
+    _shift_q_regs:
+        wur.sar_byte  a2                                // apply unalignment to the SAR_BYTE
+        ee.src.q      q0,   q0,   q1                    // shift concat. of q0 and q1 to q0 by SAR_BYTE amount
+        ee.src.q      q1,   q1,   q2                    // shift concat. of q1 and q2 to q1 by SAR_BYTE amount
+        ee.src.q      q2,   q2,   q3                    // shift concat. of q2 and q3 to q2 by SAR_BYTE amount
+
+    _dest_buff_aligned:
+        loopnez a9, ._main_loop_aligned                 // 48 bytes (16 rgb888) in one loop
+            ee.vst.128.ip q0, a3, 16                    // store 16 bytes from q0 to dest_buff a3
+            ee.vst.128.ip q1, a3, 16                    // store 16 bytes from q1 to dest_buff a3
+            ee.vst.128.ip q2, a3, 16                    // store 16 bytes from q2 to dest_buff a3
+        ._main_loop_aligned:
+
+        // Check modulo 32 of the unalignment, if - then set 32 bytes
+        bbci      a10,   5,  .lt_32                     // branch if 5-th bit of local_dest_w_bytes a10 is clear
+            ee.vst.128.ip q0,  a3,  16                  // store 16 bytes from q0 to dest_buff a3
+            ee.vst.128.ip q1,  a3,  16                  // store 16 bytes from q1 to dest_buff a3
+
+            ee.srci.2q    q0,  q1,  1                   // shift q0 register to have next bytes to store ready from LSB
+        .lt_32:
+
+        // Check modulo 16 of the unalignment, if - then set 16 bytes
+        bbci      a10,   4,  .lt_16                     // branch if 4-th bit of local_dest_w_bytes a10 is clear
+            ee.vst.128.ip q0,  a3,  16                  // store 16 bytes from q0 to dest_buff a3
+
+            ee.srci.2q    q0,  q1,  0                   // shift q0 register to have next bytes to store ready from LSB
+        .lt_16:
+
+        // Check modulo 8 of the unalignment, if - then set 8 bytes
+        bbci      a10,   3,  .lt_8
+            ee.vst.l.64.ip q0, a3, 8                    // store 8 bytes from q0 to dest_buff a3
+
+            ee.srci.2q    q0,  q1,  1                   // shift q0 register to have next bytes to store ready from LSB
+        .lt_8:
+
+        // Check modulo 4 of the unalignment, if - then set 4 bytes
+        bbci      a10,   2,  .lt_4
+            ee.movi.32.a  q0,  a2,  0                   // move lowest 32 bits of q0 to a2
+            s32i.n        a2,  a3,  0                   // save 32 bits from a2 to dest_buff a3, offset 0
+            addi.n        a3,  a3,  4                   // increment dest_buff pointer by 4 bytes
+
+            ee.srci.2q    q0,  q1,  0                   // shift q0 register to have next bytes to store ready from LSB
+        .lt_4:
+
+        // Check modulo 2 of the unalignment, if - then set 2 bytes
+        bbci      a10,   1,  .lt_2
+            ee.movi.32.a  q0,  a2,  0                   // move lowest 32 bits of q0 to a2
+            s16i          a2,  a3,  0                   // save 16 bits from a2 to dest_buff a3, offset 0
+            addi.n        a3,  a3,  2                   // increment dest_buff pointer by 2 bytes
+
+            ee.srci.2q    q0,  q1,  1                   // shift q0 register to have next bytes to store ready from LSB
+        .lt_2:
+
+        // Check modulo 1 of the unalignment, if - then set 1 byte
+        bbci      a10,   0,  .lt_1
+            ee.movi.32.a  q0,  a2,  0                    // move lowest 32 bits of q0 to a2
+            s8i           a2,  a3,  0                    // save 8 bits from a2 to dest_buff a3, offset 0
+            addi.n        a3,  a3,  1                    // increment dest_buff pointer by 1 byte
+        .lt_1:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+    bnez a5, .outer_loop_aligned
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
+
+    .lv_color_blend_to_rgb888_esp32_body:
+
+    // Prepare main loop length and dest_w_bytes
+    srli     a9,     a4,    2                    // a9 = loop_len = dest_w / 4, calculate main loop_len for original dest_w
+    movi.n   a8,     0x3                         // a8 = 0x3, remainder mask
+    and      a10,    a4,    a8                   // a10 - remainder after division by 4 = a4 & 0x3
+
+    .outer_loop:
+
+        // Run main loop which sets 12 bytes (4 rgb888) in one loop run
+        loopnez a9, ._main_loop
+            s32i.n      a13,  a3,  0                    // save 32 bits from 32-bit color a13 to dest_buff a3, offset 0
+            s32i.n      a14,  a3,  4                    // save 32 bits from 32-bit color a14 to dest_buff a3, offset 4
+            s32i.n      a15,  a3,  8                    // save 32 bits from 32-bit color a15 to dest_buff a3, offset 8
+            addi.n      a3,   a3,  12                   // increment dest_buff pointer by 12
+        ._main_loop:
+
+        bnei   a10,  0x3,  _less_than_3                 // branch if less than 3 values left
+            s32i.n      a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
+            s32i.n      a14,  a3,  4                    // save 32 bits from a14 to dest_buff a3, offset 4 bytes
+            s8i         a15,  a3,  8                    // save  8 bits from a15 to dest_buff a3, offset 8 bytes
+            addi.n      a3,   a3,  9                    // increment dest_buff pointer by 9 bytes
+            j           _less_than_1
+        _less_than_3:
+
+        bnei  a10,  0x2,  _less_than_2                  // branch if less than 2 values left
+            s32i.n      a13,  a3,  0                    // save 32 bits from a13 to dest_buff a3, offset 0 bytes
+            s16i        a14,  a3,  4                    // save 16 bits from a14 to dest_buff a3, offset 4 bytes
+            addi.n      a3,   a3,  6                    // increment dest_buff pointer by 6 bytes
+            j           _less_than_1
+        _less_than_2:
+
+        bnei  a10,  0x1,  _less_than_1                  // branch if less than 1 value left
+            s16i        a13,  a3,  0                    // save 16 bits from a13 to dest_buff a3, offset 0 bytes
+            s8i         a15,  a3,  2                    // save  8 bits from a15 to dest_buff a3, offset 2 bytes
+            addi.n      a3,   a3,  3                    // increment dest_buff pointer by 3 bytes
+        _less_than_1:
+
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        addi.n  a5,  a5,  -1                            // decrease the outer loop
+        and     a7,  a8,  a3                            // a7 = dest_buff AND 0x3 (chck if the address is 4-byte aligned)
+    bnez a5, .outer_loop
+
+    movi.n   a2, 1                                      // return LV_RESULT_OK = 1
+    retw.n                                              // return
diff --git a/components/esp_lvgl_port/test_apps/simd/README.md b/components/esp_lvgl_port/test_apps/simd/README.md
index d319e2e3..6a994d31 100644
--- a/components/esp_lvgl_port/test_apps/simd/README.md
+++ b/components/esp_lvgl_port/test_apps/simd/README.md
@@ -12,6 +12,8 @@ Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/)
 |              | 127x127     |      1 byte      |     0.488      |     1.597      |
 | RGB565       | 128x128     |     16 byte      |     0.196      |     1.146      |
 |              | 127x127     |      1 byte      |     0.497      |     1.124      |
+| RGB888       | 128x128     |     16 byte      |     0.608      |     2.247      |
+|              | 127x127     |      1 byte      |     0.826      |     2.413      |
 * this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x127 1 byte aligned matrix (worst case)
 * the values represent cycles per sample to perform simple fill of the matrix on esp32s3
 
@@ -118,3 +120,4 @@ Example of an best and corner case input parameters for benchmark test, for a co
 | :----------------- | :--------------- | :------------- | :------------- | :------------- |
 | Best case          | 16-byte aligned  | Multiple of 8  | Multiple of 8  | Multiple of 8  |
 | Corner case        | 1-byte aligned   | Not power of 2 | Not power of 2 | Not power of 2 |
+
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h
new file mode 100644
index 00000000..3c7ac340
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend_to_rgb888.h
@@ -0,0 +1,53 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_draw_sw_blend_rgb888.h
+ *
+ */
+
+#ifndef LV_DRAW_SW_BLEND_RGB888_H
+#define LV_DRAW_SW_BLEND_RGB888_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_draw_sw_blend.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_color_to_rgb888(_lv_draw_sw_blend_fill_dsc_t *dsc,
+        uint32_t dest_px_size);
+
+void /* LV_ATTRIBUTE_FAST_MEM */ lv_draw_sw_blend_image_to_rgb888(_lv_draw_sw_blend_image_dsc_t *dsc,
+        uint32_t dest_px_size);
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_DRAW_SW_BLEND_RGB888_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c
new file mode 100644
index 00000000..344c6d1b
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb888.c
@@ -0,0 +1,953 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_draw_sw_blend_to_rgb888.c
+ *
+ */
+
+/*********************
+ *      INCLUDES
+ *********************/
+#include "lv_draw_sw_blend_to_rgb888.h"
+
+#include "lv_assert.h"
+#include "lv_types.h"
+#include "lv_log.h"
+#include "lv_draw_sw_blend.h"
+#include "lv_math.h"
+#include "lv_color.h"
+#include "string.h"
+
+#include "esp_lvgl_port_lv_blend.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+#define LV_ATTRIBUTE_FAST_MEM
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ *  STATIC PROTOTYPES
+ **********************/
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ i1_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size);
+
+static inline uint8_t /* LV_ATTRIBUTE_FAST_MEM */ get_bit(const uint8_t *buf, int32_t bit_idx);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc,
+        const uint8_t dest_px_size,
+        uint32_t src_px_size);
+
+static void /* LV_ATTRIBUTE_FAST_MEM */ argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc,
+        uint32_t dest_px_size);
+
+static inline void /* LV_ATTRIBUTE_FAST_MEM */ lv_color_8_24_mix(const uint8_t src, uint8_t *dest, uint8_t mix);
+
+static inline void /* LV_ATTRIBUTE_FAST_MEM */ lv_color_24_24_mix(const uint8_t *src, uint8_t *dest, uint8_t mix);
+
+static inline void /* LV_ATTRIBUTE_FAST_MEM */ blend_non_normal_pixel(uint8_t *dest, lv_color32_t src,
+        lv_blend_mode_t mode);
+static inline void * /* LV_ATTRIBUTE_FAST_MEM */ drawbuf_next_row(const void *buf, uint32_t stride);
+
+/**********************
+ *  STATIC VARIABLES
+ **********************/
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888(...)                           LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA(...)                  LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK(...)                 LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA
+#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA(...)              LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(...)                       LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(...)              LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(...)             LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA
+#define LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...)          LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888(...)                   LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA(...)          LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK(...)         LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...)      LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(...)                   LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA(...)          LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK(...)         LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA
+#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...)      LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888(...)                 LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA(...)        LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK(...)       LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA
+#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(...)    LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888
+#define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888(...)  LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA
+#define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA(...)  LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK
+#define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK(...)  LV_RESULT_INVALID
+#endif
+
+#ifndef LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA
+#define LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA(...)  LV_RESULT_INVALID
+#endif
+
+/**********************
+ *   GLOBAL FUNCTIONS
+ **********************/
+
+void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_color_to_rgb888(_lv_draw_sw_blend_fill_dsc_t *dsc, uint32_t dest_px_size)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    const lv_opa_t *mask = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+    int32_t dest_stride = dsc->dest_stride;
+
+    int32_t x;
+    int32_t y;
+
+    LV_UNUSED(w);
+    LV_UNUSED(h);
+    LV_UNUSED(x);
+    LV_UNUSED(y);
+    LV_UNUSED(opa);
+    LV_UNUSED(mask);
+    LV_UNUSED(mask_stride);
+    LV_UNUSED(dest_stride);
+
+    /*Simple fill*/
+    if (mask == NULL && opa >= LV_OPA_MAX) {
+        if (dsc->use_asm && dest_px_size == 3) {
+            LV_DRAW_SW_COLOR_BLEND_TO_RGB888(dsc);
+        } else {
+            if (dest_px_size == 3) {
+                uint8_t *dest_buf_u8 = dsc->dest_buf;
+                uint8_t *dest_buf_ori = dsc->dest_buf;
+                w *= dest_px_size;
+
+                for (x = 0; x < w; x += 3) {
+                    dest_buf_u8[x + 0] = dsc->color.blue;
+                    dest_buf_u8[x + 1] = dsc->color.green;
+                    dest_buf_u8[x + 2] = dsc->color.red;
+                }
+
+                dest_buf_u8 += dest_stride;
+
+                for (y = 1; y < h; y++) {
+                    // TODO: lv_memcpy
+                    memcpy(dest_buf_u8, dest_buf_ori, w);
+                    dest_buf_u8 += dest_stride;
+                }
+            }
+            if (dest_px_size == 4) {
+                uint32_t color32 = lv_color_to_u32(dsc->color);
+                uint32_t *dest_buf_u32 = dsc->dest_buf;
+                for (y = 0; y < h; y++) {
+                    for (x = 0; x <= w - 16; x += 16) {
+                        dest_buf_u32[x + 0] = color32;
+                        dest_buf_u32[x + 1] = color32;
+                        dest_buf_u32[x + 2] = color32;
+                        dest_buf_u32[x + 3] = color32;
+
+                        dest_buf_u32[x + 4] = color32;
+                        dest_buf_u32[x + 5] = color32;
+                        dest_buf_u32[x + 6] = color32;
+                        dest_buf_u32[x + 7] = color32;
+
+                        dest_buf_u32[x + 8] = color32;
+                        dest_buf_u32[x + 9] = color32;
+                        dest_buf_u32[x + 10] = color32;
+                        dest_buf_u32[x + 11] = color32;
+
+                        dest_buf_u32[x + 12] = color32;
+                        dest_buf_u32[x + 13] = color32;
+                        dest_buf_u32[x + 14] = color32;
+                        dest_buf_u32[x + 15] = color32;
+                    }
+                    for (; x < w; x ++) {
+                        dest_buf_u32[x] = color32;
+                    }
+
+                    dest_buf_u32 = drawbuf_next_row(dest_buf_u32, dest_stride);
+                }
+            }
+        }
+    }
+    /*Opacity only*/
+    else if (mask == NULL && opa < LV_OPA_MAX) {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
+            uint32_t color32 = lv_color_to_u32(dsc->color);
+            uint8_t *dest_buf = dsc->dest_buf;
+            w *= dest_px_size;
+            for (y = 0; y < h; y++) {
+                for (x = 0; x < w; x += dest_px_size) {
+                    lv_color_24_24_mix((const uint8_t *)&color32, &dest_buf[x], opa);
+                }
+
+                dest_buf = drawbuf_next_row(dest_buf, dest_stride);
+            }
+        }
+    }
+    /*Masked with full opacity*/
+    else if (mask && opa >= LV_OPA_MAX) {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
+            uint32_t color32 = lv_color_to_u32(dsc->color);
+            uint8_t *dest_buf = dsc->dest_buf;
+            w *= dest_px_size;
+
+            for (y = 0; y < h; y++) {
+                uint32_t mask_x;
+                for (x = 0, mask_x = 0; x < w; x += dest_px_size, mask_x++) {
+                    lv_color_24_24_mix((const uint8_t *)&color32, &dest_buf[x], mask[mask_x]);
+                }
+                dest_buf += dest_stride;
+                mask += mask_stride;
+            }
+        }
+    }
+    /*Masked with opacity*/
+    else {
+        if (LV_RESULT_INVALID == LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
+            uint32_t color32 = lv_color_to_u32(dsc->color);
+            uint8_t *dest_buf = dsc->dest_buf;
+            w *= dest_px_size;
+
+            for (y = 0; y < h; y++) {
+                uint32_t mask_x;
+                for (x = 0, mask_x = 0; x < w; x += dest_px_size, mask_x++) {
+                    lv_color_24_24_mix((const uint8_t *) &color32, &dest_buf[x], LV_OPA_MIX2(opa, mask[mask_x]));
+                }
+                dest_buf += dest_stride;
+                mask += mask_stride;
+            }
+        }
+    }
+}
+
+void LV_ATTRIBUTE_FAST_MEM lv_draw_sw_blend_image_to_rgb888(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
+{
+
+    switch (dsc->src_color_format) {
+    case LV_COLOR_FORMAT_RGB565:
+        rgb565_image_blend(dsc, dest_px_size);
+        break;
+    case LV_COLOR_FORMAT_RGB888:
+        rgb888_image_blend(dsc, dest_px_size, 3);
+        break;
+    case LV_COLOR_FORMAT_XRGB8888:
+        rgb888_image_blend(dsc, dest_px_size, 4);
+        break;
+    case LV_COLOR_FORMAT_ARGB8888:
+        argb8888_image_blend(dsc, dest_px_size);
+        break;
+    case LV_COLOR_FORMAT_L8:
+        l8_image_blend(dsc, dest_px_size);
+        break;
+    case LV_COLOR_FORMAT_AL88:
+        al88_image_blend(dsc, dest_px_size);
+        break;
+    case LV_COLOR_FORMAT_I1:
+        i1_image_blend(dsc, dest_px_size);
+        break;
+    default:
+        LV_LOG_WARN("Not supported source color format");
+        break;
+    }
+}
+
+/**********************
+ *   STATIC FUNCTIONS
+ **********************/
+
+static void LV_ATTRIBUTE_FAST_MEM i1_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint8_t *dest_buf_u8 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint8_t *src_buf_i1 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255;
+                        dest_buf_u8[dest_x + 2] = chan_val;
+                        dest_buf_u8[dest_x + 1] = chan_val;
+                        dest_buf_u8[dest_x + 0] = chan_val;
+                    }
+                    dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
+                    src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255;
+                        lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], opa);
+                    }
+                    dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
+                    src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_WITH_MASK(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255;
+                        lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], mask_buf[src_x]);
+                    }
+                    dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
+                    src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_I1_BLEND_NORMAL_TO_888_MIX_MASK_OPA(dsc)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        uint8_t chan_val = get_bit(src_buf_i1, src_x) * 255;
+                        lv_color_8_24_mix(chan_val, &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x]));
+                    }
+                    dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
+                    src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        for (y = 0; y < h; y++) {
+            for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                lv_color32_t src_argb;
+                src_argb.red = get_bit(src_buf_i1, src_x) * 255;
+                src_argb.green = src_argb.red;
+                src_argb.blue = src_argb.red;
+                if (mask_buf == NULL) {
+                    src_argb.alpha = opa;
+                } else {
+                    src_argb.alpha = LV_OPA_MIX2(mask_buf[src_x], opa);
+                }
+                blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_u8 = drawbuf_next_row(dest_buf_u8, dest_stride);
+            src_buf_i1 = drawbuf_next_row(src_buf_i1, src_stride);
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM al88_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint8_t *dest_buf_u8 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const lv_color16a_t *src_buf_al88 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], src_buf_al88[src_x].alpha);
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa));
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX2(src_buf_al88[src_x].alpha,
+                                          mask_buf[src_x]));
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_8_24_mix(src_buf_al88[src_x].lumi, &dest_buf_u8[dest_x], LV_OPA_MIX3(src_buf_al88[src_x].alpha,
+                                          mask_buf[src_x], opa));
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        for (y = 0; y < h; y++) {
+            for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                lv_color32_t src_argb;
+                src_argb.red = src_argb.green = src_argb.blue = src_buf_al88[src_x].lumi;
+                if (mask_buf == NULL) {
+                    src_argb.alpha = LV_OPA_MIX2(src_buf_al88[src_x].alpha, opa);
+                } else {
+                    src_argb.alpha = LV_OPA_MIX3(src_buf_al88[src_x].alpha, mask_buf[dest_x], opa);
+                }
+                blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_u8 += dest_stride;
+            src_buf_al88 = drawbuf_next_row(src_buf_al88, src_stride);
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM l8_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint8_t *dest_buf_u8 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint8_t *src_buf_l8 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        dest_buf_u8[dest_x + 2] = src_buf_l8[src_x];
+                        dest_buf_u8[dest_x + 1] = src_buf_l8[src_x];
+                        dest_buf_u8[dest_x + 0] = src_buf_l8[src_x];
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], opa);
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], mask_buf[src_x]);
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_L8_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_8_24_mix(src_buf_l8[src_x], &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x]));
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        lv_color32_t src_argb;
+        for (y = 0; y < h; y++) {
+            for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                src_argb.red = src_buf_l8[src_x];
+                src_argb.green = src_buf_l8[src_x];
+                src_argb.blue = src_buf_l8[src_x];
+                if (mask_buf == NULL) {
+                    src_argb.alpha = opa;
+                } else {
+                    src_argb.alpha = LV_OPA_MIX2(mask_buf[dest_x], opa);
+                }
+                blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_u8 += dest_stride;
+            src_buf_l8 = drawbuf_next_row(src_buf_l8, src_stride);
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint8_t *dest_buf_u8 = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const lv_color16_t *src_buf_c16 = (const lv_color16_t *) dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t src_x;
+    int32_t dest_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        dest_buf_u8[dest_x + 2] = (src_buf_c16[src_x].red * 2106) >> 8;  /*To make it rounded*/
+                        dest_buf_u8[dest_x + 1] = (src_buf_c16[src_x].green * 1037) >> 8;
+                        dest_buf_u8[dest_x + 0] = (src_buf_c16[src_x].blue * 2106) >> 8;
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
+                uint8_t res[3];
+                for (y = 0; y < h; y++) {
+                    for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        res[2] = (src_buf_c16[src_x].red * 2106) >> 8; /*To make it rounded*/
+                        res[1] = (src_buf_c16[src_x].green * 1037) >> 8;
+                        res[0] = (src_buf_c16[src_x].blue * 2106) >> 8;
+                        lv_color_24_24_mix(res, &dest_buf_u8[dest_x], opa);
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
+                uint8_t res[3];
+                for (y = 0; y < h; y++) {
+                    for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        res[2] = (src_buf_c16[src_x].red * 2106) >> 8;  /*To make it rounded*/
+                        res[1] = (src_buf_c16[src_x].green * 1037) >> 8;
+                        res[0] = (src_buf_c16[src_x].blue * 2106) >> 8;
+                        lv_color_24_24_mix(res, &dest_buf_u8[dest_x], mask_buf[src_x]);
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
+                uint8_t res[3];
+                for (y = 0; y < h; y++) {
+                    for (src_x = 0, dest_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        res[2] = (src_buf_c16[src_x].red * 2106) >> 8;  /*To make it rounded*/
+                        res[1] = (src_buf_c16[src_x].green * 1037) >> 8;
+                        res[0] = (src_buf_c16[src_x].blue * 2106) >> 8;
+                        lv_color_24_24_mix(res, &dest_buf_u8[dest_x], LV_OPA_MIX2(opa, mask_buf[src_x]));
+                    }
+                    dest_buf_u8 += dest_stride;
+                    src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        lv_color32_t src_argb;
+        for (y = 0; y < h; y++) {
+            for (src_x = 0, dest_x = 0; src_x < w; src_x++, dest_x += dest_px_size) {
+                src_argb.red = (src_buf_c16[src_x].red * 2106) >> 8;
+                src_argb.green = (src_buf_c16[src_x].green * 1037) >> 8;
+                src_argb.blue = (src_buf_c16[src_x].blue * 2106) >> 8;
+                if (mask_buf == NULL) {
+                    src_argb.alpha = opa;
+                } else {
+                    src_argb.alpha = LV_OPA_MIX2(mask_buf[src_x], opa);
+                }
+                blend_non_normal_pixel(&dest_buf_u8[dest_x], src_argb, dsc->blend_mode);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf_u8 += dest_stride;
+            src_buf_c16 = drawbuf_next_row(src_buf_c16, src_stride);
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, const uint8_t dest_px_size,
+        uint32_t src_px_size)
+{
+    int32_t w = dsc->dest_w * dest_px_size;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint8_t *dest_buf = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const uint8_t *src_buf = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        /*Special case*/
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size, src_px_size)) {
+                if (src_px_size == dest_px_size) {
+                    for (y = 0; y < h; y++) {
+                        memcpy(dest_buf, src_buf, w);
+                        dest_buf += dest_stride;
+                        src_buf += src_stride;
+                    }
+                } else {
+                    for (y = 0; y < h; y++) {
+                        for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) {
+                            dest_buf[dest_x + 0] = src_buf[src_x + 0];
+                            dest_buf[dest_x + 1] = src_buf[src_x + 1];
+                            dest_buf[dest_x + 2] = src_buf[src_x + 2];
+                        }
+                        dest_buf += dest_stride;
+                        src_buf += src_stride;
+                    }
+                }
+            }
+        }
+        if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size, src_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) {
+                        lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], opa);
+                    }
+                    dest_buf += dest_stride;
+                    src_buf += src_stride;
+                }
+            }
+        }
+        if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size, src_px_size)) {
+                uint32_t mask_x;
+                for (y = 0; y < h; y++) {
+                    for (mask_x = 0, dest_x = 0, src_x = 0; dest_x < w; mask_x++, dest_x += dest_px_size, src_x += src_px_size) {
+                        lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], mask_buf[mask_x]);
+                    }
+                    dest_buf += dest_stride;
+                    src_buf += src_stride;
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+        if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size, src_px_size)) {
+                uint32_t mask_x;
+                for (y = 0; y < h; y++) {
+                    for (mask_x = 0, dest_x = 0, src_x = 0; dest_x < w; mask_x++, dest_x += dest_px_size, src_x += src_px_size) {
+                        lv_color_24_24_mix(&src_buf[src_x], &dest_buf[dest_x], LV_OPA_MIX2(opa, mask_buf[mask_x]));
+                    }
+                    dest_buf += dest_stride;
+                    src_buf += src_stride;
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        lv_color32_t src_argb;
+        for (y = 0; y < h; y++) {
+            for (dest_x = 0, src_x = 0; dest_x < w; dest_x += dest_px_size, src_x += src_px_size) {
+                src_argb.red = src_buf[src_x + 2];
+                src_argb.green = src_buf[src_x + 1];
+                src_argb.blue = src_buf[src_x + 0];
+                if (mask_buf == NULL) {
+                    src_argb.alpha = opa;
+                } else {
+                    src_argb.alpha = LV_OPA_MIX2(mask_buf[dest_x], opa);
+                }
+
+                blend_non_normal_pixel(&dest_buf[dest_x], src_argb, dsc->blend_mode);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf += dest_stride;
+            src_buf += src_stride;
+        }
+    }
+}
+
+static void LV_ATTRIBUTE_FAST_MEM argb8888_image_blend(_lv_draw_sw_blend_image_dsc_t *dsc, uint32_t dest_px_size)
+{
+    int32_t w = dsc->dest_w;
+    int32_t h = dsc->dest_h;
+    lv_opa_t opa = dsc->opa;
+    uint8_t *dest_buf = dsc->dest_buf;
+    int32_t dest_stride = dsc->dest_stride;
+    const lv_color32_t *src_buf_c32 = dsc->src_buf;
+    int32_t src_stride = dsc->src_stride;
+    const lv_opa_t *mask_buf = dsc->mask_buf;
+    int32_t mask_stride = dsc->mask_stride;
+
+    int32_t dest_x;
+    int32_t src_x;
+    int32_t y;
+
+    if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
+        if (mask_buf == NULL && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x], src_buf_c32[src_x].alpha);
+                    }
+                    dest_buf += dest_stride;
+                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+                }
+            }
+        } else if (mask_buf == NULL && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x], LV_OPA_MIX2(src_buf_c32[src_x].alpha, opa));
+                    }
+                    dest_buf += dest_stride;
+                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+                }
+            }
+        } else if (mask_buf && opa >= LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x],
+                                           LV_OPA_MIX2(src_buf_c32[src_x].alpha, mask_buf[src_x]));
+                    }
+                    dest_buf += dest_stride;
+                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        } else if (mask_buf && opa < LV_OPA_MAX) {
+            if (LV_RESULT_INVALID == LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size)) {
+                for (y = 0; y < h; y++) {
+                    for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x++) {
+                        lv_color_24_24_mix((const uint8_t *)&src_buf_c32[src_x], &dest_buf[dest_x],
+                                           LV_OPA_MIX3(src_buf_c32[src_x].alpha, mask_buf[src_x], opa));
+                    }
+                    dest_buf += dest_stride;
+                    src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+                    mask_buf += mask_stride;
+                }
+            }
+        }
+    } else {
+        lv_color32_t src_argb;
+        for (y = 0; y < h; y++) {
+            for (dest_x = 0, src_x = 0; src_x < w; dest_x += dest_px_size, src_x ++) {
+                src_argb = src_buf_c32[src_x];
+                if (mask_buf == NULL) {
+                    src_argb.alpha = LV_OPA_MIX2(src_argb.alpha, opa);
+                } else {
+                    src_argb.alpha = LV_OPA_MIX3(src_argb.alpha, mask_buf[dest_x], opa);
+                }
+
+                blend_non_normal_pixel(&dest_buf[dest_x], src_argb, dsc->blend_mode);
+            }
+            if (mask_buf) {
+                mask_buf += mask_stride;
+            }
+            dest_buf += dest_stride;
+            src_buf_c32 = drawbuf_next_row(src_buf_c32, src_stride);
+        }
+    }
+}
+
+static inline void LV_ATTRIBUTE_FAST_MEM blend_non_normal_pixel(uint8_t *dest, lv_color32_t src, lv_blend_mode_t mode)
+{
+    uint8_t res[3] = {0, 0, 0};
+    switch (mode) {
+    case LV_BLEND_MODE_ADDITIVE:
+        res[0] = LV_MIN(dest[0] + src.blue, 255);
+        res[1] = LV_MIN(dest[1] + src.green, 255);
+        res[2] = LV_MIN(dest[2] + src.red, 255);
+        break;
+    case LV_BLEND_MODE_SUBTRACTIVE:
+        res[0] = LV_MAX(dest[0] - src.blue, 0);
+        res[1] = LV_MAX(dest[1] - src.green, 0);
+        res[2] = LV_MAX(dest[2] - src.red, 0);
+        break;
+    case LV_BLEND_MODE_MULTIPLY:
+        res[0] = (dest[0] * src.blue) >> 8;
+        res[1] = (dest[1] * src.green) >> 8;
+        res[2] = (dest[2] * src.red) >> 8;
+        break;
+    default:
+        LV_LOG_WARN("Not supported blend mode: %d", mode);
+        return;
+    }
+    lv_color_24_24_mix(res, dest, src.alpha);
+}
+
+static inline void LV_ATTRIBUTE_FAST_MEM lv_color_8_24_mix(const uint8_t src, uint8_t *dest, uint8_t mix)
+{
+
+    if (mix == 0) {
+        return;
+    }
+
+    if (mix >= LV_OPA_MAX) {
+        dest[0] = src;
+        dest[1] = src;
+        dest[2] = src;
+    } else {
+        lv_opa_t mix_inv = 255 - mix;
+        dest[0] = (uint32_t)((uint32_t)src * mix + dest[0] * mix_inv) >> 8;
+        dest[1] = (uint32_t)((uint32_t)src * mix + dest[1] * mix_inv) >> 8;
+        dest[2] = (uint32_t)((uint32_t)src * mix + dest[2] * mix_inv) >> 8;
+    }
+}
+
+static inline void LV_ATTRIBUTE_FAST_MEM lv_color_24_24_mix(const uint8_t *src, uint8_t *dest, uint8_t mix)
+{
+
+    if (mix == 0) {
+        return;
+    }
+
+    if (mix >= LV_OPA_MAX) {
+        dest[0] = src[0];
+        dest[1] = src[1];
+        dest[2] = src[2];
+    } else {
+        lv_opa_t mix_inv = 255 - mix;
+        dest[0] = (uint32_t)((uint32_t)src[0] * mix + dest[0] * mix_inv) >> 8;
+        dest[1] = (uint32_t)((uint32_t)src[1] * mix + dest[1] * mix_inv) >> 8;
+        dest[2] = (uint32_t)((uint32_t)src[2] * mix + dest[2] * mix_inv) >> 8;
+    }
+}
+
+static inline uint8_t LV_ATTRIBUTE_FAST_MEM get_bit(const uint8_t *buf, int32_t bit_idx)
+{
+    return (buf[bit_idx / 8] >> (7 - (bit_idx % 8))) & 1;
+}
+
+static inline void *LV_ATTRIBUTE_FAST_MEM drawbuf_next_row(const void *buf, uint32_t stride)
+{
+    return (void *)((uint8_t *)buf + stride);
+}
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
index 5243857e..fb0ddece 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
@@ -42,7 +42,8 @@ typedef struct {
         void *p_asm_alloc;                                  // pointer to the beginning of the memory allocated for ASM test buf, used in free()
         void *p_ansi_alloc;                                 // pointer to the beginning of the memory allocated for ANSI test buf, used in free()
     } buf;
-    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function
+    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *);              // pointer to LVGL API function
+    void (*blend_api_px_func)(_lv_draw_sw_blend_fill_dsc_t *, uint32_t); // pointer to LVGL API function with dest_px_size argument
     lv_color_format_t color_format;                         // LV color format
     size_t data_type_size;                                  // Used data type size, eg sizeof()
     size_t active_buf_len;                                  // Length of buffer, where the actual data are stored (not including Canary bytes)
@@ -65,7 +66,8 @@ typedef struct {
     unsigned int benchmark_cycles;                          // Count of benchmark cycles
     void *array_align16;                                    // test array with 16 byte alignment - testing most ideal case
     void *array_align1;                                     // test array with 1 byte alignment - testing wort case
-    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function
+    void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *);              // pointer to LVGL API function
+    void (*blend_api_px_func)(_lv_draw_sw_blend_fill_dsc_t *, uint32_t); // pointer to LVGL API function with dest_px_size argument
 } bench_test_case_params_t;
 
 #ifdef __cplusplus
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
index 85935985..f038e679 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_benchmark.c
@@ -15,6 +15,7 @@
 #include "lv_draw_sw_blend.h"
 #include "lv_draw_sw_blend_to_argb8888.h"
 #include "lv_draw_sw_blend_to_rgb565.h"
+#include "lv_draw_sw_blend_to_rgb888.h"
 
 #define WIDTH 128
 #define HEIGHT 128
@@ -115,6 +116,31 @@ TEST_CASE("LV Fill benchmark RGB565", "[fill][benchmark][RGB565]")
     lv_fill_benchmark_init(&test_params);
     free(dest_array_align16);
 }
+
+TEST_CASE("LV Fill benchmark RGB888", "[fill][benchmark][RGB888]")
+{
+    uint8_t *dest_array_align16  = (uint8_t *)memalign(16, STRIDE * HEIGHT * 3 + UNALIGN_BYTES);
+    TEST_ASSERT_NOT_EQUAL(NULL, dest_array_align16);
+
+    // Apply byte unalignment for the worst-case test scenario
+    uint8_t *dest_array_align1 = dest_array_align16 + UNALIGN_BYTES;
+
+    bench_test_case_params_t test_params = {
+        .height = HEIGHT,
+        .width = WIDTH,
+        .stride = STRIDE * 3,
+        .cc_height = HEIGHT - 1,
+        .cc_width = WIDTH - 1,
+        .benchmark_cycles = BENCHMARK_CYCLES,
+        .array_align16 = (void *)dest_array_align16,
+        .array_align1 = (void *)dest_array_align1,
+        .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888,
+    };
+
+    ESP_LOGI(TAG_LV_FILL_BENCH, "running test for RGB888 color format");
+    lv_fill_benchmark_init(&test_params);
+    free(dest_array_align16);
+}
 // ------------------------------------------------ Static test functions ----------------------------------------------
 
 static void lv_fill_benchmark_init(bench_test_case_params_t *test_params)
@@ -143,6 +169,7 @@ static void lv_fill_benchmark_init(bench_test_case_params_t *test_params)
 
         // Run benchmark with the most ideal input parameters
         // Dest array is 16 byte aligned, dest_w and dest_h are dividable by 4
+
         float cycles = lv_fill_benchmark_run(test_params, &dsc);        // Call Benchmark cycle
         float per_sample = cycles / ((float)(dsc.dest_w * dsc.dest_h));
         ESP_LOGI(TAG_LV_FILL_BENCH, " %s ideal case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample", asm_ansi_func[i], cycles, dsc.dest_w, dsc.dest_h, per_sample);
@@ -162,15 +189,29 @@ static void lv_fill_benchmark_init(bench_test_case_params_t *test_params)
 static float lv_fill_benchmark_run(bench_test_case_params_t *test_params, _lv_draw_sw_blend_fill_dsc_t *dsc)
 {
     // Call the DUT function for the first time to init the benchmark test
-    test_params->blend_api_func(dsc);
+    if (test_params->blend_api_func != NULL) {
+        test_params->blend_api_func(dsc);
+    } else if (test_params->blend_api_px_func != NULL) {
+        test_params->blend_api_px_func(dsc, 3);
+    }
+
 
     const unsigned int start_b = xthal_get_ccount();
-    for (int i = 0; i < test_params->benchmark_cycles; i++) {
-        test_params->blend_api_func(dsc);
+
+    if (test_params->blend_api_func != NULL) {
+        for (int i = 0; i < test_params->benchmark_cycles; i++) {
+            test_params->blend_api_func(dsc);
+        }
+    } else if (test_params->blend_api_px_func != NULL) {
+        for (int i = 0; i < test_params->benchmark_cycles; i++) {
+            test_params->blend_api_px_func(dsc, 3);
+        }
     }
+
     const unsigned int end_b = xthal_get_ccount();
 
     const float total_b = end_b - start_b;
     const float cycles = total_b / (test_params->benchmark_cycles);
+
     return cycles;
 }
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
index 5bf29558..45ebd219 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_fill_functionality.c
@@ -13,6 +13,7 @@
 #include "lv_draw_sw_blend.h"
 #include "lv_draw_sw_blend_to_argb8888.h"
 #include "lv_draw_sw_blend_to_rgb565.h"
+#include "lv_draw_sw_blend_to_rgb888.h"
 
 // ------------------------------------------------- Defines -----------------------------------------------------------
 
@@ -47,14 +48,14 @@ static lv_color_t test_color = {
  * - generate functionality test combinations, based on the provided test_matrix struct
  *
  * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
  */
 static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case);
 
 /**
  * @brief Fill test buffers for functionality test
  *
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
  */
 static void fill_test_bufs(func_test_case_params_t *test_case);
 
@@ -63,24 +64,31 @@ static void fill_test_bufs(func_test_case_params_t *test_case);
  *
  * - function prepares structures for functionality testing and runs the LVGL API
  *
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
  */
 static void lv_fill_functionality(func_test_case_params_t *test_case);
 
 /**
  * @brief Evaluate results for 32bit data length
  *
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
  */
 static void test_eval_32bit_data(func_test_case_params_t *test_case);
 
 /**
  * @brief Evaluate results for 16bit data length
  *
- * @param[in] test_case Pointer ot structure defining functionality test case
+ * @param[in] test_case Pointer to structure defining functionality test case
  */
 static void test_eval_16bit_data(func_test_case_params_t *test_case);
 
+/**
+ * @brief Evaluate results for 24bit data length
+ *
+ * @param[in] test_case Pointer to structure defining functionality test case
+ */
+static void test_eval_24bit_data(func_test_case_params_t *test_case);
+
 // ------------------------------------------------ Test cases ---------------------------------------------------------
 
 /*
@@ -147,6 +155,29 @@ TEST_CASE("Test fill functionality RGB565", "[fill][functionality][RGB565]")
     functionality_test_matrix(&test_matrix, &test_case);
 }
 
+TEST_CASE("Test fill functionality RGB888", "[fill][functionality][RGB888]")
+{
+    test_matrix_params_t test_matrix = {
+        .min_w = 12,             // 12 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
+        .min_h = 1,
+        .max_w = 32,
+        .max_h = 3,
+        .min_unalign_byte = 0,
+        .max_unalign_byte = 16,
+        .unalign_step = 1,
+        .dest_stride_step = 1,
+        .test_combinations_count = 0,
+    };
+
+    func_test_case_params_t test_case = {
+        .blend_api_px_func = &lv_draw_sw_blend_color_to_rgb888,
+        .color_format = LV_COLOR_FORMAT_RGB888,
+        .data_type_size = sizeof(uint8_t) * 3,   // 24-bit data length
+    };
+
+    ESP_LOGI(TAG_LV_FILL_FUNC, "running test for RGB888 color format");
+    functionality_test_matrix(&test_matrix, &test_case);
+}
 // ------------------------------------------------ Static test functions ----------------------------------------------
 
 static void functionality_test_matrix(test_matrix_params_t *test_matrix, func_test_case_params_t *test_case)
@@ -195,8 +226,13 @@ static void lv_fill_functionality(func_test_case_params_t *test_case)
     dsc_ansi.dest_buf = test_case->buf.p_ansi;
     dsc_ansi.use_asm = false;
 
-    test_case->blend_api_func(&dsc_asm);    // Call the LVGL API with Assembly code
-    test_case->blend_api_func(&dsc_ansi);   // Call the LVGL API with ANSI code
+    if (test_case->blend_api_func != NULL) {
+        test_case->blend_api_func(&dsc_asm);    // Call the LVGL API with Assembly code
+        test_case->blend_api_func(&dsc_ansi);   // Call the LVGL API with ANSI code
+    } else if (test_case->blend_api_px_func != NULL) {
+        test_case->blend_api_px_func(&dsc_asm, 3);    // Call the LVGL API with Assembly code
+        test_case->blend_api_px_func(&dsc_ansi, 3);   // Call the LVGL API with ANSI code
+    }
 
     // Shift array pointers by Canary Bytes amount back
     test_case->buf.p_asm -= CANARY_BYTES * test_case->data_type_size;
@@ -216,6 +252,11 @@ static void lv_fill_functionality(func_test_case_params_t *test_case)
         break;
     }
 
+    case LV_COLOR_FORMAT_RGB888: {
+        test_eval_24bit_data(test_case);
+        break;
+    }
+
     default:
         TEST_ASSERT_MESSAGE(false, "LV Color format not found");
     }
@@ -233,6 +274,7 @@ static void fill_test_bufs(func_test_case_params_t *test_case)
     const unsigned int unalign_byte = test_case->unalign_byte;
 
     // Allocate destination arrays for Assembly and ANSI LVGL Blend API
+
     void *mem_asm   = memalign(16, (total_buf_len * data_type_size) + unalign_byte);
     void *mem_ansi  = memalign(16, (total_buf_len * data_type_size) + unalign_byte);
     TEST_ASSERT_NOT_NULL_MESSAGE(mem_asm, "Lack of memory");
@@ -275,7 +317,6 @@ static void test_eval_32bit_data(func_test_case_params_t *test_case)
     }
     printf("\n");
 #endif
-
     // Canary bytes area must stay 0
     TEST_ASSERT_EACH_EQUAL_UINT32_MESSAGE(0, (uint32_t *)test_case->buf.p_ansi, CANARY_BYTES, test_msg_buf);
     TEST_ASSERT_EACH_EQUAL_UINT32_MESSAGE(0, (uint32_t *)test_case->buf.p_asm, CANARY_BYTES, test_msg_buf);
@@ -297,8 +338,6 @@ static void test_eval_16bit_data(func_test_case_params_t *test_case)
     }
     printf("\n");
 #endif
-
-    // Canary bytes area must stay 0
     TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_ansi, CANARY_BYTES, test_msg_buf);
     TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_asm, CANARY_BYTES, test_msg_buf);
 
@@ -309,3 +348,32 @@ static void test_eval_16bit_data(func_test_case_params_t *test_case)
     TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_ansi + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf);
     TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_asm + (test_case->total_buf_len - CANARY_BYTES), CANARY_BYTES, test_msg_buf);
 }
+
+static void test_eval_24bit_data(func_test_case_params_t *test_case)
+{
+    // Print results, 24bit data
+#if DBG_PRINT_OUTPUT
+    size_t data_type_size = test_case->data_type_size;
+    for (uint32_t i = 0; i < test_case->total_buf_len; i++) {
+        uint32_t ansi_value = ((uint8_t *)test_case->buf.p_ansi)[i * data_type_size]
+                              | (((uint8_t *)test_case->buf.p_ansi)[i * data_type_size + 1] << 8)
+                              | (((uint8_t *)test_case->buf.p_ansi)[i * data_type_size + 2] << 16);
+        uint32_t asm_value  = ((uint8_t *)test_case->buf.p_asm)[i * data_type_size]
+                              | (((uint8_t *)test_case->buf.p_asm)[i * data_type_size + 1] << 8)
+                              | (((uint8_t *)test_case->buf.p_asm)[i * data_type_size + 2] << 16);
+        printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx32" \t asm = %8"PRIx32" \n", i, ((i < 10) ? (" ") : ("")), ansi_value, asm_value);
+    }
+    printf("\n");
+#endif
+
+    // Canary bytes area must stay 0
+    TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_ansi, CANARY_BYTES * test_case->data_type_size, test_msg_buf);
+    TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_asm, CANARY_BYTES * test_case->data_type_size, test_msg_buf);
+
+    // dest_buf_asm and dest_buf_ansi must be equal
+    TEST_ASSERT_EQUAL_UINT8_ARRAY_MESSAGE((uint8_t *)test_case->buf.p_asm + CANARY_BYTES * test_case->data_type_size, (uint8_t *)test_case->buf.p_ansi + CANARY_BYTES * test_case->data_type_size, test_case->active_buf_len * test_case->data_type_size, test_msg_buf);
+
+    // Canary bytes area must stay 0
+    TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_ansi + test_case->active_buf_len * test_case->data_type_size + CANARY_BYTES * test_case->data_type_size, CANARY_BYTES * test_case->data_type_size, test_msg_buf);
+    TEST_ASSERT_EACH_EQUAL_UINT8_MESSAGE(0, (uint8_t *)test_case->buf.p_asm + test_case->active_buf_len * test_case->data_type_size + CANARY_BYTES * test_case->data_type_size, CANARY_BYTES * test_case->data_type_size, test_msg_buf);
+}