From 1f6c3291ac6a0b0a9c2c0ad3fdb9fb59bd7ebea0 Mon Sep 17 00:00:00 2001
From: "peter.marcisovsky" <peter.marcisovsky@espressif.com>
Date: Tue, 17 Dec 2024 15:10:43 +0100
Subject: [PATCH] feat(lvgl_port_simd): RBG565 image blend to RGB565

    - RGB565 blend to RGB565 (optimized memcpy for RGB565 type)
    - esp32s3 assembly implementation using SIMD instructions
    - esp32 assembly fallback
---
 components/esp_lvgl_port/CMakeLists.txt       |   5 +
 .../include/esp_lvgl_port_lv_blend.h          |  20 +
 .../src/lvgl9/simd/lv_macro_memcpy.S          |  60 +++
 .../lv_rgb565_blend_normal_to_rgb565_esp32.S  | 264 +++++++++++++
 ...lv_rgb565_blend_normal_to_rgb565_esp32s3.S | 372 ++++++++++++++++++
 .../esp_lvgl_port/test_apps/simd/README.md    |  13 +-
 .../test_apps/simd/main/CMakeLists.txt        |  12 +-
 .../main/lv_blend/include/lv_draw_sw_blend.h  |   1 +
 .../simd/main/lv_blend/include/lv_string.h    |  79 ++++
 .../simd/main/lv_blend/include/lv_types.h     |   2 +
 .../src/lv_draw_sw_blend_to_argb8888.c        |  10 +-
 .../lv_blend/src/lv_draw_sw_blend_to_rgb565.c |   8 +-
 .../main/lv_blend/src/lv_string_builtin.c     | 188 +++++++++
 .../test_apps/simd/main/lv_fill_common.h      |   2 +-
 .../test_apps/simd/main/lv_image_common.h     | 111 ++++++
 .../simd/main/test_lv_image_benchmark.c       | 171 ++++++++
 .../simd/main/test_lv_image_functionality.c   | 351 +++++++++++++++++
 17 files changed, 1658 insertions(+), 11 deletions(-)
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S
 create mode 100644 components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/lv_image_common.h
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c
 create mode 100644 components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c

diff --git a/components/esp_lvgl_port/CMakeLists.txt b/components/esp_lvgl_port/CMakeLists.txt
index 8dc53693..4a84cbf0 100644
--- a/components/esp_lvgl_port/CMakeLists.txt
+++ b/components/esp_lvgl_port/CMakeLists.txt
@@ -85,6 +85,10 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0"))
         else()
             file(GLOB_RECURSE ASM_SRCS ${PORT_PATH}/simd/*_esp32.S)      # Select only esp32 related files
         endif()
+
+        # Explicitly add all assembly macro files
+        file(GLOB_RECURSE ASM_MACROS ${PORT_PATH}/simd/lv_macro_*.S)
+        list(APPEND ADD_SRCS ${ASM_MACROS})
         list(APPEND ADD_SRCS ${ASM_SRCS})
 
         # Include component libraries, so lvgl component would see lvgl_port includes
@@ -94,6 +98,7 @@ if((lvgl_ver VERSION_GREATER_EQUAL "9.1.0") AND (lvgl_ver VERSION_LESS "9.2.0"))
         # Force link .S files
         set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_argb8888_esp")
         set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_color_blend_to_rgb565_esp")
+        set_property(TARGET ${COMPONENT_LIB} APPEND PROPERTY INTERFACE_LINK_LIBRARIES "-u lv_rgb565_blend_normal_to_rgb565_esp")
     endif()
 endif()
 
diff --git a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
index c00de1c0..999153fe 100644
--- a/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
+++ b/components/esp_lvgl_port/include/esp_lvgl_port_lv_blend.h
@@ -32,6 +32,10 @@ extern "C" {
     _lv_color_blend_to_rgb565_esp(dsc)
 #endif
 
+#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565
+#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc)  \
+    _lv_rgb565_blend_normal_to_rgb565_esp(dsc)
+#endif
 
 /**********************
  *      TYPEDEFS
@@ -83,6 +87,22 @@ static inline lv_result_t _lv_color_blend_to_rgb565_esp(_lv_draw_sw_blend_fill_d
     return lv_color_blend_to_rgb565_esp(&asm_dsc);
 }
 
+extern int lv_rgb565_blend_normal_to_rgb565_esp(asm_dsc_t *asm_dsc);
+
+static inline lv_result_t _lv_rgb565_blend_normal_to_rgb565_esp(_lv_draw_sw_blend_image_dsc_t * dsc)
+{
+    asm_dsc_t asm_dsc = {
+        .dst_buf = dsc->dest_buf,
+        .dst_w = dsc->dest_w,
+        .dst_h = dsc->dest_h,
+        .dst_stride = dsc->dest_stride,
+        .src_buf = dsc->src_buf,
+        .src_stride = dsc->src_stride
+    };
+    
+    return lv_rgb565_blend_normal_to_rgb565_esp(&asm_dsc);
+}
+
 #endif // CONFIG_LV_DRAW_SW_ASM_CUSTOM
 
 #ifdef __cplusplus
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S
new file mode 100644
index 00000000..377f9285
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_macro_memcpy.S
@@ -0,0 +1,60 @@
+/*
+ * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+// Memcpy macros for modulo checking
+// After running the main loop, there is need to check remaining bytes to be copied out of the main loop
+// Macros work with both, aligned and unaligned (4-byte boundary) memories
+// but performance is significantly lower when using unaligned memory, because of the unaligned memory access exception
+
+
+// Macro for checking modulo 8
+ .macro macro_memcpy_mod_8 src_buf, dest_buf, condition, x1, x2, JUMP_TAG
+    // Check modulo 8 of the \condition, if - then copy 8 bytes
+    bbci \condition, 3, ._mod_8_check_\JUMP_TAG      // Branch if 3-rd bit of \condition is clear
+        l32i.n      \x1,        \src_buf,   0        // Load 32 bits from \src_buff to \x1, offset 0
+        l32i.n      \x2,        \src_buf,   4        // Load 32 bits from \src_buff to \x2, offset 4
+        s32i.n      \x1,        \dest_buf,  0        // Save 32 bits from \x1 to \dest_buff, offset 0
+        s32i.n      \x2,        \dest_buf,  4        // Save 32 bits from \x2 to \dest_buff, offset 4
+        addi.n      \src_buf,   \src_buf,   8        // Increment \src_buff pointer by 8
+        addi.n      \dest_buf,  \dest_buf,  8        // Increment \dest_buff pointer 8
+    ._mod_8_check_\JUMP_TAG:
+.endm // macro_memcpy_mod_8
+
+
+// Macro for checking modulo 4
+ .macro macro_memcpy_mod_4 src_buf, dest_buf, condition, x1, JUMP_TAG
+    // Check modulo 4 of the \condition, if - then copy 4 bytes
+    bbci \condition, 2, ._mod_4_check_\JUMP_TAG      // Branch if 2-nd bit of \condition is clear
+        l32i.n      \x1,        \src_buf,   0        // Load 32 bits from \src_buff to \x1, offset 0
+        addi.n      \src_buf,   \src_buf,   4        // Increment \src_buff pointer by 4
+        s32i.n      \x1,        \dest_buf,  0        // Save 32 bits from \x1 to \dest_buff, offset 0
+        addi.n      \dest_buf,  \dest_buf,  4        // Increment \dest_buff pointer 4
+    ._mod_4_check_\JUMP_TAG:
+.endm // macro_memcpy_mod_4
+
+
+// Macro for checking modulo 2
+ .macro macro_memcpy_mod_2 src_buf, dest_buf, condition, x1, JUMP_TAG
+    // Check modulo 2 of the \condition, if - then copy 2 bytes
+    bbci \condition, 1, ._mod_2_check_\JUMP_TAG      // Branch if 1-st bit of \condition is clear
+        l16ui       \x1,        \src_buf,   0        // Load 16 bits from \src_buff to \x1, offset 0
+        addi.n      \src_buf,   \src_buf,   2        // Increment \src_buff pointer by 2
+        s16i        \x1,        \dest_buf,  0        // Save 16 bits from \x1 to \dest_buff, offset 0
+        addi.n      \dest_buf,  \dest_buf,  2        // Increment \dest_buff pointer 2
+    ._mod_2_check_\JUMP_TAG:
+.endm // macro_memcpy_mod_2
+
+
+// Macro for checking modulo 1
+ .macro macro_memcpy_mod_1 src_buf, dest_buf, condition, x1, JUMP_TAG
+    // Check modulo 1 of the \condition, if - then copy 1 byte
+    bbci \condition, 0, ._mod_1_check_\JUMP_TAG      // Branch if 0-th bit of \condition is clear
+        l8ui        \x1,        \src_buf,   0        // Load 8 bits from \src_buff to \x1, offset 0
+        addi.n      \src_buf,   \src_buf,   1        // Increment \src_buff pointer by 1
+        s8i         \x1,        \dest_buf,  0        // Save 8 bits from \x1 to \dest_buff, offset 0
+        addi.n      \dest_buf,  \dest_buf,  1        // Increment \dest_buff pointer 1
+    ._mod_1_check_\JUMP_TAG:
+.endm // macro_memcpy_mod_1
\ No newline at end of file
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S
new file mode 100644
index 00000000..7752aae4
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32.S
@@ -0,0 +1,264 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "lv_macro_memcpy.S"        // Memcpy macros
+
+// This is LVGL RGB565 image blend to RGB565 for ESP32 processor
+
+    .section .text
+    .align  4
+    .global lv_rgb565_blend_normal_to_rgb565_esp
+    .type   lv_rgb565_blend_normal_to_rgb565_esp,@function
+// The function implements the following C code:
+// void rgb565_image_blend(_lv_draw_sw_blend_image_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+lv_rgb565_blend_normal_to_rgb565_esp:
+
+    entry    a1,    32
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff
+    l32i.n   a8,    a2,    24                   // a8 - src_stride            in bytes
+    slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
+
+    // No need to convert any colors here, we are copying from rgb565 to rgb565
+
+    // Check dest_w length
+    bltui   a4,  8,  _matrix_width_check                // Branch if dest_w (a4) is lower than 8
+
+    // Check memory alignment and input parameters lengths and decide which implementation to use
+    movi.n   a10,   0x3                                 // a10 = 0x3 alignment mask (4-byte alignment)
+    or       a15,   a7,    a3                           // a15 = src_buff (a7) OR dest_buff (a3)
+    or       a15,   a15,   a6                           // a15 = a15 OR dest_stride (a6)
+    or       a15,   a15,   a8                           // a15 = a15 OR src_stride (a8)
+    or       a15,   a15,   a11                          // a15 = a15 OR dest_w_bytes (a11)
+    and      a15,   a15,   a10                          // a15 = a15 AND alignment mask (a10)
+    bnez     a15,   _alignment_check                    // Branch if a15 not equals to zero
+
+//**********************************************************************************************************************
+
+    // The most ideal case - both arrays aligned, both strides and dest_w are multiples of 4
+
+    // dest_buff   (a3) - 4-byte aligned
+    // src_buff    (a7) - 4-byte aligned
+    // dest_stride (a6) - 4-byte multiple
+    // src_stride  (a8) - 4-byte multiple
+    // dest_w      (a4) - 4-byte multiple
+
+    srli    a9,    a4,   3                              // a9 - loop_len = dest_w / 4
+    // Convert strides to matrix paddings
+    sub     a6,    a6,   a11                            // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
+    sub     a8,    a8,   a11                            // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
+
+    .outer_loop_align:
+
+        // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run
+        loopnez a9, ._main_loop_aligned
+            l32i.n      a15,  a7,  0                    // Load 32 bits from src_buff a7 to a15, offset 0
+            l32i.n      a14,  a7,  4                    // Load 32 bits from src_buff a7 to a14, offset 4
+            l32i.n      a13,  a7,  8                    // Load 32 bits from src_buff a7 to a13, offset 8
+            l32i.n      a12,  a7,  12                   // Load 32 bits from src_buff a7 to a12, offset 12
+            s32i.n      a15,  a3,  0                    // Save 32 bits from a15 to dest_buff a3, offset 0
+            s32i.n      a14,  a3,  4                    // Save 32 bits from a15 to dest_buff a3, offset 4
+            s32i.n      a13,  a3,  8                    // Save 32 bits from a15 to dest_buff a3, offset 8
+            s32i.n      a12,  a3,  12                   // Save 32 bits from a15 to dest_buff a3, offset 12
+            addi.n      a7,   a7,  16                   // Increment src_buff pointer a7 by 16
+            addi.n      a3,   a3,  16                   // Increment dest_buff pointer a3 by 16
+        ._main_loop_aligned:
+
+        // Finish the remaining bytes out of the main loop
+
+        // Check modulo 8 of the dest_w_bytes (a11), if - then copy 8 bytes (4 RGB565 pixels)
+        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy registers a14 a15
+        macro_memcpy_mod_8 a7, a3, a11, a14, a15 __LINE__
+
+        // Check modulo 4 of the dest_w_bytes (a11), if - then copy 4 bytes (2 RGB565 pixels)
+        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
+        macro_memcpy_mod_4 a7, a3, a11, a15 __LINE__
+
+        // Check modulo 2 of the dest_w_bytes (a11), if - then copy 2 bytes (1 RGB565 pixel)
+        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
+        macro_memcpy_mod_2 a7, a3, a11, a15 __LINE__
+
+        // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel)
+        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
+        macro_memcpy_mod_1 a7, a3, a11, a15 __LINE__
+
+        add     a3,  a3,  a6                            // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
+        add     a7,  a7,  a8                            // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
+        addi.n  a5,  a5,  -1                            // Decrease the outer loop
+    bnez a5, .outer_loop_align
+
+    movi.n   a2, 1                                      // Return LV_RESULT_OK = 1
+    retw.n                                              // Return
+
+
+//**********************************************************************************************************************
+
+    // The most general case - at leas one array is not aligned, or one parameter is not multiple of 4
+    _alignment_check:
+
+    // dest_buff   (a3) - 4-byte aligned, or not
+    // src_buff    (a7) - 4-byte aligned, or not
+    // dest_stride (a6) - 4-byte multiple, or not
+    // src_stride  (a8) - 4-byte multiple, or not
+    // dest_w      (a4) - 4-byte multiple, or not
+
+    // Convert strides to matrix paddings
+    sub     a6,    a6,   a11                            // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
+    sub     a8,    a8,   a11                            // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
+
+    .outer_loop_unalign:
+
+        extui       a13,  a3,   0,   2                  // Get last two bits of the dest_buff address a3, to a13
+        movi.n      a15,  4                             // Move 4 to a15, for calculation of the destination alignment loop
+        sub         a14,  a15,  a13                     // Calculate destination alignment loop length (a14 = 4 - a13)
+
+        // In case of the dest_buff a3 being already aligned (for example by matrix padding), correct a14 value,
+        // to prevent the destination aligning loop to run 4 times (to prevent aligning already aligned memory)
+        moveqz      a14,  a13,  a13                     // If a13 is zero, move a13 to a14, move 0 to a14
+
+        sub         a10,  a11,  a14                     // Get the dest_w_bytes after the aligning loop
+        srli        a9,   a10,  4                       // Calculate main loop len (a9 = dest_w_bytes_local / 16)
+
+        // Run dest_buff aligning loop byte by byte
+        loopnez a14, ._dest_aligning_loop
+            l8ui        a15,  a7,  0                    // Load 8 bits from src_buff a7 to a15, offset 0
+            addi.n      a7,   a7,  1                    // Increment src_buff pointer a7 by 1
+            s8i         a15,  a3,  0                    // Save 8 bits from a15 to dest_buff a3, offset 0
+            addi.n      a3,   a3,  1                    // Increment dest_buff pointer a3 by 1
+        ._dest_aligning_loop:
+
+        // Destination is aligned, source is unaligned
+
+        // For more information about this implementation, see chapter 3.3.2 Shifts and the Shift Amount Register (SAR)
+        // in Xtensa Instruction Set Architecture (ISA) Reference Manual
+
+        ssa8l       a7                                  // Set SAR_BYTE from src_buff a7 unalignment
+        extui       a4,  a7,  0,  2                     // Get last 2 bits of the src_buff,  a4 = src_buff_unalignment 
+        sub         a7,  a7,  a4                        // "align" the src_buff a7, to 4-byte boundary by decreasing it's pointer to the nearest aligned boundary
+
+        // First preload for the loopnez cycle
+        l32i.n      a15,  a7,  0                        // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 0
+
+        // Run main loop which copies 16 bytes (8 RGB565 pixels) in one loop run
+        loopnez a9, ._main_loop_unalign
+            l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
+            l32i.n      a13,  a7,   8                   // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8
+            src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15
+            s32i.n      a15,  a3,   0                   // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0
+            l32i.n      a12,  a7,   12                  // Load 32 bits from 4-byte aligned src_buff a7 to a12, offset 12
+            src         a14,  a13,  a14                 // Concatenate a13 and a14 and shift by SAR_BYTE amount to a14
+            s32i.n      a14,  a3,   4                   // Save 32 bits from shift-corrected a14 to dest_buff a3, offset 4
+            l32i.n      a15,  a7,   16                  // Load 32 bits from 4-byte aligned src_buff a7 to a15, offset 16
+            src         a13,  a12,  a13                 // Concatenate a12 and a13 and shift by SAR_BYTE amount to a13
+            s32i.n      a13,  a3,   8                   // Save 32 bits from shift-corrected a13 to dest_buff a3, offset 8
+            addi.n      a7,   a7,   16                  // Increment src_buff pointer a7 by 16
+            src         a12,  a15,  a12                 // Concatenate a15 and a12 and shift by SAR_BYTE amount to a12
+            s32i.n      a12,  a3,   12                  // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 12
+            addi.n      a3,   a3,   16                  // Increment dest_buff pointer a3 by 16
+        ._main_loop_unalign:
+
+        // Finish the remaining bytes out of the loop
+        // Check modulo 8 of the dest_w_bytes_local (a10), if - then copy 8 bytes
+        bbci a10, 3, _mod_8_check                       // Branch if 3-rd bit of dest_w_bytes_local is clear
+            l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
+            l32i.n      a13,  a7,   8                   // Load 32 bits from 4-byte aligned src_buff a7 to a13, offset 8
+            src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps)
+            s32i.n      a15,  a3,   0                   // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0
+            addi.n      a7,   a7,   8                   // Increment src_buff pointer a7 by 8
+            src         a14,  a13,  a14                 // Concatenate a13 and a14 and shift by SAR_BYTE amount to a14
+            s32i.n      a14,  a3,   4                   // Save 32 bits from shift-corrected a14 to dest_buff a3, offset 4
+            addi.n      a3,   a3,   8                   // Increment dest_buff pointer a3 by 8
+            mov         a15,  a13                       // Prepare a15 for the next steps (copy a13 to a15)
+        _mod_8_check:
+
+        // Check modulo 4 of the dest_w_bytes_local (a10), if - then copy 4 bytes
+        bbci a10, 2, _mod_4_check                       // Branch if 2-nd bit of dest_w_bytes_local is clear
+            l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
+            addi.n      a7,   a7,   4                   // Increment src_buff pointer a7 by 4
+            src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps)
+            s32i.n      a15,  a3,   0                   // Save 32 bits from shift-corrected a15 to dest_buff a3, offset 0
+            addi.n      a3,   a3,   4                   // Increment dest_buff pointer a3 by 4
+            mov         a15,  a14                       // Prepare a15 for the next steps (copy a14 to a15)
+        _mod_4_check:
+
+        extui       a13,  a10,  0,  2                   // Get the last 2 bytes of the dest_w_bytes_local (a10), a13 = a10[1:0], to find out how many bytes are needs copied and to increase src and dest pointer accordingly
+        beqz        a13,  _mod_1_2_check                // Branch if a13 equal to zero, E.G. if there are no bytes to be copied
+            l32i.n      a14,  a7,   4                   // Load 32 bits from 4-byte aligned src_buff a7 to a14, offset 4
+            l32i.n      a12,  a3,   0                   // Get dest_buff value: Load 32 bits from 4-byte aligned dest_buff a3 to a12, offset 0
+            src         a15,  a14,  a15                 // Concatenate a14 and a15 and shift by SAR_BYTE amount to a15 (value in a15 is already prepared from previous steps)
+            ssa8l       a10                             // Set SAR_BYTE from dest_w_bytes_local a10 length
+            sll         a15,  a15                       // Shift the dest word a15 by SAR_BYTE amount
+            srl         a12,  a12                       // Shift the src word a12 by SAR_BYTE amount
+            ssa8b       a10                             // Set SAR_BYTE from dest_w_bytes_local a10 length
+            src         a12,  a12,  a15                 // Concatenate a12 and a15 and shift by SAR_BYTE amount to a12
+            s32i.n      a12,  a3,   0                   // Save 32 bits from shift-corrected a12 to dest_buff a3, offset 0
+            add         a7,   a7,   a13                 // Increment src_buff pointer a7, by amount of copied bytes (a13)
+            add         a3,   a3,   a13                 // Increment dest_buff pointer a3, by amount of copied bytes (a13)
+        _mod_1_2_check:
+
+        add     a7,  a7,  a4                            // Correct the src_buff back by src_buff_unalignment (a4), after we have force-aligned it to 4-byte boundary before the main loop
+        add     a3,  a3,  a6                            // dest_buff + dest_stride
+        add     a7,  a7,  a8                            // src_buff + src_stride
+        addi.n  a5,  a5,  -1                            // Decrease the outer loop
+    bnez a5, .outer_loop_unalign
+
+    movi.n   a2, 1                                      // Return LV_RESULT_OK = 1
+    retw.n                                              // Return
+
+//**********************************************************************************************************************
+
+    // Small matrix width, keep it simple for lengths less than 8 pixels
+    _matrix_width_check:                                // Matrix width is greater or equal 8 pixels
+
+    // Convert strides to matrix paddings
+    sub     a6,  a6,  a11                               // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
+    sub     a8,  a8,  a11                               // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
+
+    .outer_loop_short_matrix_length:
+
+        // Run main loop which copies 2 bytes (one RGB565 pixel) in one loop run
+        loopnez a4, ._main_loop_short_matrix_length
+            l8ui        a15,  a7,  0                    // Load 8 bits from src_buff a7 to a15, offset 0
+            l8ui        a14,  a7,  1                    // Load 8 bits from src_buff a7 to a14, offset 1
+            s8i         a15,  a3,  0                    // Save 8 bits from a15 to dest_buff a3, offset 0
+            s8i         a14,  a3,  1                    // Save 8 bits from a14 to dest_buff a3, offset 1
+            addi.n      a7,   a7,  2                    // Increment src_buff pointer a7 by 1
+            addi.n      a3,   a3,  2                    // Increment dest_buff pointer a3 by 2
+        ._main_loop_short_matrix_length:
+
+        // Finish remaining byte out of the main loop
+
+        // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel)
+        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
+        macro_memcpy_mod_1 a7, a3, a11, a15, __LINE__
+
+        add     a3,  a3,  a6                            // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
+        add     a7,  a7,  a8                            // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
+        addi.n  a5,  a5,  -1                            // Decrease the outer loop
+    bnez a5, .outer_loop_short_matrix_length
+
+    movi.n   a2, 1                                      // Return LV_RESULT_OK = 1
+    retw.n                                              // Return
\ No newline at end of file
diff --git a/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S
new file mode 100644
index 00000000..a0dc9066
--- /dev/null
+++ b/components/esp_lvgl_port/src/lvgl9/simd/lv_rgb565_blend_normal_to_rgb565_esp32s3.S
@@ -0,0 +1,372 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "lv_macro_memcpy.S"        // Memcpy macros
+
+// This is LVGL RGB565 image blend to RGB565 for ESP32S3 processor
+
+    .section .text
+    .align  4
+    .global lv_rgb565_blend_normal_to_rgb565_esp
+    .type   lv_rgb565_blend_normal_to_rgb565_esp,@function
+// The function implements the following C code:
+// void lv_color_blend_to_rgb565(_lv_draw_sw_blend_fill_dsc_t * dsc);
+
+// Input params
+//
+// dsc - a2
+
+// typedef struct {
+//     uint32_t opa;                l32i    0
+//     void * dst_buf;              l32i    4
+//     uint32_t dst_w;              l32i    8
+//     uint32_t dst_h;              l32i    12
+//     uint32_t dst_stride;         l32i    16
+//     const void * src_buf;        l32i    20
+//     uint32_t src_stride;         l32i    24
+//     const lv_opa_t * mask_buf;   l32i    28
+//     uint32_t mask_stride;        l32i    32
+// } asm_dsc_t;
+
+lv_rgb565_blend_normal_to_rgb565_esp:
+
+    entry    a1,    32
+    l32i.n   a3,    a2,    4                    // a3 - dest_buff
+    l32i.n   a4,    a2,    8                    // a4 - dest_w                in uint16_t
+    l32i.n   a5,    a2,    12                   // a5 - dest_h                in uint16_t
+    l32i.n   a6,    a2,    16                   // a6 - dest_stride           in bytes
+    l32i.n   a7,    a2,    20                   // a7 - src_buff
+    l32i.n   a8,    a2,    24                   // a8 - src_stride            in bytes
+    movi.n   a10,   0xf                         // 0xf alignment mask (16-byte alignment)
+    slli     a11,   a4,    1                    // a11 - dest_w_bytes = sizeof(uint16_t) * dest_w
+
+    // No need to convert any colors here, we are copying from rgb565 to rgb565
+
+    // Check dest_w length
+    bltui   a4,  8,  _matrix_width_check                    // Branch if dest_w (a4) is lower than 8
+
+    // Check dest_buff alignment fist
+    and     a15,   a10,  a3                                 // 16-byte alignment mask AND dest_buff pointer a3
+    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
+                                                            // Jump straight to the last implementation, since this is the only one which deals with unaligned destination arrays
+
+    // Check src_buff alignment
+    and     a15,   a10,  a7                                 // 16-byte alignment mask AND src_buff pointer a7
+    bnez    a15,   _src_align_dest_unalign                  // Branch if a15 not equals to zero
+                                                            // Jump to check, if the second or third implementation can be used (depends on both strides and dest_w)
+
+    // Check dest_stride alignment
+    and     a15,   a10,  a6                                 // 16-byte alignment mask AND dest_stride a6
+    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
+                                                            // Jump straight to the last implementation, since this is the only one which deals with destination stride not aligned
+
+    // Check src_stride alignment
+    and     a15,   a10,  a8                                 // 16-byte alignment mask AND src_stride a8
+    bnez    a15,   _src_align_dest_unalign                  // Branch if a15 not equals to zero
+                                                            // Jump to check, if the second or third implementation can be used (depends on dest_w_bytes)
+
+    // Check dest_w_bytes alignment
+    and     a15,   a10,  a11                                // 16-byte alignment mask AND dest_w_bytes
+    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
+                                                            // Jump straight to the last implementation, since this is the only one which deals with dest_w_bytes not aligned
+
+//**********************************************************************************************************************
+
+    // The most ideal case - both arrays aligned, both strides and dest_w are multiples of 16
+
+    // dest_buff   (a3) - 16-byte aligned
+    // src_buff    (a7) - 16-byte aligned
+    // dest_stride (a6) - 16-byte multiple
+    // src_stride  (a8) - 16-byte multiple
+    // dest_w      (a4) - 16-byte multiple
+
+    srli    a9,    a4,   4                                  // a9 - loop_len = dest_w / 16
+    // Convert strides to matrix paddings
+    sub     a6,  a6,  a11                                   // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
+    sub     a8,  a8,  a11                                   // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
+
+    .outer_loop_align:
+
+        // Run main loop which copies 32 bytes (16 RGB565 pixels) in one loop run
+        loopnez  a9, ._main_loop_align                      // 32 bytes (16 RGB565 pixels) in one loop run
+            ee.vld.128.ip q0, a7, 16                        // Load 16 bytes from src_buff a7 to q0, increase src_buf pointer a7 by 16
+            ee.vld.128.ip q1, a7, 16                        // Load 16 bytes from src_buff a7 to q1, increase src_buf pointer a7 by 16
+            ee.vst.128.ip q0, a3, 16                        // Store 16 bytes from q0 to dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.vst.128.ip q1, a3, 16                        // Store 16 bytes from q1 to dest_buff a3, increase dest_buff pointer a3 by 16
+        ._main_loop_align:
+
+        // Finish remaining bytes out of the main loop
+
+        // Check modulo 16 of the dest_w, if - then copy 16 bytes (8 RGB565 pixels)
+        bbci a11, 4, _align_mod_16_check                    // Branch if 4-th bit of dest_w_bytes a11 is clear
+            ee.vld.128.ip q0, a7, 16                        // Load 16 bytes from src_buff a7 to q0, increase src_buf pointer a7 by 16
+            ee.vst.128.ip q0, a3, 16                        // Store 16 bytes from q0 to dest_buff a3, increase dest_buff pointer a3 by 16
+        _align_mod_16_check:
+
+        add     a3,  a3,  a6                                // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
+        add     a7,  a7,  a8                                // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
+        addi.n  a5,  a5,  -1                                // Decrease the outer loop
+    bnez a5, .outer_loop_align
+
+    movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
+    retw.n                                                  // Return
+
+
+    _src_align_dest_unalign:
+
+    // Check dest_stride alignment
+    and     a15,   a10,  a6                                 // 16-byte alignment mask AND dest_stride a6
+    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
+
+    // Check dest_w_bytes alignment
+    and     a15,   a10,  a11                                // 16-byte alignment mask AND dest_w_bytes a11
+    bnez    a15,   _src_unalign_dest_unalign                // Branch if a15 not equals to zero
+
+    // We don't check src_stride alignment for this implementation, as it can be either align, or unalign
+
+//**********************************************************************************************************************
+
+    // Less ideal case - Only destination array is aligned, src array is unaligned
+    //                   Source stride is either aligned or unaligned, destination stride must be aligned, dest_w_bytes must be aligned
+
+    // dest_buff   (a3) - 16-byte aligned
+    // src_buff    (a7) - unaligned
+    // dest_stride (a6) - 16-byte multiple
+    // src_stride  (a8) - does not matter if 16-byte multiple
+    // dest_w      (a4) - 16-byte multiple
+
+    // Convert strides to matrix paddings
+    sub     a6,  a6,  a11                                   // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
+    sub     a8,  a8,  a11                                   // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
+
+    // Calculate modulo for non-aligned data
+    movi    a15,  48                                        // a15 = 48 (main loop copies 48 bytes)
+    quou    a9,   a11,  a15                                 // a9 = dest_w_bytes (a11) MOD 48 (15)
+    remu    a12,  a11,  a15                                 // a12 = dest_w_bytes (a11) remainder div 48 (15)
+
+    .outer_loop_src_unalign_dest_align:
+
+        ee.ld.128.usar.ip   q2,  a7,  16                    // Preload 16 bytes from src_buff a7 to q2, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
+        ee.ld.128.usar.ip   q3,  a7,  16                    // Preload 16 bytes from src_buff a7 to q3, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
+
+        // Run main loop which copies 48 bytes (24 RGB565 pixels) in one loop run
+        loopnez a9, ._main_loop_src_unalign_dest_align      // 48 bytes (24 RGB565 pixels) in one loop
+            ee.src.q.ld.ip    q4,  a7,  16, q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
+            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q.ld.ip    q2,  a7,  16, q3, q4          // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
+            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q.ld.ip    q3,  a7,  16, q4, q2          // Load 16 bytes from src_buff a7 to q3, concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
+            ee.vst.128.ip     q4,  a3,  16                  // Store 16 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+        ._main_loop_src_unalign_dest_align:
+
+        // Finish the main loop outside of the loop from Q registers preloads
+
+        // Check modulo 32 of the loop_len_remainder, if - then copy 32 bytes (16 RGB565 pixels)
+        bbci   a12,  5,   _unalign_mod_32_check             // Branch if 5-th bit of loop_len_remainder a12 is clear
+            ee.src.q.ld.ip    q4,  a7,  0,  q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7
+            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q          q3,  q3,  q4                  // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount
+            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            j _end_of_row_src_unalign_dest_align
+        _unalign_mod_32_check:
+
+        // Check modulo 16 of the loop_len_remainder, if - then copy 16 bytes (8 RGB565 pixels)
+        bbci   a12, 4,   _unalign_mod_16_check              // Branch if 4-th bit of loop_len_remainder a12 is clear
+            ee.src.q          q2,  q2,  q3                  // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount
+            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            addi              a7,  a7, -16                  // Correct the src_buff pointer a7, caused by q reg preload
+            j _end_of_row_src_unalign_dest_align
+        _unalign_mod_16_check:
+
+        // Nothing to copy outside of the main loop
+        addi    a7,  a7,  -32                               // Correct the src_buff pointer a7, caused by q reg preload
+
+        _end_of_row_src_unalign_dest_align:
+
+        add     a3,  a3,  a6                                // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
+        add     a7,  a7,  a8                                // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
+        addi.n  a5,  a5,  -1                                // Decrease the outer loop
+    bnez a5, .outer_loop_src_unalign_dest_align
+
+    movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
+    retw.n                                                  // Return
+
+
+    _src_unalign_dest_unalign:
+
+//**********************************************************************************************************************
+
+    // The most general case, can handle all the possible combinations
+
+    // dest_buff   (a3) - unaligned
+    // src_buff    (a7) - unaligned
+    // dest_stride (a6) - not 16-byte multiple
+    // src_stride  (a8) - not 16-byte multiple
+    // dest_w      (a4) - not 16-byte multiple
+
+    // Convert strides to matrix paddings
+    sub     a6,  a6,  a11                                   // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
+    sub     a8,  a8,  a11                                   // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
+
+    .outer_loop_all_unalign:
+
+        // dest_buff alignment check
+        and     a13,   a10,  a3                             // Alignment mask 0xf (a10) AND dest_buff pointer
+        beqz    a13,   _dest_buff_aligned                   // Branch if a13 = 0 (if dest_buff is aligned)
+
+        movi.n  a14,   16                                   // a14 = 16
+        sub     a13,   a14,   a13                           // a13 = 16 - unalignment
+
+        // Check modulo 8 of the unalignment a13, if - then copy 8 bytes (4 RGB565 pixels)
+        // src_buff a7, dest_buff a3, unalignment a13, copy registers a14, a15
+        macro_memcpy_mod_8 a7, a3, a13, a15, a14, __LINE__
+
+        // Check modulo 4 of the unalignment, if - then copy 4 bytes (2 RGB565 pixels)
+        // src_buff a7, dest_buff a3, unalignment a13, copy register a15
+        macro_memcpy_mod_4 a7, a3, a13, a15, __LINE__
+
+        // Check modulo 2 of the unalignment, if - then copy 2 bytes (1 RGB565 pixel)
+        // src_buff a7, dest_buff a3, unalignment a13, copy register a15
+        macro_memcpy_mod_2 a7, a3, a13, a15, __LINE__
+
+        // Check modulo 1 of the unalignment, if - then copy 1 byte (1/2 of RGB565 pixel)
+        // src_buff a7, dest_buff a3, unalignment a13, copy register a15
+        macro_memcpy_mod_1 a7, a3, a13, a15, __LINE__
+
+        _dest_buff_aligned:
+
+        // Calculate modulo for non-aligned data
+        sub     a11,   a11,   a13                           // a11 = local_dest_w_bytes (a11) = dest_w_bytes (a11) - (16 - unalignment)
+        movi    a15,    48                                  // a15 = 48
+        quou    a9,    a11,   a15                           // a9 =  local_dest_w_bytes (a11) MOD 48 (a15)
+        remu    a12,   a11,   a15                           // a12 = local_dest_w_bytes (a11) remainder div 48 (a15)
+
+        ee.ld.128.usar.ip   q2,  a7,  16                    // Preload 16 bytes from src_buff a7 to q2, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
+        ee.ld.128.usar.ip   q3,  a7,  16                    // Preload 16 bytes from src_buff a7 to q3, get value of the SAR_BYTE, increase src_buf pointer a7 by 16
+
+        // Run main loop which copies 48 bytes (24 RGB565 pixels) in one loop run
+        loopnez a9, ._main_loop_all_unalign                 // 48 bytes (24 RGB565 pixels) in one loop
+            ee.src.q.ld.ip    q4,  a7,  16, q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
+            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q.ld.ip    q2,  a7,  16, q3, q4          // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
+            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q.ld.ip    q3,  a7,  16, q4, q2          // Load 16 bytes from src_buff a7 to q3, concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
+            ee.vst.128.ip     q4,  a3,  16                  // Store 16 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+        ._main_loop_all_unalign:
+
+        // Finish the main loop outside of the loop from Q registers preloads
+
+        // Check modulo 32 and modulo 8 of the loop_len_remainder a12
+        bbci   a12,  5,   _all_unalign_mod_32_check         // Branch if 5-th bit of loop_len_remainder a12 is clear
+        bbsi   a12,  3,   _all_unalign_mod_32_mod_8_check   // Branch if 3-rd bif of loop_len_remainder a12 is set
+
+            // Copy 32 bytes (16 RGB565 pixels) (47 - 40)
+            ee.src.q.ld.ip    q4,  a7,  0,  q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7
+            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q          q3,  q3,  q4                  // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount
+            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            j _skip_mod16
+
+            _all_unalign_mod_32_mod_8_check:
+            // Copy 40 bytes (20 RGB565 pixels)
+            ee.src.q.ld.ip    q4,  a7,  16, q2, q3          // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, increase src_buf pointer a7 by 16
+            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q.ld.ip    q2,  a7,  0,  q3, q4          // Load 16 bytes from src_buff a7 to q2, concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount, don't increase src_buf pointer a7
+            ee.vst.128.ip     q3,  a3,  16                  // Store 16 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q          q4,  q4,  q2                  // Concatenate q4 and q2 and shift to q4 by the SAR_BYTE amount
+            ee.vst.l.64.ip    q4,  a3,  8                   // Store lower 8 bytes from q4 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
+            addi              a7,  a7,  -8                  // Correct the src_buff pointer a7, caused by q reg preload
+            j _skip_mod16
+
+        _all_unalign_mod_32_check:
+
+        // Check modulo 16 and modulo 8 of the loop_len_remainder a12
+        bbci   a12,  4,   _all_unalign_mod_16_check         // branch if 4-th bit of loop_len_remainder a12 is clear
+        bbsi   a12,  3,   _all_unalign_mod_16_mod_8_check   // branch if 3-rd bit of loop_len_remainder a12 is set
+
+            // Copy 16 bytes (8 RGB565 pixels)
+            ee.src.q          q2,  q2,  q3                  // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount
+            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            addi              a7,  a7, -16                  // Correct the src_buff pointer a7, caused by q reg preload
+            j _skip_mod16
+
+            _all_unalign_mod_16_mod_8_check:
+            // Copy 24 bytes (12 RGB565 pixels)
+            ee.src.q.ld.ip    q4,  a7,  0, q2, q3           // Load 16 bytes from src_buff a7 to q4, concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount, don't increase src_buf pointer a7
+            ee.vst.128.ip     q2,  a3,  16                  // Store 16 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 16
+            ee.src.q          q3,  q3,  q4                  // Concatenate q3 and q4 and shift to q3 by the SAR_BYTE amount
+            ee.vst.l.64.ip    q3,  a3,  8                   // Store lower 8 bytes from q3 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
+            addi              a7,  a7, -8                   // Correct the src_buff pointer a7, caused by q reg preload
+            j _skip_mod16
+        _all_unalign_mod_16_check:
+
+        bbci   a12, 3,  _all_unalign_mod_8_check            // Branch if 3-rd bit of loop_len_remainder a12 is clear
+            // Copy 8 bytes (4 RGB565 pixels)
+            ee.src.q          q2,  q2,  q3                  // Concatenate q2 and q3 and shift to q2 by the SAR_BYTE amount
+            ee.vst.l.64.ip    q2,  a3,  8                   // Store lower 8 bytes from q2 to aligned dest_buff a3, increase dest_buff pointer a3 by 8
+            addi              a7,  a7,  -24                 // Correct the src_buff pointer a7, caused by q reg preload
+            j _skip_mod16
+        _all_unalign_mod_8_check:
+
+        addi    a7, a7, -32                                 // Correct the src_buff pointer a7, caused by q reg preload
+
+        _skip_mod16:
+
+        // Check modulo 4 of the loop_len_remainder, if - then copy 4 bytes (2 RGB565 pixels)
+        // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
+        macro_memcpy_mod_4 a7, a3, a12, a15, __LINE__
+
+        // Check modulo 2 of the loop_len_remainder, if - then copy 2 bytes (1 RGB565 pixel)
+        // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
+        macro_memcpy_mod_2 a7, a3, a12, a15, __LINE__
+
+        // Check modulo 1 of the loop_len_remainder, if - then copy 1 byte (1/2 RGB565 pixel)
+        // src_buff a7, dest_buff a3, loop_len_remainder a12, copy register a15
+        macro_memcpy_mod_1 a7, a3, a12, a15, __LINE_
+
+        slli    a11, a4,   1                                // Refresh dest_w_bytes
+        add     a3,  a3,  a6                                // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
+        add     a7,  a7,  a8                                // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
+        addi.n  a5,  a5,  -1                                // Decrease the outer loop
+    bnez a5, .outer_loop_all_unalign
+
+    movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
+    retw.n                                                  // Return
+
+//**********************************************************************************************************************
+
+    // Small matrix width, keep it simple for lengths less than 8 pixels
+    _matrix_width_check:                                    // Matrix width is greater or equal 8 pixels
+
+    // Convert strides to matrix paddings
+    sub     a6,  a6,  a11                                   // dest_matrix_padding (a6) = dest_stride (a6) - dest_w_bytes (a11)
+    sub     a8,  a8,  a11                                   // src_matrix_padding (a8) = src_stride (a8) - dest_w_bytes (a11)
+
+    .outer_loop_short_matrix_length:
+
+        // Run main loop which copies 2 bytes (one RGB565 pixel) in one loop run
+        loopnez a4, ._main_loop_short_matrix_length
+            l8ui        a15,  a7,  0                        // Load 8 bits from src_buff a7 to a15, offset 0
+            l8ui        a14,  a7,  1                        // Load 8 bits from src_buff a7 to a14, offset 1
+            s8i         a15,  a3,  0                        // Save 8 bits from a15 to dest_buff a3, offset 0
+            s8i         a14,  a3,  1                        // Save 8 bits from a14 to dest_buff a3, offset 1
+            addi.n      a7,   a7,  2                        // Increment src_buff pointer a7 by 1
+            addi.n      a3,   a3,  2                        // Increment dest_buff pointer a3 by 2
+        ._main_loop_short_matrix_length:
+
+        // Finish remaining byte out of the main loop
+
+        // Check modulo 1 of the dest_w_bytes (a11), if - then copy 1 byte (1/2 RGB565 pixel)
+        // src_buff a7, dest_buff a3, dest_w_bytes a11, copy register a15
+        macro_memcpy_mod_1 a7, a3, a11, a15, __LINE__
+
+        add     a3,  a3,  a6                                // dest_buff (a3) = dest_buff (a3) + dest_matrix_padding (a6)
+        add     a7,  a7,  a8                                // src_buff (a7) = src_buff (a7) + src_matrix_padding (a8)
+        addi.n  a5,  a5,  -1                                // Decrease the outer loop
+    bnez a5, .outer_loop_short_matrix_length
+
+    movi.n   a2, 1                                          // Return LV_RESULT_OK = 1
+    retw.n                                                  // Return
diff --git a/components/esp_lvgl_port/test_apps/simd/README.md b/components/esp_lvgl_port/test_apps/simd/README.md
index d319e2e3..90efa55c 100644
--- a/components/esp_lvgl_port/test_apps/simd/README.md
+++ b/components/esp_lvgl_port/test_apps/simd/README.md
@@ -4,7 +4,7 @@ Test app accommodates two types of tests: [`functionality test`](#Functionality-
 
 Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/) component. Header file with the assembly function prototypes is provided into the LVGL using Kconfig option `LV_DRAW_SW_ASM_CUSTOM_INCLUDE` and can be found in the [`lvgl_port/include`](../../include/esp_lvgl_port_lv_blend.h)
 
-## Benchmark results
+## Benchmark results for LV Fill functions (memset)
 
 | Color format | Matrix size | Memory alignment |  ASM version   | ANSI C version |
 | :----------- | :---------- | :--------------- | :------------- | :------------- |
@@ -15,6 +15,15 @@ Assembly source files could be found in the [`lvgl_port`](../../src/lvgl9/simd/)
 * this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x127 1 byte aligned matrix (worst case)
 * the values represent cycles per sample to perform simple fill of the matrix on esp32s3
 
+## Benchmark results for LV Image functions (memcpy)
+
+| Color format | Matrix size | Memory alignment |  ASM version   | ANSI C version |
+| :----------- | :---------- | :--------------- | :------------- | :------------- |
+| RGB565       | 128x128     |     16 byte      |     0.352      |     3.437      |
+|              | 127x128     |      1 byte      |     0.866      |     5.978      |
+* this data was obtained by running [benchmark tests](#benchmark-test) on 128x128 16 byte aligned matrix (ideal case) and 127x128 1 byte aligned matrix (worst case)
+* the values represent cycles per sample to perform memory copy between two matrices on esp32s3
+
 ## Functionality test
 * Tests, whether the HW accelerated assembly version of an LVGL function provides the same results as the ANSI version
 * A top-level flow of the functionality test:
@@ -62,6 +71,8 @@ Here's the test menu, pick your combo:
 (2)	"Test fill functionality RGB565" [fill][functionality][RGB565]
 (3)	"LV Fill benchmark ARGB8888" [fill][benchmark][ARGB8888]
 (4)	"LV Fill benchmark RGB565" [fill][benchmark][RGB565]
+(5)	"LV Image functionality RGB565 blend to RGB565" [image][functionality][RGB565]
+(6)	"LV Image benchmark RGB565 blend to RGB565" [image][benchmark][RGB565]
 
 Enter test for running.
 ```
diff --git a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
index 0a6d5da4..20c061ff 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
+++ b/components/esp_lvgl_port/test_apps/simd/main/CMakeLists.txt
@@ -8,6 +8,9 @@ if(CONFIG_IDF_TARGET_ESP32 OR CONFIG_IDF_TARGET_ESP32S3)
     else()
         file(GLOB_RECURSE ASM_SOURCES ${PORT_PATH}/simd/*_esp32.S)      # Select only esp32 related files
     endif()
+
+    file(GLOB_RECURSE ASM_MACROS ${PORT_PATH}/simd/lv_macro_*.S)        # Explicitly add all assembler macro files
+
 else()
     message(WARNING "This test app is intended only for esp32 and esp32s3")
 endif()
@@ -15,7 +18,14 @@ endif()
 # Hard copy of LV files
 file(GLOB_RECURSE BLEND_SRCS lv_blend/src/*.c)
 
-idf_component_register(SRCS "test_app_main.c" "test_lv_fill_functionality.c" "test_lv_fill_benchmark.c" ${BLEND_SRCS} ${ASM_SOURCES}
+idf_component_register(SRCS "test_app_main.c"
+                            "test_lv_fill_functionality.c"      # memset tests
+                            "test_lv_fill_benchmark.c"
+                            "test_lv_image_functionality.c"     # memcpy tests
+                            "test_lv_image_benchmark.c"
+                            ${BLEND_SRCS}                       # Hard copy of LVGL's blend API, to simplify testing
+                            ${ASM_SOURCES}                      # Assembly src files
+                            ${ASM_MACROS}                       # Assembly macro files
                       INCLUDE_DIRS "lv_blend/include" "../../../include"
                       REQUIRES unity
                       WHOLE_ARCHIVE)
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h
index 01c5f769..40ab1e84 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_draw_sw_blend.h
@@ -57,6 +57,7 @@ typedef struct {
     lv_color_format_t src_color_format;
     lv_opa_t opa;
     lv_blend_mode_t blend_mode;
+    bool use_asm;
 } _lv_draw_sw_blend_image_dsc_t;
 
 /**********************
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h
new file mode 100644
index 00000000..9747af98
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_string.h
@@ -0,0 +1,79 @@
+/*
+ * SPDX-FileCopyrightText: 2024 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_stringn.h
+ *
+ */
+
+#ifndef LV_STRING_H
+#define LV_STRING_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*********************
+ *      INCLUDES
+ *********************/
+//#include "../lv_conf_internal.h"
+#include <stdint.h>
+#include <stddef.h>
+#include "lv_types.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ * GLOBAL PROTOTYPES
+ **********************/
+
+/**
+ * @brief Copies a block of memory from a source address to a destination address.
+ * @param dst Pointer to the destination array where the content is to be copied.
+ * @param src Pointer to the source of data to be copied.
+ * @param len Number of bytes to copy.
+ * @return Pointer to the destination array.
+ * @note The function does not check for any overlapping of the source and destination memory blocks.
+ */
+void * lv_memcpy(void * dst, const void * src, size_t len);
+
+/**
+ * @brief Fills a block of memory with a specified value.
+ * @param dst Pointer to the destination array to fill with the specified value.
+ * @param v Value to be set. The value is passed as an int, but the function fills
+ *          the block of memory using the unsigned char conversion of this value.
+ * @param len Number of bytes to be set to the value.
+ */
+void lv_memset(void * dst, uint8_t v, size_t len);
+
+/**
+ * @brief Move a block of memory from source to destination
+ * @param dst Pointer to the destination array where the content is to be copied.
+ * @param src Pointer to the source of data to be copied.
+ * @param len Number of bytes to copy
+ * @return Pointer to the destination array.
+ */
+void * lv_memmove(void * dst, const void * src, size_t len);
+
+
+/**********************
+ *      MACROS
+ **********************/
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
+
+#endif /*LV_STRING_H*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h
index 2e9244fe..f97a51eb 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/include/lv_types.h
@@ -19,6 +19,8 @@
 extern "C" {
 #endif
 
+#include <stdint.h>
+
 /**********************
  *      TYPEDEFS
  **********************/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c
index f18e3670..2f95bc74 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_argb8888.c
@@ -23,7 +23,7 @@
 #include "lv_draw_sw_blend.h"
 #include "lv_math.h"
 #include "lv_color.h"
-#include "string.h"
+#include "lv_string.h"
 
 #include "esp_lvgl_port_lv_blend.h"
 
@@ -628,7 +628,7 @@ static void LV_ATTRIBUTE_FAST_MEM rgb888_image_blend(_lv_draw_sw_blend_image_dsc
                 if (src_px_size == 4) {
                     uint32_t line_in_bytes = w * 4;
                     for (y = 0; y < h; y++) {
-                        memcpy(dest_buf_c32, src_buf, line_in_bytes);   // lv_memcpy
+                        lv_memcpy(dest_buf_c32, src_buf, line_in_bytes);
                         dest_buf_c32 = drawbuf_next_row(dest_buf_c32, dest_stride);
                         src_buf = drawbuf_next_row(src_buf, src_stride);
                     }
@@ -870,9 +870,9 @@ static inline lv_color32_t LV_ATTRIBUTE_FAST_MEM lv_color_32_32_mix(lv_color32_t
 
 void lv_color_mix_with_alpha_cache_init(lv_color_mix_alpha_cache_t *cache)
 {
-    memset(&cache->fg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
-    memset(&cache->bg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
-    memset(&cache->res_saved, 0x00, sizeof(lv_color32_t));  //lv_memzero
+    lv_memset(&cache->fg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
+    lv_memset(&cache->bg_saved, 0x00, sizeof(lv_color32_t));   //lv_memzero
+    lv_memset(&cache->res_saved, 0x00, sizeof(lv_color32_t));  //lv_memzero
     cache->res_alpha_saved = 255;
     cache->ratio_saved = 255;
 }
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c
index 361571ff..dd6e5392 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_draw_sw_blend_to_rgb565.c
@@ -23,7 +23,7 @@
 #include "lv_draw_sw_blend.h"
 #include "lv_math.h"
 #include "lv_color.h"
-#include "string.h"
+#include "lv_string.h"
 
 #include "esp_lvgl_port_lv_blend.h"
 
@@ -601,10 +601,12 @@ static void LV_ATTRIBUTE_FAST_MEM rgb565_image_blend(_lv_draw_sw_blend_image_dsc
 
     if (dsc->blend_mode == LV_BLEND_MODE_NORMAL) {
         if (mask_buf == NULL && opa >= LV_OPA_MAX) {
-            if (LV_RESULT_INVALID == LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc)) {
+            if (dsc->use_asm) {
+                LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB565(dsc);
+            } else {
                 uint32_t line_in_bytes = w * 2;
                 for (y = 0; y < h; y++) {
-                    memcpy(dest_buf_u16, src_buf_u16, line_in_bytes);   // lv_memcpy
+                    lv_memcpy(dest_buf_u16, src_buf_u16, line_in_bytes);
                     dest_buf_u16 = drawbuf_next_row(dest_buf_u16, dest_stride);
                     src_buf_u16 = drawbuf_next_row(src_buf_u16, src_stride);
                 }
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c
new file mode 100644
index 00000000..0e604048
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_blend/src/lv_string_builtin.c
@@ -0,0 +1,188 @@
+/*
+ * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * This file is derived from the LVGL project.
+ * See https://github.com/lvgl/lvgl for details.
+ */
+
+/**
+ * @file lv_string.c
+ */
+
+/*********************
+ *      INCLUDES
+ *********************/
+//#include "../../lv_conf_internal.h"
+#if LV_USE_STDLIB_STRING == LV_STDLIB_BUILTIN
+#include "lv_assert.h"
+#include "lv_log.h"
+#include "lv_math.h"
+#include "lv_string.h"
+
+/*********************
+ *      DEFINES
+ *********************/
+#ifdef LV_ARCH_64
+    #define MEM_UNIT         uint64_t
+    #define ALIGN_MASK       0x7
+#else
+    #define MEM_UNIT         uint32_t
+    #define ALIGN_MASK       0x3
+#endif
+
+#define LV_ATTRIBUTE_FAST_MEM
+
+/**********************
+ *      TYPEDEFS
+ **********************/
+
+/**********************
+ *  STATIC PROTOTYPES
+ **********************/
+
+/**********************
+ *  STATIC VARIABLES
+ **********************/
+
+/**********************
+ *      MACROS
+ **********************/
+#if LV_USE_LOG && LV_LOG_TRACE_MEM
+    #define LV_TRACE_MEM(...) LV_LOG_TRACE(__VA_ARGS__)
+#else
+    #define LV_TRACE_MEM(...)
+#endif
+
+#define _COPY(d, s) *d = *s; d++; s++;
+#define _SET(d, v) *d = v; d++;
+#define _REPEAT8(expr) expr expr expr expr expr expr expr expr
+
+/**********************
+ *   GLOBAL FUNCTIONS
+ **********************/
+
+void * LV_ATTRIBUTE_FAST_MEM lv_memcpy(void * dst, const void * src, size_t len)
+{
+    uint8_t * d8 = dst;
+    const uint8_t * s8 = src;
+
+    /*Simplify for small memories*/
+    if(len < 16) {
+        while(len) {
+            *d8 = *s8;
+            d8++;
+            s8++;
+            len--;
+        }
+        return dst;
+    }
+
+    lv_uintptr_t d_align = (lv_uintptr_t)d8 & ALIGN_MASK;
+    lv_uintptr_t s_align = (lv_uintptr_t)s8 & ALIGN_MASK;
+
+    /*Byte copy for unaligned memories*/
+    if(s_align != d_align) {
+        while(len > 32) {
+            _REPEAT8(_COPY(d8, s8));
+            _REPEAT8(_COPY(d8, s8));
+            _REPEAT8(_COPY(d8, s8));
+            _REPEAT8(_COPY(d8, s8));
+            len -= 32;
+        }
+        while(len) {
+            _COPY(d8, s8)
+            len--;
+        }
+        return dst;
+    }
+
+    /*Make the memories aligned*/
+    if(d_align) {
+        d_align = ALIGN_MASK + 1 - d_align;
+        while(d_align && len) {
+            _COPY(d8, s8);
+            d_align--;
+            len--;
+        }
+    }
+
+    uint32_t * d32 = (uint32_t *)d8;
+    const uint32_t * s32 = (uint32_t *)s8;
+    while(len > 32) {
+        _REPEAT8(_COPY(d32, s32))
+        len -= 32;
+    }
+
+    d8 = (uint8_t *)d32;
+    s8 = (const uint8_t *)s32;
+    while(len) {
+        _COPY(d8, s8)
+        len--;
+    }
+
+    return dst;
+}
+
+void LV_ATTRIBUTE_FAST_MEM lv_memset(void * dst, uint8_t v, size_t len)
+{
+    uint8_t * d8 = (uint8_t *)dst;
+    uintptr_t d_align = (lv_uintptr_t) d8 & ALIGN_MASK;
+
+    /*Make the address aligned*/
+    if(d_align) {
+        d_align = ALIGN_MASK + 1 - d_align;
+        while(d_align && len) {
+            _SET(d8, v);
+            len--;
+            d_align--;
+        }
+    }
+
+    uint32_t v32 = (uint32_t)v + ((uint32_t)v << 8) + ((uint32_t)v << 16) + ((uint32_t)v << 24);
+    uint32_t * d32 = (uint32_t *)d8;
+
+    while(len > 32) {
+        _REPEAT8(_SET(d32, v32));
+        len -= 32;
+    }
+
+    d8 = (uint8_t *)d32;
+    while(len) {
+        _SET(d8, v);
+        len--;
+    }
+}
+
+void * LV_ATTRIBUTE_FAST_MEM lv_memmove(void * dst, const void * src, size_t len)
+{
+    if(dst < src || (char *)dst > ((char *)src + len)) {
+        return lv_memcpy(dst, src, len);
+    }
+
+    if(dst > src) {
+        char * tmp = (char *)dst + len - 1;
+        char * s   = (char *)src + len - 1;
+
+        while(len--) {
+            *tmp-- = *s--;
+        }
+    }
+    else {
+        char * tmp = (char *)dst;
+        char * s   = (char *)src;
+
+        while(len--) {
+            *tmp++ = *s++;
+        }
+    }
+
+    return dst;
+}
+
+/**********************
+ *   STATIC FUNCTIONS
+ **********************/
+
+#endif /*LV_STDLIB_BUILTIN*/
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
index 5243857e..b208819c 100644
--- a/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_fill_common.h
@@ -64,7 +64,7 @@ typedef struct {
     unsigned int cc_width;                                  // Corner case test array width
     unsigned int benchmark_cycles;                          // Count of benchmark cycles
     void *array_align16;                                    // test array with 16 byte alignment - testing most ideal case
-    void *array_align1;                                     // test array with 1 byte alignment - testing wort case
+    void *array_align1;                                     // test array with 1 byte alignment - testing worst case
     void (*blend_api_func)(_lv_draw_sw_blend_fill_dsc_t *); // pointer to LVGL API function
 } bench_test_case_params_t;
 
diff --git a/components/esp_lvgl_port/test_apps/simd/main/lv_image_common.h b/components/esp_lvgl_port/test_apps/simd/main/lv_image_common.h
new file mode 100644
index 00000000..f2c64827
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/lv_image_common.h
@@ -0,0 +1,111 @@
+/*
+ * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include "esp_err.h"
+#include <stdint.h>
+#include "lv_color.h"
+#include "lv_draw_sw_blend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// ------------------------------------------------- Macros and Types --------------------------------------------------
+
+/**
+ * @brief Type of blend DUT function
+ */
+typedef enum {
+    OPERATION_FILL,
+    OPERATION_FILL_WITH_OPA,
+} blend_operation_t;
+
+/**
+ * @brief Canary pixels amount depending on data type
+ * @note
+ *    - We should use at least 16 bytes of memory for canary pixels because of esp32s3 TIE 16-bytes wide Q registers
+ *    - Canary pixels are multiplied by sizeof(used_data_type) to get the memory length occupied by the canary pixels
+ *    - The memory occupied by canary pixels should be in 16-byte multiples, to achieve 16-byte memory alignment in functionality test
+ *    - For example, ideally, for RGB565 we would need 8 canary pixels -> 8 * sizeof(uint16_t) = 16
+ */
+typedef enum {
+    CANARY_PIXELS_ARGB8888 = 4,                                /*!< Canary pixels: 4 * sizeof(uint32_t) = 16 */
+    CANARY_PIXELS_RGB565 = 8,                                  /*!< Canary pixels: 8 * sizeof(uint16_t) = 16 */
+} canary_pixels_t;
+
+/**
+ * @brief Functionality test combinations for LV Image
+ */
+typedef struct {
+    unsigned int min_w;                                       /*!< Minimum width of the test array */
+    unsigned int min_h;                                       /*!< Minimum height of the test array */
+    unsigned int max_w;                                       /*!< Maximum width of the test array */
+    unsigned int max_h;                                       /*!< Maximum height of the test array */
+    unsigned int src_min_unalign_byte;                        /*!< Minimum amount of unaligned bytes of the source test array */
+    unsigned int dest_min_unalign_byte;                       /*!< Minimum amount of unaligned bytes of the destination test array */
+    unsigned int src_max_unalign_byte;                        /*!< Maximum amount of unaligned bytes of the source test array */
+    unsigned int dest_max_unalign_byte;                       /*!< Maximum amount of unaligned bytes of the destination test array */
+    unsigned int src_unalign_step;                            /*!< Increment step in bytes unalignment of the source test array */
+    unsigned int dest_unalign_step;                           /*!< Increment step in bytes unalignment of the destination test array */
+    unsigned int src_stride_step;                             /*!< Increment step in destination stride of the source test array */
+    unsigned int dest_stride_step;                            /*!< Increment step in destination stride of the destination test array */
+    unsigned int test_combinations_count;                     /*!< Count of fest combinations */
+} test_matrix_lv_image_params_t;
+
+
+/**
+ * @brief Functionality test case parameters for LV Image
+ */
+typedef struct {
+    struct {
+        void *p_src;                                          /*!< pointer to the source test buff (common src buffer for both the ANSI and ASM)  */
+        void *p_src_alloc;                                    /*!< pointer to the beginning of the memory allocated for the source ASM test buf, used in free() */
+        void *p_dest_asm;                                     /*!< pointer to the destination ASM test buf */
+        void *p_dest_ansi;                                    /*!< pointer to the destination ANSI test buf */
+        void *p_dest_asm_alloc;                               /*!< pointer to the beginning of the memory allocated for the destination ASM test buf, used in free() */
+        void *p_dest_ansi_alloc;                              /*!< pointer to the beginning of the memory allocated for the destination ANSI test buf, used in free() */
+    } buf;
+    void (*blend_api_func)(_lv_draw_sw_blend_image_dsc_t *);  /*!< pointer to LVGL API function */
+    lv_color_format_t color_format;                           /*!< LV color format */
+    size_t src_data_type_size;                                /*!< Used data type size in the source buffer, eg sizeof(src_buff[0]) */
+    size_t dest_data_type_size;                               /*!< Used data type size in the destination buffer, eg sizeof(dest_buff[0]) */
+    size_t src_buf_len;                                       /*!< Length of the source buffer, including matrix padding (no Canary pixels are used for source buffer) */
+    size_t active_dest_buf_len;                               /*!< Length of the destination buffer, where the actual data are stored, including matrix padding, not including Canary pixels */
+    size_t total_dest_buf_len;                                /*!< Total length of the destination buffer (including Canary pixels and matrix padding) */
+    size_t canary_pixels;                                     /*!< Canary pixels must be adjusted according to the used color type, to achieve aligned memory effect */
+    unsigned int dest_w;                                      /*!< Destination buffer width */
+    unsigned int dest_h;                                      /*!< Destination buffer height */
+    unsigned int src_stride;                                  /*!< Source buffer stride */
+    unsigned int dest_stride;                                 /*!< Destination buffer stride */
+    unsigned int src_unalign_byte;                            /*!< Source buffer memory unalignment */
+    unsigned int dest_unalign_byte;                           /*!< Destination buffer memory unalignment */
+    blend_operation_t operation_type;                         /*!< Type of fundamental blend operation */
+} func_test_case_lv_image_params_t;
+
+
+/**
+ * @brief Benchmark test case parameters for LV Image
+ */
+typedef struct {
+    unsigned int height;                                      /*!< Test array height */
+    unsigned int width;                                       /*!< Test array width */
+    unsigned int dest_stride;                                 /*!< Destination test array stride */
+    unsigned int src_stride;                                  /*!< Source test array stride */
+    unsigned int cc_height;                                   /*!< Corner case test array height */
+    unsigned int cc_width;                                    /*!< Corner case test array width */
+    unsigned int benchmark_cycles;                            /*!< Count of benchmark cycles */
+    void *src_array_align16;                                  /*!< Source test array with 16 byte alignment - testing most ideal case */
+    void *src_array_align1;                                   /*!< Source test array with 1 byte alignment - testing worst case */
+    void *dest_array_align16;                                 /*!< Destination test array with 16 byte alignment - testing most ideal case */
+    void *dest_array_align1;                                  /*!< Destination test array with 1 byte alignment - testing worst case */
+    void (*blend_api_func)(_lv_draw_sw_blend_image_dsc_t *);  /*!< pointer to LVGL API function */
+} bench_test_case_lv_image_params_t;
+
+#ifdef __cplusplus
+} /*extern "C"*/
+#endif
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c
new file mode 100644
index 00000000..2372f07a
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_benchmark.c
@@ -0,0 +1,171 @@
+/*
+ * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include <malloc.h>
+#include <sdkconfig.h>
+
+#include "unity.h"
+#include "esp_log.h"
+#include "freertos/FreeRTOS.h"  // for xthal_get_ccount()
+#include "lv_image_common.h"
+#include "lv_draw_sw_blend.h"
+#include "lv_draw_sw_blend_to_rgb565.h"
+
+#define COMMON_DIM 128      // Common matrix dimension 128x128 pixels
+#define WIDTH COMMON_DIM
+#define HEIGHT COMMON_DIM
+#define STRIDE WIDTH
+#define UNALIGN_BYTES 3
+#define BENCHMARK_CYCLES 1000
+
+// ------------------------------------------------ Static variables ---------------------------------------------------
+
+static const char *TAG_LV_IMAGE_BENCH = "LV Image Benchmark";
+static const char *asm_ansi_func[] = {"ASM", "ANSI"};
+
+// ------------------------------------------------ Static function headers --------------------------------------------
+
+/**
+ * @brief Initialize the benchmark test
+ */
+static void lv_image_benchmark_init(bench_test_case_lv_image_params_t *test_params);
+
+/**
+ * @brief Run the benchmark test
+ */
+static float lv_image_benchmark_run(bench_test_case_lv_image_params_t *test_params, _lv_draw_sw_blend_image_dsc_t *dsc);
+
+// ------------------------------------------------ Test cases ---------------------------------------------------------
+
+/*
+Benchmark tests
+
+Requires:
+    - To pass functionality tests first
+
+Purpose:
+    - Test that an acceleration is achieved by an assembly implementation of LVGL blending API
+
+Procedure:
+    - Initialize input parameters (test array length, width, allocate array...) of the benchmark test
+    - Run assembly version of LVGL blending API multiple times (1000-times or so)
+    - Firstly use an input test parameters for the most ideal case (16-byte aligned arrays, arrays widths divisible by 2 for RGB565 color format)
+    - Then use worst-case input test parameters (1-byte aligned arrays, arrays width NOT divisible by 2 for RGB565 color format)
+    - Count how many CPU cycles does it take to run a function from the LVGL blending API for each case (ideal and worst case)
+    - Run ansi version of LVGL blending API multiple times (1000-times or so) and repeat the 2 above steps for the ansi version
+    - Compare the results
+    - Free test arrays and structures needed for LVGL blending API
+
+Inducing Most ideal and worst case scenarios:
+    - Most ideal:
+        - Both, the source and the destination buffers should be aligned by 16-byte (Xtensa PIE), or 4-byte (Xtensa base) boundaries
+        - Matrix width (in pixels) should be equal to the main loop length in the assembly src code
+          typically multiples of 16 bytes (for RGB565 it's either 32 bytes - 16 pixels or 48 bytes - 24 pixels)
+        - Matrix height does not have any effect on benchmark unit tests, unit the matrix is too large that cache limitations start to affect the performance
+        - Matrix strides, should be equal to the matrix widths (0 matrix padding), or their multiples (matrix width = matrix padding)
+    - Worst case:
+        - Both, hte source and the destination buffers should NOT be aligned by 16-byte (Xtensa PIE), or 4-byte (Xtensa base) boundaries,
+          Source buffer unalignment should be different from the destination unalignment, with one unalignment being even, the other being odd
+          The unalignments shall be small numbers (preferably 1 or 2 bytes)
+        - Matrix width should be one pixels smaller, than the matrix width for the most ideal case
+        - Matrix height does not have any effect on benchmark unit tests, unit the matrix is too large that cache limitations start to affect the performance
+        - Matrix strides, should NOT be equal to the matrix widths (non 0 matrix padding)
+*/
+// ------------------------------------------------ Test cases stages --------------------------------------------------
+
+TEST_CASE("LV Image benchmark RGB565 blend to RGB565", "[image][benchmark][RGB565]")
+{
+    uint16_t *dest_array_align16  = (uint16_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint16_t) + UNALIGN_BYTES);
+    uint16_t *src_array_align16  = (uint16_t *)memalign(16, STRIDE * HEIGHT * sizeof(uint16_t) + UNALIGN_BYTES);
+    TEST_ASSERT_NOT_EQUAL(NULL, dest_array_align16);
+    TEST_ASSERT_NOT_EQUAL(NULL, src_array_align16);
+
+    // Apply byte unalignment for the worst-case test scenario
+    uint16_t *dest_array_align1 = (uint16_t *)((uint8_t *)dest_array_align16 + UNALIGN_BYTES - 2);
+    uint16_t *src_array_align1 = (uint16_t *)((uint8_t *)src_array_align16 + UNALIGN_BYTES);
+
+    bench_test_case_lv_image_params_t test_params = {
+        .height = HEIGHT,
+        .width = WIDTH,
+        .dest_stride = STRIDE * sizeof(uint16_t),
+        .src_stride = STRIDE * sizeof(uint16_t),
+        .cc_height = HEIGHT,
+        .cc_width = WIDTH - 1,
+        .benchmark_cycles = BENCHMARK_CYCLES,
+        .src_array_align16 = (void *)src_array_align16,
+        .src_array_align1 = (void *)src_array_align1,
+        .dest_array_align16 = (void *)dest_array_align16,
+        .dest_array_align1 = (void *)dest_array_align1,
+        .blend_api_func = &lv_draw_sw_blend_image_to_rgb565,
+    };
+
+    ESP_LOGI(TAG_LV_IMAGE_BENCH, "running test for RGB565 color format");
+    lv_image_benchmark_init(&test_params);
+    free(dest_array_align16);
+    free(src_array_align16);
+}
+// ------------------------------------------------ Static test functions ----------------------------------------------
+
+static void lv_image_benchmark_init(bench_test_case_lv_image_params_t *test_params)
+{
+    // Init structure for LVGL blend API, to call the Assembly API
+    _lv_draw_sw_blend_image_dsc_t dsc = {
+        .dest_buf = test_params->dest_array_align16,
+        .dest_w = test_params->width,
+        .dest_h = test_params->height,
+        .dest_stride = test_params->dest_stride,  // stride * sizeof()
+        .mask_buf = NULL,
+        .src_buf = test_params->src_array_align16,
+        .src_stride = test_params->src_stride,
+        .src_color_format = LV_COLOR_FORMAT_RGB565,
+        .opa = LV_OPA_MAX,
+        .blend_mode = LV_BLEND_MODE_NORMAL,
+        .use_asm = true,
+    };
+
+    // Init structure for LVGL blend API, to call the ANSI API
+    _lv_draw_sw_blend_image_dsc_t dsc_cc = dsc;
+    dsc_cc.dest_buf = test_params->dest_array_align1;
+    dsc_cc.dest_w = test_params->cc_width;
+    dsc_cc.dest_h = test_params->cc_height;
+    dsc_cc.src_buf = test_params->src_array_align1;
+
+    // Run benchmark 2 times:
+    // First run using assembly, second run using ANSI
+    for (int i = 0; i < 2; i++) {
+
+        // Run benchmark with the most ideal input parameters
+        float cycles = lv_image_benchmark_run(test_params, &dsc);        // Call Benchmark cycle
+        float per_sample = cycles / ((float)(dsc.dest_w * dsc.dest_h));
+        ESP_LOGI(TAG_LV_IMAGE_BENCH, " %s ideal case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample", asm_ansi_func[i], cycles, dsc.dest_w, dsc.dest_h, per_sample);
+
+        // Run benchmark with the corner case input parameters
+        cycles = lv_image_benchmark_run(test_params, &dsc_cc);           // Call Benchmark cycle
+        per_sample = cycles / ((float)(dsc_cc.dest_w * dsc_cc.dest_h));
+        ESP_LOGI(TAG_LV_IMAGE_BENCH, " %s corner case: %.3f cycles for %"PRIi32"x%"PRIi32" matrix, %.3f cycles per sample\n", asm_ansi_func[i], cycles, dsc_cc.dest_w, dsc_cc.dest_h, per_sample);
+
+        // change to ANSI
+        dsc.use_asm = false;
+        dsc_cc.use_asm = false;
+    }
+}
+
+static float lv_image_benchmark_run(bench_test_case_lv_image_params_t *test_params, _lv_draw_sw_blend_image_dsc_t *dsc)
+{
+    // Call the DUT function for the first time to init the benchmark test
+    test_params->blend_api_func(dsc);
+
+    const unsigned int start_b = xthal_get_ccount();
+    for (int i = 0; i < test_params->benchmark_cycles; i++) {
+        test_params->blend_api_func(dsc);
+    }
+    const unsigned int end_b = xthal_get_ccount();
+
+    const float total_b = end_b - start_b;
+    const float cycles = total_b / (test_params->benchmark_cycles);
+    return cycles;
+}
diff --git a/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c
new file mode 100644
index 00000000..17317deb
--- /dev/null
+++ b/components/esp_lvgl_port/test_apps/simd/main/test_lv_image_functionality.c
@@ -0,0 +1,351 @@
+/*
+ * SPDX-FileCopyrightText: 2025 Espressif Systems (Shanghai) CO LTD
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include <string.h>
+#include <malloc.h>
+#include <inttypes.h>
+#include "sdkconfig.h"
+#include "unity.h"
+#include "esp_log.h"
+#include "lv_image_common.h"
+#include "lv_draw_sw_blend.h"
+#include "lv_draw_sw_blend_to_rgb565.h"
+
+// ------------------------------------------------- Defines -----------------------------------------------------------
+
+#define DBG_PRINT_OUTPUT false
+
+// ------------------------------------------------- Macros and Types --------------------------------------------------
+
+#define UPDATE_TEST_CASE(test_case_ptr, dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte) ({  \
+    (test_case_ptr)->src_buf_len = (size_t)(dest_h * src_stride);                                 \
+    (test_case_ptr)->active_dest_buf_len = (size_t)(dest_h * dest_stride);                        \
+    (test_case_ptr)->total_dest_buf_len = (size_t)((dest_h * dest_stride) + (test_case_ptr->canary_pixels * 2));  \
+    (test_case_ptr)->dest_w = (dest_w);                         \
+    (test_case_ptr)->dest_h = (dest_h);                         \
+    (test_case_ptr)->src_stride = (src_stride);                 \
+    (test_case_ptr)->dest_stride = (dest_stride);               \
+    (test_case_ptr)->src_unalign_byte = (src_unalign_byte);     \
+    (test_case_ptr)->dest_unalign_byte = (dest_unalign_byte);   \
+})
+
+// ------------------------------------------------ Static variables ---------------------------------------------------
+
+static const char *TAG_LV_IMAGE_FUNC = "LV Image Functionality";
+static char test_msg_buf[200];
+
+static const test_matrix_lv_image_params_t default_test_matrix_image_rgb565_blend_rgb565 = {
+#if CONFIG_IDF_TARGET_ESP32S3
+    .min_w = 8,                   // 8 is the lower limit for the esp32s3 asm implementation, otherwise esp32 is executed
+    .min_h = 1,
+    .max_w = 24,
+    .max_h = 2,
+    .src_max_unalign_byte = 16,   // Use 16-byte boundary check for Xtensa PIE
+    .dest_max_unalign_byte = 16,
+    .dest_unalign_step = 1,       // Step 1 as the destination array is being aligned in the assembly code all the time
+    .src_unalign_step = 3,        // Step 3 (more relaxed) as source array is used unaligned in the assembly code
+    .src_stride_step = 3,
+    .dest_stride_step = 3,
+#else
+    .min_w = 1,
+    .min_h = 1,
+    .max_w = 16,
+    .max_h = 2,
+    .src_max_unalign_byte = 4,    // Use 4-byte boundary  check for Xtensa base
+    .dest_max_unalign_byte = 4,
+    .dest_unalign_step = 1,
+    .src_unalign_step = 1,
+    .src_stride_step = 1,
+    .dest_stride_step = 1,
+#endif
+    .src_min_unalign_byte = 0,
+    .dest_min_unalign_byte = 0,
+    .test_combinations_count = 0,
+};
+
+// ------------------------------------------------ Static function headers --------------------------------------------
+
+/**
+ * @brief Generate all the functionality test combinations
+ *
+ * - generate functionality test combinations, based on the provided test_matrix struct
+ *
+ * @param[in] test_matrix Pointer to structure defining test matrix - all the test combinations
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void functionality_test_matrix(test_matrix_lv_image_params_t *test_matrix, func_test_case_lv_image_params_t *test_case);
+
+/**
+ * @brief Fill test buffers for image functionality test
+ *
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void fill_test_bufs(func_test_case_lv_image_params_t *test_case);
+
+/**
+ * @brief The actual functionality test
+ *
+ * - function prepares structures for functionality testing and runs the LVGL API
+ *
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void lv_image_functionality(func_test_case_lv_image_params_t *test_case);
+
+/**
+ * @brief Evaluate results of LV Image functionality for 16bit data length
+ *
+ * @param[in] test_case Pointer ot structure defining functionality test case
+ */
+static void test_eval_image_16bit_data(func_test_case_lv_image_params_t *test_case);
+
+// ------------------------------------------------ Test cases ---------------------------------------------------------
+
+/*
+Functionality tests
+
+Purpose:
+    - Test that an assembly version of LVGL blending API achieves the same results as the ANSI version
+
+Procedure:
+    - Prepare testing matrix, to cover all the possible combinations of destination and source arrays widths,
+      lengths, strides and memory alignments
+    - Run assembly version of the LVGL blending API
+    - Run ANSI C version of the LVGL blending API
+    - Compare the results
+    - Repeat above 3 steps for each test matrix setup
+*/
+
+// ------------------------------------------------ Test cases stages --------------------------------------------------
+
+TEST_CASE("LV Image functionality RGB565 blend to RGB565", "[image][functionality][RGB565]")
+{
+    test_matrix_lv_image_params_t test_matrix = default_test_matrix_image_rgb565_blend_rgb565;
+
+    func_test_case_lv_image_params_t test_case = {
+        .blend_api_func = &lv_draw_sw_blend_image_to_rgb565,
+        .color_format = LV_COLOR_FORMAT_RGB565,
+        .canary_pixels = CANARY_PIXELS_RGB565,
+        .src_data_type_size = sizeof(uint16_t),
+        .dest_data_type_size = sizeof(uint16_t),
+        .operation_type = OPERATION_FILL,
+    };
+
+    ESP_LOGI(TAG_LV_IMAGE_FUNC, "running test for RGB565 color format");
+    functionality_test_matrix(&test_matrix, &test_case);
+}
+
+// ------------------------------------------------ Static test functions ----------------------------------------------
+
+static void functionality_test_matrix(test_matrix_lv_image_params_t *test_matrix, func_test_case_lv_image_params_t *test_case)
+{
+    // Step destination array width
+    for (int dest_w = test_matrix->min_w; dest_w <= test_matrix->max_w; dest_w++) {
+
+        // Step destination array height
+        for (int dest_h = test_matrix->min_h; dest_h <= test_matrix->max_h; dest_h++) {
+
+            // Step source array stride
+            for (int src_stride = dest_w; src_stride <= dest_w * 2; src_stride += test_matrix->src_stride_step) {
+
+                // Step destination array stride
+                for (int dest_stride = dest_w; dest_stride <= dest_w * 2; dest_stride += test_matrix->dest_stride_step) {
+
+                    // Step source array unalignment
+                    for (int src_unalign_byte = test_matrix->src_min_unalign_byte; src_unalign_byte <= test_matrix->src_max_unalign_byte; src_unalign_byte += test_matrix->src_unalign_step) {
+
+                        // Step destination array unalignment
+                        for (int dest_unalign_byte = test_matrix->dest_min_unalign_byte; dest_unalign_byte <= test_matrix->dest_max_unalign_byte; dest_unalign_byte += test_matrix->dest_unalign_step) {
+
+                            // Call functionality test
+                            UPDATE_TEST_CASE(test_case, dest_w, dest_h, src_stride, dest_stride, src_unalign_byte, dest_unalign_byte);
+                            lv_image_functionality(test_case);
+                            test_matrix->test_combinations_count++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+    ESP_LOGI(TAG_LV_IMAGE_FUNC, "test combinations: %d\n", test_matrix->test_combinations_count);
+}
+
+static void lv_image_functionality(func_test_case_lv_image_params_t *test_case)
+{
+    fill_test_bufs(test_case);
+
+    _lv_draw_sw_blend_image_dsc_t dsc_asm = {
+        .dest_buf = test_case->buf.p_dest_asm,
+        .dest_w = test_case->dest_w,
+        .dest_h = test_case->dest_h,
+        .dest_stride = test_case->dest_stride * test_case->dest_data_type_size,  // dest_stride * sizeof(data_type)
+        .mask_buf = NULL,
+        .mask_stride = 0,
+        .src_buf = test_case->buf.p_src,
+        .src_stride = test_case->src_stride * test_case->src_data_type_size,     // src_stride * sizeof(data_type)
+        .src_color_format = test_case->color_format,
+        .opa = LV_OPA_MAX,
+        .blend_mode = LV_BLEND_MODE_NORMAL,
+        .use_asm = true,
+    };
+
+    // Init structure for LVGL blend API, to call the ANSI API
+    _lv_draw_sw_blend_image_dsc_t dsc_ansi = dsc_asm;
+    dsc_ansi.dest_buf = test_case->buf.p_dest_ansi;
+    dsc_ansi.use_asm = false;
+
+    test_case->blend_api_func(&dsc_asm);    // Call the LVGL API with Assembly code
+    test_case->blend_api_func(&dsc_ansi);   // Call the LVGL API with ANSI code
+
+    // Shift array pointers by (Canary pixels amount * data type length) back
+    test_case->buf.p_dest_asm -= test_case->canary_pixels * test_case->dest_data_type_size;
+    test_case->buf.p_dest_ansi -= test_case->canary_pixels * test_case->dest_data_type_size;
+
+    // Evaluate the results
+    sprintf(test_msg_buf, "Test case: dest_w = %d, dest_h = %d, dest_stride = %d, src_stride = %d, dest_unalign_byte = %d, src_unalign_byte = %d\n",
+            test_case->dest_w, test_case->dest_h, test_case->dest_stride, test_case->src_stride, test_case->dest_unalign_byte, test_case->src_unalign_byte);
+#if DBG_PRINT_OUTPUT
+    printf("%s\n", test_msg_buf);
+#endif
+    switch (test_case->color_format) {
+    case LV_COLOR_FORMAT_RGB565:
+        test_eval_image_16bit_data(test_case);
+        break;
+    default:
+        TEST_ASSERT_MESSAGE(false, "LV Color format not found");
+        break;
+    }
+
+    // Free memory allocated for test buffers
+    free(test_case->buf.p_dest_asm_alloc);
+    free(test_case->buf.p_dest_ansi_alloc);
+    free(test_case->buf.p_src_alloc);
+}
+
+static void fill_test_bufs(func_test_case_lv_image_params_t *test_case)
+{
+    const size_t src_data_type_size = test_case->src_data_type_size;        // sizeof() of used data type in the source buffer
+    const size_t dest_data_type_size = test_case->dest_data_type_size;      // sizeof() of used data type in the destination buffer
+    const size_t src_buf_len = test_case->src_buf_len;                      // Total source buffer length, data part of the source buffer including matrix padding (no Canary pixels are used for source buffer)
+    const size_t total_dest_buf_len = test_case->total_dest_buf_len;        // Total destination buffer length, data part of the destination buffer including the Canary pixels and matrix padding
+    const size_t active_dest_buf_len = test_case->active_dest_buf_len;      // Length of the data part of the destination buffer including matrix padding
+    const size_t canary_pixels = test_case->canary_pixels;                    // Canary pixels, according to the data type
+    const unsigned int src_unalign_byte = test_case->src_unalign_byte;      // Unalignment bytes for source buffer
+    const unsigned int dest_unalign_byte = test_case->dest_unalign_byte;    // Unalignment bytes for destination buffer
+
+    // Allocate destination arrays and source array for Assembly and ANSI LVGL Blend API
+    void *src_mem_common = memalign(16, (src_buf_len * src_data_type_size) + src_unalign_byte);
+    void *dest_mem_asm   = memalign(16, (total_dest_buf_len * dest_data_type_size) + dest_unalign_byte);
+    void *dest_mem_ansi  = memalign(16, (total_dest_buf_len * dest_data_type_size) + dest_unalign_byte);
+    TEST_ASSERT_NOT_NULL_MESSAGE(src_mem_common, "Lack of memory");
+    TEST_ASSERT_NOT_NULL_MESSAGE(dest_mem_asm, "Lack of memory");
+    TEST_ASSERT_NOT_NULL_MESSAGE(dest_mem_ansi, "Lack of memory");
+
+    // Save a pointer to the beginning of the allocated memory which will be used to free()
+    test_case->buf.p_src_alloc = src_mem_common;
+    test_case->buf.p_dest_asm_alloc = dest_mem_asm;
+    test_case->buf.p_dest_ansi_alloc = dest_mem_ansi;
+
+    // Apply destination and source array unalignment
+    uint8_t *src_buf_common = (uint8_t *)src_mem_common + src_unalign_byte;
+    uint8_t *dest_buf_asm = (uint8_t *)dest_mem_asm + dest_unalign_byte;
+    uint8_t *dest_buf_ansi = (uint8_t *)dest_mem_ansi + dest_unalign_byte;
+
+    // Set the whole buffer to 0, including the Canary pixels part
+    memset(src_buf_common, 0, src_buf_len * src_data_type_size);
+    memset(dest_buf_asm, 0, total_dest_buf_len * src_data_type_size);
+    memset(dest_buf_ansi, 0, total_dest_buf_len * src_data_type_size);
+
+    switch (test_case->operation_type) {
+    case OPERATION_FILL:
+        // Fill the actual part of the destination buffers with known values,
+        // Values must be same, because of the stride
+
+        if (test_case->color_format == LV_COLOR_FORMAT_RGB565) {
+            uint16_t *dest_buf_asm_uint16 = (uint16_t *)dest_buf_asm;
+            uint16_t *dest_buf_ansi_uint16 = (uint16_t *)dest_buf_ansi;
+            uint16_t *src_buf_uint16 = (uint16_t*)src_buf_common;
+
+            // Fill destination buffers
+            for (int i = 0; i < active_dest_buf_len; i++) {
+                dest_buf_asm_uint16[canary_pixels + i] = i + ((i & 1) ? 0x6699 : 0x9966);
+                dest_buf_ansi_uint16[canary_pixels + i] = dest_buf_asm_uint16[canary_pixels + i];
+            }
+
+            // Fill source buffer
+            for (int i = 0; i < src_buf_len; i++) {
+                src_buf_uint16[i] = i + ((i & 1) ? 0x55AA : 0xAA55);
+            }
+        }
+
+        break;
+    default:
+        TEST_ASSERT_MESSAGE(false, "LV Operation not found");
+        break;
+    }
+
+    // Shift array pointers by (Canary pixels amount * data type length) forward
+    dest_buf_asm += canary_pixels * dest_data_type_size;
+    dest_buf_ansi += canary_pixels * dest_data_type_size;
+
+    // Save a pointer to the working part of the memory, where the test data are stored
+    test_case->buf.p_src = (void *)src_buf_common;
+    test_case->buf.p_dest_asm = (void *)dest_buf_asm;
+    test_case->buf.p_dest_ansi = (void *)dest_buf_ansi;
+
+#if DBG_PRINT_OUTPUT
+    printf("Destination buffers fill:\n");
+    for (uint32_t i = 0; i < test_case->active_dest_buf_len; i++) {
+        printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx16" \t asm = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_dest_ansi)[i], ((uint16_t *)test_case->buf.p_dest_asm)[i]);
+    }
+    printf("\n");
+
+    printf("Source buffer fill:\n");
+    for (uint32_t i = 0; i < test_case->src_buf_len; i++) {
+        printf("src_buf[%"PRIi32"] %s = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_src)[i]);
+    }
+    printf("\n");
+#endif
+
+}
+
+static void test_eval_image_16bit_data(func_test_case_lv_image_params_t *test_case)
+{
+    // Print results, 16bit data
+#if DBG_PRINT_OUTPUT
+    printf("\nEval\nDestination buffers fill:\n");
+    for (uint32_t i = 0; i < test_case->total_dest_buf_len; i++) {
+        printf("dest_buf[%"PRIi32"] %s ansi = %8"PRIx16" \t asm = %8"PRIx16"   %s \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_dest_ansi)[i], ((uint16_t *)test_case->buf.p_dest_asm)[i], (((uint16_t *)test_case->buf.p_dest_ansi)[i] == ((uint16_t *)test_case->buf.p_dest_asm)[i]) ? ("OK") : ("FAIL"));
+    }
+    printf("\n");
+
+    printf("Source buffer fill:\n");
+    for (uint32_t i = 0; i < test_case->src_buf_len; i++) {
+        printf("src_buf[%"PRIi32"] %s = %8"PRIx16" \n", i, ((i < 10) ? (" ") : ("")), ((uint16_t *)test_case->buf.p_src)[i]);
+    }
+    printf("\n");
+#endif
+
+    // Canary pixels area must stay 0
+    const size_t canary_pixels = test_case->canary_pixels;
+    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_ansi, canary_pixels, test_msg_buf);
+    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_asm, canary_pixels, test_msg_buf);
+
+    // dest_buf_asm and dest_buf_ansi must be equal
+    TEST_ASSERT_EQUAL_UINT16_ARRAY_MESSAGE((uint16_t *)test_case->buf.p_dest_ansi + canary_pixels, (uint16_t *)test_case->buf.p_dest_asm + canary_pixels, test_case->active_dest_buf_len, test_msg_buf);
+
+    // Data part of the destination buffer and source buffer (not considering matrix padding) must be equal
+    uint16_t *dest_row_begin = (uint16_t *)test_case->buf.p_dest_asm + canary_pixels;
+    uint16_t *src_row_begin = (uint16_t *)test_case->buf.p_src;
+    for (int row = 0; row < test_case->dest_h; row++) {
+        TEST_ASSERT_EQUAL_UINT16_ARRAY_MESSAGE(dest_row_begin, src_row_begin, test_case->dest_w, test_msg_buf);
+        dest_row_begin += test_case->dest_stride;   // Move pointer of the destination buffer to the next row
+        src_row_begin += test_case->src_stride;     // Move pointer of the source buffer to the next row
+    }
+
+    // Canary pixels area must stay 0
+    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_ansi + (test_case->total_dest_buf_len - canary_pixels), canary_pixels, test_msg_buf);
+    TEST_ASSERT_EACH_EQUAL_UINT16_MESSAGE(0, (uint16_t *)test_case->buf.p_dest_asm + (test_case->total_dest_buf_len - canary_pixels), canary_pixels, test_msg_buf);
+}