diff --git a/.gitignore b/.gitignore
index 485cccfcf..29234d44d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@
# Clangd cache
.cache
+
+# Clangd configurations
+.clangd
diff --git a/.gitmodules b/.gitmodules
index 00d892bd3..77bef44d1 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,7 @@
path = thirdparty/mimalloc
url = https://github.com/microsoft/mimalloc.git
shallow = true
+[submodule "thirdparty/riscv-gnu-toolchain"]
+ path = thirdparty/riscv-gnu-toolchain
+ url = https://github.com/riscv-collab/riscv-gnu-toolchain.git
+ shallow = true
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 841444541..486346744 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -22,6 +22,7 @@ project(buddy-mlir LANGUAGES CXX C)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED YES)
+include(ExternalProject)
#-------------------------------------------------------------------------------
# Options and settings
@@ -41,13 +42,15 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR BUDDY_MLIR_OUT_OF_TREE_
message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
- set(LLVM_MLIR_BINARY_DIR ${MLIR_DIR}/../../../bin)
- set(LLVM_MLIR_LIBRARY_DIR ${MLIR_DIR}/../../../lib)
- set(LLVM_PROJECT_BUILD_DIR ${MLIR_DIR}/../../../)
- if(NOT DEFINED LLVM_PROJECT_SOURCE_DIR)
- get_filename_component(LLVM_PROJECT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llvm/ ABSOLUTE)
+ # LLVM_MAIN_SRC_DIR is a private variable for the LLVM in-tree build.
+ # To provide compatibility for unifying the one-step and two-step build,
+ # we set LLVM_MAIN_SRC_DIR ourselves here.
+ # This could benefit users who want to specify a custom LLVM source directory,
+ # but also not interfere with normal users who just want to use the buddy-mlir provided LLVM sources.
+ if(NOT DEFINED LLVM_MAIN_SRC_DIR)
+ get_filename_component(LLVM_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llvm/llvm ABSOLUTE)
endif()
- set(LLVM_MLIR_SOURCE_DIR ${LLVM_PROJECT_SOURCE_DIR}/mlir)
+ set(MLIR_MAIN_SRC_DIR ${LLVM_MAIN_SRC_DIR}/../mlir)
list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
@@ -65,16 +68,9 @@ else()
#-------------------------------------------------------------------------------
# MLIR/LLVM Configuration
#-------------------------------------------------------------------------------
-
- # Allow using out-of-tree llvm directory
- set(LLVM_PROJECT_SOURCE_DIR ${LLVM_MAIN_SRC_DIR}/..)
- message(STATUS "Using LLVM Project ${LLVM_PROJECT_SOURCE_DIR}")
-
set(MLIR_MAIN_SRC_DIR ${LLVM_MAIN_SRC_DIR}/../mlir)
set(MLIR_INCLUDE_DIR ${MLIR_MAIN_SRC_DIR}/include)
set(MLIR_GENERATED_INCLUDE_DIR ${LLVM_BINARY_DIR}/tools/mlir/include)
- set(LLVM_MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}/bin)
- set(MLIR_INCLUDE_DIRS "${MLIR_INCLUDE_DIR};${MLIR_GENERATED_INCLUDE_DIR}")
endif()
#-------------------------------------------------------------------------------
@@ -188,6 +184,24 @@ if(BUDDY_MLIR_USE_MIMALLOC)
find_package(mimalloc REQUIRED)
endif()
+#-------------------------------------------------------------------------------
+# The RISC-V toolchain
+#-------------------------------------------------------------------------------
+
+if(BUDDY_MLIR_ENABLE_RISCV_GNU_TOOLCHAIN)
+ set(RISCV_GNU_TOOLCHAIN_DIR "${BUDDY_SOURCE_DIR}/thirdparty/riscv-gnu-toolchain")
+ set(RISCV_GNU_TOOLCHAIN_INSTALL_DIR "${CMAKE_BINARY_DIR}/thirdparty/riscv-gnu-toolchain")
+ ExternalProject_Add(
+ riscv-gnu-toolchain
+ SOURCE_DIR ${RISCV_GNU_TOOLCHAIN_DIR}
+ PREFIX ${RISCV_GNU_TOOLCHAIN_INSTALL_DIR}
+ CONFIGURE_COMMAND ${RISCV_GNU_TOOLCHAIN_DIR}/configure --prefix=${RISCV_GNU_TOOLCHAIN_INSTALL_DIR}
+ BUILD_COMMAND make clean && make linux build-qemu -j
+ BUILD_IN_SOURCE TRUE
+ INSTALL_COMMAND ""
+ )
+endif()
+
#-------------------------------------------------------------------------------
# Initialize Python packages
#-------------------------------------------------------------------------------
@@ -201,6 +215,8 @@ if(BUDDY_MLIR_ENABLE_PYTHON_PACKAGES)
# Create empty __init__.py files to make these directories Python packages
file(WRITE ${BUDDY_MLIR_PYTHON_PACKAGES_DIR}/buddy/__init__.py "")
file(WRITE ${BUDDY_MLIR_PYTHON_PACKAGES_DIR}/buddy/compiler/__init__.py "")
+
+ install(DIRECTORY ${BUDDY_MLIR_PYTHON_PACKAGES_DIR}/buddy DESTINATION python_packages)
endif()
#-------------------------------------------------------------------------------
diff --git a/README.md b/README.md
index cb9a5f1c2..be650591b 100644
--- a/README.md
+++ b/README.md
@@ -104,6 +104,17 @@ If you want to add domain-specific framework support, please add the following c
| -------------- | ------------- | ------------- |
| OpenCV | `-DBUDDY_ENABLE_OPENCV=ON` | Add `-DOpenCV_DIR=` or install OpenCV release version on your local device. |
+To build buddy-mlir with custom LLVM sources:
+
+```
+$ cmake -G Ninja .. \
+ -DMLIR_DIR=PATH/TO/LLVM/lib/cmake/mlir \
+ -DLLVM_DIR=PATH/TO/LLVM/lib/cmake/llvm \
+ -DLLVM_ENABLE_ASSERTIONS=ON \
+ -DCMAKE_BUILD_TYPE=RELEASE \
+ -DLLVM_MAIN_SRC_DIR=PATH/TO/LLVM_SOURCE
+```
+
One-step building strategy
If you only want to use our tools and integrate them more easily into your projects, you can choose to use the one-step build strategy.
@@ -134,7 +145,7 @@ This repository have nix flake support. You can follow the [nix installation ins
nix develop .
```
-This will setup a bash shell with `clang`, `clangd`, `cmake`, `ninja`, and other necessary dependencies to build buddy-mlir from source.
+This will setup a bash shell with `clang`, `ccls`, `cmake`, `ninja`, and other necessary dependencies to build buddy-mlir from source.
- If you want to use the buddy-mlir bintools
diff --git a/backend/include/llvm/IR/CMakeLists.txt b/backend/include/llvm/IR/CMakeLists.txt
index b3447eae6..2de6b999b 100644
--- a/backend/include/llvm/IR/CMakeLists.txt
+++ b/backend/include/llvm/IR/CMakeLists.txt
@@ -1,4 +1,4 @@
-include_directories(${LLVM_PROJECT_SOURCE_DIR}/llvm/include/llvm/IR/)
+include_directories(${LLVM_MAIN_SRC_DIR}/include/llvm/IR/)
set(LLVM_TARGET_DEFINITIONS IntrinsicsBuddyExt.td)
tablegen(LLVM IntrinsicImpl.inc -gen-intrinsic-impl)
diff --git a/backend/llvm/lib/Analysis/CMakeLists.txt b/backend/llvm/lib/Analysis/CMakeLists.txt
index 2a3a65971..117f75d89 100644
--- a/backend/llvm/lib/Analysis/CMakeLists.txt
+++ b/backend/llvm/lib/Analysis/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Analysis_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Analysis)
+set(LLVM_Analysis_DIR ${LLVM_MAIN_SRC_DIR}/lib/Analysis)
add_llvm_component_library(LLVMBuddyAnalysis
diff --git a/backend/llvm/lib/AsmParser/CMakeLists.txt b/backend/llvm/lib/AsmParser/CMakeLists.txt
index b5411d100..d687d1d3b 100644
--- a/backend/llvm/lib/AsmParser/CMakeLists.txt
+++ b/backend/llvm/lib/AsmParser/CMakeLists.txt
@@ -1,6 +1,6 @@
# AsmParser
-set(LLVM_AsmParser_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/AsmParser)
+set(LLVM_AsmParser_DIR ${LLVM_MAIN_SRC_DIR}/lib/AsmParser)
add_llvm_component_library(LLVMBuddyAsmParser
${LLVM_AsmParser_DIR}/LLLexer.cpp
diff --git a/backend/llvm/lib/Bitcode/Reader/CMakeLists.txt b/backend/llvm/lib/Bitcode/Reader/CMakeLists.txt
index cf92a543f..7ea904801 100644
--- a/backend/llvm/lib/Bitcode/Reader/CMakeLists.txt
+++ b/backend/llvm/lib/Bitcode/Reader/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Reader_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Bitcode/Reader)
+set(LLVM_Reader_DIR ${LLVM_MAIN_SRC_DIR}/lib/Bitcode/Reader)
add_llvm_component_library(LLVMBuddyBitReader
${LLVM_Reader_DIR}/BitcodeAnalyzer.cpp
diff --git a/backend/llvm/lib/Bitcode/Writer/CMakeLists.txt b/backend/llvm/lib/Bitcode/Writer/CMakeLists.txt
index f19595cea..a8b7f0c27 100644
--- a/backend/llvm/lib/Bitcode/Writer/CMakeLists.txt
+++ b/backend/llvm/lib/Bitcode/Writer/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Writer_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Bitcode/Writer)
+set(LLVM_Writer_DIR ${LLVM_MAIN_SRC_DIR}/lib/Bitcode/Writer)
add_llvm_component_library(LLVMBuddyBitWriter
diff --git a/backend/llvm/lib/CodeGen/AsmPrinter/CMakeLists.txt b/backend/llvm/lib/CodeGen/AsmPrinter/CMakeLists.txt
index fe3273dd5..b942f4f73 100644
--- a/backend/llvm/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/backend/llvm/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_AsmPrinter_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/CodeGen/AsmPrinter)
+set(LLVM_AsmPrinter_DIR ${LLVM_MAIN_SRC_DIR}/lib/CodeGen/AsmPrinter)
add_llvm_component_library(LLVMBuddyAsmPrinter
${LLVM_AsmPrinter_DIR}/AccelTable.cpp
diff --git a/backend/llvm/lib/CodeGen/CMakeLists.txt b/backend/llvm/lib/CodeGen/CMakeLists.txt
index 1794b38fa..7eb38876d 100644
--- a/backend/llvm/lib/CodeGen/CMakeLists.txt
+++ b/backend/llvm/lib/CodeGen/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_CodeGen_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/CodeGen)
+set(LLVM_CodeGen_DIR ${LLVM_MAIN_SRC_DIR}/lib/CodeGen)
add_llvm_component_library(LLVMBuddyCodeGen
${LLVM_CodeGen_DIR}/AggressiveAntiDepBreaker.cpp
diff --git a/backend/llvm/lib/CodeGen/MIRParser/CMakeLists.txt b/backend/llvm/lib/CodeGen/MIRParser/CMakeLists.txt
index 6275b1ece..1ab94ee93 100644
--- a/backend/llvm/lib/CodeGen/MIRParser/CMakeLists.txt
+++ b/backend/llvm/lib/CodeGen/MIRParser/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_MIRParser_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/CodeGen/MIRParser)
+set(LLVM_MIRParser_DIR ${LLVM_MAIN_SRC_DIR}/lib/CodeGen/MIRParser)
add_llvm_component_library(LLVMBuddyMIRParser
${LLVM_MIRParser_DIR}/MILexer.cpp
diff --git a/backend/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt b/backend/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 4bb3cde98..3b467a4ed 100644
--- a/backend/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/backend/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_SelectionDAG_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/CodeGen/SelectionDAG)
+set(LLVM_SelectionDAG_DIR ${LLVM_MAIN_SRC_DIR}/lib/CodeGen/SelectionDAG)
add_llvm_component_library(LLVMBuddySelectionDAG
${LLVM_SelectionDAG_DIR}/DAGCombiner.cpp
diff --git a/backend/llvm/lib/IR/CMakeLists.txt b/backend/llvm/lib/IR/CMakeLists.txt
index e6895a1f8..0d5618473 100644
--- a/backend/llvm/lib/IR/CMakeLists.txt
+++ b/backend/llvm/lib/IR/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_IR_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/IR)
+set(LLVM_IR_DIR ${LLVM_MAIN_SRC_DIR}/lib/IR)
add_llvm_component_library(LLVMBuddyCore
${LLVM_IR_DIR}/AbstractCallSite.cpp
diff --git a/backend/llvm/lib/IRReader/CMakeLists.txt b/backend/llvm/lib/IRReader/CMakeLists.txt
index 9b315dec3..72e95722a 100644
--- a/backend/llvm/lib/IRReader/CMakeLists.txt
+++ b/backend/llvm/lib/IRReader/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_IRReader_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/IRReader)
+set(LLVM_IRReader_DIR ${LLVM_MAIN_SRC_DIR}/lib/IRReader)
add_llvm_component_library(LLVMBuddyIRReader
${LLVM_IRReader_DIR}/IRReader.cpp
diff --git a/backend/llvm/lib/Object/CMakeLists.txt b/backend/llvm/lib/Object/CMakeLists.txt
index 8695d55ba..a8425e97c 100644
--- a/backend/llvm/lib/Object/CMakeLists.txt
+++ b/backend/llvm/lib/Object/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Object_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Object)
+set(LLVM_Object_DIR ${LLVM_MAIN_SRC_DIR}/lib/Object)
add_llvm_component_library(LLVMBuddyObject
${LLVM_Object_DIR}/Archive.cpp
diff --git a/backend/llvm/lib/ProfileData/CMakeLists.txt b/backend/llvm/lib/ProfileData/CMakeLists.txt
index 9ae05a36f..742ecf662 100644
--- a/backend/llvm/lib/ProfileData/CMakeLists.txt
+++ b/backend/llvm/lib/ProfileData/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_ProfileData_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/ProfileData)
+set(LLVM_ProfileData_DIR ${LLVM_MAIN_SRC_DIR}/lib/ProfileData)
add_llvm_component_library(LLVMBuddyProfileData
${LLVM_ProfileData_DIR}/GCOV.cpp
diff --git a/backend/llvm/lib/Remarks/CMakeLists.txt b/backend/llvm/lib/Remarks/CMakeLists.txt
index 4ed877577..5c1c81b7d 100644
--- a/backend/llvm/lib/Remarks/CMakeLists.txt
+++ b/backend/llvm/lib/Remarks/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Remarks_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Remarks)
+set(LLVM_Remarks_DIR ${LLVM_MAIN_SRC_DIR}/lib/Remarks)
add_llvm_component_library(LLVMBuddyRemarks
${LLVM_Remarks_DIR}/BitstreamRemarkParser.cpp
diff --git a/backend/llvm/lib/Target/CMakeLists.txt b/backend/llvm/lib/Target/CMakeLists.txt
index c6298c383..1dd5cd34f 100644
--- a/backend/llvm/lib/Target/CMakeLists.txt
+++ b/backend/llvm/lib/Target/CMakeLists.txt
@@ -2,7 +2,7 @@ list(APPEND LLVM_COMMON_DEPENDS buddy_intrinsics_gen)
list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target)
-set(LLVM_Target_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Target)
+set(LLVM_Target_DIR ${LLVM_MAIN_SRC_DIR}/lib/Target)
add_llvm_component_library(LLVMBuddyTarget
${LLVM_Target_DIR}/Target.cpp
diff --git a/backend/llvm/lib/Target/RISCV/CMakeLists.txt b/backend/llvm/lib/Target/RISCV/CMakeLists.txt
index 4a66f6529..6bfee7c2f 100644
--- a/backend/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/backend/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -21,7 +21,7 @@ macro(buddy_add_llvm_target target_name)
set( CURRENT_LLVM_TARGET LLVM${target_name} )
endmacro(buddy_add_llvm_target)
-set(LLVM_TARGET_RISCV_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Target/RISCV)
+set(LLVM_TARGET_RISCV_DIR ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV)
# ------------------------------------------------------------------------------
# Configure RISC-V Buddy Extension.
diff --git a/backend/llvm/lib/Transforms/IPO/CMakeLists.txt b/backend/llvm/lib/Transforms/IPO/CMakeLists.txt
index 74ff79863..08392abf8 100644
--- a/backend/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/backend/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_IPO_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Transforms/IPO)
+set(LLVM_IPO_DIR ${LLVM_MAIN_SRC_DIR}/lib/Transforms/IPO)
add_llvm_component_library(LLVMBuddyIPO
${LLVM_IPO_DIR}/AlwaysInliner.cpp
diff --git a/backend/llvm/lib/Transforms/Scalar/CMakeLists.txt b/backend/llvm/lib/Transforms/Scalar/CMakeLists.txt
index c3c412b9a..6bbcf432e 100644
--- a/backend/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/backend/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Scalar_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Transforms/Scalar)
+set(LLVM_Scalar_DIR ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Scalar)
add_llvm_component_library(LLVMBuddyScalarOpts
${LLVM_Scalar_DIR}/ADCE.cpp
diff --git a/backend/llvm/lib/Transforms/Utils/CMakeLists.txt b/backend/llvm/lib/Transforms/Utils/CMakeLists.txt
index 989a672ed..e3313e07b 100644
--- a/backend/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/backend/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Utils_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Transforms/Utils)
+set(LLVM_Utils_DIR ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Utils)
add_llvm_component_library(LLVMBuddyTransformUtils
diff --git a/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index e9cece2c4..669aae585 100644
--- a/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Vectorize_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Transforms/Vectorize)
+set(LLVM_Vectorize_DIR ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Vectorize)
add_llvm_component_library(LLVMBuddyVectorize
${LLVM_Vectorize_DIR}/LoadStoreVectorizer.cpp
diff --git a/docs/PythonEnvironment.md b/docs/PythonEnvironment.md
new file mode 100644
index 000000000..77f431e85
--- /dev/null
+++ b/docs/PythonEnvironment.md
@@ -0,0 +1,10 @@
+# Python Virtual Environment Setup Guide for Buddy-mlir
+
+We recommend you to use anaconda3 to create python virtual environment. You should install python packages as buddy-mlir/requirements.
+
+```bash
+$ conda create -n python=3.11
+$ conda activate
+$ cd buddy-mlir
+$ pip install -r requirements.txt
+```
\ No newline at end of file
diff --git a/docs/RVVEnvironment.md b/docs/RVVEnvironment.md
new file mode 100644
index 000000000..ddca0ab8f
--- /dev/null
+++ b/docs/RVVEnvironment.md
@@ -0,0 +1,153 @@
+# Environment Setup Guide for MLIR and RVV Testing and Experiments
+
+This guide provides instructions on setting up an environment to test the RISC-V Vector Extension using the buddy-mlir project.
+The target platform for emulation is QEMU.
+
+## Requirements
+
+Before proceed any further make sure that you installed dependencies below
+
+* [LLVM dependecies](https://llvm.org/docs/GettingStarted.html#requirements)
+* [GNU Toolchain dependecies](https://github.com/riscv-collab/riscv-gnu-toolchain#prerequisites)
+* [QEMU dependecies](https://wiki.qemu.org/Hosts/Linux)
+
+## Build Steps
+
+> **_NOTE:_** The build process includes several heavy stages. It may take significant time to clone and build all components.
+
+0. Prepare `buddy-mlir` and Submodules
+
+```
+$ git clone https://github.com/buddy-compiler/buddy-mlir.git
+$ cd buddy-mlir
+$ git submodule update --init
+```
+
+1. Build Local LLVM/MLIR
+
+```
+$ cd buddy-mlir
+$ mkdir llvm/build
+$ cd llvm/build
+$ cmake -G Ninja ../llvm \
+ -DLLVM_ENABLE_PROJECTS="mlir;clang;openmp" \
+ -DLLVM_TARGETS_TO_BUILD="host;RISCV" \
+ -DLLVM_ENABLE_ASSERTIONS=ON \
+ -DOPENMP_ENABLE_LIBOMPTARGET=OFF \
+ -DCMAKE_BUILD_TYPE=RELEASE \
+ -DMLIR_ENABLE_BINDINGS_PYTHON=ON \
+ -DPython3_EXECUTABLE=$(which python3)
+$ ninja check-clang check-mlir omp
+$ export BUILD_LOCAL_LLVM_DIR=$PWD
+```
+
+2. Build Local `buddy-mlir`
+
+```
+$ cd buddy-mlir
+$ mkdir build
+$ cd build
+$ cmake -G Ninja .. \
+ -DMLIR_DIR=$PWD/../llvm/build/lib/cmake/mlir \
+ -DLLVM_DIR=$PWD/../llvm/build/lib/cmake/llvm \
+ -DLLVM_ENABLE_ASSERTIONS=ON \
+ -DCMAKE_BUILD_TYPE=RELEASE \
+ -DBUDDY_MLIR_ENABLE_RISCV_GNU_TOOLCHAIN=ON \
+ -DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
+ -DPython3_EXECUTABLE=$(which python3)
+$ ninja
+$ ninja check-buddy
+$ export BUILD_RISCV_GNU_TOOLCHAIN_DIR=$PWD/thirdparty/riscv-gnu-toolchain/
+$ export RISCV_GNU_TOOLCHAIN_SYSROOT_DIR=${BUILD_RISCV_GNU_TOOLCHAIN_DIR}/sysroot/
+```
+
+3. Build Cross-Compiled Clang
+
+```
+$ cd buddy-mlir
+$ mkdir llvm/build-cross-clang-rv
+$ cd llvm/build-cross-clang-rv
+$ cmake -G Ninja ../llvm \
+ -DLLVM_ENABLE_PROJECTS="clang" \
+ -DLLVM_TARGETS_TO_BUILD="RISCV" \
+ -DCMAKE_SYSTEM_NAME=Linux \
+ -DCMAKE_C_COMPILER=${BUILD_LOCAL_LLVM_DIR}/bin/clang \
+ -DCMAKE_CXX_COMPILER=${BUILD_LOCAL_LLVM_DIR}/bin/clang++ \
+ -DCMAKE_C_FLAGS="--target=riscv64-unknown-linux-gnu --sysroot=${RISCV_GNU_TOOLCHAIN_SYSROOT_DIR} --gcc-toolchain=${BUILD_RISCV_GNU_TOOLCHAIN_DIR}" \
+ -DCMAKE_CXX_FLAGS="--target=riscv64-unknown-linux-gnu --sysroot=${RISCV_GNU_TOOLCHAIN_SYSROOT_DIR} --gcc-toolchain=${BUILD_RISCV_GNU_TOOLCHAIN_DIR}" \
+ -DLLVM_TABLEGEN=${BUILD_LOCAL_LLVM_DIR}/bin/llvm-tblgen \
+ -DCLANG_TABLEGEN=${BUILD_LOCAL_LLVM_DIR}/bin/clang-tblgen \
+ -DLLVM_DEFAULT_TARGET_TRIPLE=riscv64-unknown-linux-gnu \
+ -DLLVM_TARGET_ARCH=RISCV64 \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DLLVM_ENABLE_ZSTD=Off
+$ ninja clang lli
+```
+
+4. Build Cross-Compiled MLIR
+
+```
+$ cd buddy-mlir
+$ mkdir llvm/build-cross-mlir-rv
+$ cd llvm/build-cross-mlir-rv
+$ cmake -G Ninja ../../llvm/llvm \
+ -DLLVM_ENABLE_PROJECTS="mlir" \
+ -DLLVM_BUILD_EXAMPLES=OFF \
+ -DCMAKE_CROSSCOMPILING=True \
+ -DLLVM_TARGET_ARCH=RISCV64 \
+ -DLLVM_TARGETS_TO_BUILD=RISCV \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DLLVM_ENABLE_ASSERTIONS=ON \
+ -DLLVM_NATIVE_ARCH=RISCV \
+ -DLLVM_HOST_TRIPLE=riscv64-unknown-linux-gnu \
+ -DLLVM_DEFAULT_TARGET_TRIPLE=riscv64-unknown-linux-gnu \
+ -DCMAKE_C_COMPILER=${BUILD_LOCAL_LLVM_DIR}/bin/clang \
+ -DCMAKE_CXX_COMPILER=${BUILD_LOCAL_LLVM_DIR}/bin/clang++ \
+ -DCMAKE_C_FLAGS="--target=riscv64-unknown-linux-gnu --sysroot=${RISCV_GNU_TOOLCHAIN_SYSROOT_DIR} --gcc-toolchain=${BUILD_RISCV_GNU_TOOLCHAIN_DIR}" \
+ -DCMAKE_CXX_FLAGS="--target=riscv64-unknown-linux-gnu --sysroot=${RISCV_GNU_TOOLCHAIN_SYSROOT_DIR} --gcc-toolchain=${BUILD_RISCV_GNU_TOOLCHAIN_DIR}" \
+ -DMLIR_TABLEGEN=${BUILD_LOCAL_LLVM_DIR}/bin/mlir-tblgen \
+ -DLLVM_TABLEGEN=${BUILD_LOCAL_LLVM_DIR}/bin/llvm-tblgen \
+ -DMLIR_LINALG_ODS_YAML_GEN=${BUILD_LOCAL_LLVM_DIR}/bin/mlir-linalg-ods-yaml-gen \
+ -DMLIR_PDLL_TABLEGEN=${BUILD_LOCAL_LLVM_DIR}/bin/mlir-pdll \
+ -DLLVM_ENABLE_ZSTD=Off
+$ ninja
+$ export BUILD_CROSS_MLIR_DIR=$PWD
+```
+
+5. Build Cross-Compiled `buddy-mlir`
+
+```
+$ cd buddy-mlir
+$ mkdir build-cross-rv
+$ cd build-cross-rv
+$ cmake -G Ninja .. \
+ -DCMAKE_SYSTEM_NAME=Linux \
+ -DMLIR_DIR=${BUILD_CROSS_MLIR_DIR}/lib/cmake/mlir \
+ -DLLVM_DIR=${BUILD_CROSS_MLIR_DIR}/lib/cmake/llvm \
+ -DCMAKE_CROSSCOMPILING=True \
+ -DLLVM_TARGETS_TO_BUILD=RISCV \
+ -DCMAKE_BUILD_TYPE=Release \
+ -DLLVM_ENABLE_ASSERTIONS=ON \
+ -DLLVM_NATIVE_ARCH=RISCV \
+ -DLLVM_HOST_TRIPLE=riscv64-unknown-linux-gnu \
+ -DCMAKE_C_COMPILER=${BUILD_LOCAL_LLVM_DIR}/bin/clang \
+ -DCMAKE_CXX_COMPILER=${BUILD_LOCAL_LLVM_DIR}/bin/clang++ \
+ -DCMAKE_C_FLAGS="--target=riscv64-unknown-linux-gnu --sysroot=${RISCV_GNU_TOOLCHAIN_SYSROOT_DIR} --gcc-toolchain=${BUILD_RISCV_GNU_TOOLCHAIN_DIR}" \
+ -DCMAKE_CXX_FLAGS="--target=riscv64-unknown-linux-gnu --sysroot=${RISCV_GNU_TOOLCHAIN_SYSROOT_DIR} --gcc-toolchain=${BUILD_RISCV_GNU_TOOLCHAIN_DIR}" \
+ -DLLVM_ENABLE_ZSTD=Off
+$ ninja StaticMLIRCRunnerUtils StaticMLIRRunnerUtils
+```
+
+## Testing RVV Environment
+
+```
+$ cd buddy-mlir
+$ cd examples/RVVDialect/
+$ make rvv-mul-add-run
+
+// Expected Output:
+Unranked Memref base@ = 0x55555729aaa0 rank = 1 offset = 0 sizes = [20] strides = [1] data =
+[0, 12, 26, 42, 60, 80, 102, 126, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+```
+
+Congratulations! Your RVV environment is now fully set up. Enjoy exploring and testing!
diff --git a/docs/rvv-enviroment.md b/docs/rvv-enviroment.md
deleted file mode 100644
index f48a8262d..000000000
--- a/docs/rvv-enviroment.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# Setting up environment for testing MLIR RVV dialect
-
-This guide will help to set up environment for testing RISC-V Vector Extension using buddy-mlir project and
-corresponding RVV Dialect. As a target platform QEMU emulator is used.
-
-## Requirements
-
-Before proceed any further make sure that you installed dependencies below
-
-* [LLVM dependecies](https://llvm.org/docs/GettingStarted.html#requirements)
-* [GNU Toolchain dependecies](https://github.com/riscv-collab/riscv-gnu-toolchain#prerequisites)
-* [QEMU dependecies](https://wiki.qemu.org/Hosts/Linux)
-
-## Build steps
-
-1. Clone buddy-mlir project
-``` bash
-git clone git@github.com:buddy-compiler/buddy-mlir.git
-cd buddy-mlir
-git submodule update --init
-```
-> **_NOTE:_** `buddly-mlir` contains `llvm-project` as a submodule. `llvm-project` is large, so cloning will take a while
-
-2. Run a script building environment
-```
-cd buddy-mlir/thirdparty
-./build-rvv-env.sh
-```
-> **_NOTE:_** The scripts consist of multiple heavy stages, so be patient - it will take a while to clone and build
-everything.
-Detailed description of the steps can be found in [the page](https://gist.github.com/zhanghb97/ad44407e169de298911b8a4235e68497)
-
-> **_NOTE:_** By default, the script allows `make` to use all available threads for compilation. It may lead
-to consuming a lot of memory and crashing the compiler. If you face with the issue, try to limit the number of threads
-by passing a corresponding argument to the script. For example, `./build-rvv-env.sh 4`
diff --git a/examples/BuddyBert/CMakeLists.txt b/examples/BuddyBert/CMakeLists.txt
index 93dc7c2da..95c98dfa9 100644
--- a/examples/BuddyBert/CMakeLists.txt
+++ b/examples/BuddyBert/CMakeLists.txt
@@ -7,13 +7,13 @@ add_custom_command(
add_custom_command(
OUTPUT forward.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyBert/forward.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyBert/forward.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-pass-pipeline "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), eliminate-empty-tensors, func.func(llvm-request-c-wrappers),convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, convert-func-to-llvm, reconcile-unrealized-casts)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyBert/forward.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyBert/forward.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyBert/forward.mlir
COMMENT "Building forward.o"
VERBATIM)
@@ -22,11 +22,11 @@ add_custom_command(
OUTPUT subgraph0.o
COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyBert/subgraph0.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, func-bufferize-dynamic-offset, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize))" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-pass-pipeline "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), eliminate-empty-tensors, func.func(llvm-request-c-wrappers),convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, convert-func-to-llvm, reconcile-unrealized-casts)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyBert/subgraph0.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyBert/subgraph0.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyBert/subgraph0.mlir
COMMENT "Building subgraph0.o"
VERBATIM)
@@ -36,7 +36,7 @@ add_library(BERT STATIC forward.o subgraph0.o)
SET_TARGET_PROPERTIES(BERT PROPERTIES LINKER_LANGUAGE C)
add_executable(buddy-bert-run bert-main.cpp)
-target_link_directories(buddy-bert-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_directories(buddy-bert-run PRIVATE ${LLVM_LIBRARY_DIR})
set(BUDDY_BERT_LIBS BERT mlir_c_runner_utils)
target_link_libraries(buddy-bert-run ${BUDDY_BERT_LIBS})
diff --git a/examples/BuddyConvolution/.gitignore b/examples/BuddyConvolution/.gitignore
new file mode 100644
index 000000000..df9389428
--- /dev/null
+++ b/examples/BuddyConvolution/.gitignore
@@ -0,0 +1,4 @@
+log.mlir
+log.ll
+log.s
+a.out
diff --git a/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir b/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir
new file mode 100644
index 000000000..76d5e4d93
--- /dev/null
+++ b/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir
@@ -0,0 +1,137 @@
+// RUN: buddy-opt %s \
+// RUN: -convert-vector-to-scf \
+// RUN: -lower-affine \
+// RUN: -arith-bufferize \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -O3 -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+// Using `8` as the vector size.
+#map = affine_map<(d0) -> (d0 floordiv 8)>
+#map0 = affine_map<(d0, d1, d2, d3) -> (d2)>
+#map1 = affine_map<(d0, d1) -> (d0 + d1)>
+#map2 = affine_map<(d0, d1) -> (d0 + d1 * 8)>
+#map3 = affine_map<(d0) -> (d0 * 8)>
+
+module {
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func private @rtclock() -> f64
+
+ func.func @conv_2d_nhwc_fhwc(%arg0: memref, %arg1: memref, %arg2: memref) {
+ %f0 = arith.constant 0. : f32
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c3 = arith.constant 3 : index
+ %n = memref.dim %arg0, %c0 : memref
+ %h_i = memref.dim %arg0, %c1 : memref
+ %w_i = memref.dim %arg0, %c2 : memref
+ %c = memref.dim %arg0, %c3 : memref
+ %f = memref.dim %arg1, %c0 : memref
+ %h_k = memref.dim %arg1, %c1 : memref
+ %w_k = memref.dim %arg1, %c2 : memref
+ %h_o = memref.dim %arg2, %c1 : memref
+ %w_o = memref.dim %arg2, %c2 : memref
+
+ // Output is NHoWoF
+ affine.for %idx_n = %c0 to %n {
+ affine.for %idx_f = %c0 to %f {
+ affine.for %idx_c = %c0 to %c {
+ affine.for %idx_h_o = %c0 to %h_o {
+ affine.for %idx_h_k = %c0 to %h_k {
+ affine.for %idx_w_k = %c0 to %w_k {
+ affine.for %idx_w_o = %c0 to #map(%w_o) {
+ %kernel_ele = memref.load %arg1[%idx_f, %idx_h_k, %idx_w_k, %idx_c] : memref
+ %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<8xf32>
+ %in_iter_h = affine.apply #map1 (%idx_h_k, %idx_h_o)
+ %in_iter_w = affine.apply #map2 (%idx_w_k, %idx_w_o)
+ %out_iter_w = affine.apply #map3 (%idx_w_o)
+ %input_vec = vector.transfer_read %arg0[%idx_n, %in_iter_h, %in_iter_w, %idx_c], %f0
+ { permutation_map = #map0 } : memref, vector<8xf32>
+ %output_vec = vector.transfer_read %arg2[%idx_n, %idx_h_o, %out_iter_w, %idx_f], %f0
+ { permutation_map = #map0 } : memref, vector<8xf32>
+ %res_vec = vector.fma %kernel_vec, %input_vec, %output_vec : vector<8xf32>
+ vector.transfer_write %res_vec, %arg2[%idx_n, %idx_h_o, %out_iter_w, %idx_f]
+ { permutation_map = #map0 } : vector<8xf32>, memref
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return
+ }
+
+ func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> memref {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = memref.alloc(%arg0, %arg1, %arg2, %arg3) : memref
+ scf.for %idx0 = %c0 to %arg0 step %c1 {
+ scf.for %idx1 = %c0 to %arg1 step %c1 {
+ scf.for %idx2 = %c0 to %arg2 step %c1 {
+ scf.for %idx3 = %c0 to %arg3 step %c1 {
+ memref.store %arg4, %0[%idx0, %idx1, %idx2, %idx3] : memref
+ }
+ }
+ }
+ }
+ return %0 : memref
+ }
+
+ func.func @main() {
+ %f0 = arith.constant 0.000000e+00 : f32
+ %f2 = arith.constant 2.000000e+00 : f32
+ %f3 = arith.constant 3.000000e+00 : f32
+
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c3 = arith.constant 3 : index
+ %c5 = arith.constant 5 : index
+ %c6 = arith.constant 6 : index
+ %c8 = arith.constant 8 : index
+ %c12 = arith.constant 12 : index
+ %c16 = arith.constant 16 : index
+ %c24 = arith.constant 24 : index
+ %c28 = arith.constant 28 : index
+
+ // %v0 = call @alloc_f32(%c1, %c12, %c12, %c6, %f2) : (index, index, index, index, f32) -> memref
+ // %v1 = call @alloc_f32(%c16, %c5, %c5, %c6, %f3) : (index, index, index, index, f32) -> memref
+ // %v2 = call @alloc_f32(%c1, %c8, %c8, %c16, %f0) : (index, index, index, index, f32) -> memref
+
+ %v0 = call @alloc_f32(%c1, %c28, %c28, %c1, %f2) : (index, index, index, index, f32) -> memref
+ %v1 = call @alloc_f32(%c6, %c5, %c5, %c1, %f3) : (index, index, index, index, f32) -> memref
+ %v2 = call @alloc_f32(%c1, %c24, %c24, %c6, %f0) : (index, index, index, index, f32) -> memref
+
+ %t_start = call @rtclock() : () -> f64
+ call @conv_2d_nhwc_fhwc(%v0, %v1, %v2) : (memref, memref, memref) -> ()
+ %t_end = call @rtclock() : () -> f64
+
+ // All the elements of the MemRef are the same,
+ // only check the first line to verify the correctness.
+ // CHECK: Unranked Memref
+ // CHECK: [
+ // CHECK: [
+ // CHECK: [
+ // CHECK: [150{{(, 150)*}}],
+ %print_v2 = memref.cast %v2 : memref to memref<*xf32>
+ call @printMemrefF32(%print_v2) : (memref<*xf32>) -> ()
+
+ %time = arith.subf %t_end, %t_start : f64
+ vector.print %time : f64
+
+ memref.dealloc %v0 : memref
+ memref.dealloc %v1 : memref
+ memref.dealloc %v2 : memref
+
+ return
+ }
+}
diff --git a/examples/BuddyConvolution/conv2d-nhwc-fhwc.mlir b/examples/BuddyConvolution/conv2d-nhwc-fhwc.mlir
new file mode 100644
index 000000000..90759355e
--- /dev/null
+++ b/examples/BuddyConvolution/conv2d-nhwc-fhwc.mlir
@@ -0,0 +1,88 @@
+// RUN: buddy-opt %s \
+// RUN: -convert-linalg-to-loops \
+// RUN: -lower-affine \
+// RUN: -arith-bufferize \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func private @rtclock() -> f64
+
+ func.func @conv_2d_nhwc_fhwc(%arg0: memref, %arg1: memref, %arg2: memref) {
+ linalg.conv_2d_nhwc_fhwc ins (%arg0, %arg1: memref, memref)
+ outs (%arg2: memref)
+ return
+ }
+
+ func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> memref {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = memref.alloc(%arg0, %arg1, %arg2, %arg3) : memref
+ scf.for %idx0 = %c0 to %arg0 step %c1 {
+ scf.for %idx1 = %c0 to %arg1 step %c1 {
+ scf.for %idx2 = %c0 to %arg2 step %c1 {
+ scf.for %idx3 = %c0 to %arg3 step %c1 {
+ memref.store %arg4, %0[%idx0, %idx1, %idx2, %idx3] : memref
+ }
+ }
+ }
+ }
+ return %0 : memref
+ }
+
+ func.func @main() {
+ %f0 = arith.constant 0.000000e+00 : f32
+ %f2 = arith.constant 2.000000e+00 : f32
+ %f3 = arith.constant 3.000000e+00 : f32
+
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c3 = arith.constant 3 : index
+ %c5 = arith.constant 5 : index
+ %c6 = arith.constant 6 : index
+ %c8 = arith.constant 8 : index
+ %c12 = arith.constant 12 : index
+ %c16 = arith.constant 16 : index
+ %c24 = arith.constant 24 : index
+ %c28 = arith.constant 28 : index
+
+ // %v0 = call @alloc_f32(%c1, %c12, %c12, %c6, %f2) : (index, index, index, index, f32) -> memref
+ // %v1 = call @alloc_f32(%c16, %c5, %c5, %c6, %f3) : (index, index, index, index, f32) -> memref
+ // %v2 = call @alloc_f32(%c1, %c8, %c8, %c16, %f0) : (index, index, index, index, f32) -> memref
+
+ %v0 = call @alloc_f32(%c1, %c28, %c28, %c1, %f2) : (index, index, index, index, f32) -> memref
+ %v1 = call @alloc_f32(%c6, %c5, %c5, %c1, %f3) : (index, index, index, index, f32) -> memref
+ %v2 = call @alloc_f32(%c1, %c24, %c24, %c6, %f0) : (index, index, index, index, f32) -> memref
+
+ %t_start = call @rtclock() : () -> f64
+ call @conv_2d_nhwc_fhwc(%v0, %v1, %v2) : (memref, memref, memref) -> ()
+ %t_end = call @rtclock() : () -> f64
+
+ // All the elements of the MemRef are the same,
+ // only check the first line to verify the correctness.
+ // CHECK: Unranked Memref
+ // CHECK: [
+ // CHECK: [
+ // CHECK: [
+ // CHECK: [150{{(, 150)*}}],
+ %print_v2 = memref.cast %v2 : memref to memref<*xf32>
+ call @printMemrefF32(%print_v2) : (memref<*xf32>) -> ()
+
+ %time = arith.subf %t_end, %t_start : f64
+ vector.print %time : f64
+
+ memref.dealloc %v0 : memref
+ memref.dealloc %v1 : memref
+ memref.dealloc %v2 : memref
+ return
+ }
+}
diff --git a/examples/BuddyConvolution/conv2d.mlir b/examples/BuddyConvolution/conv2d.mlir
new file mode 100644
index 000000000..c4f1ac2ef
--- /dev/null
+++ b/examples/BuddyConvolution/conv2d.mlir
@@ -0,0 +1,71 @@
+// RUN: buddy-opt %s \
+// RUN: -conv-vectorization \
+// RUN: -convert-linalg-to-loops \
+// RUN: -lower-affine \
+// RUN: -arith-bufferize \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -llvm-request-c-wrappers \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+#map0 = affine_map<(d0, d1) -> (d0 + d1 - 1)>
+
+module {
+ func.func private @printMemrefF32(memref<*xf32>)
+
+ func.func @conv_2d(%arg0: memref, %arg1: memref, %arg2: memref) {
+ linalg.conv_2d ins (%arg0, %arg1: memref, memref)
+ outs (%arg2: memref)
+ return
+ }
+
+ func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: f32) -> memref {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = memref.alloc(%arg0, %arg1) : memref
+ scf.for %arg3 = %c0 to %arg0 step %c1 {
+ scf.for %arg4 = %c0 to %arg1 step %c1 {
+ memref.store %arg2, %0[%arg3, %arg4] : memref
+ }
+ }
+ return %0 : memref
+ }
+
+ func.func @main() {
+ %c0 = arith.constant 0.000000e+00 : f32
+ %c1 = arith.constant 1.000000e+00 : f32
+ %c2 = arith.constant 2 : index
+ %c3 = arith.constant 3 : index
+
+ %current_v1 = arith.constant 3 : index
+ %current_v2 = arith.constant 8 : index
+ %current_v0 = affine.apply #map0(%current_v2, %current_v1)
+
+ %v0 = call @alloc_f32(%current_v0, %current_v0, %c1) : (index, index, f32) -> memref
+ %v1 = call @alloc_f32(%current_v1, %current_v1, %c1) : (index, index, f32) -> memref
+ %v2 = call @alloc_f32(%current_v2, %current_v2, %c0) : (index, index, f32) -> memref
+
+ call @conv_2d(%v0, %v1, %v2) : (memref, memref, memref) -> ()
+
+ %print_v2 = memref.cast %v2 : memref to memref<*xf32>
+
+ // All the elements of the MemRef are the same,
+ // only check the first line to verify the correctness.
+ // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [8, 8] strides = [8, 1] data =
+ // CHECK-NEXT: [
+ // CHECK-SAME: [9{{(, 9)*}}],
+ call @printMemrefF32(%print_v2) : (memref<*xf32>) -> ()
+
+ memref.dealloc %v0 : memref
+ memref.dealloc %v1 : memref
+ memref.dealloc %v2 : memref
+ return
+ }
+}
diff --git a/examples/BuddyConvolution/makefile b/examples/BuddyConvolution/makefile
new file mode 100644
index 000000000..196264376
--- /dev/null
+++ b/examples/BuddyConvolution/makefile
@@ -0,0 +1,127 @@
+#!/bin/bash
+BUDDY_OPT := ../../build/bin/buddy-opt
+MLIR_OPT := ../../llvm/build/bin/mlir-opt
+CLANG := ../../llvm/build/bin/clang
+MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
+MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
+LLC := ../../llvm/build/bin/llc
+OPT_FLAG := -O3
+MLIR_LIB := ../../llvm/build/lib/
+
+ifeq ($(shell uname),Linux)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
+MTRIPLE := x86_64-unknown-linux-gnu
+else ifeq ($(shell uname),Darwin)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
+MTRIPLE := x86_64-apple-darwin
+endif
+
+conv2d-lower:
+ @${BUDDY_OPT} ./conv2d.mlir \
+ -conv-vectorization \
+ -convert-linalg-to-loops \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -llvm-request-c-wrappers \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts \
+ -o ./log.mlir
+
+conv2d-translate:
+ @${BUDDY_OPT} ./conv2d.mlir \
+ -conv-vectorization \
+ -convert-linalg-to-loops \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -llvm-request-c-wrappers \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+conv2d-run:
+ @${BUDDY_OPT} ./conv2d.mlir \
+ -conv-vectorization \
+ -convert-linalg-to-loops \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -llvm-request-c-wrappers \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+conv2d-nhwc-fhwc-run:
+ @${BUDDY_OPT} ./conv2d-nhwc-fhwc.mlir \
+ -convert-linalg-to-loops \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+conv2d-nhwc-fhwc-aot:
+ @${BUDDY_OPT} ./conv2d-nhwc-fhwc.mlir \
+ -convert-linalg-to-loops \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
+ ${CLANG} log.ll ${OPT_FLAG} \
+ -L${MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
+ -o a.out
+ @LD_LIBRARY_PATH=${MLIR_LIB} ./a.out
+
+conv2d-nhwc-fhwc-opt-run:
+ @${BUDDY_OPT} ./conv2d-nhwc-fhwc-opt.mlir \
+ -convert-vector-to-scf \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} -O3 -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+conv2d-nhwc-fhwc-opt-aot:
+ @${BUDDY_OPT} ./conv2d-nhwc-fhwc-opt.mlir \
+ -convert-vector-to-scf \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
+ ${CLANG} log.ll -O3 \
+ -L${MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
+ -o a.out
+ @LD_LIBRARY_PATH=${MLIR_LIB} ./a.out
diff --git a/examples/BuddyGPU/.gitignore b/examples/BuddyGPU/.gitignore
new file mode 100644
index 000000000..0194ea7a6
--- /dev/null
+++ b/examples/BuddyGPU/.gitignore
@@ -0,0 +1,3 @@
+log.mlir
+log.ll
+log.s
diff --git a/examples/BuddyGPU/makefile b/examples/BuddyGPU/makefile
new file mode 100644
index 000000000..677396d1d
--- /dev/null
+++ b/examples/BuddyGPU/makefile
@@ -0,0 +1,8 @@
+#!/bin/bash
+BUDDY_OPT := ../../build/bin/buddy-opt
+
+buddy-gpu-matmul-lower:
+ @${BUDDY_OPT} matmul.mlir \
+ -transform-preload-library="transform-library-paths=transform.mlir" \
+ -transform-interpreter="entry-point=codegen" \
+ -o log.mlir
diff --git a/examples/BuddyGPU/matmul.mlir b/examples/BuddyGPU/matmul.mlir
new file mode 100644
index 000000000..2f0fa226c
--- /dev/null
+++ b/examples/BuddyGPU/matmul.mlir
@@ -0,0 +1,12 @@
+!unit = f32
+!lhs = tensor<5376x2048x!unit>
+!rhs = tensor<2048x5376x!unit>
+!res = tensor<5376x5376x!unit>
+
+func.func @matmul(%arg0: !lhs, %arg1: !rhs) -> !res {
+ %cst = arith.constant 0.000000e+00 : !unit
+ %0 = tensor.empty() : !res
+ %1 = linalg.fill ins(%cst : !unit) outs(%0 : !res) -> !res
+ %2 = linalg.matmul ins(%arg0, %arg1: !lhs, !rhs) outs(%1: !res) -> !res
+ func.return %2 : !res
+}
diff --git a/examples/BuddyGPU/transform.mlir b/examples/BuddyGPU/transform.mlir
new file mode 100644
index 000000000..ef2645199
--- /dev/null
+++ b/examples/BuddyGPU/transform.mlir
@@ -0,0 +1,23 @@
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(%arg0: !transform.any_op) {
+ // Match the target operations and assign them to SSA values.
+ %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0
+ : (!transform.any_op) -> !transform.any_op
+ %fill = transform.structured.match ops{["linalg.fill"]} in %arg0
+ : (!transform.any_op) -> !transform.any_op
+
+ // Perform tiling for the grid.
+ // For the matrix multiplication of 5376x2048 and 2048x5376, the compilation
+ // strategy sets the tile size for grid-based partitioning to 128x256.
+ // This means that each 128x256 matmul tile is computed within a GPU block,
+ // while multiple such blocks are computed in parallel across the grid.
+ // `tile_sizes` specify the dimensions of the tiled matmul result.
+ // `%tiled_op` is the tiled matmul operation within the `scf.forall` loop.
+ // `%forall_op` is the `scf.forall` loop that maintains tile information.
+ %tiled_op, %forall_op = transform.structured.tile_using_forall %matmul
+ tile_sizes [128, 256] (mapping = [#gpu.block, #gpu.block])
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ transform.yield
+ }
+} // module
diff --git a/examples/BuddyGen/.gitignore b/examples/BuddyGen/.gitignore
new file mode 100644
index 000000000..df9389428
--- /dev/null
+++ b/examples/BuddyGen/.gitignore
@@ -0,0 +1,4 @@
+log.mlir
+log.ll
+log.s
+a.out
diff --git a/examples/BuddyGen/GenMemRef.cpp b/examples/BuddyGen/GenMemRef.cpp
new file mode 100644
index 000000000..8ca2526b7
--- /dev/null
+++ b/examples/BuddyGen/GenMemRef.cpp
@@ -0,0 +1,43 @@
+//===- GenMemRef.cpp ------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+// $ export LLVM_DIR=$PWD/../../llvm/
+// $ export LLVM_BUILD_DIR=$LLVM_DIR/build
+// $ c++ GenMemRef.cpp \
+ -I $LLVM_DIR/llvm/include/ -I $LLVM_BUILD_DIR/include/ \
+ -I $LLVM_DIR/mlir/include/ -I $LLVM_BUILD_DIR/tools/mlir/include/ \
+ -L$LLVM_BUILD_DIR/lib -lMLIRIR -lMLIRParser -lMLIRSupport -lLLVMCore \
+ -lLLVMSupport -lncurses -ltinfo -lstdc++ -lLLVMDemangle \
+ -o a.out
+// $ ./a.out
+
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+
+int main() {
+ mlir::MLIRContext context;
+ mlir::OpBuilder builder(&context);
+ mlir::Type eleType = builder.getF64Type();
+ // Target memref type:
+ // `memref>`
+ mlir::MemRefType memrefType = mlir::MemRefType::get(
+ {mlir::ShapedType::kDynamic}, eleType,
+ mlir::StridedLayoutAttr::get(
+ &context, /*offset=*/mlir::ShapedType::kDynamic, /*strides=*/{1}));
+ memrefType.dump();
+ return 0;
+}
diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index 9698f617b..928f1f88c 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -6,25 +6,26 @@ add_custom_command(
add_custom_command(
OUTPUT forward.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-pass-pipeline "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), eliminate-empty-tensors, func.func(llvm-request-c-wrappers),convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, convert-func-to-llvm, reconcile-unrealized-casts)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/forward.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/forward.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir
COMMENT "Building forward.o"
VERBATIM)
add_custom_command(
OUTPUT subgraph0.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
${BUDDY_BINARY_DIR}/buddy-opt
-eliminate-empty-tensors
- -convert-tensor-to-linalg
+ -convert-tensor-to-linalg
-linalg-bufferize
+ -batchmatmul-optimize
-convert-linalg-to-affine-loops
-lower-affine
-func-bufferize-dynamic-offset
@@ -42,9 +43,9 @@ add_custom_command(
-convert-arith-to-llvm
-convert-func-to-llvm
-reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
COMMENT "Building subgraph0.o"
VERBATIM)
@@ -54,7 +55,7 @@ add_library(LENET STATIC subgraph0.o forward.o)
SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
add_executable(buddy-lenet-run buddy-lenet-main.cpp)
-target_link_directories(buddy-lenet-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_directories(buddy-lenet-run PRIVATE ${LLVM_LIBRARY_DIR})
set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${OpenCV_LIBS})
target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS})
diff --git a/examples/BuddyLeNet/README.md b/examples/BuddyLeNet/README.md
index 5988edbe7..23ac086cf 100644
--- a/examples/BuddyLeNet/README.md
+++ b/examples/BuddyLeNet/README.md
@@ -24,9 +24,7 @@ $ cmake -G Ninja .. \
-DLLVM_ENABLE_ASSERTIONS=ON \
-DCMAKE_BUILD_TYPE=RELEASE \
-DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
- -DPython3_EXECUTABLE=$(which python3) \
- -DBUDDY_ENABLE_OPENCV=ON \
- -DOpenCV_DIR=
+ -DPython3_EXECUTABLE=$(which python3)
$ ninja
$ ninja check-buddy
```
diff --git a/examples/BuddyLeNet/buddy-lenet-main.cpp b/examples/BuddyLeNet/buddy-lenet-main.cpp
index 4e2dc2efe..ca12820ba 100644
--- a/examples/BuddyLeNet/buddy-lenet-main.cpp
+++ b/examples/BuddyLeNet/buddy-lenet-main.cpp
@@ -15,41 +15,24 @@
//===----------------------------------------------------------------------===//
#include
-#include
+#include
#include
+#include
#include
#include
#include
#include
-#include
#include
#include
#include
constexpr size_t ParamsSize = 44426;
-const std::string ImgName = "3.png";
+const std::string ImgName = "8.bmp";
/// Declare LeNet forward function.
extern "C" void _mlir_ciface_forward(MemRef *output,
MemRef *arg0,
- Img *input);
-
-/// Function for preprocessing the image to match model input requirements.
-const cv::Mat imagePreprocessing() {
- // Get the directory of the LeNet example and construct the image path.
- std::string lenetDir = getenv("LENET_EXAMPLE_PATH");
- std::string imgPath = lenetDir + "/images/" + ImgName;
- // Read the image in grayscale mode.
- cv::Mat inputImage = cv::imread(imgPath, cv::IMREAD_GRAYSCALE);
- assert(!inputImage.empty() && "Could not read the image.");
- cv::Mat resizedImage;
- int imageWidth = 28;
- int imageHeight = 28;
- // Resize the image to 28x28 pixels.
- cv::resize(inputImage, resizedImage, cv::Size(imageWidth, imageHeight),
- cv::INTER_LINEAR);
- return resizedImage;
-}
+ dip::Image *input);
/// Print [Log] label in bold blue format.
void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
@@ -112,19 +95,16 @@ int main() {
const std::string title = "LeNet Inference Powered by Buddy Compiler";
std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
- // Preprocess the image to match the input requirements of the model.
- cv::Mat image = imagePreprocessing();
-
- // Define the sizes of the input and output tensors.
- intptr_t sizesInput[4] = {1, 1, 28, 28};
+ // Define the sizes of the output tensors.
intptr_t sizesOutput[2] = {1, 10};
// Create input and output containers for the image and model output.
- Img input(image, sizesInput, true);
+ std::string lenetDir = getenv("LENET_EXAMPLE_PATH");
+ std::string imgPath = lenetDir + "/images/" + ImgName;
+ dip::Image input(imgPath, dip::DIP_GRAYSCALE, true /* norm */);
MemRef output(sizesOutput);
// Load model parameters from the specified file.
- std::string lenetDir = getenv("LENET_EXAMPLE_PATH");
std::string paramsDir = lenetDir + "/arg0.data";
MemRef paramsContainer({ParamsSize});
loadParameters(paramsDir, paramsContainer);
diff --git a/examples/BuddyLeNet/fake-lenet.mlir b/examples/BuddyLeNet/fake-lenet.mlir
index 48d91a7fd..d7d80a533 100644
--- a/examples/BuddyLeNet/fake-lenet.mlir
+++ b/examples/BuddyLeNet/fake-lenet.mlir
@@ -1,5 +1,6 @@
module {
func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+ func.func private @rtclock() -> f64
func.func @forward(%arg0: tensor<44426xf32>, %arg1: tensor<1x1x28x28xf32>) -> tensor<1x10xf32> {
%extracted_slice = tensor.extract_slice %arg0[0] [150] [1] : tensor<44426xf32> to tensor<150xf32>
@@ -81,10 +82,16 @@ module {
%fake_params = arith.constant dense<1.0> : tensor<44426xf32>
%fake_input = arith.constant dense<2.0> : tensor<1x1x28x28xf32>
+ %t_start = call @rtclock() : () -> f64
%fake_output = call @forward(%fake_params, %fake_input) : (tensor<44426xf32>, tensor<1x1x28x28xf32>) -> tensor<1x10xf32>
+ %t_end = call @rtclock() : () -> f64
%tensor_unranked = tensor.cast %fake_output : tensor<1x10xf32> to tensor<*xf32>
call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+
+ %time = arith.subf %t_end, %t_start : f64
+ vector.print %time : f64
+
return
}
}
diff --git a/examples/BuddyLeNet/images/8.bmp b/examples/BuddyLeNet/images/8.bmp
new file mode 100644
index 000000000..7a9e02a29
Binary files /dev/null and b/examples/BuddyLeNet/images/8.bmp differ
diff --git a/examples/BuddyLeNet/makefile b/examples/BuddyLeNet/makefile
index 6f0664272..fe87b6da1 100644
--- a/examples/BuddyLeNet/makefile
+++ b/examples/BuddyLeNet/makefile
@@ -1,30 +1,33 @@
#!/bin/bash
-BUDDY_OPT := ../../build/bin/buddy-opt
-MLIR_OPT := ../../llvm/build/bin/mlir-opt
-MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
-MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
-LLC := ../../llvm/build/bin/llc
-OPT_FLAG := -O0
+BUDDY_BUILD_DIR := ../../build/
+LLVM_BUILD_DIR := ../../llvm/build/
+BUDDY_OPT := ${BUDDY_BUILD_DIR}/bin/buddy-opt
+MLIR_OPT := ${LLVM_BUILD_DIR}/bin/mlir-opt
+MLIR_TRANSLATE := ${LLVM_BUILD_DIR}/bin/mlir-translate
+MLIR_CPU_RUNNER := ${LLVM_BUILD_DIR}/bin/mlir-cpu-runner
+LLC := ${LLVM_BUILD_DIR}/bin/llc
+OPT_FLAG := -O3
ifeq ($(shell uname),Linux)
-MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
-MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
-MLIR_ASYNC_RUNTIME := ../../llvm/build/lib/libmlir_async_runtime.so
+MLIR_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_c_runner_utils.so
+MLIR_ASYNC_RUNTIME := ${LLVM_BUILD_DIR}/lib/libmlir_async_runtime.so
MTRIPLE := x86_64-unknown-linux-gnu
else ifeq ($(shell uname),Darwin)
-MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
-MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
-MLIR_ASYNC_RUNTIME := ./../llvm/build/lib/libmlir_async_runtime.dylib
+MLIR_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_runner_utils.dylib
+MLIR_C_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_c_runner_utils.dylib
+MLIR_ASYNC_RUNTIME := ${LLVM_BUILD_DIR}/lib/libmlir_async_runtime.dylib
MTRIPLE := x86_64-apple-darwin
endif
buddy-lenet-lower:
- @${MLIR_OPT} ./fake-lenet.mlir \
+ @${BUDDY_OPT} ./fake-lenet.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
- ${MLIR_OPT} \
+ ${BUDDY_OPT} \
-eliminate-empty-tensors \
-convert-tensor-to-linalg \
-linalg-bufferize \
+ -batchmatmul-optimize \
-convert-linalg-to-affine-loops \
-lower-affine \
-func-bufferize \
@@ -38,16 +41,15 @@ buddy-lenet-lower:
-convert-arith-to-llvm \
-finalize-memref-to-llvm \
-convert-scf-to-cf \
- -llvm-request-c-wrappers \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts \
-o ./log.mlir
buddy-lenet-translate:
- @${MLIR_OPT} ./fake-lenet.mlir \
+ @${BUDDY_OPT} ./fake-lenet.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
- ${MLIR_OPT} \
+ ${BUDDY_OPT} \
-eliminate-empty-tensors \
-convert-tensor-to-linalg \
-linalg-bufferize \
@@ -64,7 +66,6 @@ buddy-lenet-translate:
-convert-arith-to-llvm \
-finalize-memref-to-llvm \
-convert-scf-to-cf \
- -llvm-request-c-wrappers \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
@@ -72,9 +73,9 @@ buddy-lenet-translate:
buddy-lenet-run:
- @${MLIR_OPT} ./fake-lenet.mlir \
+ @${BUDDY_OPT} ./fake-lenet.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
- ${MLIR_OPT} \
+ ${BUDDY_OPT} \
-eliminate-empty-tensors \
-convert-tensor-to-linalg \
-linalg-bufferize \
@@ -91,7 +92,33 @@ buddy-lenet-run:
-convert-arith-to-llvm \
-finalize-memref-to-llvm \
-convert-scf-to-cf \
- -llvm-request-c-wrappers \
+ -convert-arith-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+buddy-lenet-opt-run:
+ @${BUDDY_OPT} ./fake-lenet.mlir \
+ -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
+ ${BUDDY_OPT} \
+ -eliminate-empty-tensors \
+ -convert-tensor-to-linalg \
+ -linalg-bufferize \
+ -batchmatmul-optimize \
+ -convert-linalg-to-affine-loops \
+ -lower-affine \
+ -func-bufferize \
+ -arith-bufferize \
+ -tensor-bufferize \
+ -buffer-deallocation \
+ -finalizing-bufferize \
+ -convert-vector-to-scf \
+ -expand-strided-metadata \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-scf-to-cf \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
diff --git a/examples/BuddyLlama/CMakeLists.txt b/examples/BuddyLlama/CMakeLists.txt
index 97aa736cb..a6bfc2f74 100644
--- a/examples/BuddyLlama/CMakeLists.txt
+++ b/examples/BuddyLlama/CMakeLists.txt
@@ -6,14 +6,14 @@ add_custom_command(
add_custom_command(
OUTPUT forward.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/forward.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/forward.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
${BUDDY_BINARY_DIR}/buddy-opt
-arith-expand
-eliminate-empty-tensors
-empty-tensor-to-alloc-tensor
-one-shot-bufferize
- -matmul-paralell-vectorization-optimize
+ -matmul-parallel-vectorization-optimize
-batchmatmul-optimize
-convert-linalg-to-affine-loops
-affine-loop-fusion
@@ -40,9 +40,9 @@ add_custom_command(
-convert-math-to-libm
-convert-func-to-llvm
-reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
-o ${BUDDY_BINARY_DIR}/../examples/BuddyLlama/forward.o
DEPENDS buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/forward.mlir
COMMENT "Building forward.o "
@@ -50,14 +50,14 @@ add_custom_command(
add_custom_command(
OUTPUT subgraph.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/subgraph0.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/subgraph0.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
${BUDDY_BINARY_DIR}/buddy-opt
-arith-expand
-eliminate-empty-tensors
-empty-tensor-to-alloc-tensor
-one-shot-bufferize
- -matmul-paralell-vectorization-optimize
+ -matmul-parallel-vectorization-optimize
-batchmatmul-optimize
-convert-linalg-to-affine-loops
-affine-loop-fusion
@@ -85,9 +85,9 @@ add_custom_command(
-convert-math-to-libm
-convert-func-to-llvm
-reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
-o ${BUDDY_BINARY_DIR}/../examples/BuddyLlama/subgraph.o
DEPENDS buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/subgraph0.mlir
COMMENT "Building subgraph.o "
@@ -107,7 +107,7 @@ SET_TARGET_PROPERTIES(
LINKER_LANGUAGE C)
add_executable(buddy-llama-run llama-main.cpp)
-target_link_directories(buddy-llama-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_directories(buddy-llama-run PRIVATE ${LLVM_LIBRARY_DIR})
set(BUDDY_LLAMA_LIBS
LLAMA
diff --git a/examples/BuddyLlama/import-llama2.py b/examples/BuddyLlama/import-llama2.py
index fbd12e5bf..2903d6bd8 100644
--- a/examples/BuddyLlama/import-llama2.py
+++ b/examples/BuddyLlama/import-llama2.py
@@ -1,11 +1,3 @@
-import os
-import torch
-import torch._dynamo as dynamo
-from transformers import LlamaForCausalLM, LlamaTokenizer
-from torch._inductor.decomposition import decompositions as inductor_decomp
-import numpy
-
-from buddy.compiler.frontend import DynamoCompiler
# ===- import-llama2.py --------------------------------------------------------
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,6 +17,15 @@
# This is the test of llama2 model.
#
# ===---------------------------------------------------------------------------
+
+import os
+import torch
+import torch._dynamo as dynamo
+from transformers import LlamaForCausalLM, LlamaTokenizer
+from torch._inductor.decomposition import decompositions as inductor_decomp
+import numpy
+
+from buddy.compiler.frontend import DynamoCompiler
from buddy.compiler.ops import tosa
from buddy.compiler.graph import GraphDriver
from buddy.compiler.graph.transform import simply_fuse
diff --git a/examples/BuddyLlama/llama_annotation.mlir b/examples/BuddyLlama/llama_annotation.mlir
new file mode 100644
index 000000000..acb735d12
--- /dev/null
+++ b/examples/BuddyLlama/llama_annotation.mlir
@@ -0,0 +1,6012 @@
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)>
+#map7 = affine_map<(d0, d1) -> (0, d0, d1)>
+module {
+ func.func @subgraph0(%arg0: tensor<32000x4096xf32>, %arg1: tensor<1x40xi64>, %arg2: tensor<4096xf32>, %arg3: tensor<4096x4096xf32>, %arg4: tensor<4096x4096xf32>, %arg5: tensor<4096x4096xf32>, %arg6: tensor<1x1x2048x128xf32>, %arg7: tensor<1x1x2048x128xf32>, %arg8: tensor<4096x4096xf32>, %arg9: tensor<4096xf32>, %arg10: tensor<11008x4096xf32>, %arg11: tensor<11008x4096xf32>, %arg12: tensor<4096x11008xf32>, %arg13: tensor<4096xf32>, %arg14: tensor<4096x4096xf32>, %arg15: tensor<4096x4096xf32>, %arg16: tensor<4096x4096xf32>, %arg17: tensor<1x1x2048x128xf32>, %arg18: tensor<1x1x2048x128xf32>, %arg19: tensor<4096x4096xf32>, %arg20: tensor<4096xf32>, %arg21: tensor<11008x4096xf32>, %arg22: tensor<11008x4096xf32>, %arg23: tensor<4096x11008xf32>, %arg24: tensor<4096xf32>, %arg25: tensor<4096x4096xf32>, %arg26: tensor<4096x4096xf32>, %arg27: tensor<4096x4096xf32>, %arg28: tensor<1x1x2048x128xf32>, %arg29: tensor<1x1x2048x128xf32>, %arg30: tensor<4096x4096xf32>, %arg31: tensor<4096xf32>, %arg32: tensor<11008x4096xf32>, %arg33: tensor<11008x4096xf32>, %arg34: tensor<4096x11008xf32>, %arg35: tensor<4096xf32>, %arg36: tensor<4096x4096xf32>, %arg37: tensor<4096x4096xf32>, %arg38: tensor<4096x4096xf32>, %arg39: tensor<1x1x2048x128xf32>, %arg40: tensor<1x1x2048x128xf32>, %arg41: tensor<4096x4096xf32>, %arg42: tensor<4096xf32>, %arg43: tensor<11008x4096xf32>, %arg44: tensor<11008x4096xf32>, %arg45: tensor<4096x11008xf32>, %arg46: tensor<4096xf32>, %arg47: tensor<4096x4096xf32>, %arg48: tensor<4096x4096xf32>, %arg49: tensor<4096x4096xf32>, %arg50: tensor<1x1x2048x128xf32>, %arg51: tensor<1x1x2048x128xf32>, %arg52: tensor<4096x4096xf32>, %arg53: tensor<4096xf32>, %arg54: tensor<11008x4096xf32>, %arg55: tensor<11008x4096xf32>, %arg56: tensor<4096x11008xf32>, %arg57: tensor<4096xf32>, %arg58: tensor<4096x4096xf32>, %arg59: tensor<4096x4096xf32>, %arg60: tensor<4096x4096xf32>, %arg61: tensor<1x1x2048x128xf32>, %arg62: tensor<1x1x2048x128xf32>, %arg63: tensor<4096x4096xf32>, %arg64: tensor<4096xf32>, %arg65: tensor<11008x4096xf32>, %arg66: tensor<11008x4096xf32>, %arg67: tensor<4096x11008xf32>, %arg68: tensor<4096xf32>, %arg69: tensor<4096x4096xf32>, %arg70: tensor<4096x4096xf32>, %arg71: tensor<4096x4096xf32>, %arg72: tensor<1x1x2048x128xf32>, %arg73: tensor<1x1x2048x128xf32>, %arg74: tensor<4096x4096xf32>, %arg75: tensor<4096xf32>, %arg76: tensor<11008x4096xf32>, %arg77: tensor<11008x4096xf32>, %arg78: tensor<4096x11008xf32>, %arg79: tensor<4096xf32>, %arg80: tensor<4096x4096xf32>, %arg81: tensor<4096x4096xf32>, %arg82: tensor<4096x4096xf32>, %arg83: tensor<1x1x2048x128xf32>, %arg84: tensor<1x1x2048x128xf32>, %arg85: tensor<4096x4096xf32>, %arg86: tensor<4096xf32>, %arg87: tensor<11008x4096xf32>, %arg88: tensor<11008x4096xf32>, %arg89: tensor<4096x11008xf32>, %arg90: tensor<4096xf32>, %arg91: tensor<4096x4096xf32>, %arg92: tensor<4096x4096xf32>, %arg93: tensor<4096x4096xf32>, %arg94: tensor<1x1x2048x128xf32>, %arg95: tensor<1x1x2048x128xf32>, %arg96: tensor<4096x4096xf32>, %arg97: tensor<4096xf32>, %arg98: tensor<11008x4096xf32>, %arg99: tensor<11008x4096xf32>, %arg100: tensor<4096x11008xf32>, %arg101: tensor<4096xf32>, %arg102: tensor<4096x4096xf32>, %arg103: tensor<4096x4096xf32>, %arg104: tensor<4096x4096xf32>, %arg105: tensor<1x1x2048x128xf32>, %arg106: tensor<1x1x2048x128xf32>, %arg107: tensor<4096x4096xf32>, %arg108: tensor<4096xf32>, %arg109: tensor<11008x4096xf32>, %arg110: tensor<11008x4096xf32>, %arg111: tensor<4096x11008xf32>, %arg112: tensor<4096xf32>, %arg113: tensor<4096x4096xf32>, %arg114: tensor<4096x4096xf32>, %arg115: tensor<4096x4096xf32>, %arg116: tensor<1x1x2048x128xf32>, %arg117: tensor<1x1x2048x128xf32>, %arg118: tensor<4096x4096xf32>, %arg119: tensor<4096xf32>, %arg120: tensor<11008x4096xf32>, %arg121: tensor<11008x4096xf32>, %arg122: tensor<4096x11008xf32>, %arg123: tensor<4096xf32>, %arg124: tensor<4096x4096xf32>, %arg125: tensor<4096x4096xf32>, %arg126: tensor<4096x4096xf32>, %arg127: tensor<1x1x2048x128xf32>, %arg128: tensor<1x1x2048x128xf32>, %arg129: tensor<4096x4096xf32>, %arg130: tensor<4096xf32>, %arg131: tensor<11008x4096xf32>, %arg132: tensor<11008x4096xf32>, %arg133: tensor<4096x11008xf32>, %arg134: tensor<4096xf32>, %arg135: tensor<4096x4096xf32>, %arg136: tensor<4096x4096xf32>, %arg137: tensor<4096x4096xf32>, %arg138: tensor<1x1x2048x128xf32>, %arg139: tensor<1x1x2048x128xf32>, %arg140: tensor<4096x4096xf32>, %arg141: tensor<4096xf32>, %arg142: tensor<11008x4096xf32>, %arg143: tensor<11008x4096xf32>, %arg144: tensor<4096x11008xf32>, %arg145: tensor<4096xf32>, %arg146: tensor<4096x4096xf32>, %arg147: tensor<4096x4096xf32>, %arg148: tensor<4096x4096xf32>, %arg149: tensor<1x1x2048x128xf32>, %arg150: tensor<1x1x2048x128xf32>, %arg151: tensor<4096x4096xf32>, %arg152: tensor<4096xf32>, %arg153: tensor<11008x4096xf32>, %arg154: tensor<11008x4096xf32>, %arg155: tensor<4096x11008xf32>, %arg156: tensor<4096xf32>, %arg157: tensor<4096x4096xf32>, %arg158: tensor<4096x4096xf32>, %arg159: tensor<4096x4096xf32>, %arg160: tensor<1x1x2048x128xf32>, %arg161: tensor<1x1x2048x128xf32>, %arg162: tensor<4096x4096xf32>, %arg163: tensor<4096xf32>, %arg164: tensor<11008x4096xf32>, %arg165: tensor<11008x4096xf32>, %arg166: tensor<4096x11008xf32>, %arg167: tensor<4096xf32>, %arg168: tensor<4096x4096xf32>, %arg169: tensor<4096x4096xf32>, %arg170: tensor<4096x4096xf32>, %arg171: tensor<1x1x2048x128xf32>, %arg172: tensor<1x1x2048x128xf32>, %arg173: tensor<4096x4096xf32>, %arg174: tensor<4096xf32>, %arg175: tensor<11008x4096xf32>, %arg176: tensor<11008x4096xf32>, %arg177: tensor<4096x11008xf32>, %arg178: tensor<4096xf32>, %arg179: tensor<4096x4096xf32>, %arg180: tensor<4096x4096xf32>, %arg181: tensor<4096x4096xf32>, %arg182: tensor<1x1x2048x128xf32>, %arg183: tensor<1x1x2048x128xf32>, %arg184: tensor<4096x4096xf32>, %arg185: tensor<4096xf32>, %arg186: tensor<11008x4096xf32>, %arg187: tensor<11008x4096xf32>, %arg188: tensor<4096x11008xf32>, %arg189: tensor<4096xf32>, %arg190: tensor<4096x4096xf32>, %arg191: tensor<4096x4096xf32>, %arg192: tensor<4096x4096xf32>, %arg193: tensor<1x1x2048x128xf32>, %arg194: tensor<1x1x2048x128xf32>, %arg195: tensor<4096x4096xf32>, %arg196: tensor<4096xf32>, %arg197: tensor<11008x4096xf32>, %arg198: tensor<11008x4096xf32>, %arg199: tensor<4096x11008xf32>, %arg200: tensor<4096xf32>, %arg201: tensor<4096x4096xf32>, %arg202: tensor<4096x4096xf32>, %arg203: tensor<4096x4096xf32>, %arg204: tensor<1x1x2048x128xf32>, %arg205: tensor<1x1x2048x128xf32>, %arg206: tensor<4096x4096xf32>, %arg207: tensor<4096xf32>, %arg208: tensor<11008x4096xf32>, %arg209: tensor<11008x4096xf32>, %arg210: tensor<4096x11008xf32>, %arg211: tensor<4096xf32>, %arg212: tensor<4096x4096xf32>, %arg213: tensor<4096x4096xf32>, %arg214: tensor<4096x4096xf32>, %arg215: tensor<1x1x2048x128xf32>, %arg216: tensor<1x1x2048x128xf32>, %arg217: tensor<4096x4096xf32>, %arg218: tensor<4096xf32>, %arg219: tensor<11008x4096xf32>, %arg220: tensor<11008x4096xf32>, %arg221: tensor<4096x11008xf32>, %arg222: tensor<4096xf32>, %arg223: tensor<4096x4096xf32>, %arg224: tensor<4096x4096xf32>, %arg225: tensor<4096x4096xf32>, %arg226: tensor<1x1x2048x128xf32>, %arg227: tensor<1x1x2048x128xf32>, %arg228: tensor<4096x4096xf32>, %arg229: tensor<4096xf32>, %arg230: tensor<11008x4096xf32>, %arg231: tensor<11008x4096xf32>, %arg232: tensor<4096x11008xf32>, %arg233: tensor<4096xf32>, %arg234: tensor<4096x4096xf32>, %arg235: tensor<4096x4096xf32>, %arg236: tensor<4096x4096xf32>, %arg237: tensor<1x1x2048x128xf32>, %arg238: tensor<1x1x2048x128xf32>, %arg239: tensor<4096x4096xf32>, %arg240: tensor<4096xf32>, %arg241: tensor<11008x4096xf32>, %arg242: tensor<11008x4096xf32>, %arg243: tensor<4096x11008xf32>, %arg244: tensor<4096xf32>, %arg245: tensor<4096x4096xf32>, %arg246: tensor<4096x4096xf32>, %arg247: tensor<4096x4096xf32>, %arg248: tensor<1x1x2048x128xf32>, %arg249: tensor<1x1x2048x128xf32>, %arg250: tensor<4096x4096xf32>, %arg251: tensor<4096xf32>, %arg252: tensor<11008x4096xf32>, %arg253: tensor<11008x4096xf32>, %arg254: tensor<4096x11008xf32>, %arg255: tensor<4096xf32>, %arg256: tensor<4096x4096xf32>, %arg257: tensor<4096x4096xf32>, %arg258: tensor<4096x4096xf32>, %arg259: tensor<1x1x2048x128xf32>, %arg260: tensor<1x1x2048x128xf32>, %arg261: tensor<4096x4096xf32>, %arg262: tensor<4096xf32>, %arg263: tensor<11008x4096xf32>, %arg264: tensor<11008x4096xf32>, %arg265: tensor<4096x11008xf32>, %arg266: tensor<4096xf32>, %arg267: tensor<4096x4096xf32>, %arg268: tensor<4096x4096xf32>, %arg269: tensor<4096x4096xf32>, %arg270: tensor<1x1x2048x128xf32>, %arg271: tensor<1x1x2048x128xf32>, %arg272: tensor<4096x4096xf32>, %arg273: tensor<4096xf32>, %arg274: tensor<11008x4096xf32>, %arg275: tensor<11008x4096xf32>, %arg276: tensor<4096x11008xf32>, %arg277: tensor<4096xf32>, %arg278: tensor<4096x4096xf32>, %arg279: tensor<4096x4096xf32>, %arg280: tensor<4096x4096xf32>, %arg281: tensor<1x1x2048x128xf32>, %arg282: tensor<1x1x2048x128xf32>, %arg283: tensor<4096x4096xf32>, %arg284: tensor<4096xf32>, %arg285: tensor<11008x4096xf32>, %arg286: tensor<11008x4096xf32>, %arg287: tensor<4096x11008xf32>, %arg288: tensor<4096xf32>, %arg289: tensor<4096x4096xf32>, %arg290: tensor<4096x4096xf32>, %arg291: tensor<4096x4096xf32>, %arg292: tensor<1x1x2048x128xf32>, %arg293: tensor<1x1x2048x128xf32>, %arg294: tensor<4096x4096xf32>, %arg295: tensor<4096xf32>, %arg296: tensor<11008x4096xf32>, %arg297: tensor<11008x4096xf32>, %arg298: tensor<4096x11008xf32>, %arg299: tensor<4096xf32>, %arg300: tensor<4096x4096xf32>, %arg301: tensor<4096x4096xf32>, %arg302: tensor<4096x4096xf32>, %arg303: tensor<1x1x2048x128xf32>, %arg304: tensor<1x1x2048x128xf32>, %arg305: tensor<4096x4096xf32>, %arg306: tensor<4096xf32>, %arg307: tensor<11008x4096xf32>, %arg308: tensor<11008x4096xf32>, %arg309: tensor<4096x11008xf32>, %arg310: tensor<4096xf32>, %arg311: tensor<4096x4096xf32>, %arg312: tensor<4096x4096xf32>, %arg313: tensor<4096x4096xf32>, %arg314: tensor<1x1x2048x128xf32>, %arg315: tensor<1x1x2048x128xf32>, %arg316: tensor<4096x4096xf32>, %arg317: tensor<4096xf32>, %arg318: tensor<11008x4096xf32>, %arg319: tensor<11008x4096xf32>, %arg320: tensor<4096x11008xf32>, %arg321: tensor<4096xf32>, %arg322: tensor<4096x4096xf32>, %arg323: tensor<4096x4096xf32>, %arg324: tensor<4096x4096xf32>, %arg325: tensor<1x1x2048x128xf32>, %arg326: tensor<1x1x2048x128xf32>, %arg327: tensor<4096x4096xf32>, %arg328: tensor<4096xf32>, %arg329: tensor<11008x4096xf32>, %arg330: tensor<11008x4096xf32>, %arg331: tensor<4096x11008xf32>, %arg332: tensor<4096xf32>, %arg333: tensor<4096x4096xf32>, %arg334: tensor<4096x4096xf32>, %arg335: tensor<4096x4096xf32>, %arg336: tensor<1x1x2048x128xf32>, %arg337: tensor<1x1x2048x128xf32>, %arg338: tensor<4096x4096xf32>, %arg339: tensor<4096xf32>, %arg340: tensor<11008x4096xf32>, %arg341: tensor<11008x4096xf32>, %arg342: tensor<4096x11008xf32>, %arg343: tensor<4096xf32>, %arg344: tensor<4096x4096xf32>, %arg345: tensor<4096x4096xf32>, %arg346: tensor<4096x4096xf32>, %arg347: tensor<1x1x2048x128xf32>, %arg348: tensor<1x1x2048x128xf32>, %arg349: tensor<4096x4096xf32>, %arg350: tensor<4096xf32>, %arg351: tensor<11008x4096xf32>, %arg352: tensor<11008x4096xf32>, %arg353: tensor<4096x11008xf32>, %arg354: tensor<4096xf32>, %arg355: tensor<32000x4096xf32>) -> (tensor<1x40x4096xf32>, tensor<1x40x32000xf32>) {
+ %0 = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]> : tensor<40xi64>}> : () -> tensor<40xi64>
+ %1 = tosa.reshape %0 {new_shape = array} : (tensor<40xi64>) -> tensor<1x40xi64>
+ %2 = tosa.reshape %1 {new_shape = array} : (tensor<1x40xi64>) -> tensor<1x40xi64>
+ %3 = tosa.cast %arg1 : (tensor<1x40xi64>) -> tensor<1x40xi32>
+ %4 = tosa.reshape %arg0 {new_shape = array} : (tensor<32000x4096xf32>) -> tensor<1x32000x4096xf32>
+ %5 = tosa.gather %4, %3 : (tensor<1x32000x4096xf32>, tensor<1x40xi32>) -> tensor<1x40x4096xf32>
+ %6 = tosa.reshape %5 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %cst = arith.constant dense : tensor<1x40xi1>
+ %cst_0 = arith.constant dense<-3.40282347E+38> : tensor<40x40xf32>
+ %7 = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]> : tensor<40xi64>}> : () -> tensor<40xi64>
+ %8 = "tosa.const"() <{value = dense<1> : tensor<40xi64>}> : () -> tensor<40xi64>
+ %9 = tosa.add %7, %8 : (tensor<40xi64>, tensor<40xi64>) -> tensor<40xi64>
+ %10 = tosa.reshape %9 {new_shape = array} : (tensor<40xi64>) -> tensor<40x1xi64>
+ %11 = tensor.empty() : tensor<40x40xi1>
+ %12 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<40xi64>, tensor<40x1xi64>) outs(%11 : tensor<40x40xi1>) {
+ ^bb0(%in: i64, %in_742: i64, %out: i1):
+ %4175 = arith.cmpi slt, %in, %in_742 : i64
+ linalg.yield %4175 : i1
+ } -> tensor<40x40xi1>
+ %cst_1 = arith.constant 0.000000e+00 : f32
+ %13 = tensor.empty() : tensor<40x40xf32>
+ %14 = linalg.generic {indexing_maps = [#map3, #map3, #map3], iterator_types = ["parallel", "parallel"]} ins(%12, %cst_0 : tensor<40x40xi1>, tensor<40x40xf32>) outs(%13 : tensor<40x40xf32>) {
+ ^bb0(%in: i1, %in_742: f32, %out: f32):
+ %4175 = arith.select %in, %cst_1, %in_742 : f32
+ linalg.yield %4175 : f32
+ } -> tensor<40x40xf32>
+ %extracted_slice = tensor.extract_slice %cst[0, 0] [1, 40] [1, 1] : tensor<1x40xi1> to tensor<1x40xi1>
+ %15 = tosa.reshape %extracted_slice {new_shape = array} : (tensor<1x40xi1>) -> tensor<1x1x40xi1>
+ %16 = tosa.reshape %15 {new_shape = array} : (tensor<1x1x40xi1>) -> tensor<1x1x1x40xi1>
+ %extracted_slice_2 = tensor.extract_slice %16[0, 0, 0, 0] [1, 1, 1, 40] [1, 1, 1, 1] : tensor<1x1x1x40xi1> to tensor<1x1x1x40xi1>
+ %17 = "tosa.const"() <{value = dense : tensor<1x1x40x40xi1>}> : () -> tensor<1x1x40x40xi1>
+ %18 = tosa.add %extracted_slice_2, %17 : (tensor<1x1x1x40xi1>, tensor<1x1x40x40xi1>) -> tensor<1x1x40x40xi1>
+ %19 = tosa.cast %18 : (tensor<1x1x40x40xi1>) -> tensor<1x1x40x40xf32>
+ %20 = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x40x40xf32>}> : () -> tensor<1x1x40x40xf32>
+ %21 = tosa.sub %20, %19 : (tensor<1x1x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x1x40x40xf32>
+ %22 = tosa.cast %21 : (tensor<1x1x40x40xf32>) -> tensor<1x1x40x40xi1>
+ %cst_3 = arith.constant -3.40282347E+38 : f32
+ %23 = tensor.empty() : tensor<1x1x40x40xf32>
+ %24 = linalg.generic {indexing_maps = [#map4, #map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22, %21 : tensor<1x1x40x40xi1>, tensor<1x1x40x40xf32>) outs(%23 : tensor<1x1x40x40xf32>) {
+ ^bb0(%in: i1, %in_742: f32, %out: f32):
+ %4175 = arith.select %in, %cst_3, %in_742 : f32
+ linalg.yield %4175 : f32
+ } -> tensor<1x1x40x40xf32>
+ %25 = tosa.reshape %14 {new_shape = array} : (tensor<40x40xf32>) -> tensor<1x40x40xf32>
+ %26 = tosa.reshape %25 {new_shape = array} : (tensor<1x40x40xf32>) -> tensor<1x1x40x40xf32>
+ %extracted_slice_4 = tensor.extract_slice %26[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x40xf32> to tensor<1x1x40x40xf32>
+ %extracted_slice_5 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x40xf32> to tensor<1x1x40x40xf32>
+ %27 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x40xf32>}> : () -> tensor<1x1x40x40xf32>
+ %28 = tosa.add %extracted_slice_5, %27 : (tensor<1x1x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x1x40x40xf32>
+ %29 = tosa.add %24, %28 : (tensor<1x1x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x1x40x40xf32>
+ // RMSNorm begins
+ %30 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32 = arith.constant 2 : i32
+ %31 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%6 : tensor<1x40x4096xf32>) outs(%30 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %32 = tosa.reduce_sum %31 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %33 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %34 = tosa.reciprocal %33 : (tensor<1xf32>) -> tensor<1xf32>
+ %35 = tosa.mul %34, %32 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %36 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %37 = tosa.add %35, %36 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %38 = tosa.rsqrt %37 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %39 = tosa.mul %6, %38 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %40 = tosa.reshape %arg2 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ // %41 is the input matrix X after embedding,
+ // then there are three consecutive similar codes representing the calculation of Q, K, V (%46, %51, %56):
+ %41 = tosa.mul %40, %39 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+
+ %42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %43 = tosa.transpose %arg3, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %44 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_6 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %45 = linalg.matmul {cast = #linalg.type_fn} ins(%44, %43 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_6 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %46 = tosa.reshape %45 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+ %47 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %48 = tosa.transpose %arg4, %47 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %49 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_7 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %50 = linalg.matmul {cast = #linalg.type_fn} ins(%49, %48 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_7 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %51 = tosa.reshape %50 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+ %52 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %53 = tosa.transpose %arg5, %52 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %54 = tosa.reshape %41 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_8 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %55 = linalg.matmul {cast = #linalg.type_fn} ins(%54, %53 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_8 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %56 = tosa.reshape %55 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ // completed the calculation of Q, K, V; dimensions is (batch, seq_len, num_heads, head_dims)
+ // transpose Q, K, V dimensions for RoPE and dot product
+
+ // // begin of RoPE
+ %57 = tosa.reshape %46 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %58 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %59 = tosa.transpose %57, %58 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %60 = tosa.reshape %51 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %61 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %62 = tosa.transpose %60, %61 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %63 = tosa.reshape %56 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %64 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %65 = tosa.transpose %63, %64 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %extracted_slice_9 = tensor.extract_slice %arg6[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %extracted_slice_12 = tensor.extract_slice %arg7[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_14 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %66 = tensor.empty() : tensor<1x40x128xf32>
+ %67 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_11 : tensor<1x1x40x128xf32>) outs(%66 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %68 = tensor.empty() : tensor<40x128xf32>
+ %69 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%67 : tensor<1x40x128xf32>) outs(%68 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %70 = tensor.empty() : tensor<1x40x128xf32>
+ %71 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_14 : tensor<1x1x40x128xf32>) outs(%70 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %72 = tensor.empty() : tensor<40x128xf32>
+ %73 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x40x128xf32>) outs(%72 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ // precompute_theta_pos_frequencies function, which is used to calculating special values of RoPE according to: https://hyper.ai/wiki/29220
+ %74 = tensor.empty() : tensor<1x40x128xf32>
+ %75 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%74 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %69[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %76 = tosa.reshape %75 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %77 = tensor.empty() : tensor<1x40x128xf32>
+ %78 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%77 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %73[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %79 = tosa.reshape %78 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %80 = tosa.mul %59, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_15 = tensor.extract_slice %59[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_16 = tensor.extract_slice %59[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %81 = tosa.negate %extracted_slice_16 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %82 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice = tensor.insert_slice %81 into %82[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_17 = tensor.insert_slice %extracted_slice_15 into %inserted_slice[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %83 = tosa.mul %inserted_slice_17, %79 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %84 = tosa.add %80, %83 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %85 = tosa.mul %62, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_18 = tensor.extract_slice %62[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_19 = tensor.extract_slice %62[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %86 = tosa.negate %extracted_slice_19 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %87 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_20 = tensor.insert_slice %86 into %87[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_21 = tensor.insert_slice %extracted_slice_18 into %inserted_slice_20[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ // end of RoPE, begin of Softmax(QK/sqrt(d_k)):
+ %88 = tosa.mul %inserted_slice_21, %79 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %89 = tosa.add %85, %88 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %90 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %91 = tosa.transpose %89, %90 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
+ %92 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %93 = tosa.add %84, %92 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %94 = tosa.reshape %93 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %95 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32>
+ %96 = tosa.add %91, %95 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32>
+ %97 = tosa.reshape %96 {new_shape = array} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
+ %98 = tosa.matmul %94, %97 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
+ %99 = tosa.reshape %98 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %100 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %101 = tosa.reciprocal %100 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %102 = tosa.mul %99, %101 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %103 = tosa.add %102, %29 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %104 = tosa.reduce_max %103 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %105 = tosa.sub %103, %104 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %106 = tosa.exp %105 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %107 = tosa.reduce_sum %106 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %108 = tosa.reciprocal %107 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32>
+ %109 = tosa.mul %106, %108 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ // end of Softmax(QK/sqrt(d_k)), begin of matmul with V
+ %110 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %111 = tosa.add %109, %110 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %112 = tosa.reshape %111 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32>
+ %113 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %114 = tosa.add %65, %113 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %115 = tosa.reshape %114 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ //
+ %116 = tosa.matmul %112, %115 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
+ // complete one head Softmax(QK/sqrt(d_k)), collect all heads.
+ %117 = tosa.reshape %116 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %118 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %119 = tosa.transpose %117, %118 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+ %120 = tosa.identity %119 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+ %121 = tosa.reshape %120 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+ %122 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %123 = tosa.transpose %arg8, %122 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %124 = tosa.reshape %121 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_22 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %125 = linalg.matmul {cast = #linalg.type_fn} ins(%124, %123 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_22 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %126 = tosa.reshape %125 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %127 = tosa.add %6, %126 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ // end of GQA(Group Query Attention) block, begin of FFN block(RMSNorm --> SwiGLU).
+ %128 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_23 = arith.constant 2 : i32
+ %129 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%127 : tensor<1x40x4096xf32>) outs(%128 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_23 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %130 = tosa.reduce_sum %129 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %131 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %132 = tosa.reciprocal %131 : (tensor<1xf32>) -> tensor<1xf32>
+ %133 = tosa.mul %132, %130 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %134 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %135 = tosa.add %133, %134 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %136 = tosa.rsqrt %135 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %137 = tosa.mul %127, %136 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %138 = tosa.reshape %arg9 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %139 = tosa.mul %138, %137 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %140 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %141 = tosa.transpose %arg10, %140 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %142 = tosa.reshape %139 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_24 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %143 = linalg.matmul {cast = #linalg.type_fn} ins(%142, %141 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_24 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %144 = tosa.reshape %143 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %145 = tosa.sigmoid %144 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %146 = tosa.mul %144, %145 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %147 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %148 = tosa.transpose %arg11, %147 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %149 = tosa.reshape %139 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_25 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %150 = linalg.matmul {cast = #linalg.type_fn} ins(%149, %148 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_25 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %151 = tosa.reshape %150 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %152 = tosa.mul %146, %151 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %153 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %154 = tosa.transpose %arg12, %153 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
+ %155 = tosa.reshape %152 {new_shape = array} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
+ %cst_26 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %156 = linalg.matmul {cast = #linalg.type_fn} ins(%155, %154 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_26 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %157 = tosa.reshape %156 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %158 = tosa.add %127, %157 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ // end of last decoder block, begin of new decoder block.
+ %159 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_27 = arith.constant 2 : i32
+ %160 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%158 : tensor<1x40x4096xf32>) outs(%159 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_27 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %161 = tosa.reduce_sum %160 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %162 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %163 = tosa.reciprocal %162 : (tensor<1xf32>) -> tensor<1xf32>
+ %164 = tosa.mul %163, %161 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %165 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %166 = tosa.add %164, %165 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %167 = tosa.rsqrt %166 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %168 = tosa.mul %158, %167 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %169 = tosa.reshape %arg13 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ // %170 is the input matrix X after embedding,
+ // then there are three consecutive similar code block representing the calculation of Q, K, V (%175, %180, %185):
+ %170 = tosa.mul %169, %168 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+
+ %171 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %172 = tosa.transpose %arg14, %171 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %173 = tosa.reshape %170 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_28 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %174 = linalg.matmul {cast = #linalg.type_fn} ins(%173, %172 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_28 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %175 = tosa.reshape %174 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+ %176 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %177 = tosa.transpose %arg15, %176 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %178 = tosa.reshape %170 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_29 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %179 = linalg.matmul {cast = #linalg.type_fn} ins(%178, %177 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_29 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %180 = tosa.reshape %179 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+
+ %181 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %182 = tosa.transpose %arg16, %181 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %183 = tosa.reshape %170 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_30 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %184 = linalg.matmul {cast = #linalg.type_fn} ins(%183, %182 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_30 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %185 = tosa.reshape %184 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ // completed the calculation of Q, K, V above.
+ %186 = tosa.reshape %175 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %187 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %188 = tosa.transpose %186, %187 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %189 = tosa.reshape %180 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %190 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %191 = tosa.transpose %189, %190 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %192 = tosa.reshape %185 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %193 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %194 = tosa.transpose %192, %193 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %extracted_slice_31 = tensor.extract_slice %arg17[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_32 = tensor.extract_slice %extracted_slice_31[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_33 = tensor.extract_slice %extracted_slice_32[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %extracted_slice_34 = tensor.extract_slice %arg18[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_35 = tensor.extract_slice %extracted_slice_34[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_36 = tensor.extract_slice %extracted_slice_35[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %195 = tensor.empty() : tensor<1x40x128xf32>
+ %196 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_33 : tensor<1x1x40x128xf32>) outs(%195 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %197 = tensor.empty() : tensor<40x128xf32>
+ // #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+ // #map3 = affine_map<(d0, d1) -> (d0, d1)>
+ // #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+ // #map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+ // #map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)>
+ // #map7 = affine_map<(d0, d1) -> (0, d0, d1)>
+ %198 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%196 : tensor<1x40x128xf32>) outs(%197 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %199 = tensor.empty() : tensor<1x40x128xf32>
+ %200 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_36 : tensor<1x1x40x128xf32>) outs(%199 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %201 = tensor.empty() : tensor<40x128xf32>
+ %202 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%200 : tensor<1x40x128xf32>) outs(%201 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %203 = tensor.empty() : tensor<1x40x128xf32>
+ %204 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%203 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %198[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %205 = tosa.reshape %204 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %206 = tensor.empty() : tensor<1x40x128xf32>
+ %207 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%206 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %202[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %208 = tosa.reshape %207 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %209 = tosa.mul %188, %205 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_37 = tensor.extract_slice %188[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_38 = tensor.extract_slice %188[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %210 = tosa.negate %extracted_slice_38 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %211 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_39 = tensor.insert_slice %210 into %211[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_40 = tensor.insert_slice %extracted_slice_37 into %inserted_slice_39[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %212 = tosa.mul %inserted_slice_40, %208 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %213 = tosa.add %209, %212 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+
+ %214 = tosa.mul %191, %205 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_41 = tensor.extract_slice %191[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_42 = tensor.extract_slice %191[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %215 = tosa.negate %extracted_slice_42 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %216 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_43 = tensor.insert_slice %215 into %216[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_44 = tensor.insert_slice %extracted_slice_41 into %inserted_slice_43[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %217 = tosa.mul %inserted_slice_44, %208 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %218 = tosa.add %214, %217 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+
+ %219 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %220 = tosa.transpose %218, %219 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
+ %221 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %222 = tosa.add %213, %221 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %223 = tosa.reshape %222 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %224 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32>
+ %225 = tosa.add %220, %224 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32>
+ %226 = tosa.reshape %225 {new_shape = array} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
+ %227 = tosa.matmul %223, %226 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
+ %228 = tosa.reshape %227 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %229 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %230 = tosa.reciprocal %229 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %231 = tosa.mul %228, %230 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %232 = tosa.add %231, %29 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %233 = tosa.reduce_max %232 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %234 = tosa.sub %232, %233 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %235 = tosa.exp %234 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %236 = tosa.reduce_sum %235 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %237 = tosa.reciprocal %236 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32>
+ %238 = tosa.mul %235, %237 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %239 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %240 = tosa.add %238, %239 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %241 = tosa.reshape %240 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32>
+ %242 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %243 = tosa.add %194, %242 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %244 = tosa.reshape %243 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %245 = tosa.matmul %241, %244 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
+ %246 = tosa.reshape %245 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %247 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %248 = tosa.transpose %246, %247 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+ %249 = tosa.identity %248 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+ %250 = tosa.reshape %249 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+ %251 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %252 = tosa.transpose %arg19, %251 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %253 = tosa.reshape %250 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_45 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %254 = linalg.matmul {cast = #linalg.type_fn} ins(%253, %252 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_45 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %255 = tosa.reshape %254 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %256 = tosa.add %158, %255 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %257 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_46 = arith.constant 2 : i32
+ %258 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%256 : tensor<1x40x4096xf32>) outs(%257 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_46 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %259 = tosa.reduce_sum %258 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %260 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %261 = tosa.reciprocal %260 : (tensor<1xf32>) -> tensor<1xf32>
+ %262 = tosa.mul %261, %259 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %263 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %264 = tosa.add %262, %263 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %265 = tosa.rsqrt %264 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %266 = tosa.mul %256, %265 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %267 = tosa.reshape %arg20 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %268 = tosa.mul %267, %266 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %269 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %270 = tosa.transpose %arg21, %269 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %271 = tosa.reshape %268 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_47 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %272 = linalg.matmul {cast = #linalg.type_fn} ins(%271, %270 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_47 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %273 = tosa.reshape %272 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %274 = tosa.sigmoid %273 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %275 = tosa.mul %273, %274 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %276 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %277 = tosa.transpose %arg22, %276 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %278 = tosa.reshape %268 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_48 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %279 = linalg.matmul {cast = #linalg.type_fn} ins(%278, %277 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_48 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %280 = tosa.reshape %279 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %281 = tosa.mul %275, %280 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %282 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %283 = tosa.transpose %arg23, %282 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
+ %284 = tosa.reshape %281 {new_shape = array} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
+ %cst_49 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %285 = linalg.matmul {cast = #linalg.type_fn} ins(%284, %283 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_49 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %286 = tosa.reshape %285 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %287 = tosa.add %256, %286 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %288 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_50 = arith.constant 2 : i32
+ %289 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%287 : tensor<1x40x4096xf32>) outs(%288 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_50 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %290 = tosa.reduce_sum %289 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %291 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %292 = tosa.reciprocal %291 : (tensor<1xf32>) -> tensor<1xf32>
+ %293 = tosa.mul %292, %290 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %294 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %295 = tosa.add %293, %294 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %296 = tosa.rsqrt %295 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %297 = tosa.mul %287, %296 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %298 = tosa.reshape %arg24 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %299 = tosa.mul %298, %297 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %300 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %301 = tosa.transpose %arg25, %300 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %302 = tosa.reshape %299 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_51 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %303 = linalg.matmul {cast = #linalg.type_fn} ins(%302, %301 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_51 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %304 = tosa.reshape %303 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %305 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %306 = tosa.transpose %arg26, %305 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %307 = tosa.reshape %299 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_52 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %308 = linalg.matmul {cast = #linalg.type_fn} ins(%307, %306 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_52 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %309 = tosa.reshape %308 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %310 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %311 = tosa.transpose %arg27, %310 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %312 = tosa.reshape %299 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_53 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %313 = linalg.matmul {cast = #linalg.type_fn} ins(%312, %311 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_53 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %314 = tosa.reshape %313 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %315 = tosa.reshape %304 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %316 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %317 = tosa.transpose %315, %316 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %318 = tosa.reshape %309 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %319 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %320 = tosa.transpose %318, %319 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %321 = tosa.reshape %314 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %322 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %323 = tosa.transpose %321, %322 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_54 = tensor.extract_slice %arg28[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_55 = tensor.extract_slice %extracted_slice_54[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_56 = tensor.extract_slice %extracted_slice_55[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %extracted_slice_57 = tensor.extract_slice %arg29[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_58 = tensor.extract_slice %extracted_slice_57[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_59 = tensor.extract_slice %extracted_slice_58[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %324 = tensor.empty() : tensor<1x40x128xf32>
+ %325 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_56 : tensor<1x1x40x128xf32>) outs(%324 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %326 = tensor.empty() : tensor<40x128xf32>
+ %327 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%325 : tensor<1x40x128xf32>) outs(%326 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %328 = tensor.empty() : tensor<1x40x128xf32>
+ %329 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_59 : tensor<1x1x40x128xf32>) outs(%328 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %330 = tensor.empty() : tensor<40x128xf32>
+ %331 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%329 : tensor<1x40x128xf32>) outs(%330 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %332 = tensor.empty() : tensor<1x40x128xf32>
+ %333 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%332 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %327[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %334 = tosa.reshape %333 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %335 = tensor.empty() : tensor<1x40x128xf32>
+ %336 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%335 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %331[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %337 = tosa.reshape %336 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %338 = tosa.mul %317, %334 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_60 = tensor.extract_slice %317[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_61 = tensor.extract_slice %317[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %339 = tosa.negate %extracted_slice_61 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %340 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_62 = tensor.insert_slice %339 into %340[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_63 = tensor.insert_slice %extracted_slice_60 into %inserted_slice_62[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %341 = tosa.mul %inserted_slice_63, %337 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %342 = tosa.add %338, %341 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %343 = tosa.mul %320, %334 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_64 = tensor.extract_slice %320[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_65 = tensor.extract_slice %320[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %344 = tosa.negate %extracted_slice_65 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %345 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_66 = tensor.insert_slice %344 into %345[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_67 = tensor.insert_slice %extracted_slice_64 into %inserted_slice_66[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %346 = tosa.mul %inserted_slice_67, %337 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %347 = tosa.add %343, %346 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %348 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %349 = tosa.transpose %347, %348 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
+ %350 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %351 = tosa.add %342, %350 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %352 = tosa.reshape %351 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %353 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32>
+ %354 = tosa.add %349, %353 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32>
+ %355 = tosa.reshape %354 {new_shape = array} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
+ %356 = tosa.matmul %352, %355 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
+ %357 = tosa.reshape %356 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %358 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %359 = tosa.reciprocal %358 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %360 = tosa.mul %357, %359 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %361 = tosa.add %360, %29 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %362 = tosa.reduce_max %361 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %363 = tosa.sub %361, %362 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %364 = tosa.exp %363 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %365 = tosa.reduce_sum %364 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %366 = tosa.reciprocal %365 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32>
+ %367 = tosa.mul %364, %366 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %368 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %369 = tosa.add %367, %368 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %370 = tosa.reshape %369 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32>
+ %371 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %372 = tosa.add %323, %371 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %373 = tosa.reshape %372 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %374 = tosa.matmul %370, %373 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
+ %375 = tosa.reshape %374 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %376 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %377 = tosa.transpose %375, %376 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+ %378 = tosa.identity %377 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+ %379 = tosa.reshape %378 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+ %380 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %381 = tosa.transpose %arg30, %380 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %382 = tosa.reshape %379 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_68 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %383 = linalg.matmul {cast = #linalg.type_fn} ins(%382, %381 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_68 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %384 = tosa.reshape %383 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %385 = tosa.add %287, %384 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %386 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_69 = arith.constant 2 : i32
+ %387 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%385 : tensor<1x40x4096xf32>) outs(%386 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_69 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %388 = tosa.reduce_sum %387 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %389 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %390 = tosa.reciprocal %389 : (tensor<1xf32>) -> tensor<1xf32>
+ %391 = tosa.mul %390, %388 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %392 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %393 = tosa.add %391, %392 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %394 = tosa.rsqrt %393 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %395 = tosa.mul %385, %394 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %396 = tosa.reshape %arg31 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %397 = tosa.mul %396, %395 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %398 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %399 = tosa.transpose %arg32, %398 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %400 = tosa.reshape %397 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_70 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %401 = linalg.matmul {cast = #linalg.type_fn} ins(%400, %399 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_70 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %402 = tosa.reshape %401 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %403 = tosa.sigmoid %402 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %404 = tosa.mul %402, %403 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %405 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %406 = tosa.transpose %arg33, %405 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %407 = tosa.reshape %397 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_71 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %408 = linalg.matmul {cast = #linalg.type_fn} ins(%407, %406 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_71 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %409 = tosa.reshape %408 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %410 = tosa.mul %404, %409 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %411 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %412 = tosa.transpose %arg34, %411 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
+ %413 = tosa.reshape %410 {new_shape = array} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
+ %cst_72 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %414 = linalg.matmul {cast = #linalg.type_fn} ins(%413, %412 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_72 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %415 = tosa.reshape %414 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %416 = tosa.add %385, %415 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %417 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_73 = arith.constant 2 : i32
+ %418 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%416 : tensor<1x40x4096xf32>) outs(%417 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_73 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %419 = tosa.reduce_sum %418 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %420 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %421 = tosa.reciprocal %420 : (tensor<1xf32>) -> tensor<1xf32>
+ %422 = tosa.mul %421, %419 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %423 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %424 = tosa.add %422, %423 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %425 = tosa.rsqrt %424 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %426 = tosa.mul %416, %425 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %427 = tosa.reshape %arg35 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %428 = tosa.mul %427, %426 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %429 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %430 = tosa.transpose %arg36, %429 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %431 = tosa.reshape %428 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_74 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %432 = linalg.matmul {cast = #linalg.type_fn} ins(%431, %430 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_74 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %433 = tosa.reshape %432 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %434 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %435 = tosa.transpose %arg37, %434 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %436 = tosa.reshape %428 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_75 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %437 = linalg.matmul {cast = #linalg.type_fn} ins(%436, %435 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_75 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %438 = tosa.reshape %437 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %439 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %440 = tosa.transpose %arg38, %439 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %441 = tosa.reshape %428 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_76 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %442 = linalg.matmul {cast = #linalg.type_fn} ins(%441, %440 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_76 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %443 = tosa.reshape %442 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %444 = tosa.reshape %433 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %445 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %446 = tosa.transpose %444, %445 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %447 = tosa.reshape %438 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %448 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %449 = tosa.transpose %447, %448 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %450 = tosa.reshape %443 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %451 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %452 = tosa.transpose %450, %451 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_77 = tensor.extract_slice %arg39[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_78 = tensor.extract_slice %extracted_slice_77[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_79 = tensor.extract_slice %extracted_slice_78[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %extracted_slice_80 = tensor.extract_slice %arg40[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_81 = tensor.extract_slice %extracted_slice_80[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_82 = tensor.extract_slice %extracted_slice_81[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %453 = tensor.empty() : tensor<1x40x128xf32>
+ %454 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_79 : tensor<1x1x40x128xf32>) outs(%453 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %455 = tensor.empty() : tensor<40x128xf32>
+ %456 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%454 : tensor<1x40x128xf32>) outs(%455 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %457 = tensor.empty() : tensor<1x40x128xf32>
+ %458 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_82 : tensor<1x1x40x128xf32>) outs(%457 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %459 = tensor.empty() : tensor<40x128xf32>
+ %460 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%458 : tensor<1x40x128xf32>) outs(%459 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %461 = tensor.empty() : tensor<1x40x128xf32>
+ %462 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%461 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %456[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %463 = tosa.reshape %462 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %464 = tensor.empty() : tensor<1x40x128xf32>
+ %465 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%464 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %460[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %466 = tosa.reshape %465 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %467 = tosa.mul %446, %463 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_83 = tensor.extract_slice %446[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_84 = tensor.extract_slice %446[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %468 = tosa.negate %extracted_slice_84 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %469 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_85 = tensor.insert_slice %468 into %469[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_86 = tensor.insert_slice %extracted_slice_83 into %inserted_slice_85[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %470 = tosa.mul %inserted_slice_86, %466 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %471 = tosa.add %467, %470 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %472 = tosa.mul %449, %463 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_87 = tensor.extract_slice %449[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_88 = tensor.extract_slice %449[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %473 = tosa.negate %extracted_slice_88 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %474 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_89 = tensor.insert_slice %473 into %474[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_90 = tensor.insert_slice %extracted_slice_87 into %inserted_slice_89[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %475 = tosa.mul %inserted_slice_90, %466 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %476 = tosa.add %472, %475 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %477 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %478 = tosa.transpose %476, %477 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
+ %479 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %480 = tosa.add %471, %479 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %481 = tosa.reshape %480 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %482 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32>
+ %483 = tosa.add %478, %482 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32>
+ %484 = tosa.reshape %483 {new_shape = array} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
+ %485 = tosa.matmul %481, %484 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
+ %486 = tosa.reshape %485 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %487 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %488 = tosa.reciprocal %487 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %489 = tosa.mul %486, %488 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %490 = tosa.add %489, %29 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %491 = tosa.reduce_max %490 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %492 = tosa.sub %490, %491 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %493 = tosa.exp %492 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %494 = tosa.reduce_sum %493 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %495 = tosa.reciprocal %494 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32>
+ %496 = tosa.mul %493, %495 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %497 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %498 = tosa.add %496, %497 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %499 = tosa.reshape %498 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32>
+ %500 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %501 = tosa.add %452, %500 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %502 = tosa.reshape %501 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %503 = tosa.matmul %499, %502 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
+ %504 = tosa.reshape %503 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %505 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %506 = tosa.transpose %504, %505 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+ %507 = tosa.identity %506 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+ %508 = tosa.reshape %507 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+ %509 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %510 = tosa.transpose %arg41, %509 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %511 = tosa.reshape %508 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_91 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %512 = linalg.matmul {cast = #linalg.type_fn} ins(%511, %510 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_91 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %513 = tosa.reshape %512 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %514 = tosa.add %416, %513 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %515 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_92 = arith.constant 2 : i32
+ %516 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%514 : tensor<1x40x4096xf32>) outs(%515 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_92 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %517 = tosa.reduce_sum %516 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %518 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %519 = tosa.reciprocal %518 : (tensor<1xf32>) -> tensor<1xf32>
+ %520 = tosa.mul %519, %517 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %521 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %522 = tosa.add %520, %521 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %523 = tosa.rsqrt %522 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %524 = tosa.mul %514, %523 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %525 = tosa.reshape %arg42 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %526 = tosa.mul %525, %524 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %527 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %528 = tosa.transpose %arg43, %527 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %529 = tosa.reshape %526 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_93 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %530 = linalg.matmul {cast = #linalg.type_fn} ins(%529, %528 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_93 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %531 = tosa.reshape %530 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %532 = tosa.sigmoid %531 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %533 = tosa.mul %531, %532 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %534 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %535 = tosa.transpose %arg44, %534 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %536 = tosa.reshape %526 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_94 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %537 = linalg.matmul {cast = #linalg.type_fn} ins(%536, %535 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_94 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %538 = tosa.reshape %537 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %539 = tosa.mul %533, %538 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %540 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %541 = tosa.transpose %arg45, %540 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
+ %542 = tosa.reshape %539 {new_shape = array} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
+ %cst_95 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %543 = linalg.matmul {cast = #linalg.type_fn} ins(%542, %541 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_95 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %544 = tosa.reshape %543 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %545 = tosa.add %514, %544 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %546 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_96 = arith.constant 2 : i32
+ %547 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%545 : tensor<1x40x4096xf32>) outs(%546 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_96 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %548 = tosa.reduce_sum %547 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %549 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %550 = tosa.reciprocal %549 : (tensor<1xf32>) -> tensor<1xf32>
+ %551 = tosa.mul %550, %548 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %552 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %553 = tosa.add %551, %552 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %554 = tosa.rsqrt %553 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %555 = tosa.mul %545, %554 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %556 = tosa.reshape %arg46 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %557 = tosa.mul %556, %555 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %558 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %559 = tosa.transpose %arg47, %558 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %560 = tosa.reshape %557 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_97 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %561 = linalg.matmul {cast = #linalg.type_fn} ins(%560, %559 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_97 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %562 = tosa.reshape %561 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %563 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %564 = tosa.transpose %arg48, %563 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %565 = tosa.reshape %557 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_98 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %566 = linalg.matmul {cast = #linalg.type_fn} ins(%565, %564 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_98 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %567 = tosa.reshape %566 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %568 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %569 = tosa.transpose %arg49, %568 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %570 = tosa.reshape %557 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_99 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %571 = linalg.matmul {cast = #linalg.type_fn} ins(%570, %569 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_99 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %572 = tosa.reshape %571 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %573 = tosa.reshape %562 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %574 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %575 = tosa.transpose %573, %574 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %576 = tosa.reshape %567 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %577 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %578 = tosa.transpose %576, %577 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %579 = tosa.reshape %572 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %580 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %581 = tosa.transpose %579, %580 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_100 = tensor.extract_slice %arg50[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_101 = tensor.extract_slice %extracted_slice_100[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_102 = tensor.extract_slice %extracted_slice_101[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %extracted_slice_103 = tensor.extract_slice %arg51[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_104 = tensor.extract_slice %extracted_slice_103[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_105 = tensor.extract_slice %extracted_slice_104[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %582 = tensor.empty() : tensor<1x40x128xf32>
+ %583 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_102 : tensor<1x1x40x128xf32>) outs(%582 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %584 = tensor.empty() : tensor<40x128xf32>
+ %585 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%583 : tensor<1x40x128xf32>) outs(%584 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %586 = tensor.empty() : tensor<1x40x128xf32>
+ %587 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_105 : tensor<1x1x40x128xf32>) outs(%586 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %588 = tensor.empty() : tensor<40x128xf32>
+ %589 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%587 : tensor<1x40x128xf32>) outs(%588 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %590 = tensor.empty() : tensor<1x40x128xf32>
+ %591 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%590 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %585[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %592 = tosa.reshape %591 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %593 = tensor.empty() : tensor<1x40x128xf32>
+ %594 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%593 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %589[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %595 = tosa.reshape %594 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %596 = tosa.mul %575, %592 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_106 = tensor.extract_slice %575[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_107 = tensor.extract_slice %575[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %597 = tosa.negate %extracted_slice_107 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %598 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_108 = tensor.insert_slice %597 into %598[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_109 = tensor.insert_slice %extracted_slice_106 into %inserted_slice_108[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %599 = tosa.mul %inserted_slice_109, %595 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %600 = tosa.add %596, %599 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %601 = tosa.mul %578, %592 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_110 = tensor.extract_slice %578[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_111 = tensor.extract_slice %578[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %602 = tosa.negate %extracted_slice_111 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %603 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_112 = tensor.insert_slice %602 into %603[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_113 = tensor.insert_slice %extracted_slice_110 into %inserted_slice_112[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %604 = tosa.mul %inserted_slice_113, %595 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %605 = tosa.add %601, %604 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %606 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %607 = tosa.transpose %605, %606 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
+ %608 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %609 = tosa.add %600, %608 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %610 = tosa.reshape %609 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %611 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32>
+ %612 = tosa.add %607, %611 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32>
+ %613 = tosa.reshape %612 {new_shape = array} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
+ %614 = tosa.matmul %610, %613 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
+ %615 = tosa.reshape %614 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %616 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %617 = tosa.reciprocal %616 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %618 = tosa.mul %615, %617 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %619 = tosa.add %618, %29 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %620 = tosa.reduce_max %619 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %621 = tosa.sub %619, %620 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %622 = tosa.exp %621 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %623 = tosa.reduce_sum %622 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %624 = tosa.reciprocal %623 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32>
+ %625 = tosa.mul %622, %624 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %626 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %627 = tosa.add %625, %626 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %628 = tosa.reshape %627 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32>
+ %629 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %630 = tosa.add %581, %629 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %631 = tosa.reshape %630 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %632 = tosa.matmul %628, %631 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
+ %633 = tosa.reshape %632 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %634 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %635 = tosa.transpose %633, %634 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+ %636 = tosa.identity %635 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+ %637 = tosa.reshape %636 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+ %638 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %639 = tosa.transpose %arg52, %638 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %640 = tosa.reshape %637 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_114 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %641 = linalg.matmul {cast = #linalg.type_fn} ins(%640, %639 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_114 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %642 = tosa.reshape %641 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %643 = tosa.add %545, %642 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %644 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_115 = arith.constant 2 : i32
+ %645 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%643 : tensor<1x40x4096xf32>) outs(%644 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_115 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %646 = tosa.reduce_sum %645 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %647 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %648 = tosa.reciprocal %647 : (tensor<1xf32>) -> tensor<1xf32>
+ %649 = tosa.mul %648, %646 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %650 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %651 = tosa.add %649, %650 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %652 = tosa.rsqrt %651 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %653 = tosa.mul %643, %652 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %654 = tosa.reshape %arg53 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %655 = tosa.mul %654, %653 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %656 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %657 = tosa.transpose %arg54, %656 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %658 = tosa.reshape %655 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_116 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %659 = linalg.matmul {cast = #linalg.type_fn} ins(%658, %657 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_116 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %660 = tosa.reshape %659 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %661 = tosa.sigmoid %660 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %662 = tosa.mul %660, %661 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %663 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %664 = tosa.transpose %arg55, %663 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %665 = tosa.reshape %655 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_117 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %666 = linalg.matmul {cast = #linalg.type_fn} ins(%665, %664 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_117 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %667 = tosa.reshape %666 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %668 = tosa.mul %662, %667 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %669 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %670 = tosa.transpose %arg56, %669 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
+ %671 = tosa.reshape %668 {new_shape = array} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
+ %cst_118 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %672 = linalg.matmul {cast = #linalg.type_fn} ins(%671, %670 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_118 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %673 = tosa.reshape %672 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %674 = tosa.add %643, %673 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %675 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_119 = arith.constant 2 : i32
+ %676 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%674 : tensor<1x40x4096xf32>) outs(%675 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_119 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %677 = tosa.reduce_sum %676 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %678 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %679 = tosa.reciprocal %678 : (tensor<1xf32>) -> tensor<1xf32>
+ %680 = tosa.mul %679, %677 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %681 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %682 = tosa.add %680, %681 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %683 = tosa.rsqrt %682 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %684 = tosa.mul %674, %683 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %685 = tosa.reshape %arg57 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %686 = tosa.mul %685, %684 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %687 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %688 = tosa.transpose %arg58, %687 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %689 = tosa.reshape %686 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_120 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %690 = linalg.matmul {cast = #linalg.type_fn} ins(%689, %688 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_120 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %691 = tosa.reshape %690 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %692 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %693 = tosa.transpose %arg59, %692 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %694 = tosa.reshape %686 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_121 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %695 = linalg.matmul {cast = #linalg.type_fn} ins(%694, %693 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_121 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %696 = tosa.reshape %695 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %697 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %698 = tosa.transpose %arg60, %697 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %699 = tosa.reshape %686 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_122 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %700 = linalg.matmul {cast = #linalg.type_fn} ins(%699, %698 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_122 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %701 = tosa.reshape %700 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %702 = tosa.reshape %691 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %703 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %704 = tosa.transpose %702, %703 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %705 = tosa.reshape %696 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %706 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %707 = tosa.transpose %705, %706 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %708 = tosa.reshape %701 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %709 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %710 = tosa.transpose %708, %709 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_123 = tensor.extract_slice %arg61[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_124 = tensor.extract_slice %extracted_slice_123[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_125 = tensor.extract_slice %extracted_slice_124[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %extracted_slice_126 = tensor.extract_slice %arg62[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_127 = tensor.extract_slice %extracted_slice_126[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_128 = tensor.extract_slice %extracted_slice_127[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %711 = tensor.empty() : tensor<1x40x128xf32>
+ %712 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_125 : tensor<1x1x40x128xf32>) outs(%711 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %713 = tensor.empty() : tensor<40x128xf32>
+ %714 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%712 : tensor<1x40x128xf32>) outs(%713 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %715 = tensor.empty() : tensor<1x40x128xf32>
+ %716 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_128 : tensor<1x1x40x128xf32>) outs(%715 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %717 = tensor.empty() : tensor<40x128xf32>
+ %718 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%716 : tensor<1x40x128xf32>) outs(%717 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %719 = tensor.empty() : tensor<1x40x128xf32>
+ %720 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%719 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %714[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %721 = tosa.reshape %720 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %722 = tensor.empty() : tensor<1x40x128xf32>
+ %723 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<1x40xi64>) outs(%722 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %718[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %724 = tosa.reshape %723 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %725 = tosa.mul %704, %721 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_129 = tensor.extract_slice %704[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_130 = tensor.extract_slice %704[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %726 = tosa.negate %extracted_slice_130 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %727 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_131 = tensor.insert_slice %726 into %727[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_132 = tensor.insert_slice %extracted_slice_129 into %inserted_slice_131[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %728 = tosa.mul %inserted_slice_132, %724 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %729 = tosa.add %725, %728 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %730 = tosa.mul %707, %721 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_133 = tensor.extract_slice %707[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_134 = tensor.extract_slice %707[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %731 = tosa.negate %extracted_slice_134 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %732 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_135 = tensor.insert_slice %731 into %732[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_136 = tensor.insert_slice %extracted_slice_133 into %inserted_slice_135[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %733 = tosa.mul %inserted_slice_136, %724 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %734 = tosa.add %730, %733 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %735 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %736 = tosa.transpose %734, %735 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32>
+ %737 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %738 = tosa.add %729, %737 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %739 = tosa.reshape %738 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %740 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32>
+ %741 = tosa.add %736, %740 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32>
+ %742 = tosa.reshape %741 {new_shape = array} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32>
+ %743 = tosa.matmul %739, %742 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32>
+ %744 = tosa.reshape %743 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %745 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %746 = tosa.reciprocal %745 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %747 = tosa.mul %744, %746 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %748 = tosa.add %747, %29 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %749 = tosa.reduce_max %748 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %750 = tosa.sub %748, %749 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %751 = tosa.exp %750 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %752 = tosa.reduce_sum %751 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32>
+ %753 = tosa.reciprocal %752 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32>
+ %754 = tosa.mul %751, %753 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32>
+ %755 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32>
+ %756 = tosa.add %754, %755 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32>
+ %757 = tosa.reshape %756 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32>
+ %758 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32>
+ %759 = tosa.add %710, %758 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %760 = tosa.reshape %759 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32>
+ %761 = tosa.matmul %757, %760 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
+ %762 = tosa.reshape %761 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %763 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %764 = tosa.transpose %762, %763 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+ %765 = tosa.identity %764 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32>
+ %766 = tosa.reshape %765 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32>
+ %767 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %768 = tosa.transpose %arg63, %767 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %769 = tosa.reshape %766 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_137 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %770 = linalg.matmul {cast = #linalg.type_fn} ins(%769, %768 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_137 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %771 = tosa.reshape %770 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %772 = tosa.add %674, %771 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %773 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_138 = arith.constant 2 : i32
+ %774 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%772 : tensor<1x40x4096xf32>) outs(%773 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_138 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %775 = tosa.reduce_sum %774 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %776 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %777 = tosa.reciprocal %776 : (tensor<1xf32>) -> tensor<1xf32>
+ %778 = tosa.mul %777, %775 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %779 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %780 = tosa.add %778, %779 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %781 = tosa.rsqrt %780 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %782 = tosa.mul %772, %781 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %783 = tosa.reshape %arg64 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %784 = tosa.mul %783, %782 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %785 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %786 = tosa.transpose %arg65, %785 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %787 = tosa.reshape %784 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_139 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %788 = linalg.matmul {cast = #linalg.type_fn} ins(%787, %786 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_139 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %789 = tosa.reshape %788 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %790 = tosa.sigmoid %789 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %791 = tosa.mul %789, %790 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %792 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %793 = tosa.transpose %arg66, %792 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32>
+ %794 = tosa.reshape %784 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_140 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32>
+ %795 = linalg.matmul {cast = #linalg.type_fn} ins(%794, %793 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_140 : tensor<40x11008xf32>) -> tensor<40x11008xf32>
+ %796 = tosa.reshape %795 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32>
+ %797 = tosa.mul %791, %796 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+ %798 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %799 = tosa.transpose %arg67, %798 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32>
+ %800 = tosa.reshape %797 {new_shape = array} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32>
+ %cst_141 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %801 = linalg.matmul {cast = #linalg.type_fn} ins(%800, %799 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_141 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %802 = tosa.reshape %801 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %803 = tosa.add %772, %802 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %804 = tensor.empty() : tensor<1x40x4096xf32>
+ %c2_i32_142 = arith.constant 2 : i32
+ %805 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%803 : tensor<1x40x4096xf32>) outs(%804 : tensor<1x40x4096xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ %4175 = math.fpowi %in, %c2_i32_142 : f32, i32
+ linalg.yield %4175 : f32
+ } -> tensor<1x40x4096xf32>
+ %806 = tosa.reduce_sum %805 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32>
+ %807 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32>
+ %808 = tosa.reciprocal %807 : (tensor<1xf32>) -> tensor<1xf32>
+ %809 = tosa.mul %808, %806 {shift = 0 : i8} : (tensor<1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %810 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32>
+ %811 = tosa.add %809, %810 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %812 = tosa.rsqrt %811 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32>
+ %813 = tosa.mul %803, %812 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32>
+ %814 = tosa.reshape %arg68 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32>
+ %815 = tosa.mul %814, %813 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32>
+ %816 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %817 = tosa.transpose %arg69, %816 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %818 = tosa.reshape %815 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_143 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %819 = linalg.matmul {cast = #linalg.type_fn} ins(%818, %817 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_143 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %820 = tosa.reshape %819 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %821 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %822 = tosa.transpose %arg70, %821 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %823 = tosa.reshape %815 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32>
+ %cst_144 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32>
+ %824 = linalg.matmul {cast = #linalg.type_fn} ins(%823, %822 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_144 : tensor<40x4096xf32>) -> tensor<40x4096xf32>
+ %825 = tosa.reshape %824 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32>
+ %826 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
+ %827 = tosa.transpose %arg71, %826 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32>
+ %828 = tosa.reshape %815 {new_shape = array