diff --git a/.gitignore b/.gitignore
index 485cccfcf9..1ffba60cbc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,6 @@
# Clangd cache
.cache
+
+# Clangd configurations
+.clangd
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 83b7981421..cd2379468a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,13 +42,15 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR OR BUDDY_MLIR_OUT_OF_TREE_
message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
- set(LLVM_MLIR_BINARY_DIR ${MLIR_DIR}/../../../bin)
- set(LLVM_MLIR_LIBRARY_DIR ${MLIR_DIR}/../../../lib)
- set(LLVM_PROJECT_BUILD_DIR ${MLIR_DIR}/../../../)
- if(NOT DEFINED LLVM_PROJECT_SOURCE_DIR)
- get_filename_component(LLVM_PROJECT_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llvm/ ABSOLUTE)
+ # LLVM_MAIN_SRC_DIR is a private variable for the LLVM in-tree build.
+ # To provide compatibility for unifying the one-step and two-step build,
+ # we set LLVM_MAIN_SRC_DIR ourselves here.
+ # This could benefit users who want to specify a custom LLVM source directory,
+ # but also not interfere with normal users who just want to use the buddy-mlir provided LLVM sources.
+ if(NOT DEFINED LLVM_MAIN_SRC_DIR)
+ get_filename_component(LLVM_MAIN_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llvm/llvm ABSOLUTE)
endif()
- set(LLVM_MLIR_SOURCE_DIR ${LLVM_PROJECT_SOURCE_DIR}/mlir)
+ set(MLIR_MAIN_SRC_DIR ${LLVM_MAIN_SRC_DIR}/../mlir)
list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
list(APPEND CMAKE_MODULE_PATH "${LLVM_CMAKE_DIR}")
@@ -66,16 +68,9 @@ else()
#-------------------------------------------------------------------------------
# MLIR/LLVM Configuration
#-------------------------------------------------------------------------------
-
- # Allow using out-of-tree llvm directory
- set(LLVM_PROJECT_SOURCE_DIR ${LLVM_MAIN_SRC_DIR}/..)
- message(STATUS "Using LLVM Project ${LLVM_PROJECT_SOURCE_DIR}")
-
set(MLIR_MAIN_SRC_DIR ${LLVM_MAIN_SRC_DIR}/../mlir)
set(MLIR_INCLUDE_DIR ${MLIR_MAIN_SRC_DIR}/include)
set(MLIR_GENERATED_INCLUDE_DIR ${LLVM_BINARY_DIR}/tools/mlir/include)
- set(LLVM_MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}/bin)
- set(MLIR_INCLUDE_DIRS "${MLIR_INCLUDE_DIR};${MLIR_GENERATED_INCLUDE_DIR}")
endif()
#-------------------------------------------------------------------------------
@@ -98,12 +93,22 @@ set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${BUDDY_LIBRARY_DIR})
set(BUDDY_EXAMPLES OFF CACHE BOOL "Build examples")
set(BUDDY_ENABLE_OPENCV OFF CACHE BOOL "Enable OpenCV support.")
-if(BUDDY_ENABLE_OPENCV)
- add_definitions(-DBUDDY_ENABLE_OPENCV)
- find_package(JPEG REQUIRED)
+ if(BUDDY_ENABLE_OPENCV)
+ add_definitions(-DBUDDY_ENABLE_OPENCV)
+ find_package(JPEG REQUIRED)
+ find_package(PNG REQUIRED)
+ find_package(OpenCV REQUIRED CONFIG)
+ include_directories(${OpenCV_INCLUDE_DIRS})
+ endif()
+
+if(BUDDY_MLIR_ENABLE_DIP_LIB)
+ add_definitions(-DBUDDY_MLIR_ENABLE_DIP_LIB)
+ find_package(PNG REQUIRED)
+endif()
+
+if(BUDDY_ENABLE_PNG)
+ add_definitions(-DBUDDY_ENABLE_PNG)
find_package(PNG REQUIRED)
- find_package(OpenCV REQUIRED CONFIG)
- include_directories(${OpenCV_INCLUDE_DIRS})
endif()
# Generate libraries into `lib` of build directory.
@@ -220,6 +225,8 @@ if(BUDDY_MLIR_ENABLE_PYTHON_PACKAGES)
# Create empty __init__.py files to make these directories Python packages
file(WRITE ${BUDDY_MLIR_PYTHON_PACKAGES_DIR}/buddy/__init__.py "")
file(WRITE ${BUDDY_MLIR_PYTHON_PACKAGES_DIR}/buddy/compiler/__init__.py "")
+
+ install(DIRECTORY ${BUDDY_MLIR_PYTHON_PACKAGES_DIR}/buddy DESTINATION python_packages)
endif()
#-------------------------------------------------------------------------------
diff --git a/README.md b/README.md
index cb9a5f1c24..2e44658b02 100644
--- a/README.md
+++ b/README.md
@@ -96,13 +96,37 @@ $ cmake -G Ninja .. \
-DCMAKE_BUILD_TYPE=RELEASE \
-DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
-DPython3_EXECUTABLE=$(which python3)
+$ ninja
+$ ninja check-buddy
+$ export BUDDY_MLIR_BUILD_DIR=$PWD
+$ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
+$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
+```
+
+To configure the build environment for using image processing libraries, follow these steps:
+
+```
+$ cmake -G Ninja .. \
+ -DMLIR_DIR=$PWD/../llvm/build/lib/cmake/mlir \
+ -DLLVM_DIR=$PWD/../llvm/build/lib/cmake/llvm \
+ -DLLVM_ENABLE_ASSERTIONS=ON \
+ -DCMAKE_BUILD_TYPE=RELEASE \
+ -DBUDDY_MLIR_ENABLE_DIP_LIB=ON \
+ -DBUDDY_ENABLE_PNG=ON
+$ ninja
+$ ninja check-buddy
```
-If you want to add domain-specific framework support, please add the following cmake options:
+To build buddy-mlir with custom LLVM sources:
-| Framework | Enable Option | Other Options |
-| -------------- | ------------- | ------------- |
-| OpenCV | `-DBUDDY_ENABLE_OPENCV=ON` | Add `-DOpenCV_DIR=` or install OpenCV release version on your local device. |
+```
+$ cmake -G Ninja .. \
+ -DMLIR_DIR=PATH/TO/LLVM/lib/cmake/mlir \
+ -DLLVM_DIR=PATH/TO/LLVM/lib/cmake/llvm \
+ -DLLVM_ENABLE_ASSERTIONS=ON \
+ -DCMAKE_BUILD_TYPE=RELEASE \
+ -DLLVM_MAIN_SRC_DIR=PATH/TO/LLVM_SOURCE
+```
One-step building strategy
@@ -134,7 +158,7 @@ This repository have nix flake support. You can follow the [nix installation ins
nix develop .
```
-This will setup a bash shell with `clang`, `clangd`, `cmake`, `ninja`, and other necessary dependencies to build buddy-mlir from source.
+This will setup a bash shell with `clang`, `ccls`, `cmake`, `ninja`, and other necessary dependencies to build buddy-mlir from source.
- If you want to use the buddy-mlir bintools
diff --git a/backend/include/llvm/IR/CMakeLists.txt b/backend/include/llvm/IR/CMakeLists.txt
index b3447eae61..2de6b999b3 100644
--- a/backend/include/llvm/IR/CMakeLists.txt
+++ b/backend/include/llvm/IR/CMakeLists.txt
@@ -1,4 +1,4 @@
-include_directories(${LLVM_PROJECT_SOURCE_DIR}/llvm/include/llvm/IR/)
+include_directories(${LLVM_MAIN_SRC_DIR}/include/llvm/IR/)
set(LLVM_TARGET_DEFINITIONS IntrinsicsBuddyExt.td)
tablegen(LLVM IntrinsicImpl.inc -gen-intrinsic-impl)
diff --git a/backend/llvm/lib/Analysis/CMakeLists.txt b/backend/llvm/lib/Analysis/CMakeLists.txt
index 2a3a65971b..117f75d89b 100644
--- a/backend/llvm/lib/Analysis/CMakeLists.txt
+++ b/backend/llvm/lib/Analysis/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Analysis_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Analysis)
+set(LLVM_Analysis_DIR ${LLVM_MAIN_SRC_DIR}/lib/Analysis)
add_llvm_component_library(LLVMBuddyAnalysis
diff --git a/backend/llvm/lib/AsmParser/CMakeLists.txt b/backend/llvm/lib/AsmParser/CMakeLists.txt
index b5411d1007..d687d1d3bc 100644
--- a/backend/llvm/lib/AsmParser/CMakeLists.txt
+++ b/backend/llvm/lib/AsmParser/CMakeLists.txt
@@ -1,6 +1,6 @@
# AsmParser
-set(LLVM_AsmParser_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/AsmParser)
+set(LLVM_AsmParser_DIR ${LLVM_MAIN_SRC_DIR}/lib/AsmParser)
add_llvm_component_library(LLVMBuddyAsmParser
${LLVM_AsmParser_DIR}/LLLexer.cpp
diff --git a/backend/llvm/lib/Bitcode/Reader/CMakeLists.txt b/backend/llvm/lib/Bitcode/Reader/CMakeLists.txt
index cf92a543fd..7ea9048011 100644
--- a/backend/llvm/lib/Bitcode/Reader/CMakeLists.txt
+++ b/backend/llvm/lib/Bitcode/Reader/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Reader_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Bitcode/Reader)
+set(LLVM_Reader_DIR ${LLVM_MAIN_SRC_DIR}/lib/Bitcode/Reader)
add_llvm_component_library(LLVMBuddyBitReader
${LLVM_Reader_DIR}/BitcodeAnalyzer.cpp
diff --git a/backend/llvm/lib/Bitcode/Writer/CMakeLists.txt b/backend/llvm/lib/Bitcode/Writer/CMakeLists.txt
index f19595cead..a8b7f0c274 100644
--- a/backend/llvm/lib/Bitcode/Writer/CMakeLists.txt
+++ b/backend/llvm/lib/Bitcode/Writer/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Writer_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Bitcode/Writer)
+set(LLVM_Writer_DIR ${LLVM_MAIN_SRC_DIR}/lib/Bitcode/Writer)
add_llvm_component_library(LLVMBuddyBitWriter
diff --git a/backend/llvm/lib/CodeGen/AsmPrinter/CMakeLists.txt b/backend/llvm/lib/CodeGen/AsmPrinter/CMakeLists.txt
index fe3273dd5d..b942f4f734 100644
--- a/backend/llvm/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/backend/llvm/lib/CodeGen/AsmPrinter/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_AsmPrinter_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/CodeGen/AsmPrinter)
+set(LLVM_AsmPrinter_DIR ${LLVM_MAIN_SRC_DIR}/lib/CodeGen/AsmPrinter)
add_llvm_component_library(LLVMBuddyAsmPrinter
${LLVM_AsmPrinter_DIR}/AccelTable.cpp
diff --git a/backend/llvm/lib/CodeGen/CMakeLists.txt b/backend/llvm/lib/CodeGen/CMakeLists.txt
index 1794b38fa4..7eb38876db 100644
--- a/backend/llvm/lib/CodeGen/CMakeLists.txt
+++ b/backend/llvm/lib/CodeGen/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_CodeGen_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/CodeGen)
+set(LLVM_CodeGen_DIR ${LLVM_MAIN_SRC_DIR}/lib/CodeGen)
add_llvm_component_library(LLVMBuddyCodeGen
${LLVM_CodeGen_DIR}/AggressiveAntiDepBreaker.cpp
diff --git a/backend/llvm/lib/CodeGen/MIRParser/CMakeLists.txt b/backend/llvm/lib/CodeGen/MIRParser/CMakeLists.txt
index 6275b1ece0..1ab94ee930 100644
--- a/backend/llvm/lib/CodeGen/MIRParser/CMakeLists.txt
+++ b/backend/llvm/lib/CodeGen/MIRParser/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_MIRParser_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/CodeGen/MIRParser)
+set(LLVM_MIRParser_DIR ${LLVM_MAIN_SRC_DIR}/lib/CodeGen/MIRParser)
add_llvm_component_library(LLVMBuddyMIRParser
${LLVM_MIRParser_DIR}/MILexer.cpp
diff --git a/backend/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt b/backend/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 4bb3cde980..3b467a4eda 100644
--- a/backend/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/backend/llvm/lib/CodeGen/SelectionDAG/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_SelectionDAG_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/CodeGen/SelectionDAG)
+set(LLVM_SelectionDAG_DIR ${LLVM_MAIN_SRC_DIR}/lib/CodeGen/SelectionDAG)
add_llvm_component_library(LLVMBuddySelectionDAG
${LLVM_SelectionDAG_DIR}/DAGCombiner.cpp
diff --git a/backend/llvm/lib/IR/CMakeLists.txt b/backend/llvm/lib/IR/CMakeLists.txt
index e6895a1f80..0d56184730 100644
--- a/backend/llvm/lib/IR/CMakeLists.txt
+++ b/backend/llvm/lib/IR/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_IR_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/IR)
+set(LLVM_IR_DIR ${LLVM_MAIN_SRC_DIR}/lib/IR)
add_llvm_component_library(LLVMBuddyCore
${LLVM_IR_DIR}/AbstractCallSite.cpp
diff --git a/backend/llvm/lib/IRReader/CMakeLists.txt b/backend/llvm/lib/IRReader/CMakeLists.txt
index 9b315dec3b..72e95722a8 100644
--- a/backend/llvm/lib/IRReader/CMakeLists.txt
+++ b/backend/llvm/lib/IRReader/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_IRReader_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/IRReader)
+set(LLVM_IRReader_DIR ${LLVM_MAIN_SRC_DIR}/lib/IRReader)
add_llvm_component_library(LLVMBuddyIRReader
${LLVM_IRReader_DIR}/IRReader.cpp
diff --git a/backend/llvm/lib/Object/CMakeLists.txt b/backend/llvm/lib/Object/CMakeLists.txt
index 8695d55ba9..a8425e97c0 100644
--- a/backend/llvm/lib/Object/CMakeLists.txt
+++ b/backend/llvm/lib/Object/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Object_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Object)
+set(LLVM_Object_DIR ${LLVM_MAIN_SRC_DIR}/lib/Object)
add_llvm_component_library(LLVMBuddyObject
${LLVM_Object_DIR}/Archive.cpp
diff --git a/backend/llvm/lib/ProfileData/CMakeLists.txt b/backend/llvm/lib/ProfileData/CMakeLists.txt
index 9ae05a36fe..742ecf662a 100644
--- a/backend/llvm/lib/ProfileData/CMakeLists.txt
+++ b/backend/llvm/lib/ProfileData/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_ProfileData_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/ProfileData)
+set(LLVM_ProfileData_DIR ${LLVM_MAIN_SRC_DIR}/lib/ProfileData)
add_llvm_component_library(LLVMBuddyProfileData
${LLVM_ProfileData_DIR}/GCOV.cpp
diff --git a/backend/llvm/lib/Remarks/CMakeLists.txt b/backend/llvm/lib/Remarks/CMakeLists.txt
index 4ed8775770..5c1c81b7d8 100644
--- a/backend/llvm/lib/Remarks/CMakeLists.txt
+++ b/backend/llvm/lib/Remarks/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Remarks_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Remarks)
+set(LLVM_Remarks_DIR ${LLVM_MAIN_SRC_DIR}/lib/Remarks)
add_llvm_component_library(LLVMBuddyRemarks
${LLVM_Remarks_DIR}/BitstreamRemarkParser.cpp
diff --git a/backend/llvm/lib/Target/CMakeLists.txt b/backend/llvm/lib/Target/CMakeLists.txt
index c6298c3837..1dd5cd34f3 100644
--- a/backend/llvm/lib/Target/CMakeLists.txt
+++ b/backend/llvm/lib/Target/CMakeLists.txt
@@ -2,7 +2,7 @@ list(APPEND LLVM_COMMON_DEPENDS buddy_intrinsics_gen)
list(APPEND LLVM_TABLEGEN_FLAGS -I ${LLVM_MAIN_SRC_DIR}/lib/Target)
-set(LLVM_Target_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Target)
+set(LLVM_Target_DIR ${LLVM_MAIN_SRC_DIR}/lib/Target)
add_llvm_component_library(LLVMBuddyTarget
${LLVM_Target_DIR}/Target.cpp
diff --git a/backend/llvm/lib/Target/RISCV/CMakeLists.txt b/backend/llvm/lib/Target/RISCV/CMakeLists.txt
index 4a66f65292..6bfee7c2f9 100644
--- a/backend/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/backend/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -21,7 +21,7 @@ macro(buddy_add_llvm_target target_name)
set( CURRENT_LLVM_TARGET LLVM${target_name} )
endmacro(buddy_add_llvm_target)
-set(LLVM_TARGET_RISCV_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Target/RISCV)
+set(LLVM_TARGET_RISCV_DIR ${LLVM_MAIN_SRC_DIR}/lib/Target/RISCV)
# ------------------------------------------------------------------------------
# Configure RISC-V Buddy Extension.
diff --git a/backend/llvm/lib/Transforms/IPO/CMakeLists.txt b/backend/llvm/lib/Transforms/IPO/CMakeLists.txt
index 74ff798637..08392abf87 100644
--- a/backend/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/backend/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_IPO_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Transforms/IPO)
+set(LLVM_IPO_DIR ${LLVM_MAIN_SRC_DIR}/lib/Transforms/IPO)
add_llvm_component_library(LLVMBuddyIPO
${LLVM_IPO_DIR}/AlwaysInliner.cpp
diff --git a/backend/llvm/lib/Transforms/Scalar/CMakeLists.txt b/backend/llvm/lib/Transforms/Scalar/CMakeLists.txt
index c3c412b9a9..6bbcf432e8 100644
--- a/backend/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/backend/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Scalar_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Transforms/Scalar)
+set(LLVM_Scalar_DIR ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Scalar)
add_llvm_component_library(LLVMBuddyScalarOpts
${LLVM_Scalar_DIR}/ADCE.cpp
diff --git a/backend/llvm/lib/Transforms/Utils/CMakeLists.txt b/backend/llvm/lib/Transforms/Utils/CMakeLists.txt
index 989a672edd..e3313e07b2 100644
--- a/backend/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/backend/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Utils_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Transforms/Utils)
+set(LLVM_Utils_DIR ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Utils)
add_llvm_component_library(LLVMBuddyTransformUtils
diff --git a/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index e9cece2c46..669aae5850 100644
--- a/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(LLVM_Vectorize_DIR ${LLVM_PROJECT_SOURCE_DIR}/llvm/lib/Transforms/Vectorize)
+set(LLVM_Vectorize_DIR ${LLVM_MAIN_SRC_DIR}/lib/Transforms/Vectorize)
add_llvm_component_library(LLVMBuddyVectorize
${LLVM_Vectorize_DIR}/LoadStoreVectorizer.cpp
diff --git a/docs/PythonEnvironment.md b/docs/PythonEnvironment.md
new file mode 100644
index 0000000000..77f431e85c
--- /dev/null
+++ b/docs/PythonEnvironment.md
@@ -0,0 +1,10 @@
+# Python Virtual Environment Setup Guide for Buddy-mlir
+
+We recommend you to use anaconda3 to create python virtual environment. You should install python packages as buddy-mlir/requirements.
+
+```bash
+$ conda create -n python=3.11
+$ conda activate
+$ cd buddy-mlir
+$ pip install -r requirements.txt
+```
\ No newline at end of file
diff --git a/docs/RVVEnviroment.md b/docs/RVVEnvironment.md
similarity index 100%
rename from docs/RVVEnviroment.md
rename to docs/RVVEnvironment.md
diff --git a/examples/BuddyBert/CMakeLists.txt b/examples/BuddyBert/CMakeLists.txt
index 93dc7c2daa..95c98dfa96 100644
--- a/examples/BuddyBert/CMakeLists.txt
+++ b/examples/BuddyBert/CMakeLists.txt
@@ -7,13 +7,13 @@ add_custom_command(
add_custom_command(
OUTPUT forward.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyBert/forward.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyBert/forward.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-pass-pipeline "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), eliminate-empty-tensors, func.func(llvm-request-c-wrappers),convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, convert-func-to-llvm, reconcile-unrealized-casts)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyBert/forward.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyBert/forward.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyBert/forward.mlir
COMMENT "Building forward.o"
VERBATIM)
@@ -22,11 +22,11 @@ add_custom_command(
OUTPUT subgraph0.o
COMMAND ${BUDDY_BINARY_DIR}/buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyBert/subgraph0.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, func-bufferize-dynamic-offset, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize))" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-pass-pipeline "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), eliminate-empty-tensors, func.func(llvm-request-c-wrappers),convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, convert-func-to-llvm, reconcile-unrealized-casts)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyBert/subgraph0.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyBert/subgraph0.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyBert/subgraph0.mlir
COMMENT "Building subgraph0.o"
VERBATIM)
@@ -36,7 +36,7 @@ add_library(BERT STATIC forward.o subgraph0.o)
SET_TARGET_PROPERTIES(BERT PROPERTIES LINKER_LANGUAGE C)
add_executable(buddy-bert-run bert-main.cpp)
-target_link_directories(buddy-bert-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_directories(buddy-bert-run PRIVATE ${LLVM_LIBRARY_DIR})
set(BUDDY_BERT_LIBS BERT mlir_c_runner_utils)
target_link_libraries(buddy-bert-run ${BUDDY_BERT_LIBS})
diff --git a/examples/BuddyConvolution/.gitignore b/examples/BuddyConvolution/.gitignore
index 0194ea7a68..df9389428a 100644
--- a/examples/BuddyConvolution/.gitignore
+++ b/examples/BuddyConvolution/.gitignore
@@ -1,3 +1,4 @@
log.mlir
log.ll
log.s
+a.out
diff --git a/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir b/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir
new file mode 100644
index 0000000000..76d5e4d932
--- /dev/null
+++ b/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir
@@ -0,0 +1,137 @@
+// RUN: buddy-opt %s \
+// RUN: -convert-vector-to-scf \
+// RUN: -lower-affine \
+// RUN: -arith-bufferize \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -O3 -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+// Using `8` as the vector size.
+#map = affine_map<(d0) -> (d0 floordiv 8)>
+#map0 = affine_map<(d0, d1, d2, d3) -> (d2)>
+#map1 = affine_map<(d0, d1) -> (d0 + d1)>
+#map2 = affine_map<(d0, d1) -> (d0 + d1 * 8)>
+#map3 = affine_map<(d0) -> (d0 * 8)>
+
+module {
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func private @rtclock() -> f64
+
+ func.func @conv_2d_nhwc_fhwc(%arg0: memref, %arg1: memref, %arg2: memref) {
+ %f0 = arith.constant 0. : f32
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c3 = arith.constant 3 : index
+ %n = memref.dim %arg0, %c0 : memref
+ %h_i = memref.dim %arg0, %c1 : memref
+ %w_i = memref.dim %arg0, %c2 : memref
+ %c = memref.dim %arg0, %c3 : memref
+ %f = memref.dim %arg1, %c0 : memref
+ %h_k = memref.dim %arg1, %c1 : memref
+ %w_k = memref.dim %arg1, %c2 : memref
+ %h_o = memref.dim %arg2, %c1 : memref
+ %w_o = memref.dim %arg2, %c2 : memref
+
+ // Output is NHoWoF
+ affine.for %idx_n = %c0 to %n {
+ affine.for %idx_f = %c0 to %f {
+ affine.for %idx_c = %c0 to %c {
+ affine.for %idx_h_o = %c0 to %h_o {
+ affine.for %idx_h_k = %c0 to %h_k {
+ affine.for %idx_w_k = %c0 to %w_k {
+ affine.for %idx_w_o = %c0 to #map(%w_o) {
+ %kernel_ele = memref.load %arg1[%idx_f, %idx_h_k, %idx_w_k, %idx_c] : memref
+ %kernel_vec = vector.broadcast %kernel_ele : f32 to vector<8xf32>
+ %in_iter_h = affine.apply #map1 (%idx_h_k, %idx_h_o)
+ %in_iter_w = affine.apply #map2 (%idx_w_k, %idx_w_o)
+ %out_iter_w = affine.apply #map3 (%idx_w_o)
+ %input_vec = vector.transfer_read %arg0[%idx_n, %in_iter_h, %in_iter_w, %idx_c], %f0
+ { permutation_map = #map0 } : memref, vector<8xf32>
+ %output_vec = vector.transfer_read %arg2[%idx_n, %idx_h_o, %out_iter_w, %idx_f], %f0
+ { permutation_map = #map0 } : memref, vector<8xf32>
+ %res_vec = vector.fma %kernel_vec, %input_vec, %output_vec : vector<8xf32>
+ vector.transfer_write %res_vec, %arg2[%idx_n, %idx_h_o, %out_iter_w, %idx_f]
+ { permutation_map = #map0 } : vector<8xf32>, memref
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return
+ }
+
+ func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> memref {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = memref.alloc(%arg0, %arg1, %arg2, %arg3) : memref
+ scf.for %idx0 = %c0 to %arg0 step %c1 {
+ scf.for %idx1 = %c0 to %arg1 step %c1 {
+ scf.for %idx2 = %c0 to %arg2 step %c1 {
+ scf.for %idx3 = %c0 to %arg3 step %c1 {
+ memref.store %arg4, %0[%idx0, %idx1, %idx2, %idx3] : memref
+ }
+ }
+ }
+ }
+ return %0 : memref
+ }
+
+ func.func @main() {
+ %f0 = arith.constant 0.000000e+00 : f32
+ %f2 = arith.constant 2.000000e+00 : f32
+ %f3 = arith.constant 3.000000e+00 : f32
+
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c3 = arith.constant 3 : index
+ %c5 = arith.constant 5 : index
+ %c6 = arith.constant 6 : index
+ %c8 = arith.constant 8 : index
+ %c12 = arith.constant 12 : index
+ %c16 = arith.constant 16 : index
+ %c24 = arith.constant 24 : index
+ %c28 = arith.constant 28 : index
+
+ // %v0 = call @alloc_f32(%c1, %c12, %c12, %c6, %f2) : (index, index, index, index, f32) -> memref
+ // %v1 = call @alloc_f32(%c16, %c5, %c5, %c6, %f3) : (index, index, index, index, f32) -> memref
+ // %v2 = call @alloc_f32(%c1, %c8, %c8, %c16, %f0) : (index, index, index, index, f32) -> memref
+
+ %v0 = call @alloc_f32(%c1, %c28, %c28, %c1, %f2) : (index, index, index, index, f32) -> memref
+ %v1 = call @alloc_f32(%c6, %c5, %c5, %c1, %f3) : (index, index, index, index, f32) -> memref
+ %v2 = call @alloc_f32(%c1, %c24, %c24, %c6, %f0) : (index, index, index, index, f32) -> memref
+
+ %t_start = call @rtclock() : () -> f64
+ call @conv_2d_nhwc_fhwc(%v0, %v1, %v2) : (memref, memref, memref) -> ()
+ %t_end = call @rtclock() : () -> f64
+
+ // All the elements of the MemRef are the same,
+ // only check the first line to verify the correctness.
+ // CHECK: Unranked Memref
+ // CHECK: [
+ // CHECK: [
+ // CHECK: [
+ // CHECK: [150{{(, 150)*}}],
+ %print_v2 = memref.cast %v2 : memref to memref<*xf32>
+ call @printMemrefF32(%print_v2) : (memref<*xf32>) -> ()
+
+ %time = arith.subf %t_end, %t_start : f64
+ vector.print %time : f64
+
+ memref.dealloc %v0 : memref
+ memref.dealloc %v1 : memref
+ memref.dealloc %v2 : memref
+
+ return
+ }
+}
diff --git a/examples/BuddyConvolution/conv2d-nhwc-fhwc.mlir b/examples/BuddyConvolution/conv2d-nhwc-fhwc.mlir
new file mode 100644
index 0000000000..90759355e9
--- /dev/null
+++ b/examples/BuddyConvolution/conv2d-nhwc-fhwc.mlir
@@ -0,0 +1,88 @@
+// RUN: buddy-opt %s \
+// RUN: -convert-linalg-to-loops \
+// RUN: -lower-affine \
+// RUN: -arith-bufferize \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func private @rtclock() -> f64
+
+ func.func @conv_2d_nhwc_fhwc(%arg0: memref, %arg1: memref, %arg2: memref) {
+ linalg.conv_2d_nhwc_fhwc ins (%arg0, %arg1: memref, memref)
+ outs (%arg2: memref)
+ return
+ }
+
+ func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> memref {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = memref.alloc(%arg0, %arg1, %arg2, %arg3) : memref
+ scf.for %idx0 = %c0 to %arg0 step %c1 {
+ scf.for %idx1 = %c0 to %arg1 step %c1 {
+ scf.for %idx2 = %c0 to %arg2 step %c1 {
+ scf.for %idx3 = %c0 to %arg3 step %c1 {
+ memref.store %arg4, %0[%idx0, %idx1, %idx2, %idx3] : memref
+ }
+ }
+ }
+ }
+ return %0 : memref
+ }
+
+ func.func @main() {
+ %f0 = arith.constant 0.000000e+00 : f32
+ %f2 = arith.constant 2.000000e+00 : f32
+ %f3 = arith.constant 3.000000e+00 : f32
+
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %c3 = arith.constant 3 : index
+ %c5 = arith.constant 5 : index
+ %c6 = arith.constant 6 : index
+ %c8 = arith.constant 8 : index
+ %c12 = arith.constant 12 : index
+ %c16 = arith.constant 16 : index
+ %c24 = arith.constant 24 : index
+ %c28 = arith.constant 28 : index
+
+ // %v0 = call @alloc_f32(%c1, %c12, %c12, %c6, %f2) : (index, index, index, index, f32) -> memref
+ // %v1 = call @alloc_f32(%c16, %c5, %c5, %c6, %f3) : (index, index, index, index, f32) -> memref
+ // %v2 = call @alloc_f32(%c1, %c8, %c8, %c16, %f0) : (index, index, index, index, f32) -> memref
+
+ %v0 = call @alloc_f32(%c1, %c28, %c28, %c1, %f2) : (index, index, index, index, f32) -> memref
+ %v1 = call @alloc_f32(%c6, %c5, %c5, %c1, %f3) : (index, index, index, index, f32) -> memref
+ %v2 = call @alloc_f32(%c1, %c24, %c24, %c6, %f0) : (index, index, index, index, f32) -> memref
+
+ %t_start = call @rtclock() : () -> f64
+ call @conv_2d_nhwc_fhwc(%v0, %v1, %v2) : (memref, memref, memref) -> ()
+ %t_end = call @rtclock() : () -> f64
+
+ // All the elements of the MemRef are the same,
+ // only check the first line to verify the correctness.
+ // CHECK: Unranked Memref
+ // CHECK: [
+ // CHECK: [
+ // CHECK: [
+ // CHECK: [150{{(, 150)*}}],
+ %print_v2 = memref.cast %v2 : memref to memref<*xf32>
+ call @printMemrefF32(%print_v2) : (memref<*xf32>) -> ()
+
+ %time = arith.subf %t_end, %t_start : f64
+ vector.print %time : f64
+
+ memref.dealloc %v0 : memref
+ memref.dealloc %v1 : memref
+ memref.dealloc %v2 : memref
+ return
+ }
+}
diff --git a/examples/BuddyConvolution/makefile b/examples/BuddyConvolution/makefile
index 063832fa0f..1962643766 100644
--- a/examples/BuddyConvolution/makefile
+++ b/examples/BuddyConvolution/makefile
@@ -1,10 +1,12 @@
#!/bin/bash
BUDDY_OPT := ../../build/bin/buddy-opt
MLIR_OPT := ../../llvm/build/bin/mlir-opt
+CLANG := ../../llvm/build/bin/clang
MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
LLC := ../../llvm/build/bin/llc
-OPT_FLAG := -O0
+OPT_FLAG := -O3
+MLIR_LIB := ../../llvm/build/lib/
ifeq ($(shell uname),Linux)
MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
@@ -61,3 +63,65 @@ conv2d-run:
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+conv2d-nhwc-fhwc-run:
+ @${BUDDY_OPT} ./conv2d-nhwc-fhwc.mlir \
+ -convert-linalg-to-loops \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+conv2d-nhwc-fhwc-aot:
+ @${BUDDY_OPT} ./conv2d-nhwc-fhwc.mlir \
+ -convert-linalg-to-loops \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
+ ${CLANG} log.ll ${OPT_FLAG} \
+ -L${MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
+ -o a.out
+ @LD_LIBRARY_PATH=${MLIR_LIB} ./a.out
+
+conv2d-nhwc-fhwc-opt-run:
+ @${BUDDY_OPT} ./conv2d-nhwc-fhwc-opt.mlir \
+ -convert-vector-to-scf \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} -O3 -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+conv2d-nhwc-fhwc-opt-aot:
+ @${BUDDY_OPT} ./conv2d-nhwc-fhwc-opt.mlir \
+ -convert-vector-to-scf \
+ -lower-affine \
+ -arith-bufferize \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_TRANSLATE} -mlir-to-llvmir -o log.ll
+ ${CLANG} log.ll -O3 \
+ -L${MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
+ -o a.out
+ @LD_LIBRARY_PATH=${MLIR_LIB} ./a.out
diff --git a/examples/BuddyGPU/.gitignore b/examples/BuddyGPU/.gitignore
new file mode 100644
index 0000000000..d82aeb33bb
--- /dev/null
+++ b/examples/BuddyGPU/.gitignore
@@ -0,0 +1,4 @@
+log.mlir
+log.ll
+log.s
+matmul-cubin.mlir
diff --git a/examples/BuddyGPU/README.md b/examples/BuddyGPU/README.md
new file mode 100644
index 0000000000..7c4081e401
--- /dev/null
+++ b/examples/BuddyGPU/README.md
@@ -0,0 +1,40 @@
+# Buddy GPU Example
+This example demonstrates how to use the Buddy GPU to run a simple single-kernel program.
+
+## Matmul
+The example program is a simple matrix multiplication kernel. The linalg definition is in the `matmul.mlir` file.
+A transform sequence is in `transform.mlir` to optimize this kernel and prepare it for execution on the GPU.
+The `matmul-cubin.mlir` provides a lowered file, in case the pipeline is not working.
+
+Run the following command to compile and run the program:
+```
+ make buddy-gpu-matmul
+ python run-module-gpu.py --source matmul.mlir --target matmul-cubin.mlir --llvm_dir ../../llvm
+```
+
+The result should be:
+```
+[[502.9141 499.7761 511.35623 ... 500.9083 505.25574 511.03818]
+ [499.57034 494.8066 506.427 ... 492.7868 497.22513 509.95612]
+ [511.2017 516.017 513.631 ... 515.5991 515.6389 521.8318 ]
+ ...
+ [496.2721 496.3155 506.08054 ... 502.36798 505.94202 516.3577 ]
+ [512.06866 505.80127 518.81934 ... 510.64966 510.10333 531.85364]
+ [501.23514 500.17123 505.71808 ... 496.4447 500.5735 514.4204 ]]
+[[503.26013 500.11093 511.70193 ... 501.24622 505.60373 511.38376]
+ [499.89877 495.13043 506.762 ... 493.1151 497.5555 510.29483]
+ [511.54883 516.35547 513.9717 ... 515.944 515.9865 522.1828 ]
+ ...
+ [496.59937 496.63785 506.41483 ... 502.70337 506.27927 516.6994 ]
+ [512.4154 506.1411 519.17175 ... 510.9929 510.45322 532.2152 ]
+ [501.57388 500.5093 506.06213 ... 496.7807 500.91638 514.77124]]
+MLIR equal to NumPy? True
+```
+
+As the tensorcore doesn't support fp32 computation, the operands are converted to tf32, hence the result is not exactly the same as the PyTorch result.
+
+### Profiling
+You need to install nsight compute first.
+```
+ncu -o profile-result --set full python run-module-gpu.py --source matmul.mlir --target matmul-cubin.mlir --llvm_dir ../../llvm
+```
\ No newline at end of file
diff --git a/examples/BuddyGPU/makefile b/examples/BuddyGPU/makefile
new file mode 100644
index 0000000000..5dbd9c25cd
--- /dev/null
+++ b/examples/BuddyGPU/makefile
@@ -0,0 +1,22 @@
+#!/bin/bash
+BUDDY_OPT := ../../build/bin/buddy-opt
+MLIR_OPT := ../../llvm/build/bin/mlir-opt
+MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
+MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
+LLC := ../../llvm/build/bin/llc
+
+buddy-gpu-matmul-lower:
+ @${BUDDY_OPT} matmul.mlir \
+ -transform-preload-library="transform-library-paths=transform.mlir" \
+ -transform-interpreter="entry-point=codegen" \
+ -o log.mlir
+
+buddy-gpu-matmul:
+ @${BUDDY_OPT} matmul.mlir -transform-preload-library="transform-library-paths=transform.mlir" -transform-interpreter="entry-point=codegen" | \
+ ${BUDDY_OPT} --pass-pipeline='builtin.module(func.func(nvgpu-optimize-shared-memory))' | \
+ ${BUDDY_OPT} -arith-expand -eliminate-empty-tensors -empty-tensor-to-alloc-tensor -linalg-bufferize -convert-linalg-to-affine-loops -affine-loop-fusion -affine-parallelize -lower-affine -canonicalize -func-bufferize -arith-bufferize -tensor-bufferize -buffer-deallocation -finalizing-bufferize -canonicalize | \
+ ${BUDDY_OPT} -gpu-launch-sink-index-computations -canonicalize -legalize-shmem-outlining -canonicalize | \
+ ${BUDDY_OPT} -convert-memcpy-to-gpu -gpu-async-region -canonicalize | \
+ ${BUDDY_OPT} -convert-scf-to-cf -memref-expand -finalize-memref-to-llvm -convert-arith-to-llvm --convert-vector-to-llvm -convert-gpu-to-nvvm='has-redux=1' | \
+ ${BUDDY_OPT} -llvm-request-c-wrappers -canonicalize -cse -sccp | \
+ ${MLIR_OPT} --test-lower-to-nvvm="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=fatbin" -o matmul-cubin.mlir
diff --git a/examples/BuddyGPU/matmul.mlir b/examples/BuddyGPU/matmul.mlir
new file mode 100644
index 0000000000..2f0fa226c1
--- /dev/null
+++ b/examples/BuddyGPU/matmul.mlir
@@ -0,0 +1,12 @@
+!unit = f32
+!lhs = tensor<5376x2048x!unit>
+!rhs = tensor<2048x5376x!unit>
+!res = tensor<5376x5376x!unit>
+
+func.func @matmul(%arg0: !lhs, %arg1: !rhs) -> !res {
+ %cst = arith.constant 0.000000e+00 : !unit
+ %0 = tensor.empty() : !res
+ %1 = linalg.fill ins(%cst : !unit) outs(%0 : !res) -> !res
+ %2 = linalg.matmul ins(%arg0, %arg1: !lhs, !rhs) outs(%1: !res) -> !res
+ func.return %2 : !res
+}
diff --git a/examples/BuddyGPU/run-module-gpu.py b/examples/BuddyGPU/run-module-gpu.py
new file mode 100644
index 0000000000..a7b210b379
--- /dev/null
+++ b/examples/BuddyGPU/run-module-gpu.py
@@ -0,0 +1,145 @@
+# ===- run-module-gpu.py --------------------------------------------------===//
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# ===----------------------------------------------------------------------===//
+#
+# This file is a script to test whether the specified MLIR module on the GPU
+# calculates the same result as NumPy.
+#
+# ===----------------------------------------------------------------------===//
+
+import mlir.dialects.func as func
+from mlir.passmanager import *
+from mlir.execution_engine import *
+from mlir import runtime as rt
+from mlir.ir import *
+import numpy as np
+import ctypes
+import argparse as ap
+
+
+def to_numpy(element_type: str) -> np.dtype:
+ match element_type:
+ case "f16":
+ return np.float16
+ case "f32":
+ return np.float32
+ case "f64":
+ return np.float64
+ case "i8":
+ return np.int8
+ case "i16":
+ return np.int16
+ case "i32":
+ return np.int32
+ case "i64":
+ return np.int64
+ case "bf16":
+ return np.dtype("bfloat16")
+ case _:
+ raise ValueError(f"Unsupported type: {element_type}")
+
+
+def new_ranked_memref_descriptor(nparray: np.ndarray):
+ if nparray.dtype == "bfloat16":
+ ctp = rt.F16
+ else:
+ ctp = rt.as_ctype(nparray.dtype)
+
+ if nparray.ndim == 0:
+ x = rt.make_zero_d_memref_descriptor(ctp)()
+ x.allocated = nparray.ctypes.data
+ x.aligned = nparray.ctypes.data_as(ctypes.POINTER(ctp))
+ x.offset = ctypes.c_longlong(0)
+ return x
+
+ x = rt.make_nd_memref_descriptor(nparray.ndim, ctp)()
+ nbytes = nparray.nbytes
+ buffer = ctypes.create_string_buffer(nbytes)
+ ctypes.memmove(buffer, nparray.ctypes.data, nbytes)
+ x.allocated = ctypes.cast(buffer, ctypes.c_void_p).value
+ x.aligned = ctypes.cast(buffer, ctypes.POINTER(ctp))
+ x.offset = ctypes.c_longlong(0)
+ x.shape = nparray.ctypes.shape
+
+ # Numpy uses byte quantities to express strides, MLIR OTOH uses the
+ # torch abstraction which specifies strides in terms of elements.
+ strides_ctype_t = ctypes.c_longlong * nparray.ndim
+ x.strides = strides_ctype_t(
+ *[x // nparray.itemsize for x in nparray.strides]
+ )
+ return x
+
+
+def get_memref_descriptors(args: list[Type]):
+ memref_ptrs = []
+ for arg in args:
+ elem_type = to_numpy(str(arg.element_type))
+ np_arg = np.random.rand(*arg.shape).astype(elem_type)
+ memref_ptrs.append(
+ ctypes.pointer(ctypes.pointer(new_ranked_memref_descriptor(np_arg)))
+ )
+ return memref_ptrs
+
+
+def test(source, target, llvm_dir):
+ with Context() as ctx:
+ file = open(source, "r")
+ module: Module = Module.parse(file.read())
+ funcOp: func.FuncOp = (
+ module.operation.regions[0].blocks[0].operations[0]
+ )
+ funcName = str(funcOp.name).replace('"', "")
+ assert isinstance(funcOp, func.FuncOp)
+ args_type: list[Type] = [arg.type for arg in funcOp.arguments]
+ res_type = funcOp.type.results
+
+ file = open(target, "r")
+ # newModule = lower_to_llvm_cpu(module)
+ newModule = Module.parse(file.read())
+ memref_ptrs = get_memref_descriptors(res_type + args_type)
+
+ engine = ExecutionEngine(
+ newModule,
+ shared_libs=[
+ llvm_dir + "/build/lib/libomp.so",
+ llvm_dir + "/build/lib/libmlir_c_runner_utils.so",
+ llvm_dir + "/build/lib/libmlir_async_runtime.so",
+ llvm_dir + "/build/lib/libmlir_runner_utils.so",
+ llvm_dir + "/build/lib/libmlir_cuda_runtime.so",
+ ],
+ opt_level=3,
+ )
+ engine.invoke(funcName, *memref_ptrs)
+ out = rt.ranked_memref_to_numpy(memref_ptrs[0][0])
+ if str(res_type[0].element_type) == "bf16":
+ print("Running on BF16 mode, skipping numpy comparison.")
+ else:
+ print(out)
+ input1 = rt.ranked_memref_to_numpy(memref_ptrs[1][0])
+ input2 = rt.ranked_memref_to_numpy(memref_ptrs[2][0])
+ numpy_out = np.matmul(input1, input2)
+ print(numpy_out)
+ print(
+ f"MLIR equal to NumPy? {np.allclose(out, numpy_out,rtol=1e-03, atol=1e-03)}"
+ )
+
+
+if __name__ == "__main__":
+ parser = ap.ArgumentParser()
+ parser.add_argument("--source", type=str, required=True)
+ parser.add_argument("--target", type=str, required=True)
+ parser.add_argument("--llvm_dir", type=str, required=True)
+ args = parser.parse_args()
+ test(args.source, args.target, args.llvm_dir)
diff --git a/examples/BuddyGPU/transform.mlir b/examples/BuddyGPU/transform.mlir
new file mode 100644
index 0000000000..e2a02a9a97
--- /dev/null
+++ b/examples/BuddyGPU/transform.mlir
@@ -0,0 +1,311 @@
+module attributes { transform.with_named_sequence } {
+ transform.named_sequence @codegen(%arg0: !transform.any_op) {
+ // Match the target operations and assign them to SSA values.
+ %matmul = transform.structured.match ops{["linalg.matmul"]} in %arg0
+ : (!transform.any_op) -> !transform.any_op
+ %fill = transform.structured.match ops{["linalg.fill"]} in %arg0
+ : (!transform.any_op) -> !transform.any_op
+
+ // Perform tiling for the grid.
+ // For the matrix multiplication of 5376x2048 and 2048x5376, the compilation
+ // strategy sets the tile size for grid-based partitioning to 128x256.
+ // This means that each [128, 2048] @ [2048, 256] matmul tile is computed within a GPU block,
+ // while multiple such blocks are computed in parallel across the grid.
+ // `tile_sizes` specify the dimensions of the tiled matmul result.
+ // `%tiled_op` is the tiled matmul operation within the `scf.forall` loop.
+ // `%forall_op` is the `scf.forall` loop that maintains tile information.
+ %tiled_op, %forall_op = transform.structured.tile_using_forall %matmul
+ tile_sizes [128, 256] (mapping = [#gpu.block, #gpu.block])
+ : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Perform canonicalization.
+ %1 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %1 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %1 : !transform.any_op
+ %all_loops = transform.structured.match interface{LoopLikeInterface}
+ in %arg0
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops : !transform.any_op
+ transform.apply_patterns to %1 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ } : !transform.any_op
+
+ // Fuse the fill operation into the scf.all op.
+ %fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %fill into %forall_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Further tile the tiled matmul
+ // Tile the third dimension in matmul.
+ // [128, 2048] @ [2048, 256] matmul is further tiled into [128, 16] @ [16, 256] matmul.
+ %tiled_linalg_op, %loops = transform.structured.tile_using_for %tiled_op [0, 0, 16] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Create pad op and prepare for mapping to GPU.
+ // Nothing has changed in the operation.
+ %padded, %pad, %copy = transform.structured.pad %tiled_linalg_op {copy_back_op = "none", pack_paddings = [1, 1, 1], pad_to_multiple_of = [1, 1, 1], padding_dimensions = [0, 1, 2], padding_values = [0.000000e+00 : f32, 0.000000e+00 : f32, 0.000000e+00 : f32]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+ // Rewrite tensor.pad into linalg.copy.
+ %3 = transform.get_producer_of_operand %padded[0] : (!transform.any_op) -> !transform.any_op
+ %4 = transform.get_producer_of_operand %padded[1] : (!transform.any_op) -> !transform.any_op
+ %5 = transform.get_producer_of_operand %padded[2] : (!transform.any_op) -> !transform.any_op
+ %6 = transform.structured.rewrite_in_destination_passing_style %3 : (!transform.any_op) -> !transform.any_op
+ %7 = transform.structured.rewrite_in_destination_passing_style %4 : (!transform.any_op) -> !transform.any_op
+ %8 = transform.structured.rewrite_in_destination_passing_style %5 : (!transform.any_op) -> !transform.any_op
+
+ // Tile the linalg.copy op and map it to GPU thread level,
+ // such that the tiled matrix are copied to GPU shared memory.
+ // num_threads is different from tile_sizes used above,
+ // as it specifies the number of tile instead of the size of the tile.
+ // The first transform tile the [128, 16] into [4, 4],
+ // and the second transform tile the [16, 256] into [2, 16].
+ %tiled_op_0, %forall_op_1 = transform.structured.tile_using_forall %6 num_threads [32, 4](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ %tiled_op_2, %forall_op_3 = transform.structured.tile_using_forall %7 num_threads [8, 16](mapping = [#gpu.thread, #gpu.thread]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Tile the linalg.matmul op and map it to GPU warp level.
+ %tiled_op_4, %forall_op_5 = transform.structured.tile_using_forall %padded num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+ // Tile the linalg.fill op and map it to GPU warp level.
+ %tiled_op_6, %forall_op_7 = transform.structured.tile_using_forall %fused_op num_threads [2, 2](mapping = [#gpu.warp, #gpu.warp]) : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+ // Perform canonicalization.
+ %9 = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %9 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %9 : !transform.any_op
+ %all_loops_2 = transform.structured.match interface{LoopLikeInterface}
+ in %9
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_2 : !transform.any_op
+ transform.apply_patterns to %9 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.vector.lower_masked_transfers
+ } : !transform.any_op
+
+ // Perform vectorization.
+ // Vectorize the linalg.copy, linalg.fill, and linalg.matmul operations.
+ %10 = transform.structured.vectorize_children_and_apply_patterns %9 : (!transform.any_op) -> !transform.any_op
+
+ // Perform canonicalization.
+ transform.apply_patterns to %10 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %10 : !transform.any_op
+ %all_loops_3 = transform.structured.match interface{LoopLikeInterface}
+ in %10
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_3 : !transform.any_op
+ transform.apply_patterns to %10 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.vector.lower_masked_transfers
+ } : !transform.any_op
+
+ // Match bufferization.alloc_tensors inside the forall op
+ %scf_forall = transform.structured.match ops{["scf.forall"]} attributes{mapping = [#gpu.block, #gpu.block]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ %alloc_tensor_ops = transform.structured.match ops{["bufferization.alloc_tensor"]} in %scf_forall : (!transform.any_op) -> !transform.any_op
+
+ // Bufferize the alloc_tensor ops to memref.alloc ops.
+ // The memory_space attribute for GPU Dialect 0 means global memory, 3 means workgroup memory address, 5 means private memory address.
+ // According to https://discourse.llvm.org/t/rfc-memref-memory-shape-as-attribute/2229
+ %buffer, %new_ops = transform.structured.bufferize_to_allocation %alloc_tensor_ops {memory_space = 3 } : !transform.any_op
+
+ // Eliminate empty tensors and erase unnecessary inputs.
+ transform.structured.eliminate_empty_tensors %arg0 : !transform.any_op
+ %func_eras = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+ transform.apply_patterns to %func_eras {
+ transform.apply_patterns.linalg.erase_unnecessary_inputs
+ } : !transform.any_op
+
+ // Bufferize the remaining operations in one time.
+ %11 = transform.bufferization.one_shot_bufferize %arg0 { bufferize_function_boundaries = true, function_boundary_type_conversion = 1 : i32} : (!transform.any_op) -> !transform.any_op
+
+ // Erase dead alloc and stores.
+ %12 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
+ transform.memref.erase_dead_alloc_and_stores %12 : (!transform.any_op) -> ()
+
+ // Generate GPU launch.
+ %13 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
+ %gpu_launch = transform.gpu.map_forall_to_blocks %13 { generate_gpu_launch } : (!transform.any_op) -> !transform.any_op
+
+ // Rewrite bufferized scf.forall ops to distributed gpu.thread_id attribute.
+ %mapped = transform.gpu.map_nested_forall_to_threads %gpu_launch block_dims = [64, 2, 1] warp_size = 32 : (!transform.any_op) -> !transform.any_op
+
+ %15 = transform.structured.match ops{["func.func"]} in %11 : (!transform.any_op) -> !transform.any_op
+
+ // Removes unnecessary GPU barriers from the function.
+ // %15 = transform.buddy.eliminate_gpu_barriers %14 : (!transform.any_op) -> !transform.any_op
+
+ // Perform canonicalization.
+ transform.apply_patterns to %15 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %15 : !transform.any_op
+ %all_loops_4 = transform.structured.match interface{LoopLikeInterface}
+ in %15
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_4 : !transform.any_op
+ transform.apply_patterns to %15 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.vector.lower_masked_transfers
+ } : !transform.any_op
+
+ // Identify static memory allocations within the given region,
+ // and move them to a higher level (hoisting).
+ transform.buddy.hoist_static_alloc %15 : (!transform.any_op) -> ()
+
+ // Collects patterns for folding memref aliasing ops (memref.subview) into consumer load/store ops (affine.load, memref.load, nvgpu.ldmatrix, vector.load, vector.transfer_read, affine.store, memref.store, etc.) and other ops (e.g., memref.subview).
+ transform.apply_patterns to %15 {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ } : !transform.any_op
+ // Collects patterns for extracting address computations from operations with memory accesses such that these memory accesses use only a base pointer.
+ transform.apply_patterns to %15 {
+ transform.apply_patterns.memref.extract_address_computations
+ } : !transform.any_op
+ // Perform canonicalization.
+ transform.apply_patterns to %15 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %15 : !transform.any_op
+ %all_loops_5 = transform.structured.match interface{LoopLikeInterface}
+ in %15
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_5 : !transform.any_op
+ transform.apply_patterns to %15 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.vector.lower_masked_transfers
+ } : !transform.any_op
+
+ // Adds patterns that unroll vectors to a native tile size for GPUs with mma operations
+ transform.apply_patterns to %15 {
+ transform.apply_patterns.buddy.unroll_vectors_gpu_mma_sync
+ } : !transform.any_op
+
+ // Insert a gpu.barrier after a given scf.for loop
+ %16 = transform.structured.match ops{["scf.for"]} in %15 : (!transform.any_op) -> !transform.op<"scf.for">
+ // transform.buddy.synchronize_loop %16 : (!transform.op<"scf.for">) -> ()
+
+
+ transform.apply_patterns to %15 {
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ } : !transform.any_op
+ transform.apply_cse to %15 : !transform.any_op
+
+ // Hoist vector.transfer_read / vector.transfer_write pairs out of immediately enclosing scf::ForOp iteratively
+ // Warning: Deprecated
+ %17 = transform.structured.hoist_redundant_vector_transfers %15 : (!transform.any_op) -> !transform.any_op
+
+ // Perform canonicalization.
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %17 : !transform.any_op
+ %all_loops_6 = transform.structured.match interface{LoopLikeInterface}
+ in %17
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_6 : !transform.any_op
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.vector.lower_masked_transfers
+ } : !transform.any_op
+
+ // This converts slices of operations containing vector.contract op into
+ // mma operations, targetting warp level tensorcore operations.
+ transform.buddy.vector.vector_to_mma_conversion %17 {use_mma_sync} : (!transform.any_op) -> ()
+
+ // %18 = transform.buddy.eliminate_gpu_barriers %17 : (!transform.any_op) -> !transform.any_op
+
+ // Perform canonicalization.
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %17 : !transform.any_op
+ %all_loops_7 = transform.structured.match interface{LoopLikeInterface}
+ in %17
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_7 : !transform.any_op
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.vector.lower_masked_transfers
+ } : !transform.any_op
+
+ %19 = transform.structured.match ops{["gpu.launch"]} in %17 : (!transform.any_op) -> !transform.any_op
+ %fwfa = transform.structured.match ops{["memref.alloc"]} in %19 : (!transform.any_op) -> !transform.op<"memref.alloc">
+
+ // Do multi-buffering/array expansion to remove dependencies on the temporary allocation between consecutive loop iterations.
+ transform.memref.multibuffer %fwfa {factor = 3 : i64, skip_analysis} : (!transform.op<"memref.alloc">) -> !transform.any_op
+
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.vector.transfer_to_scf full_unroll = true
+ } : !transform.any_op
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ } : !transform.any_op
+ transform.apply_cse to %17 : !transform.any_op
+ %all_loops_8 = transform.structured.match interface{LoopLikeInterface}
+ in %17
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_8 : !transform.any_op
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.vector.lower_masked_transfers
+ } : !transform.any_op
+
+ // Convert sync copies to shared memory to async.
+ // transform.buddy.create_async_groups %17 {use_mma_sync} : (!transform.any_op) -> ()
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ } : !transform.any_op
+ %all_loops_9 = transform.structured.match interface{LoopLikeInterface}
+ in %17
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_9 : !transform.any_op
+ transform.apply_cse to %17 : !transform.any_op
+
+
+ %20 = transform.structured.match ops{["nvgpu.mma.sync"]} in %17 : (!transform.any_op) -> !transform.any_op
+ %21 = transform.get_parent_op %20 {deduplicate, op_name = "scf.for"} : (!transform.any_op) -> !transform.any_op
+ // This applies software pipelining to a given scf.for loop.
+ // The pipelining strategy will look for a copy to shared memory and pipeline it to overlap it with the rest of the loop.
+ // %22 = transform.buddy.pipeline_shared_memory_copies %21 {depth = 3 : i64, use_mma_sync, peel_epilogue} : (!transform.any_op) -> !transform.any_op
+
+ // Perform canonicalization.
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.vector.lower_masks
+ } : !transform.any_op
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.vector.materialize_masks
+ } : !transform.any_op
+ transform.apply_patterns to %17 {
+ transform.apply_patterns.linalg.tiling_canonicalization
+ transform.apply_patterns.scf.for_loop_canonicalization
+ transform.apply_patterns.canonicalization
+ transform.apply_patterns.memref.fold_memref_alias_ops
+ } : !transform.any_op
+
+ %all_loops_10 = transform.structured.match interface{LoopLikeInterface}
+ in %17
+ : (!transform.any_op) -> !transform.any_op
+ transform.apply_licm to %all_loops_10 : !transform.any_op
+ transform.apply_cse to %17 : !transform.any_op
+
+ transform.yield
+ }
+} // module
diff --git a/examples/BuddyGen/.gitignore b/examples/BuddyGen/.gitignore
new file mode 100644
index 0000000000..df9389428a
--- /dev/null
+++ b/examples/BuddyGen/.gitignore
@@ -0,0 +1,4 @@
+log.mlir
+log.ll
+log.s
+a.out
diff --git a/examples/BuddyGen/GenMemRef.cpp b/examples/BuddyGen/GenMemRef.cpp
new file mode 100644
index 0000000000..8ca2526b79
--- /dev/null
+++ b/examples/BuddyGen/GenMemRef.cpp
@@ -0,0 +1,43 @@
+//===- GenMemRef.cpp ------------------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+
+// $ export LLVM_DIR=$PWD/../../llvm/
+// $ export LLVM_BUILD_DIR=$LLVM_DIR/build
+// $ c++ GenMemRef.cpp \
+ -I $LLVM_DIR/llvm/include/ -I $LLVM_BUILD_DIR/include/ \
+ -I $LLVM_DIR/mlir/include/ -I $LLVM_BUILD_DIR/tools/mlir/include/ \
+ -L$LLVM_BUILD_DIR/lib -lMLIRIR -lMLIRParser -lMLIRSupport -lLLVMCore \
+ -lLLVMSupport -lncurses -ltinfo -lstdc++ -lLLVMDemangle \
+ -o a.out
+// $ ./a.out
+
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+
+int main() {
+ mlir::MLIRContext context;
+ mlir::OpBuilder builder(&context);
+ mlir::Type eleType = builder.getF64Type();
+ // Target memref type:
+ // `memref>`
+ mlir::MemRefType memrefType = mlir::MemRefType::get(
+ {mlir::ShapedType::kDynamic}, eleType,
+ mlir::StridedLayoutAttr::get(
+ &context, /*offset=*/mlir::ShapedType::kDynamic, /*strides=*/{1}));
+ memrefType.dump();
+ return 0;
+}
diff --git a/examples/BuddyLeNet/CMakeLists.txt b/examples/BuddyLeNet/CMakeLists.txt
index 9698f617bc..b765218c68 100644
--- a/examples/BuddyLeNet/CMakeLists.txt
+++ b/examples/BuddyLeNet/CMakeLists.txt
@@ -6,25 +6,26 @@ add_custom_command(
add_custom_command(
OUTPUT forward.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-pass-pipeline "builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), eliminate-empty-tensors, func.func(llvm-request-c-wrappers),convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, convert-func-to-llvm, reconcile-unrealized-casts)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/forward.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/forward.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/forward.mlir
COMMENT "Building forward.o"
VERBATIM)
add_custom_command(
OUTPUT subgraph0.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
${BUDDY_BINARY_DIR}/buddy-opt
-eliminate-empty-tensors
- -convert-tensor-to-linalg
+ -convert-tensor-to-linalg
-linalg-bufferize
+ -batchmatmul-optimize
-convert-linalg-to-affine-loops
-lower-affine
-func-bufferize-dynamic-offset
@@ -42,9 +43,9 @@ add_custom_command(
-convert-arith-to-llvm
-convert-func-to-llvm
-reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyLeNet/subgraph0.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyLeNet/subgraph0.mlir
COMMENT "Building subgraph0.o"
VERBATIM)
@@ -54,7 +55,8 @@ add_library(LENET STATIC subgraph0.o forward.o)
SET_TARGET_PROPERTIES(LENET PROPERTIES LINKER_LANGUAGE C)
add_executable(buddy-lenet-run buddy-lenet-main.cpp)
-target_link_directories(buddy-lenet-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_directories(buddy-lenet-run PRIVATE ${LLVM_LIBRARY_DIR})
+
+set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${PNG_LIBRARIES})
-set(BUDDY_LENET_LIBS LENET mlir_c_runner_utils ${OpenCV_LIBS})
target_link_libraries(buddy-lenet-run ${BUDDY_LENET_LIBS})
diff --git a/examples/BuddyLeNet/README.md b/examples/BuddyLeNet/README.md
index 5988edbe7b..b9b0c44a5f 100644
--- a/examples/BuddyLeNet/README.md
+++ b/examples/BuddyLeNet/README.md
@@ -25,8 +25,7 @@ $ cmake -G Ninja .. \
-DCMAKE_BUILD_TYPE=RELEASE \
-DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
-DPython3_EXECUTABLE=$(which python3) \
- -DBUDDY_ENABLE_OPENCV=ON \
- -DOpenCV_DIR=
+ -DBUDDY_MLIR_ENABLE_DIP_LIB=ON
$ ninja
$ ninja check-buddy
```
diff --git a/examples/BuddyLeNet/buddy-lenet-main.cpp b/examples/BuddyLeNet/buddy-lenet-main.cpp
index 4e2dc2efe0..2fc8b0fbe3 100644
--- a/examples/BuddyLeNet/buddy-lenet-main.cpp
+++ b/examples/BuddyLeNet/buddy-lenet-main.cpp
@@ -15,41 +15,24 @@
//===----------------------------------------------------------------------===//
#include
-#include
+#include
#include
+#include
#include
#include
#include
#include
-#include
#include
#include
#include
constexpr size_t ParamsSize = 44426;
-const std::string ImgName = "3.png";
+const std::string ImgName = "1-28*28.png";
/// Declare LeNet forward function.
extern "C" void _mlir_ciface_forward(MemRef *output,
MemRef *arg0,
- Img *input);
-
-/// Function for preprocessing the image to match model input requirements.
-const cv::Mat imagePreprocessing() {
- // Get the directory of the LeNet example and construct the image path.
- std::string lenetDir = getenv("LENET_EXAMPLE_PATH");
- std::string imgPath = lenetDir + "/images/" + ImgName;
- // Read the image in grayscale mode.
- cv::Mat inputImage = cv::imread(imgPath, cv::IMREAD_GRAYSCALE);
- assert(!inputImage.empty() && "Could not read the image.");
- cv::Mat resizedImage;
- int imageWidth = 28;
- int imageHeight = 28;
- // Resize the image to 28x28 pixels.
- cv::resize(inputImage, resizedImage, cv::Size(imageWidth, imageHeight),
- cv::INTER_LINEAR);
- return resizedImage;
-}
+ dip::Image *input);
/// Print [Log] label in bold blue format.
void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
@@ -112,19 +95,16 @@ int main() {
const std::string title = "LeNet Inference Powered by Buddy Compiler";
std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
- // Preprocess the image to match the input requirements of the model.
- cv::Mat image = imagePreprocessing();
-
- // Define the sizes of the input and output tensors.
- intptr_t sizesInput[4] = {1, 1, 28, 28};
+ // Define the sizes of the output tensors.
intptr_t sizesOutput[2] = {1, 10};
// Create input and output containers for the image and model output.
- Img input(image, sizesInput, true);
+ std::string lenetDir = getenv("LENET_EXAMPLE_PATH");
+ std::string imgPath = lenetDir + "/images/" + ImgName;
+ dip::Image input(imgPath, dip::DIP_GRAYSCALE, true /* norm */);
MemRef output(sizesOutput);
// Load model parameters from the specified file.
- std::string lenetDir = getenv("LENET_EXAMPLE_PATH");
std::string paramsDir = lenetDir + "/arg0.data";
MemRef paramsContainer({ParamsSize});
loadParameters(paramsDir, paramsContainer);
diff --git a/examples/BuddyLeNet/fake-lenet.mlir b/examples/BuddyLeNet/fake-lenet.mlir
index 48d91a7fd3..d7d80a533a 100644
--- a/examples/BuddyLeNet/fake-lenet.mlir
+++ b/examples/BuddyLeNet/fake-lenet.mlir
@@ -1,5 +1,6 @@
module {
func.func private @printMemrefF32(%ptr : tensor<*xf32>)
+ func.func private @rtclock() -> f64
func.func @forward(%arg0: tensor<44426xf32>, %arg1: tensor<1x1x28x28xf32>) -> tensor<1x10xf32> {
%extracted_slice = tensor.extract_slice %arg0[0] [150] [1] : tensor<44426xf32> to tensor<150xf32>
@@ -81,10 +82,16 @@ module {
%fake_params = arith.constant dense<1.0> : tensor<44426xf32>
%fake_input = arith.constant dense<2.0> : tensor<1x1x28x28xf32>
+ %t_start = call @rtclock() : () -> f64
%fake_output = call @forward(%fake_params, %fake_input) : (tensor<44426xf32>, tensor<1x1x28x28xf32>) -> tensor<1x10xf32>
+ %t_end = call @rtclock() : () -> f64
%tensor_unranked = tensor.cast %fake_output : tensor<1x10xf32> to tensor<*xf32>
call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+
+ %time = arith.subf %t_end, %t_start : f64
+ vector.print %time : f64
+
return
}
}
diff --git a/examples/BuddyLeNet/images/0-28*28.png b/examples/BuddyLeNet/images/0-28*28.png
new file mode 100644
index 0000000000..a7a3b2a327
Binary files /dev/null and b/examples/BuddyLeNet/images/0-28*28.png differ
diff --git a/examples/BuddyLeNet/images/1-28*28.png b/examples/BuddyLeNet/images/1-28*28.png
new file mode 100644
index 0000000000..0f25e8b026
Binary files /dev/null and b/examples/BuddyLeNet/images/1-28*28.png differ
diff --git a/examples/BuddyLeNet/images/8-16bit-565-28*28.bmp b/examples/BuddyLeNet/images/8-16bit-565-28*28.bmp
new file mode 100644
index 0000000000..d4a43393d3
Binary files /dev/null and b/examples/BuddyLeNet/images/8-16bit-565-28*28.bmp differ
diff --git a/examples/BuddyLeNet/images/8-24bit-28*28.bmp b/examples/BuddyLeNet/images/8-24bit-28*28.bmp
new file mode 100644
index 0000000000..6591e87be8
Binary files /dev/null and b/examples/BuddyLeNet/images/8-24bit-28*28.bmp differ
diff --git a/examples/BuddyLeNet/images/8.bmp b/examples/BuddyLeNet/images/8.bmp
new file mode 100644
index 0000000000..7a9e02a295
Binary files /dev/null and b/examples/BuddyLeNet/images/8.bmp differ
diff --git a/examples/BuddyLeNet/makefile b/examples/BuddyLeNet/makefile
index 6f06642728..fe87b6da1a 100644
--- a/examples/BuddyLeNet/makefile
+++ b/examples/BuddyLeNet/makefile
@@ -1,30 +1,33 @@
#!/bin/bash
-BUDDY_OPT := ../../build/bin/buddy-opt
-MLIR_OPT := ../../llvm/build/bin/mlir-opt
-MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
-MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
-LLC := ../../llvm/build/bin/llc
-OPT_FLAG := -O0
+BUDDY_BUILD_DIR := ../../build/
+LLVM_BUILD_DIR := ../../llvm/build/
+BUDDY_OPT := ${BUDDY_BUILD_DIR}/bin/buddy-opt
+MLIR_OPT := ${LLVM_BUILD_DIR}/bin/mlir-opt
+MLIR_TRANSLATE := ${LLVM_BUILD_DIR}/bin/mlir-translate
+MLIR_CPU_RUNNER := ${LLVM_BUILD_DIR}/bin/mlir-cpu-runner
+LLC := ${LLVM_BUILD_DIR}/bin/llc
+OPT_FLAG := -O3
ifeq ($(shell uname),Linux)
-MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
-MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
-MLIR_ASYNC_RUNTIME := ../../llvm/build/lib/libmlir_async_runtime.so
+MLIR_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_c_runner_utils.so
+MLIR_ASYNC_RUNTIME := ${LLVM_BUILD_DIR}/lib/libmlir_async_runtime.so
MTRIPLE := x86_64-unknown-linux-gnu
else ifeq ($(shell uname),Darwin)
-MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
-MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
-MLIR_ASYNC_RUNTIME := ./../llvm/build/lib/libmlir_async_runtime.dylib
+MLIR_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_runner_utils.dylib
+MLIR_C_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_c_runner_utils.dylib
+MLIR_ASYNC_RUNTIME := ${LLVM_BUILD_DIR}/lib/libmlir_async_runtime.dylib
MTRIPLE := x86_64-apple-darwin
endif
buddy-lenet-lower:
- @${MLIR_OPT} ./fake-lenet.mlir \
+ @${BUDDY_OPT} ./fake-lenet.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
- ${MLIR_OPT} \
+ ${BUDDY_OPT} \
-eliminate-empty-tensors \
-convert-tensor-to-linalg \
-linalg-bufferize \
+ -batchmatmul-optimize \
-convert-linalg-to-affine-loops \
-lower-affine \
-func-bufferize \
@@ -38,16 +41,15 @@ buddy-lenet-lower:
-convert-arith-to-llvm \
-finalize-memref-to-llvm \
-convert-scf-to-cf \
- -llvm-request-c-wrappers \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts \
-o ./log.mlir
buddy-lenet-translate:
- @${MLIR_OPT} ./fake-lenet.mlir \
+ @${BUDDY_OPT} ./fake-lenet.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
- ${MLIR_OPT} \
+ ${BUDDY_OPT} \
-eliminate-empty-tensors \
-convert-tensor-to-linalg \
-linalg-bufferize \
@@ -64,7 +66,6 @@ buddy-lenet-translate:
-convert-arith-to-llvm \
-finalize-memref-to-llvm \
-convert-scf-to-cf \
- -llvm-request-c-wrappers \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
@@ -72,9 +73,9 @@ buddy-lenet-translate:
buddy-lenet-run:
- @${MLIR_OPT} ./fake-lenet.mlir \
+ @${BUDDY_OPT} ./fake-lenet.mlir \
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
- ${MLIR_OPT} \
+ ${BUDDY_OPT} \
-eliminate-empty-tensors \
-convert-tensor-to-linalg \
-linalg-bufferize \
@@ -91,7 +92,33 @@ buddy-lenet-run:
-convert-arith-to-llvm \
-finalize-memref-to-llvm \
-convert-scf-to-cf \
- -llvm-request-c-wrappers \
+ -convert-arith-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+buddy-lenet-opt-run:
+ @${BUDDY_OPT} ./fake-lenet.mlir \
+ -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
+ ${BUDDY_OPT} \
+ -eliminate-empty-tensors \
+ -convert-tensor-to-linalg \
+ -linalg-bufferize \
+ -batchmatmul-optimize \
+ -convert-linalg-to-affine-loops \
+ -lower-affine \
+ -func-bufferize \
+ -arith-bufferize \
+ -tensor-bufferize \
+ -buffer-deallocation \
+ -finalizing-bufferize \
+ -convert-vector-to-scf \
+ -expand-strided-metadata \
+ -convert-vector-to-llvm \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-scf-to-cf \
-convert-arith-to-llvm \
-convert-func-to-llvm \
-reconcile-unrealized-casts | \
diff --git a/examples/BuddyLlama/CMakeLists.txt b/examples/BuddyLlama/CMakeLists.txt
index 97aa736cb7..a6bfc2f742 100644
--- a/examples/BuddyLlama/CMakeLists.txt
+++ b/examples/BuddyLlama/CMakeLists.txt
@@ -6,14 +6,14 @@ add_custom_command(
add_custom_command(
OUTPUT forward.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/forward.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/forward.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
${BUDDY_BINARY_DIR}/buddy-opt
-arith-expand
-eliminate-empty-tensors
-empty-tensor-to-alloc-tensor
-one-shot-bufferize
- -matmul-paralell-vectorization-optimize
+ -matmul-parallel-vectorization-optimize
-batchmatmul-optimize
-convert-linalg-to-affine-loops
-affine-loop-fusion
@@ -40,9 +40,9 @@ add_custom_command(
-convert-math-to-libm
-convert-func-to-llvm
-reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
-o ${BUDDY_BINARY_DIR}/../examples/BuddyLlama/forward.o
DEPENDS buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/forward.mlir
COMMENT "Building forward.o "
@@ -50,14 +50,14 @@ add_custom_command(
add_custom_command(
OUTPUT subgraph.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/subgraph0.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/subgraph0.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" |
${BUDDY_BINARY_DIR}/buddy-opt
-arith-expand
-eliminate-empty-tensors
-empty-tensor-to-alloc-tensor
-one-shot-bufferize
- -matmul-paralell-vectorization-optimize
+ -matmul-parallel-vectorization-optimize
-batchmatmul-optimize
-convert-linalg-to-affine-loops
-affine-loop-fusion
@@ -85,9 +85,9 @@ add_custom_command(
-convert-math-to-libm
-convert-func-to-llvm
-reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
-o ${BUDDY_BINARY_DIR}/../examples/BuddyLlama/subgraph.o
DEPENDS buddy-opt ${BUDDY_EXAMPLES_DIR}/BuddyLlama/subgraph0.mlir
COMMENT "Building subgraph.o "
@@ -107,7 +107,7 @@ SET_TARGET_PROPERTIES(
LINKER_LANGUAGE C)
add_executable(buddy-llama-run llama-main.cpp)
-target_link_directories(buddy-llama-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_directories(buddy-llama-run PRIVATE ${LLVM_LIBRARY_DIR})
set(BUDDY_LLAMA_LIBS
LLAMA
diff --git a/examples/BuddyLlama/import-llama2.py b/examples/BuddyLlama/import-llama2.py
index fbd12e5bff..2903d6bd81 100644
--- a/examples/BuddyLlama/import-llama2.py
+++ b/examples/BuddyLlama/import-llama2.py
@@ -1,11 +1,3 @@
-import os
-import torch
-import torch._dynamo as dynamo
-from transformers import LlamaForCausalLM, LlamaTokenizer
-from torch._inductor.decomposition import decompositions as inductor_decomp
-import numpy
-
-from buddy.compiler.frontend import DynamoCompiler
# ===- import-llama2.py --------------------------------------------------------
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -25,6 +17,15 @@
# This is the test of llama2 model.
#
# ===---------------------------------------------------------------------------
+
+import os
+import torch
+import torch._dynamo as dynamo
+from transformers import LlamaForCausalLM, LlamaTokenizer
+from torch._inductor.decomposition import decompositions as inductor_decomp
+import numpy
+
+from buddy.compiler.frontend import DynamoCompiler
from buddy.compiler.ops import tosa
from buddy.compiler.graph import GraphDriver
from buddy.compiler.graph.transform import simply_fuse
diff --git a/examples/BuddyMatmul/.gitignore b/examples/BuddyMatmul/.gitignore
new file mode 100644
index 0000000000..80a243fa81
--- /dev/null
+++ b/examples/BuddyMatmul/.gitignore
@@ -0,0 +1 @@
+log.*
diff --git a/examples/BuddyMatmul/linalg-batchmatmul-f32.mlir b/examples/BuddyMatmul/linalg-batchmatmul-f32.mlir
new file mode 100644
index 0000000000..58c9142398
--- /dev/null
+++ b/examples/BuddyMatmul/linalg-batchmatmul-f32.mlir
@@ -0,0 +1,82 @@
+// RUN: buddy-opt %s \
+// RUN: -batchmatmul-optimize \
+// RUN: -convert-linalg-to-affine-loops \
+// RUN: -lower-affine \
+// RUN: -convert-vector-to-scf \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm \
+// RUN: -convert-math-to-llvm \
+// RUN: -convert-math-to-libm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -convert-func-to-llvm \
+// RUN: -expand-strided-metadata \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @printMemrefF32(memref<*xf32>)
+
+func.func @batch_matmul(%arg0: memref, %arg1: memref, %arg2: memref) {
+ linalg.batch_matmul
+ ins(%arg0, %arg1 : memref, memref)
+ outs(%arg2 : memref)
+ return
+}
+
+func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: index, %arg4: f32) -> memref {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = memref.alloc(%arg0, %arg1, %arg2) : memref
+ scf.for %idx0 = %c0 to %arg0 step %c1 {
+ scf.for %idx1 = %c0 to %arg1 step %c1 {
+ scf.for %idx2 = %c0 to %arg2 step %c1 {
+ memref.store %arg4, %0[%idx0, %idx1, %idx2] : memref
+ }
+ }
+ }
+ return %0 : memref
+}
+
+func.func @main(){
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c576 = arith.constant 576 : index
+ %c1024 = arith.constant 1024 : index
+ %c1000 = arith.constant 1000 : index
+ %f0 = arith.constant 0.0 : f32
+ %f2 = arith.constant 2.0 : f32
+ %f3 = arith.constant 3.0 : f32
+
+ %m0 = call @alloc_f32(%c1, %c1, %c576, %f2) : (index, index, index, f32) -> memref
+ %m1 = call @alloc_f32(%c1, %c576, %c1024, %f3) : (index, index, index, f32) -> memref
+ %m2 = call @alloc_f32(%c1, %c1, %c1024, %f0) : (index, index, index, f32) -> memref
+
+ call @batch_matmul(%m0, %m1, %m2) : (memref, memref, memref) -> ()
+
+ %printed_m2 = memref.cast %m2 : memref to memref<*xf32>
+
+ // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 1, 1024] strides = [1024, 1024, 1] data =
+ // CHECK-NEXT: [
+ // CHECK: [
+ // CHECK: [3456{{(, 3456)*}}]
+ call @printMemrefF32(%printed_m2) : (memref<*xf32>) -> ()
+
+ %m3 = call @alloc_f32(%c1, %c1, %c1024, %f2) : (index, index, index, f32) -> memref
+ %m4 = call @alloc_f32(%c1, %c1024, %c1000, %f3) : (index, index, index, f32) -> memref
+ %m5 = call @alloc_f32(%c1, %c1, %c1000, %f0) : (index, index, index, f32) -> memref
+
+ call @batch_matmul(%m3, %m4, %m5) : (memref, memref, memref) -> ()
+
+ %printed_m5 = memref.cast %m5 : memref to memref<*xf32>
+
+ // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 1, 1000] strides = [1000, 1000, 1] data =
+ // CHECK-NEXT: [
+ // CHECK: [
+ // CHECK: [6144{{(, 6144)*}}]
+ call @printMemrefF32(%printed_m5) : (memref<*xf32>) -> ()
+
+ return
+}
diff --git a/examples/BuddyMatmul/linalg-transposematmulb-f32.mlir b/examples/BuddyMatmul/linalg-transposematmulb-f32.mlir
new file mode 100644
index 0000000000..26a4458c53
--- /dev/null
+++ b/examples/BuddyMatmul/linalg-transposematmulb-f32.mlir
@@ -0,0 +1,75 @@
+// RUN: buddy-opt %s \
+// RUN: -matmul-transpose-b-vectorization \
+// RUN: -convert-linalg-to-affine-loops \
+// RUN: -lower-affine \
+// RUN: -convert-vector-to-scf \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm \
+// RUN: -convert-math-to-llvm \
+// RUN: -convert-math-to-libm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -convert-func-to-llvm \
+// RUN: -expand-strided-metadata \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @printMemrefF32(memref<*xf32>)
+
+func.func @test(%a : memref, %b : memref, %c : memref) {
+ linalg.matmul_transpose_b
+ ins(%a, %b: memref, memref)
+ outs(%c: memref)
+ return
+ }
+
+func.func @alloc_f32(%arg0: index, %arg1: index, %arg4: f32) -> memref {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = memref.alloc(%arg0, %arg1) : memref
+ scf.for %idx0 = %c0 to %arg0 step %c1 {
+ scf.for %idx1 = %c0 to %arg1 step %c1 {
+ memref.store %arg4, %0[%idx0, %idx1] : memref
+ }
+ }
+ return %0 : memref
+}
+
+func.func @main(){
+ %c32 = arith.constant 32 : index
+ %c1024 = arith.constant 1024 : index
+ %c3 = arith.constant 3 : index
+ %f0 = arith.constant 0.0 : f32
+ %f1 = arith.constant 1.0 : f32
+
+ %m0 = call @alloc_f32(%c32,%c1024, %f1) : (index, index, f32) -> memref
+ %m1 = call @alloc_f32(%c32,%c1024, %f1) : (index, index, f32) -> memref
+ %m2 = call @alloc_f32(%c32,%c32, %f0) : (index, index, f32) -> memref
+
+ call @test(%m0, %m1, %m2) : (memref, memref, memref) -> ()
+
+ %printed_m2 = memref.cast %m2 : memref to memref<*xf32>
+
+ // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [32, 32] strides = [32, 1] data =
+ // CHECK-NEXT: [
+ // CHECK: [1024{{(, 1024)*}}]
+ call @printMemrefF32(%printed_m2) : (memref<*xf32>) -> ()
+
+ %m3 = call @alloc_f32(%c3,%c3, %f1) : (index, index, f32) -> memref
+ %m4 = call @alloc_f32(%c3,%c3, %f1) : (index, index, f32) -> memref
+ %m5 = call @alloc_f32(%c3,%c3, %f0) : (index, index, f32) -> memref
+
+ call @test(%m3, %m4, %m5) : (memref, memref, memref) -> ()
+
+ %printed_m5 = memref.cast %m5 : memref to memref<*xf32>
+
+ // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [3, 3] strides = [3, 1] data =
+ // CHECK-NEXT: [
+ // CHECK: [3{{(, 3)*}}]
+ call @printMemrefF32(%printed_m5) : (memref<*xf32>) -> ()
+
+ return
+}
diff --git a/examples/BuddyMatmul/makefile b/examples/BuddyMatmul/makefile
new file mode 100644
index 0000000000..0940d608da
--- /dev/null
+++ b/examples/BuddyMatmul/makefile
@@ -0,0 +1,55 @@
+#!/bin/bash
+BUDDY_BUILD_DIR := ../../build/
+LLVM_BUILD_DIR := ../../llvm/build/
+BUDDY_OPT := ${BUDDY_BUILD_DIR}/bin/buddy-opt
+MLIR_OPT := ${LLVM_BUILD_DIR}/bin/mlir-opt
+MLIR_TRANSLATE := ${LLVM_BUILD_DIR}/bin/mlir-translate
+MLIR_CPU_RUNNER := ${LLVM_BUILD_DIR}/bin/mlir-cpu-runner
+LLC := ${LLVM_BUILD_DIR}/bin/llc
+OPT_FLAG := -O0
+
+ifeq ($(shell uname),Linux)
+MLIR_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_c_runner_utils.so
+MTRIPLE := x86_64-unknown-linux-gnu
+else ifeq ($(shell uname),Darwin)
+MLIR_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_runner_utils.dylib
+MLIR_C_RUNNER_UTILS := ${LLVM_BUILD_DIR}/lib/libmlir_c_runner_utils.dylib
+MTRIPLE := x86_64-apple-darwin
+endif
+
+linalg-batchmatmul-f32-run:
+ @${BUDDY_OPT} ./linalg-batchmatmul-f32.mlir \
+ -batchmatmul-optimize \
+ -convert-linalg-to-affine-loops \
+ -lower-affine \
+ -convert-vector-to-scf \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-math-to-llvm \
+ -convert-math-to-libm \
+ -convert-arith-to-llvm \
+ -convert-func-to-llvm \
+ -expand-strided-metadata \
+ -finalize-memref-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-matmul-transpose-b-f32-run:
+ @${BUDDY_OPT} ./linalg-transposematmulb-f32.mlir\
+ -matmul-transpose-b-vectorization \
+ -convert-linalg-to-affine-loops \
+ -lower-affine \
+ -convert-vector-to-scf \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-math-to-llvm \
+ -convert-math-to-libm \
+ -convert-arith-to-llvm \
+ -convert-func-to-llvm \
+ -expand-strided-metadata \
+ -finalize-memref-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
diff --git a/examples/BuddyMobileNetV3/CMakeLists.txt b/examples/BuddyMobileNetV3/CMakeLists.txt
index e55cc61711..ef60c7e931 100644
--- a/examples/BuddyMobileNetV3/CMakeLists.txt
+++ b/examples/BuddyMobileNetV3/CMakeLists.txt
@@ -1,6 +1,5 @@
add_custom_command(
OUTPUT ${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/arg0.data
- ${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/arg1.data
${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/forward.mlir
${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/subgraph0.mlir
COMMAND python3 ${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/buddy-mobilenetv3-import.py
@@ -10,21 +9,21 @@ add_custom_command(
add_custom_command(
OUTPUT forward.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/forward.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/forward.mlir
-pass-pipeline
"builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), \
empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, \
func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-pass-pipeline
"builtin.module(func.func(buffer-deallocation-simplification, convert-linalg-to-loops), \
eliminate-empty-tensors, func.func(llvm-request-c-wrappers), \
convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, \
convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, \
convert-func-to-llvm, reconcile-unrealized-casts)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
-o ${BUDDY_BINARY_DIR}/../examples/BuddyMobileNetV3/forward.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/forward.mlir
COMMENT "Building forward.o"
@@ -55,9 +54,9 @@ add_custom_command(
-expand-strided-metadata
-finalize-memref-to-llvm
-reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3
-o ${BUDDY_BINARY_DIR}/../examples/BuddyMobileNetV3/subgraph0.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyMobileNetV3/subgraph0.mlir
buddy-opt
@@ -69,7 +68,7 @@ add_library(MOBILENETV3 STATIC subgraph0.o forward.o)
SET_TARGET_PROPERTIES(MOBILENETV3 PROPERTIES LINKER_LANGUAGE C)
add_executable(buddy-mobilenetv3-run buddy-mobilenetv3-main.cpp)
-target_link_directories(buddy-mobilenetv3-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+target_link_directories(buddy-mobilenetv3-run PRIVATE ${LLVM_LIBRARY_DIR})
-set(BUDDY_MOBILENETV3_LIBS MOBILENETV3 mlir_c_runner_utils ${OpenCV_LIBS})
+set(BUDDY_MOBILENETV3_LIBS MOBILENETV3 mlir_c_runner_utils BuddyLibDIP ${PNG_LIBRARIES})
target_link_libraries(buddy-mobilenetv3-run ${BUDDY_MOBILENETV3_LIBS})
diff --git a/examples/BuddyMobileNetV3/Labels.txt b/examples/BuddyMobileNetV3/Labels.txt
index fe811239d8..8bdc20a086 100644
--- a/examples/BuddyMobileNetV3/Labels.txt
+++ b/examples/BuddyMobileNetV3/Labels.txt
@@ -1,4 +1,3 @@
-background
tench
goldfish
great white shark
@@ -133,7 +132,7 @@ flamingo
little blue heron
American egret
bittern
-crane
+crane bird
limpkin
European gallinule
American coot
@@ -638,7 +637,7 @@ magnetic compass
mailbag
mailbox
maillot
-maillot
+maillot tank suit
manhole cover
maraca
marimba
diff --git a/examples/BuddyMobileNetV3/README.md b/examples/BuddyMobileNetV3/README.md
index 1146addb69..a55cd74304 100644
--- a/examples/BuddyMobileNetV3/README.md
+++ b/examples/BuddyMobileNetV3/README.md
@@ -16,8 +16,8 @@ $ cmake -G Ninja .. \
-DCMAKE_BUILD_TYPE=RELEASE \
-DBUDDY_MLIR_ENABLE_PYTHON_PACKAGES=ON \
-DPython3_EXECUTABLE=$(which python3) \
- -DBUDDY_ENABLE_OPENCV=ON \
- -DOpenCV_DIR=
+ -DBUDDY_MLIR_ENABLE_DIP_LIB=ON \
+ -DBUDDY_ENABLE_PNG=ON
$ ninja
$ ninja check-buddy
```
diff --git a/examples/BuddyMobileNetV3/buddy-mobilenetv3-import.py b/examples/BuddyMobileNetV3/buddy-mobilenetv3-import.py
index 2403800bf9..704b8fc2e3 100644
--- a/examples/BuddyMobileNetV3/buddy-mobilenetv3-import.py
+++ b/examples/BuddyMobileNetV3/buddy-mobilenetv3-import.py
@@ -38,9 +38,17 @@
"The environment variable 'MOBILENETV3_MODEL_PATH' is not set or is invalid."
)
-model = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.IMAGENET1K_V1, pretrained=True)
+model = models.mobilenet_v3_small(
+ weights=models.MobileNet_V3_Small_Weights.IMAGENET1K_V1, pretrained=True
+)
model = model.eval()
+# Remove the num_batches_tracked attribute.
+for layer in model.modules():
+ if isinstance(layer, torch.nn.BatchNorm2d):
+ if hasattr(layer, "num_batches_tracked"):
+ del layer.num_batches_tracked
+
# Initialize Dynamo Compiler with specific configurations as an importer.
dynamo_compiler = DynamoCompiler(
primary_registry=tosa.ops_registry,
@@ -68,11 +76,10 @@
float32_param = np.concatenate(
- [param.detach().numpy().reshape([-1]) for param in params if param.dtype == torch.float32]
+ [
+ param.detach().numpy().reshape([-1])
+ for param in params
+ if param.dtype == torch.float32
+ ]
)
float32_param.tofile(Path(current_path) / "arg0.data")
-
-int64_param = np.concatenate(
- [param.detach().numpy().reshape([-1]) for param in params if param.dtype == torch.int64]
-)
-int64_param.tofile(Path(current_path) / "arg1.data")
diff --git a/examples/BuddyMobileNetV3/buddy-mobilenetv3-main.cpp b/examples/BuddyMobileNetV3/buddy-mobilenetv3-main.cpp
index 68d9d15411..90defb895e 100644
--- a/examples/BuddyMobileNetV3/buddy-mobilenetv3-main.cpp
+++ b/examples/BuddyMobileNetV3/buddy-mobilenetv3-main.cpp
@@ -1,4 +1,4 @@
-//===- MobileNetBenchmark.cpp ---------------------------------------------===//
+//===- buddy-mobilenetv3-main.cpp -----------------------------------------===//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@@ -15,13 +15,14 @@
//===----------------------------------------------------------------------===//
#include
-#include
+#include
+#include
#include
+#include
#include
#include
#include
#include
-#include
#include
#include
#include
@@ -31,61 +32,44 @@ const std::string ImgName = "dog.png";
// Declare the mobilenet C interface.
extern "C" void _mlir_ciface_forward(MemRef *output,
- MemRef *arg0,
- MemRef *arg1,
- Img *input);
-
-const cv::Mat imagePreprocessing() {
- // Get the directory of the LeNet example and construct the image path.
- std::string mobilenetDir = getenv("MOBILENETV3_EXAMPLE_PATH");
- std::string imgPath = mobilenetDir + "/images/" + ImgName;
- // Read the image in grayscale mode.
- cv::Mat inputImage = cv::imread(imgPath, cv::IMREAD_GRAYSCALE);
- assert(!inputImage.empty() && "Could not read the image.");
- cv::Mat resizedImage;
- int imageWidth = 224;
- int imageHeight = 224;
- // Resize the image to 224x224 pixels.
- cv::resize(inputImage, resizedImage, cv::Size(imageWidth, imageHeight),
- cv::INTER_LINEAR);
- return resizedImage;
-}
+ MemRef *arg0,
+ MemRef *input);
/// Print [Log] label in bold blue format.
void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
-void loadParameters(const std::string &floatParamPath,
- const std::string &int64ParamPath,
- MemRef &floatParam,
- MemRef &int64Param) {
- std::ifstream floatParamFile(floatParamPath, std::ios::in | std::ios::binary);
- if (!floatParamFile.is_open()) {
- std::string errMsg = "Failed to open float param file: " +
- std::filesystem::canonical(floatParamPath).string();
- throw std::runtime_error(errMsg);
- }
- floatParamFile.read(reinterpret_cast(floatParam.getData()),
- floatParam.getSize() * sizeof(float));
- if (floatParamFile.fail()) {
- throw std::runtime_error("Failed to read float param file");
- }
- floatParamFile.close();
-
-
- std::ifstream int64ParamFile(int64ParamPath, std::ios::in | std::ios::binary);
- if (!int64ParamFile.is_open()) {
- std::string errMsg = "Failed to open int64 param file: " +
- std::filesystem::canonical(int64ParamPath).string();
- throw std::runtime_error(errMsg);
+/// Load parameters into data container.
+void loadParameters(const std::string ¶mFilePath,
+ MemRef ¶ms) {
+ const auto loadStart = std::chrono::high_resolution_clock::now();
+ // Open the parameter file in binary mode.
+ std::ifstream paramFile(paramFilePath, std::ios::in | std::ios::binary);
+ if (!paramFile.is_open()) {
+ throw std::runtime_error("[Error] Failed to open params file!");
}
- int64ParamFile.read(reinterpret_cast(int64Param.getData()),
- int64Param.getSize() * sizeof(long long));
- if (int64ParamFile.fail()) {
- throw std::runtime_error("Failed to read int64 param file");
+ printLogLabel();
+ std::cout << "Loading params..." << std::endl;
+ printLogLabel();
+ // Print the canonical path of the parameter file.
+ std::cout << "Params file: " << std::filesystem::canonical(paramFilePath)
+ << std::endl;
+ // Read the parameter data into the provided memory reference.
+ paramFile.read(reinterpret_cast(params.getData()),
+ sizeof(float) * (params.getSize()));
+ if (paramFile.fail()) {
+ throw std::runtime_error("Error occurred while reading params file!");
}
- int64ParamFile.close();
+ paramFile.close();
+ const auto loadEnd = std::chrono::high_resolution_clock::now();
+ const std::chrono::duration loadTime =
+ loadEnd - loadStart;
+ printLogLabel();
+ std::cout << "Params load time: " << (double)(loadTime.count()) / 1000
+ << "s\n"
+ << std::endl;
}
+
// Softmax function.
void softmax(float *input, size_t size) {
size_t i;
@@ -110,8 +94,7 @@ void softmax(float *input, size_t size) {
std::string getLabel(int idx) {
std::string mobilenetDir = getenv("MOBILENETV3_EXAMPLE_PATH");
- std::ifstream in(
- mobilenetDir + "Labels.txt");
+ std::ifstream in(mobilenetDir + "Labels.txt");
assert(in.is_open() && "Could not read the label file.");
std::string label;
for (int i = 0; i < idx; ++i)
@@ -126,27 +109,26 @@ int main() {
const std::string title = "MobileNetV3 Inference Powered by Buddy Compiler";
std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
- // Preprocess the image to match the input requirements of the model.
- cv::Mat image = imagePreprocessing();
-
// Define the sizes of the input and output tensors.
- intptr_t sizesInput[4] = {1, 3, 224, 224};
intptr_t sizesOutput[2] = {1, 1000};
// Create input and output containers for the image and model output.
- Img input(image, sizesInput, true);
+ std::string mobilenetDir = getenv("MOBILENETV3_EXAMPLE_PATH");
+ std::string imgPath = mobilenetDir + "/images/" + ImgName;
+ dip::Image input(imgPath, dip::DIP_RGB, true /* norm */);
+ MemRef inputResize = dip::Resize4D_NCHW(
+ &input, dip::INTERPOLATION_TYPE::BILINEAR_INTERPOLATION,
+ {1, 3, 224, 224} /*{image_cols, image_rows}*/);
+
MemRef output(sizesOutput);
// Load model parameters from the specified file.
- std::string mobilenetDir = getenv("MOBILENETV3_EXAMPLE_PATH");
std::string paramsDir = mobilenetDir + "/arg0.data";
- std::string intDir = mobilenetDir + "/arg1.data";
- MemRef paramsContainerf32({ParamsSize});
- MemRef ParamsContainerInt64({34});
- loadParameters(paramsDir, intDir, paramsContainerf32, ParamsContainerInt64);
+ MemRef paramsContainer({ParamsSize});
+ loadParameters(paramsDir, paramsContainer);
// Call the forward function of the model.
- _mlir_ciface_forward(&output, ¶msContainerf32, &ParamsContainerInt64, &input);
-
+ _mlir_ciface_forward(&output, ¶msContainer, &inputResize);
+
auto out = output.getData();
softmax(out, 1000);
// Find the classification and print the result.
diff --git a/examples/BuddyMobileNetV3/images/curtain-224*224.png b/examples/BuddyMobileNetV3/images/curtain-224*224.png
new file mode 100644
index 0000000000..2fa9c06449
Binary files /dev/null and b/examples/BuddyMobileNetV3/images/curtain-224*224.png differ
diff --git a/examples/BuddyMobileNetV3/images/curtain.png b/examples/BuddyMobileNetV3/images/curtain.png
index 1ae383d359..67a54dbdde 100644
Binary files a/examples/BuddyMobileNetV3/images/curtain.png and b/examples/BuddyMobileNetV3/images/curtain.png differ
diff --git a/examples/BuddyMobileNetV3/images/dog-224*224.png b/examples/BuddyMobileNetV3/images/dog-224*224.png
new file mode 100644
index 0000000000..4c6649714c
Binary files /dev/null and b/examples/BuddyMobileNetV3/images/dog-224*224.png differ
diff --git a/examples/BuddyMobileNetV3/images/dog-32bit_224*224.bmp b/examples/BuddyMobileNetV3/images/dog-32bit_224*224.bmp
new file mode 100644
index 0000000000..201f030d7c
Binary files /dev/null and b/examples/BuddyMobileNetV3/images/dog-32bit_224*224.bmp differ
diff --git a/examples/BuddyMobileNetV3/images/dog.bmp b/examples/BuddyMobileNetV3/images/dog.bmp
new file mode 100644
index 0000000000..12f0e0dd11
Binary files /dev/null and b/examples/BuddyMobileNetV3/images/dog.bmp differ
diff --git a/examples/BuddyMobileNetV3/images/dog.png b/examples/BuddyMobileNetV3/images/dog.png
index 12f0e0dd11..4c6000a1fa 100644
Binary files a/examples/BuddyMobileNetV3/images/dog.png and b/examples/BuddyMobileNetV3/images/dog.png differ
diff --git a/examples/BuddyMobileNetV3/images/ice-cream-224*224.png b/examples/BuddyMobileNetV3/images/ice-cream-224*224.png
new file mode 100644
index 0000000000..1cd06efd4e
Binary files /dev/null and b/examples/BuddyMobileNetV3/images/ice-cream-224*224.png differ
diff --git a/examples/BuddyMobileNetV3/images/ice-cream-24bit-224*224.bmp b/examples/BuddyMobileNetV3/images/ice-cream-24bit-224*224.bmp
new file mode 100644
index 0000000000..75ad4012e0
Binary files /dev/null and b/examples/BuddyMobileNetV3/images/ice-cream-24bit-224*224.bmp differ
diff --git a/examples/BuddyMobileNetV3/images/ice-cream.png b/examples/BuddyMobileNetV3/images/ice-cream.png
index 209d8999d6..9bb408cea7 100644
Binary files a/examples/BuddyMobileNetV3/images/ice-cream.png and b/examples/BuddyMobileNetV3/images/ice-cream.png differ
diff --git a/examples/BuddyMobileNetV3/images/kite.png b/examples/BuddyMobileNetV3/images/kite.png
index 23ffe9613d..51912cddc6 100644
Binary files a/examples/BuddyMobileNetV3/images/kite.png and b/examples/BuddyMobileNetV3/images/kite.png differ
diff --git a/examples/BuddyMobileNetV3/images/traffic-light-24bit-224*224.bmp b/examples/BuddyMobileNetV3/images/traffic-light-24bit-224*224.bmp
new file mode 100644
index 0000000000..948a1ea796
Binary files /dev/null and b/examples/BuddyMobileNetV3/images/traffic-light-24bit-224*224.bmp differ
diff --git a/examples/BuddyMobileNetV3/images/traffic-light-32bit-224*224.bmp b/examples/BuddyMobileNetV3/images/traffic-light-32bit-224*224.bmp
new file mode 100644
index 0000000000..c415c8dc32
Binary files /dev/null and b/examples/BuddyMobileNetV3/images/traffic-light-32bit-224*224.bmp differ
diff --git a/examples/BuddyMobileNetV3/images/traffic-light.png b/examples/BuddyMobileNetV3/images/traffic-light.png
index fa1a1e3f61..3fa00918da 100644
Binary files a/examples/BuddyMobileNetV3/images/traffic-light.png and b/examples/BuddyMobileNetV3/images/traffic-light.png differ
diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
index 7e93591409..443907d352 100644
--- a/examples/BuddyNext/makefile
+++ b/examples/BuddyNext/makefile
@@ -164,3 +164,69 @@ next-attention-fusion-run:
-reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-sigmoid-run:
+ @${MLIR_OPT} ./next-sigmoid.mlir \
+ -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+ ${MLIR_OPT} \
+ -arith-expand \
+ -eliminate-empty-tensors \
+ -empty-tensor-to-alloc-tensor \
+ -one-shot-bufferize \
+ -convert-linalg-to-affine-loops \
+ -affine-loop-fusion \
+ -lower-affine \
+ -func-bufferize \
+ -arith-bufferize \
+ -tensor-bufferize \
+ -buffer-deallocation \
+ -finalizing-bufferize \
+ -convert-vector-to-scf \
+ -expand-strided-metadata \
+ -convert-vector-to-llvm \
+ -memref-expand \
+ -arith-expand \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-scf-to-cf \
+ -convert-openmp-to-llvm \
+ -convert-arith-to-llvm \
+ -convert-math-to-llvm \
+ -convert-math-to-libm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+next-rope-run:
+ @${MLIR_OPT} ./next-rope.mlir \
+ -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" | \
+ ${MLIR_OPT} \
+ -arith-expand \
+ -eliminate-empty-tensors \
+ -empty-tensor-to-alloc-tensor \
+ -one-shot-bufferize \
+ -convert-linalg-to-affine-loops \
+ -affine-loop-fusion \
+ -lower-affine \
+ -func-bufferize \
+ -arith-bufferize \
+ -tensor-bufferize \
+ -buffer-deallocation \
+ -finalizing-bufferize \
+ -convert-vector-to-scf \
+ -expand-strided-metadata \
+ -convert-vector-to-llvm \
+ -memref-expand \
+ -arith-expand \
+ -convert-arith-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-scf-to-cf \
+ -convert-openmp-to-llvm \
+ -convert-arith-to-llvm \
+ -convert-math-to-llvm \
+ -convert-math-to-libm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
diff --git a/examples/BuddyNext/next-rope.mlir b/examples/BuddyNext/next-rope.mlir
new file mode 100644
index 0000000000..091b2c220f
--- /dev/null
+++ b/examples/BuddyNext/next-rope.mlir
@@ -0,0 +1,157 @@
+// RUN: buddy-opt %s \
+// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN: -arith-expand \
+// RUN: -eliminate-empty-tensors \
+// RUN: -empty-tensor-to-alloc-tensor \
+// RUN: -one-shot-bufferize \
+// RUN: -convert-linalg-to-affine-loops \
+// RUN: -affine-loop-fusion \
+// RUN: -lower-affine \
+// RUN: -func-bufferize \
+// RUN: -arith-bufferize \
+// RUN: -tensor-bufferize \
+// RUN: -buffer-deallocation \
+// RUN: -finalizing-bufferize \
+// RUN: -convert-vector-to-scf \
+// RUN: -expand-strided-metadata \
+// RUN: -convert-vector-to-llvm \
+// RUN: -memref-expand \
+// RUN: -arith-expand \
+// RUN: -convert-arith-to-llvm \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-openmp-to-llvm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -convert-math-to-llvm \
+// RUN: -convert-math-to-libm \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+
+#map = affine_map<(d0, d1, d2) -> (d1)>
+#map1 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#map3 = affine_map<(d0, d1) -> (d0, d1)>
+#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
+#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)>
+#map7 = affine_map<(d0, d1) -> (0, d0, d1)>
+
+func.func @kenerl(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<1x40x4096xf32>, %arg2 : tensor<1x40x4096xf32>, %arg3 : tensor<1x1x2048x128xf32>, %arg4 : tensor<1x1x2048x128xf32>, %arg5 : tensor<1x40xi64>) {
+ %t_start = call @rtclock() : () -> f64
+
+ %57 = tosa.reshape %arg0 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %58 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %59 = tosa.transpose %57, %58 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %60 = tosa.reshape %arg1 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %61 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %62 = tosa.transpose %60, %61 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %63 = tosa.reshape %arg2 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32>
+ %64 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
+ %65 = tosa.transpose %63, %64 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32>
+
+ %extracted_slice_9 = tensor.extract_slice %arg3[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %extracted_slice_12 = tensor.extract_slice %arg4[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32>
+ %extracted_slice_14 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32>
+ %66 = tensor.empty() : tensor<1x40x128xf32>
+ %67 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_11 : tensor<1x1x40x128xf32>) outs(%66 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %68 = tensor.empty() : tensor<40x128xf32>
+ %69 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%67 : tensor<1x40x128xf32>) outs(%68 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ %70 = tensor.empty() : tensor<1x40x128xf32>
+ %71 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_14 : tensor<1x1x40x128xf32>) outs(%70 : tensor<1x40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<1x40x128xf32>
+ %72 = tensor.empty() : tensor<40x128xf32>
+ %73 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x40x128xf32>) outs(%72 : tensor<40x128xf32>) {
+ ^bb0(%in: f32, %out: f32):
+ linalg.yield %in : f32
+ } -> tensor<40x128xf32>
+ // precompute_theta_pos_frequencies function, which is used to calculating special values ​​of RoPE according to: https://hyper.ai/wiki/29220
+ %74 = tensor.empty() : tensor<1x40x128xf32>
+ %75 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg5 : tensor<1x40xi64>) outs(%74 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %69[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %76 = tosa.reshape %75 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %77 = tensor.empty() : tensor<1x40x128xf32>
+ %78 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg5 : tensor<1x40xi64>) outs(%77 : tensor<1x40x128xf32>) {
+ ^bb0(%in: i64, %out: f32):
+ %4175 = arith.index_cast %in : i64 to index
+ %4176 = linalg.index 2 : index
+ %extracted = tensor.extract %73[%4175, %4176] : tensor<40x128xf32>
+ linalg.yield %extracted : f32
+ } -> tensor<1x40x128xf32>
+ %79 = tosa.reshape %78 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32>
+ %80 = tosa.mul %59, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_15 = tensor.extract_slice %59[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_16 = tensor.extract_slice %59[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %81 = tosa.negate %extracted_slice_16 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %82 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice = tensor.insert_slice %81 into %82[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_17 = tensor.insert_slice %extracted_slice_15 into %inserted_slice[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %83 = tosa.mul %inserted_slice_17, %79 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %84 = tosa.add %80, %83 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %85 = tosa.mul %62, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32>
+ %extracted_slice_18 = tensor.extract_slice %62[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %extracted_slice_19 = tensor.extract_slice %62[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32>
+ %86 = tosa.negate %extracted_slice_19 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32>
+ %87 = tensor.empty() : tensor<1x32x40x128xf32>
+ %inserted_slice_20 = tensor.insert_slice %86 into %87[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+ %inserted_slice_21 = tensor.insert_slice %extracted_slice_18 into %inserted_slice_20[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32>
+
+ %t_end = call @rtclock() : () -> f64
+ %time = arith.subf %t_end, %t_start : f64
+
+ %tensor_unranked = tensor.cast %inserted_slice_21 : tensor<1x32x40x128xf32> to tensor<*xf32>
+
+ // All the elements of the MemRef are the same,
+ // only check the first line to verify the correctness.
+ // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 128] strides = [163840, 5120, 128, 1] data =
+ // CHECK-NEXT: [
+ // CHECK-SAME: [
+ // CHECK-SAME: [
+ // CHECK-SAME: [-3{{(, [-]?3)*}}],
+
+ // Print results.
+ call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+ // Print timings.
+ vector.print %time : f64
+
+ return
+}
+
+func.func @main() {
+
+ %c2 = arith.constant dense<2.0> : tensor<1x40x4096xf32>
+ %c3 = arith.constant dense<3.0> : tensor<1x40x4096xf32>
+ %c4 = arith.constant dense<4.0> : tensor<1x40x4096xf32>
+ %c5 = arith.constant dense<5.0> : tensor<1x1x2048x128xf32>
+ %c6 = arith.constant dense<6.0> : tensor<1x1x2048x128xf32>
+ %c7 = arith.constant dense<7> : tensor<1x40xi64>
+
+ call @kenerl(%c2, %c3, %c4, %c5, %c6, %c7) : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>, tensor<1x40x4096xf32>, tensor<1x1x2048x128xf32>, tensor<1x1x2048x128xf32>, tensor<1x40xi64>) -> ()
+
+ return
+}
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/examples/BuddyNext/next-sigmoid.mlir b/examples/BuddyNext/next-sigmoid.mlir
new file mode 100644
index 0000000000..f49f2d7943
--- /dev/null
+++ b/examples/BuddyNext/next-sigmoid.mlir
@@ -0,0 +1,70 @@
+// RUN: buddy-opt %s \
+// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
+// RUN: | buddy-opt \
+// RUN: -arith-expand \
+// RUN: -eliminate-empty-tensors \
+// RUN: -empty-tensor-to-alloc-tensor \
+// RUN: -one-shot-bufferize \
+// RUN: -convert-linalg-to-affine-loops \
+// RUN: -affine-loop-fusion \
+// RUN: -lower-affine \
+// RUN: -func-bufferize \
+// RUN: -arith-bufferize \
+// RUN: -tensor-bufferize \
+// RUN: -buffer-deallocation \
+// RUN: -finalizing-bufferize \
+// RUN: -convert-vector-to-scf \
+// RUN: -expand-strided-metadata \
+// RUN: -convert-vector-to-llvm \
+// RUN: -memref-expand \
+// RUN: -arith-expand \
+// RUN: -convert-arith-to-llvm \
+// RUN: -finalize-memref-to-llvm \
+// RUN: -convert-scf-to-cf \
+// RUN: -convert-openmp-to-llvm \
+// RUN: -convert-arith-to-llvm \
+// RUN: -convert-math-to-llvm \
+// RUN: -convert-math-to-libm \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+func.func private @rtclock() -> f64
+
+func.func @kenerl(%arg0 : tensor<1x40x11008xf32>) {
+ %t_start = call @rtclock() : () -> f64
+
+ %sigmoid = tosa.sigmoid %arg0 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32>
+
+ %t_end = call @rtclock() : () -> f64
+ %time = arith.subf %t_end, %t_start : f64
+
+ %tensor_unranked = tensor.cast %sigmoid : tensor<1x40x11008xf32> to tensor<*xf32>
+
+ // All the elements of the MemRef are the same,
+ // only check the first line to verify the correctness.
+ // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 40, 11008] strides = [440320, 11008, 1] data =
+ // CHECK-NEXT: [
+ // CHECK-SAME: [
+ // CHECK-SAME: [0.952574{{(, 0.952574)*}}],
+
+ // Print results.
+ call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> ()
+ // Print timings.
+ vector.print %time : f64
+
+ return
+}
+
+func.func @main() {
+
+ %c3 = arith.constant dense<3.0> : tensor<1x40x11008xf32>
+
+ call @kenerl(%c3) : (tensor<1x40x11008xf32>) -> ()
+
+ return
+}
+func.func private @printMemrefF32(%ptr : tensor<*xf32>)
diff --git a/examples/BuddyPython/module_gen.py b/examples/BuddyPython/module_gen.py
index e2c722cebf..1f657d2609 100644
--- a/examples/BuddyPython/module_gen.py
+++ b/examples/BuddyPython/module_gen.py
@@ -43,12 +43,11 @@ def foo(x, y):
aot_autograd_decomposition=inductor_decomp,
)
-# Pass the function and input data to the dynamo compiler's importer, the
-# importer will first build a graph. Then, lower the graph to top-level IR.
+# Pass the function and input data to the dynamo compiler's importer, the
+# importer will first build a graph. Then, lower the graph to top-level IR.
# (tosa, linalg, etc.). Finally, accepts the generated module and weight parameters.
-graphs = dynamo_compiler.importer(foo, *(float32_in1, float32_in2))
+graphs = dynamo_compiler.importer(foo, float32_in1, float32_in2)
graph = graphs[0]
-graph.lower_to_top_level_ir(do_params_pack=True)
+graph.lower_to_top_level_ir()
print(graph._imported_module)
-print(dynamo_compiler.imported_params[graph])
diff --git a/examples/BuddyWhisper/CMakeLists.txt b/examples/BuddyWhisper/CMakeLists.txt
index 16518ffb62..756d6db081 100644
--- a/examples/BuddyWhisper/CMakeLists.txt
+++ b/examples/BuddyWhisper/CMakeLists.txt
@@ -6,22 +6,22 @@ add_custom_command(
set(PATTERN_ARG "test-generalize-pad-tensor")
add_custom_command(
OUTPUT forward.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyWhisper/forward.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyWhisper/forward.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith), empty-tensor-to-alloc-tensor, convert-elementwise-to-linalg, arith-bufferize, func.func(linalg-bufferize, tensor-bufferize), func-bufferize)" |
${BUDDY_BINARY_DIR}/buddy-opt
- -pass-pipeline "builtin.module( func.func(buffer-deallocation-simplification, convert-linalg-to-loops),matmul-paralell-vectorization-optimize, batchmatmul-optimize, eliminate-empty-tensors,func-bufferize-dynamic-offset, func.func(llvm-request-c-wrappers),convert-scf-to-openmp, convert-openmp-to-llvm, convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, convert-func-to-llvm, reconcile-unrealized-casts)" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyWhisper/forward.o
+ -pass-pipeline "builtin.module( func.func(buffer-deallocation-simplification, convert-linalg-to-loops),matmul-parallel-vectorization-optimize, batchmatmul-optimize, eliminate-empty-tensors,func-bufferize-dynamic-offset, func.func(llvm-request-c-wrappers),convert-scf-to-openmp, convert-openmp-to-llvm, convert-math-to-llvm, convert-math-to-libm, convert-scf-to-cf, convert-arith-to-llvm, expand-strided-metadata, finalize-memref-to-llvm, convert-func-to-llvm, reconcile-unrealized-casts)" |
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O0 -o ${BUDDY_BINARY_DIR}/../examples/BuddyWhisper/forward.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyWhisper/forward.mlir
COMMENT "Building forward.o"
VERBATIM)
add_custom_command(
OUTPUT subgraph0.o
- COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyWhisper/subgraph0.mlir
+ COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/BuddyWhisper/subgraph0.mlir
-pass-pipeline "builtin.module(func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" |
- ${LLVM_MLIR_BINARY_DIR}/mlir-opt
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-opt
-test-linalg-transform-patterns=${PATTERN_ARG} |
${BUDDY_BINARY_DIR}/buddy-opt
-arith-expand
@@ -29,7 +29,7 @@ add_custom_command(
-convert-elementwise-to-linalg
-empty-tensor-to-alloc-tensor
-one-shot-bufferize
- -matmul-paralell-vectorization-optimize
+ -matmul-parallel-vectorization-optimize
-batchmatmul-optimize
-convert-linalg-to-affine-loops
-affine-loop-fusion
@@ -55,9 +55,9 @@ add_custom_command(
-convert-math-to-libm
-convert-func-to-llvm
-reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llvm-as |
- ${LLVM_MLIR_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3 -o ${BUDDY_BINARY_DIR}/../examples/BuddyWhisper/subgraph0.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate -mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llvm-as |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -filetype=obj -relocation-model=pic -O3 -o ${BUDDY_BINARY_DIR}/../examples/BuddyWhisper/subgraph0.o
DEPENDS ${BUDDY_EXAMPLES_DIR}/BuddyWhisper/subgraph0.mlir
COMMENT "Building subgraph0.o "
VERBATIM)
@@ -75,11 +75,16 @@ SET_TARGET_PROPERTIES(
PROPERTIES
LINKER_LANGUAGE C)
-add_executable(buddy-whisper-run whisper-main.cpp)
-target_link_directories(buddy-whisper-run PRIVATE ${LLVM_MLIR_LIBRARY_DIR})
+set(BUDDY_WHISPER_FILES
+ whisper-main.cpp
+)
+
+add_executable(buddy-whisper-run ${BUDDY_WHISPER_FILES})
+target_link_directories(buddy-whisper-run PRIVATE ${LLVM_LIBRARY_DIR})
set(BUDDY_WHISPER_LIBS
WHISPER
+ BuddyLibDAP
mlir_c_runner_utils
omp
)
diff --git a/examples/BuddyWhisper/README.md b/examples/BuddyWhisper/README.md
index f26a1d845a..680fb34ce9 100644
--- a/examples/BuddyWhisper/README.md
+++ b/examples/BuddyWhisper/README.md
@@ -1,7 +1,7 @@
# Buddy Compiler WHISPER Example
## Introduction
-This example shows how to use Buddy Compiler to compile a WHISPER model to MLIR code then run it. The [model](openai/whisper-base) is a pre-trained model for automatic speech recognition (ASR) and speech translation.
+This example shows how to use Buddy Compiler to compile a WHISPER model to MLIR code then run it. The [model](https://huggingface.co/openai/whisper-base) is a pre-trained model for automatic speech recognition (ASR) and speech translation (ST).
## How to run
@@ -63,16 +63,15 @@ $ export LLVM_MLIR_BUILD_DIR=$PWD/../llvm/build
$ export PYTHONPATH=${LLVM_MLIR_BUILD_DIR}/tools/mlir/python_packages/mlir_core:${BUDDY_MLIR_BUILD_DIR}/python_packages:${PYTHONPATH}
```
-3. Set model and dataset environment variable.
+3. Set model environment variable.
```bash
$ export WHISPER_MODEL_PATH=/path-to-whisper-model/
-$ export AUDIO_DATASET_PATH=/path-to-audio-dataset/
// For example:
$ export WHISPER_MODEL_PATH=/home/xxx/whisper-base
-$ export AUDIO_DATASET_PATH=/home/xxx/librispeech_asr_dummy
```
+Alternatively, you can leave the path blank, and import-whisper.py will automatically download the model for you.
4. Build and run the WHISPER example
@@ -83,4 +82,4 @@ $ cd bin
$ ./buddy-whisper-run
```
-4. Enjoy it!
+5. Enjoy it!
diff --git a/examples/BuddyWhisper/audio.wav b/examples/BuddyWhisper/audio.wav
new file mode 100644
index 0000000000..069c2329ef
Binary files /dev/null and b/examples/BuddyWhisper/audio.wav differ
diff --git a/examples/BuddyWhisper/import-whisper.py b/examples/BuddyWhisper/import-whisper.py
index 7b5d3681fe..449646a676 100644
--- a/examples/BuddyWhisper/import-whisper.py
+++ b/examples/BuddyWhisper/import-whisper.py
@@ -14,7 +14,7 @@
#
# ===---------------------------------------------------------------------------
#
-# This is the example of whisper model.
+# This is an example for whisper model.
#
# ===---------------------------------------------------------------------------
@@ -22,8 +22,7 @@
import torch
import torch._dynamo as dynamo
from torch._inductor.decomposition import decompositions as inductor_decomp
-from transformers import WhisperProcessor, WhisperForConditionalGeneration
-from datasets import load_dataset
+from transformers import WhisperForConditionalGeneration
import numpy
from buddy.compiler.frontend import DynamoCompiler
@@ -34,27 +33,20 @@
# Retrieve the Whisper model path from environment variables.
model_path = os.environ.get("WHISPER_MODEL_PATH")
if model_path is None:
- raise EnvironmentError(
- "The environment variable 'WHISPER_MODEL_PATH' is not set or is invalid."
- )
+ model_path = "openai/whisper-base"
-# Initialize the tokenizer and model from the specified model path.
-processor = WhisperProcessor.from_pretrained(model_path)
+# Initialize the model from the specified model path.
model = WhisperForConditionalGeneration.from_pretrained(model_path)
model.config.use_cache = False
-dataset_path = os.environ.get("AUDIO_DATASET_PATH")
-ds = load_dataset(dataset_path, "clean", split="validation")
-sample = ds[1]["audio"]
-input_features = processor(
- sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt"
-).input_features
-
-decoder_input_ids = torch.tensor([[50258] * 448], dtype=torch.long)
+# Generate placeholder for inputs.
+input_features = torch.zeros(size=(1, 80, 3000), dtype=torch.float32)
+decoder_input_ids = torch.zeros(size=(1, 448), dtype=torch.long)
inputs = {
"input_features": input_features,
"decoder_input_ids": decoder_input_ids,
}
+
# Initialize Dynamo Compiler with specific configurations as an importer.
dynamo_compiler = DynamoCompiler(
primary_registry=tosa.ops_registry,
diff --git a/examples/BuddyWhisper/input_features.data b/examples/BuddyWhisper/input_features.data
deleted file mode 100644
index c85c98ddf3..0000000000
Binary files a/examples/BuddyWhisper/input_features.data and /dev/null differ
diff --git a/examples/BuddyWhisper/whisper-main.cpp b/examples/BuddyWhisper/whisper-main.cpp
index 2ba9138544..7d69ea3074 100644
--- a/examples/BuddyWhisper/whisper-main.cpp
+++ b/examples/BuddyWhisper/whisper-main.cpp
@@ -13,21 +13,29 @@
// limitations under the License.
//
//===----------------------------------------------------------------------===//
+//
+// This file implements an example for Whisper Model Inference.
+//
+// ------------------------------------------------------------------------===//
#include
+#include
#include
#include
#include
#include
+#include
#include
#include
#include
+
+using namespace std;
using namespace buddy;
+using namespace dap;
constexpr size_t ParamsSize = 99148800;
constexpr size_t MaxVocabSize = 51865;
constexpr size_t MaxTokenLength = 448;
-constexpr size_t HiddenSize = 512;
/// Declare Whisper forward function.
extern "C" void _mlir_ciface_forward(MemRef *, MemRef *,
@@ -37,14 +45,6 @@ extern "C" void _mlir_ciface_forward(MemRef *, MemRef *,
// Helper Functions
// -----------------------------------------------------------------------------
-/// Capture input message.
-void getUserInput(std::string &inputStr) {
- std::cout << "\nPlease send a message:" << std::endl;
- std::cout << ">>> ";
- getline(std::cin, inputStr);
- std::cout << std::endl;
-}
-
/// Print [Log] label in bold blue format.
void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
@@ -83,30 +83,18 @@ void loadParameters(const std::string ¶mFilePath,
<< std::endl;
}
-void loadAudio(const std::string ¶mFilePath, MemRef ¶ms) {
- const auto loadStart = std::chrono::high_resolution_clock::now();
- std::ifstream paramFile(paramFilePath, std::ios::in | std::ios::binary);
- if (!paramFile.is_open()) {
- throw std::runtime_error("[Error] Failed to open input_features file!");
- }
- printLogLabel();
- std::cout << "Loading input_features..." << std::endl;
+/// Conduct audio data preprocess.
+void runPreprocess(dap::Audio &rawAudioContainer,
+ MemRef &audioFeatures) {
printLogLabel();
- std::cout << "input_features file: "
- << std::filesystem::canonical(paramFilePath) << std::endl;
-
- paramFile.read(reinterpret_cast(params.getData()),
- sizeof(float) * (params.getSize()));
-
- if (paramFile.fail()) {
- throw std::runtime_error("Error occurred while reading params file!");
- }
- paramFile.close();
+ std::cout << "Preprocessing audio..." << std::endl;
+ const auto loadStart = std::chrono::high_resolution_clock::now();
+ dap::whisperPreprocess(&rawAudioContainer, &audioFeatures);
const auto loadEnd = std::chrono::high_resolution_clock::now();
const std::chrono::duration loadTime =
loadEnd - loadStart;
printLogLabel();
- std::cout << "input_features load time: " << (double)(loadTime.count()) / 1000
+ std::cout << "Audio preprocess time: " << (double)(loadTime.count()) / 1000
<< "s\n"
<< std::endl;
}
@@ -129,14 +117,13 @@ int main() {
/// Define directories of vacabulary and parameter file.
const std::string vocabDir = "../../examples/BuddyWhisper/vocab.txt";
const std::string paramsDir = "../../examples/BuddyWhisper/arg0.data";
- const std::string input_featuresDir =
- "../../examples/BuddyWhisper/input_features.data";
/// Initialize data containers
// - Result container
// - Output container.
// - Parameters container.
Text outputContainer;
+ Audio rawAudioContainer("../../examples/BuddyWhisper/audio.wav");
MemRef audioInput({1, 80, 3000});
MemRef resultContainer[2] = {
MemRef({1, 1500, 512}, false, 0),
@@ -148,16 +135,17 @@ int main() {
/// Fill data into containers
// - Output: register vocabulary.
// - Parameters: load parameters from the `arg0` file into the container.
+ // - Input: compute audioInput.
outputContainer.loadVocab(vocabDir);
loadParameters(paramsDir, paramsContainer);
- loadAudio(input_featuresDir, audioInput);
+ runPreprocess(rawAudioContainer, audioInput);
/// Run Whisper Inference
// - Perform the forward function.
// - Find and append the generated token.
// - Continue iterating until the terminal condition is met.
- for (int i = 0; i < MaxTokenLength - 1; i++) {
+ for (size_t i = 0; i < MaxTokenLength - 1; i++) {
const auto inferenceStart = std::chrono::high_resolution_clock::now();
// Execute the forward pass of the model.
_mlir_ciface_forward(resultContainer, ¶msContainer, &audioInput,
diff --git a/examples/ConvOpt/CMakeLists.txt b/examples/ConvOpt/CMakeLists.txt
index 83aa26b686..e01f2b46c6 100644
--- a/examples/ConvOpt/CMakeLists.txt
+++ b/examples/ConvOpt/CMakeLists.txt
@@ -16,14 +16,14 @@ message(STATUS "Spliting size: ${SPLITING_SIZE}")
add_custom_command(OUTPUT conv2d.o
COMMAND ${CMAKE_BINARY_DIR}/bin/buddy-opt ${BUDDY_EXAMPLES_DIR}/ConvOpt/conv2d.mlir -conv-vectorization="strip-mining=${SPLITING_SIZE}" -lower-affine -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts |
- ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
- ${LLVM_MLIR_BINARY_DIR}/llc -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} --filetype=obj -o ${BUDDY_BINARY_DIR}/../examples/ConvOpt/conv2d.o
+ ${LLVM_TOOLS_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+ ${LLVM_TOOLS_BINARY_DIR}/llc -mtriple=${BUDDY_TARGET_TRIPLE} -mattr=${BUDDY_OPT_ATTR} --filetype=obj -o ${BUDDY_BINARY_DIR}/../examples/ConvOpt/conv2d.o
DEPENDS buddy-opt)
# add_custom_command(OUTPUT conv2d.o
-# COMMAND ${LLVM_MLIR_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/ConvOpt/conv2d.mlir -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm='emit-c-wrappers=1' -reconcile-unrealized-casts |
-# ${LLVM_MLIR_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
-# ${LLVM_MLIR_BINARY_DIR}/llc -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} --filetype=obj -o ${BUDDY_BINARY_DIR}/../examples/ConvOpt/conv2d.o
+# COMMAND ${LLVM_TOOLS_BINARY_DIR}/mlir-opt ${BUDDY_EXAMPLES_DIR}/ConvOpt/conv2d.mlir -convert-linalg-to-loops -convert-scf-to-cf -convert-linalg-to-llvm -lower-affine -convert-scf-to-cf --finalize-memref-to-llvm -convert-func-to-llvm='emit-c-wrappers=1' -reconcile-unrealized-casts |
+# ${LLVM_TOOLS_BINARY_DIR}/mlir-translate --mlir-to-llvmir |
+# ${LLVM_TOOLS_BINARY_DIR}/llc -mtriple=${BUDDY_OPT_TRIPLE} -mattr=${BUDDY_OPT_ATTR} --filetype=obj -o ${BUDDY_BINARY_DIR}/../examples/ConvOpt/conv2d.o
# DEPENDS buddy-opt)
add_library(Conv2D STATIC conv2d.o)
diff --git a/examples/DAPDialect/CMakeLists.txt b/examples/DAPDialect/CMakeLists.txt
index b147d56047..96b921ee3a 100644
--- a/examples/DAPDialect/CMakeLists.txt
+++ b/examples/DAPDialect/CMakeLists.txt
@@ -20,6 +20,7 @@ add_executable(buddy-fir FIRLowpass.cpp)
add_dependencies(buddy-fir buddy-opt)
target_link_libraries(buddy-fir
BuddyLibDAP
+ mlir_c_runner_utils
)
#-------------------------------------------------------------------------------
@@ -30,6 +31,7 @@ add_executable(buddy-biquad biquad.cpp)
add_dependencies(buddy-biquad buddy-opt)
target_link_libraries(buddy-biquad
BuddyLibDAP
+ mlir_c_runner_utils
)
#-------------------------------------------------------------------------------
@@ -40,10 +42,30 @@ add_executable(buddy-iir-scalar IIRLowpass.cpp)
add_dependencies(buddy-iir-scalar buddy-opt)
target_link_libraries(buddy-iir-scalar
BuddyLibDAP
+ mlir_c_runner_utils
)
add_executable(buddy-iir-vectorization IIRVectorization.cpp)
add_dependencies(buddy-iir-vectorization buddy-opt)
target_link_libraries(buddy-iir-vectorization
- BuddyLibDAPVectorization
+ BuddyLibDAP
+ mlir_c_runner_utils
+)
+
+#-------------------------------------------------------------------------------
+# Buddy DAP Dialect WhisperPreprocess Operation
+#-------------------------------------------------------------------------------
+
+add_executable(buddy-whisper-preprocess WhisperPreprocess.cpp)
+add_dependencies(buddy-whisper-preprocess buddy-opt)
+target_link_libraries(buddy-whisper-preprocess
+ BuddyLibDAP
+ mlir_c_runner_utils
+)
+
+add_executable(buddy-rfft RFFT.cpp)
+add_dependencies(buddy-rfft buddy-opt)
+target_link_libraries(buddy-rfft
+ BuddyLibDAP
+ mlir_c_runner_utils
)
diff --git a/examples/DAPDialect/FIRLowpass.cpp b/examples/DAPDialect/FIRLowpass.cpp
index cfce56091d..3a8217730a 100644
--- a/examples/DAPDialect/FIRLowpass.cpp
+++ b/examples/DAPDialect/FIRLowpass.cpp
@@ -14,45 +14,76 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements an end to end example for fir filter in buddy-mlir. It
-// generates coefficients for a filter and apply it on a piece of mono audio,
-// then saves the audio.
-// This file will be linked with the object file generated by mlir to generate
-// the executable file.
+// An end-to-end example of an FIR (Finite Impulse Response) operation in
+// buddy-mlir.
//
//===----------------------------------------------------------------------===//
#include
+#include
#include
using namespace dap;
using namespace std;
-int main(int argc, char *argv[]) {
- string fileName = "../../tests/Interface/core/NASA_Mars.wav";
- ;
- string saveFileName = "FIR_NASA_Mars.wav";
- if (argc >= 2) {
- fileName = argv[1];
- }
- if (argc == 3) {
- saveFileName = argv[2];
- }
- cout << "Usage: FIRLowpass [loadPath] [savePath]" << endl;
- cout << "Current specified path: \n";
- cout << "Load: " << fileName << endl;
- cout << "Save: " << saveFileName << endl;
+// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
+int main() {
+ // Print the title of this example.
+ const std::string title = "FIR Operation Powered by Buddy Compiler";
+ std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+ // Generate the kernel for a FIR filter operation.
+ // Params:
+ // Input kernel: Stores generated kernel data.
+ // Type: Specifies the window type from the WINDOW_TYPE enum class.
+ // Length: The length of the filter.
+ // Cutoff: The lowpass cutoff frequency.
+ // Argument: Filter-specific arguments, with size limited by the
+ // WINDOW_TYPE.
intptr_t kernelSize = 100;
MemRef kernel(&kernelSize);
- dap::firLowpass(kernel, dap::WINDOW_TYPE::BLACKMANHARRIS7,
- kernelSize, 0.3, nullptr);
- auto aud = dap::Audio(fileName);
- aud.getAudioFile().printSummary();
- dap::Audio output;
- output.fetchMetadata(aud.getAudioFile());
- output.getAudioFile().setAudioBuffer(nullptr);
- dap::fir(&aud.getMemRef(), &kernel, &output.getMemRef());
- cout << "Saving file:" << endl;
- cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl;
+ dap::firLowpass(/*input=*/kernel,
+ /*type=*/dap::WINDOW_TYPE::BLACKMANHARRIS7,
+ /*len=*/kernelSize, /*cutoff=*/0.3,
+ /*args=*/nullptr);
+
+ // Initialize data containers.
+ // Params:
+ // Input container: Stores the raw audio data.
+ // Returns:
+ // Output memory reference: Provides a MemRef for saving the output.
+ Audio inputContainer("../../tests/Interface/core/TestAudio.wav");
+ intptr_t samplesNum = static_cast(inputContainer.getSamplesNum());
+ MemRef outputMemRef(&samplesNum);
+
+ // Apply the FIR filter operation to the audio data.
+ printLogLabel();
+ std::cout << "Running FIR operation..." << std::endl;
+ const auto loadStart = std::chrono::high_resolution_clock::now();
+ dap::fir(&inputContainer, &kernel, &outputMemRef);
+ const auto loadEnd = std::chrono::high_resolution_clock::now();
+ const std::chrono::duration loadTime =
+ loadEnd - loadStart;
+ printLogLabel();
+ std::cout << "Audio processing time: " << (double)(loadTime.count()) / 1000
+ << "s\n"
+ << std::endl;
+
+ // Convert a MemRef object to an Audio object and set the metadata.
+ Audio outputContainer(std::move(outputMemRef));
+ outputContainer.setBitDepth(inputContainer.getBitDepth());
+ outputContainer.setSamplesNum(inputContainer.getSamplesNum());
+ outputContainer.setChannelsNum(inputContainer.getChannelsNum());
+ outputContainer.setSampleRate(inputContainer.getSampleRate());
+
+ // Save the processed data to an audio file.
+ std::string saveFileName = "FIRTestAudio.wav";
+ outputContainer.saveToFile(saveFileName, "wave");
+ printLogLabel();
+ std::cout << "Processed audio data saved in: " << saveFileName << "\n"
+ << std::endl;
+
return 0;
}
diff --git a/examples/DAPDialect/IIRLowpass.cpp b/examples/DAPDialect/IIRLowpass.cpp
index 1b69ec08b0..ec5de06c95 100644
--- a/examples/DAPDialect/IIRLowpass.cpp
+++ b/examples/DAPDialect/IIRLowpass.cpp
@@ -14,52 +14,81 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements an end to end example for iir filter in buddy-mlir. It
-// generates coefficients for a filter and apply it on a piece of mono audio,
-// then saves the audio.
-// This file will be linked with the object file generated by mlir to generate
-// the executable file.
+// An end-to-end example of the scalar version IIR (Infinite Impulse Response)
+// operation in buddy-mlir.
//
//===----------------------------------------------------------------------===//
#include
+#include
#include
using namespace dap;
using namespace std;
+// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
int main(int argc, char *argv[]) {
- string fileName = "../../tests/Interface/core/NASA_Mars.wav";
- string saveFileName = "IIR_LOWPASS_NASA_Mars.wav";
- if (argc >= 2) {
- fileName = argv[1];
- }
- if (argc == 3) {
- saveFileName = argv[2];
- }
- cout << "Usage: IIRLowpass [loadPath] [savePath]" << endl;
- cout << "Current specified path: \n";
- cout << "Load: " << fileName << endl;
- cout << "Save: " << saveFileName << endl;
- // Order of butterworth filter
+ // Print the title of this example.
+ const std::string title =
+ "Scalar Version IIR Operation Powered by Buddy Compiler";
+ std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+ // Allocate kernel MemRef for an IIR filter operation.
+ // Params:
+ // Order: The order of the butterworth filter.
+ // Parameter size: Each SOS matrix has 6 parameters.
int order = 8;
- // Each SOS matrix has 6 paramters.
intptr_t kernelSize[2] = {int(order / 2), 6};
MemRef kernel(kernelSize);
- // cutoff frequency = 1000, fs = 48000.
- dap::iirLowpass(kernel, dap::butterworth(order), 1000,
- 48000);
- auto aud = dap::Audio(fileName);
- aud.getAudioFile().printSummary();
- dap::Audio output;
- output.fetchMetadata(aud.getAudioFile());
- output.getAudioFile().setAudioBuffer(nullptr);
+ // Generate the kernel for an IIR filter operation.
+ // Params:
+ // Input kernel: Stores generated kernel data.
+ // Lowpass filter: Supports butterworth filter upto order 12 for now.
+ // Lowpass frequency: The lowpass cutoff frequency.
+ // Sampling frequency: The rate at which the input data is sampled.
+ dap::iirLowpass(/*kernel=*/kernel,
+ /*filter=*/dap::butterworth(order),
+ /*frequency=*/1000,
+ /*fs=*/48000);
+
+ // Initialize data containers.
+ // Params:
+ // Input container: Stores the raw audio data.
+ // Returns:
+ // Output memory reference: Provides a MemRef for saving the output.
+ Audio inputContainer("../../tests/Interface/core/TestAudio.wav");
+ intptr_t samplesNum = static_cast(inputContainer.getSamplesNum());
+ MemRef outputMemRef(&samplesNum);
+
+ // Apply scalar version IIR operation to the audio data.
+ printLogLabel();
+ std::cout << "Running scalar version IIR operation..." << std::endl;
+ const auto loadStart = std::chrono::high_resolution_clock::now();
+ dap::IIR(&inputContainer, &kernel, &outputMemRef);
+ const auto loadEnd = std::chrono::high_resolution_clock::now();
+ const std::chrono::duration loadTime =
+ loadEnd - loadStart;
+ printLogLabel();
+ std::cout << "Audio processing time: " << (double)(loadTime.count()) / 1000
+ << "s\n"
+ << std::endl;
- dap::IIR(&aud.getMemRef(), &kernel, &output.getMemRef());
+ // Convert a MemRef object to an Audio object and set the metadata.
+ Audio outputContainer(std::move(outputMemRef));
+ outputContainer.setBitDepth(inputContainer.getBitDepth());
+ outputContainer.setSamplesNum(inputContainer.getSamplesNum());
+ outputContainer.setChannelsNum(inputContainer.getChannelsNum());
+ outputContainer.setSampleRate(inputContainer.getSampleRate());
- cout << "Saving file:" << endl;
- cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl;
+ // Save the processed data to an audio file.
+ std::string saveFileName = "ScalarVersionIIRTestAudio.wav";
+ outputContainer.saveToFile(saveFileName, "wave");
+ printLogLabel();
+ std::cout << "Processed audio data saved in: " << saveFileName << "\n"
+ << std::endl;
return 0;
}
diff --git a/examples/DAPDialect/IIRVectorization.cpp b/examples/DAPDialect/IIRVectorization.cpp
index c7d0c19553..e766c85889 100644
--- a/examples/DAPDialect/IIRVectorization.cpp
+++ b/examples/DAPDialect/IIRVectorization.cpp
@@ -14,53 +14,82 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements an end to end example for iir filter in buddy-mlir. It
-// generates coefficients for a filter and apply it on a piece of mono audio,
-// then saves the audio.
-// This file will be linked with the object file which use dap vectorization
-// pass to generate the executable file.
+// An end-to-end example of the vectorized IIR (Infinite Impulse Response)
+// operation in buddy-mlir.
//
//===----------------------------------------------------------------------===//
#include
+#include
#include
using namespace dap;
using namespace std;
-int main(int argc, char *argv[]) {
- string fileName = "../../tests/Interface/core/NASA_Mars.wav";
- string saveFileName = "IIR_VECTORIZATION_PASS_NASA_Mars.wav";
- if (argc >= 2) {
- fileName = argv[1];
- }
- if (argc == 3) {
- saveFileName = argv[2];
- }
- cout << "Usage: IIRVectorizationPass [loadPath] [savePath]" << endl;
- cout << "Current specified path: \n";
- cout << "Load: " << fileName << endl;
- cout << "Save: " << saveFileName << endl;
- // Order for butterworth filter.
+// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
+int main() {
+ // Print the title of this example.
+ const std::string title =
+ "Vectorized IIR Operation Powered by Buddy Compiler";
+ std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+ // Allocate kernel MemRef for an IIR filter operation.
+ // Params:
+ // Order: The order of the butterworth filter.
+ // Parameter size: Each SOS matrix has 6 parameters.
int order = 8;
- // Each SOS matrix has 6 paramters.
intptr_t kernelSize[2] = {int(order / 2), 6};
MemRef kernel(kernelSize);
- // cutoff frequency = 1000, fs = 48000.
- dap::iirLowpass(kernel, dap::butterworth(order), 1000,
- 48000);
- auto aud = dap::Audio(fileName);
- aud.getAudioFile().printSummary();
- dap::Audio output;
- output.fetchMetadata(aud.getAudioFile());
- output.getAudioFile().setAudioBuffer(nullptr);
+ // Generate the kernel for an IIR filter operation.
+ // Params:
+ // Input kernel: Stores generated kernel data.
+ // Lowpass filter: Supports butterworth filter upto order 12 for now.
+ // Lowpass frequency: The lowpass cutoff frequency.
+ // Sampling frequency: The rate at which the input data is sampled.
+ dap::iirLowpass(/*kernel=*/kernel,
+ /*filter=*/dap::butterworth(order),
+ /*frequency=*/1000,
+ /*fs=*/48000);
+
+ // Initialize data containers.
+ // Params:
+ // Input container: Stores the raw audio data.
+ // Returns:
+ // Output memory reference: Provides a MemRef for saving the output.
+ Audio inputContainer("../../tests/Interface/core/TestAudio.wav");
+ intptr_t samplesNum = static_cast(inputContainer.getSamplesNum());
+ MemRef outputMemRef(&samplesNum);
- dap::IIR(&aud.getMemRef(), &kernel, &output.getMemRef(),
+ // Apply vectorized IIR operation to the audio data.
+ printLogLabel();
+ std::cout << "Running vectorized IIR operation..." << std::endl;
+ const auto loadStart = std::chrono::high_resolution_clock::now();
+ dap::IIR(&inputContainer, &kernel, &outputMemRef,
/*isVectorization=*/true);
+ const auto loadEnd = std::chrono::high_resolution_clock::now();
+ const std::chrono::duration loadTime =
+ loadEnd - loadStart;
+ printLogLabel();
+ std::cout << "Audio processing time: " << (double)(loadTime.count()) / 1000
+ << "s\n"
+ << std::endl;
+
+ // Convert a MemRef object to an Audio object and set the metadata.
+ Audio outputContainer(std::move(outputMemRef));
+ outputContainer.setBitDepth(inputContainer.getBitDepth());
+ outputContainer.setSamplesNum(inputContainer.getSamplesNum());
+ outputContainer.setChannelsNum(inputContainer.getChannelsNum());
+ outputContainer.setSampleRate(inputContainer.getSampleRate());
- cout << "Saving file:" << endl;
- cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl;
+ // Save the processed data to an audio file.
+ std::string saveFileName = "VectorizedIIRTestAudio.wav";
+ outputContainer.saveToFile(saveFileName, "wave");
+ printLogLabel();
+ std::cout << "Processed audio data saved in: " << saveFileName << "\n"
+ << std::endl;
return 0;
}
diff --git a/examples/DAPDialect/RFFT.cpp b/examples/DAPDialect/RFFT.cpp
new file mode 100644
index 0000000000..993fec95e1
--- /dev/null
+++ b/examples/DAPDialect/RFFT.cpp
@@ -0,0 +1,75 @@
+//===- RFFT.cpp - Example of DAP RFFT Operation ---------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// An example of the RFFT function from Whisper Preprocessor operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+
+#define testLength 840
+
+using namespace dap;
+using namespace std;
+
+// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
+// Write preprocessing results to a text file.
+void printResult(MemRef &outputMemRef) {
+ ofstream fout("whisperPreprocessResultRFFT.txt");
+ // Print title.
+ fout << "-----------------------------------------" << std::endl;
+ fout << "[ Buddy RFFT Result ]" << std::endl;
+ fout << "-----------------------------------------" << std::endl;
+ // Print reuslt data.
+ for (int i = 0; i < testLength; ++i) {
+ fout << outputMemRef[i] << std::endl;
+ }
+ fout.close();
+}
+
+int main() {
+ // Print the title of this example.
+ const std::string title = "RFFT Operation Powered by Buddy Compiler";
+ std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+ double *inputAlign = new double[testLength];
+ for (int i = 0; i < testLength; ++i) {
+ inputAlign[i] = static_cast(i);
+ }
+ intptr_t inputSizes[1] = {testLength};
+ MemRef inputMemRef(inputAlign, inputSizes);
+
+ printLogLabel();
+ std::cout << "Running RFFT operation" << std::endl;
+ const auto loadStart = std::chrono::high_resolution_clock::now();
+ dap::RFFT(&inputMemRef);
+ const auto loadEnd = std::chrono::high_resolution_clock::now();
+ const std::chrono::duration loadTime =
+ loadEnd - loadStart;
+ printLogLabel();
+ std::cout << "RFFT time: " << (double)(loadTime.count()) / 1000
+ << "s\n"
+ << std::endl;
+
+ printResult(inputMemRef);
+
+ return 0;
+}
diff --git a/examples/DAPDialect/WhisperPreprocess.cpp b/examples/DAPDialect/WhisperPreprocess.cpp
new file mode 100644
index 0000000000..db69ac836e
--- /dev/null
+++ b/examples/DAPDialect/WhisperPreprocess.cpp
@@ -0,0 +1,77 @@
+//===- WhisperPreprocessor.cpp --------------------------------------------===//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// An example of the Whisper Preprocessor operation.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+
+using namespace dap;
+using namespace std;
+
+// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
+// Write preprocessing results to a text file.
+void printResult(MemRef &outputMemRef) {
+ ofstream fout("whisperPreprocessResult.txt");
+ // Print title.
+ fout << "-----------------------------------------" << std::endl;
+ fout << "[ Whisper Preprocess Result ]" << std::endl;
+ fout << "-----------------------------------------" << std::endl;
+ // Print reuslt data.
+ for (int i = 0; i < 240000; ++i) {
+ fout << outputMemRef[i] << std::endl;
+ }
+ fout.close();
+}
+
+int main() {
+ // Print the title of this example.
+ const std::string title = "Whisper Preprocess Powered by Buddy Compiler";
+ std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+ // Initialize data containers.
+ // Params:
+ // Input container: Stores raw audio data.
+ // Returns:
+ // Output memory reference: Features formatted as memref<1x80x3000xf32>.
+ Audio inputContainer("../../examples/BuddyWhisper/audio.wav");
+ float *outputAlign = new float[240000];
+ intptr_t outputSizes[3] = {1, 80, 3000};
+ MemRef outputMemRef(outputAlign, outputSizes);
+
+ // Compute audio features from raw audio data.
+ printLogLabel();
+ std::cout << "Preprocessing audio..." << std::endl;
+ const auto loadStart = std::chrono::high_resolution_clock::now();
+ dap::whisperPreprocess(&inputContainer, &outputMemRef);
+ const auto loadEnd = std::chrono::high_resolution_clock::now();
+ const std::chrono::duration loadTime =
+ loadEnd - loadStart;
+ printLogLabel();
+ std::cout << "Audio preprocess time: " << (double)(loadTime.count()) / 1000
+ << "s\n"
+ << std::endl;
+
+ // printResult(outputMemRef);
+
+ return 0;
+}
diff --git a/examples/DAPDialect/biquad.cpp b/examples/DAPDialect/biquad.cpp
index 14a78084a0..e606c2d0e8 100644
--- a/examples/DAPDialect/biquad.cpp
+++ b/examples/DAPDialect/biquad.cpp
@@ -14,45 +14,70 @@
//
//===----------------------------------------------------------------------===//
//
-// This file implements an end to end example for biquad filter in buddy-mlir.
-// It generates coefficients for a filter and apply it on a piece of mono audio,
-// then saves the audio.
-// This file will be linked with the object file generated by mlir to generate
-// the executable file.
+// An end-to-end example of a biquad operation in buddy-mlir.
//
//===----------------------------------------------------------------------===//
#include
+#include
#include
using namespace dap;
using namespace std;
-int main(int argc, char *argv[]) {
- string fileName = "../../tests/Interface/core/NASA_Mars.wav";
- string saveFileName = "BIQUAD_NASA_Mars.wav";
- if (argc >= 2) {
- fileName = argv[1];
- }
- if (argc == 3) {
- saveFileName = argv[2];
- }
- cout << "Usage: BiquadLowpass [loadPath] [savePath]" << endl;
- cout << "Current specified path: \n";
- cout << "Load: " << fileName << endl;
- cout << "Save: " << saveFileName << endl;
+// Print [Log] label in bold blue format.
+void printLogLabel() { std::cout << "\033[34;1m[Log] \033[0m"; }
+
+int main() {
+ // Print the title of this example.
+ const std::string title = "Biquad Operation Powered by Buddy Compiler";
+ std::cout << "\033[33;1m" << title << "\033[0m" << std::endl;
+
+ // Generate the kernel for a biquad filter operation.
+ // Params:
+ // Input kernel: Stores generated kernel data.
+ // Frequency: Normalized frequency (frequency_Hz / samplerate_Hz).
+ // Quality factor: Defines the filter's bandwidth relative to its
+ // center frequency.
intptr_t kernelSize = 6;
MemRef kernel(&kernelSize);
- dap::biquadLowpass(kernel, 0.3, -1.0);
- auto aud = dap::Audio(fileName);
- aud.getAudioFile().printSummary();
- dap::Audio output;
- output.fetchMetadata(aud.getAudioFile());
- output.getAudioFile().setAudioBuffer(nullptr);
+ dap::biquadLowpass(kernel, /*frequency=*/0.3, /*Q=*/-1.0);
+
+ // Initialize data containers.
+ // Params:
+ // Input container: Stores the raw audio data.
+ // Returns:
+ // Output memory reference: Provides a MemRef for saving the output.
+ Audio inputContainer("../../tests/Interface/core/TestAudio.wav");
+ intptr_t samplesNum = static_cast(inputContainer.getSamplesNum());
+ MemRef outputMemRef(&samplesNum);
+
+ // Apply the biquad filter operation to the audio data.
+ printLogLabel();
+ std::cout << "Running biquad operation..." << std::endl;
+ const auto loadStart = std::chrono::high_resolution_clock::now();
+ dap::biquad(&inputContainer, &kernel, &outputMemRef);
+ const auto loadEnd = std::chrono::high_resolution_clock::now();
+ const std::chrono::duration loadTime =
+ loadEnd - loadStart;
+ printLogLabel();
+ std::cout << "Audio processing time: " << (double)(loadTime.count()) / 1000
+ << "s\n"
+ << std::endl;
+
+ // Convert a MemRef object to an Audio object and set the metadata.
+ Audio outputContainer(std::move(outputMemRef));
+ outputContainer.setBitDepth(inputContainer.getBitDepth());
+ outputContainer.setSamplesNum(inputContainer.getSamplesNum());
+ outputContainer.setChannelsNum(inputContainer.getChannelsNum());
+ outputContainer.setSampleRate(inputContainer.getSampleRate());
- dap::biquad(&aud.getMemRef(), &kernel, &output.getMemRef());
+ // Save the processed data to an audio file.
+ std::string saveFileName = "BiquadTestAudio.wav";
+ outputContainer.saveToFile(saveFileName, "wave");
+ printLogLabel();
+ std::cout << "Processed audio data saved in: " << saveFileName << "\n"
+ << std::endl;
- cout << "Saving file:" << endl;
- cout << (output.save(saveFileName) ? "OK" : "ERROR") << endl;
return 0;
}
diff --git a/examples/DIPDialect/CMakeLists.txt b/examples/DIPDialect/CMakeLists.txt
index 27abb889f2..2f897ad633 100644
--- a/examples/DIPDialect/CMakeLists.txt
+++ b/examples/DIPDialect/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(DIP_LIBS ${JPEG_LIBRARY} ${PNG_LIBRARY} BuddyLibDIP)
+set(DIP_LIBS ${JPEG_LIBRARY} ${PNG_LIBRARIES} BuddyLibDIP)
if(BUDDY_ENABLE_OPENCV)
find_package(OpenCV REQUIRED CONFIG)
@@ -25,3 +25,9 @@ target_link_libraries(rotation2D ${DIP_LIBS})
add_executable(resize2D resize2D.cpp)
target_link_libraries(resize2D ${DIP_LIBS})
+
+add_executable(resize4D_nhwc resize4D_nhwc.cpp)
+target_link_libraries(resize4D_nhwc ${DIP_LIBS})
+
+add_executable(resize4D_nchw resize4D_nchw.cpp)
+target_link_libraries(resize4D_nchw ${DIP_LIBS})
diff --git a/examples/DIPDialect/resize4D_nchw.cpp b/examples/DIPDialect/resize4D_nchw.cpp
new file mode 100644
index 0000000000..95d77cc27d
--- /dev/null
+++ b/examples/DIPDialect/resize4D_nchw.cpp
@@ -0,0 +1,58 @@
+//====- resize4D.cpp - Example of buddy-opt tool =============================//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a 4D resize example with dip.resize_4d operation.
+// The dip.resize_4d operation will be compiled into an object file with the
+// buddy-opt tool.
+// This file will be linked with the object file to generate the executable
+// file.
+//
+//===----------------------------------------------------------------------===//
+#include "buddy/DIP/imgcodecs/loadsave.h"
+#include
+#include
+#include
+#include
+#include
+#include
+
+using namespace std;
+
+void testImplementation(int argc, char *argv[]) {
+ // Read as colar image.
+ dip::Image inputBatch(argv[1], dip::DIP_RGB, true);
+
+ // Note : Both values in output image dimensions and scaling ratios must be
+ // positive numbers.
+ MemRef output = dip::Resize4D_NCHW(
+ &inputBatch, dip::INTERPOLATION_TYPE::BILINEAR_INTERPOLATION,
+ {1, 3, 224, 224} /*{image_cols, image_rows}*/);
+
+ // Define Img with the output of Resize4D.
+ intptr_t outSizes[3] = {output.getSizes()[2], output.getSizes()[3],
+ output.getSizes()[1]};
+
+ Img outputImageResize4D(output.getData(), outSizes);
+
+ // dip::imwrite(argv[2], outputImageResize4D);
+
+ return;
+}
+
+int main(int argc, char *argv[]) {
+ testImplementation(argc, argv);
+ return 0;
+}
diff --git a/examples/DIPDialect/resize4D_nhwc.cpp b/examples/DIPDialect/resize4D_nhwc.cpp
new file mode 100644
index 0000000000..affb8a8a09
--- /dev/null
+++ b/examples/DIPDialect/resize4D_nhwc.cpp
@@ -0,0 +1,61 @@
+//====- resize4D.cpp - Example of buddy-opt tool =============================//
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a 4D resize example with dip.resize_4d operation.
+// The dip.resize_4d operation will be compiled into an object file with the
+// buddy-opt tool.
+// This file will be linked with the object file to generate the executable
+// file.
+//
+//===----------------------------------------------------------------------===//
+#include "buddy/DIP/imgcodecs/loadsave.h"
+#include
+#include
+#include
+#include
+#include
+
+using namespace std;
+
+void testImplementation(int argc, char *argv[]) {
+ // Read as colar image.
+ Img input = dip::imread(argv[1], dip::IMGRD_COLOR);
+
+ intptr_t sizes[4] = {1, input.getSizes()[0], input.getSizes()[1],
+ input.getSizes()[2]};
+ Img inputBatch(input.getData(), sizes);
+
+ // Note : Both values in output image dimensions and scaling ratios must be
+ // positive numbers.
+ MemRef output = dip::Resize4D_NHWC(
+ &inputBatch, dip::INTERPOLATION_TYPE::BILINEAR_INTERPOLATION,
+ {1, 224, 224, 3} /*{image_cols, image_rows}*/);
+
+ // Define Img with the output of Resize4D.
+ intptr_t outSizes[3] = {output.getSizes()[1], output.getSizes()[2],
+ output.getSizes()[3]};
+
+ Img outputImageResize4D(output.getData(), outSizes);
+
+ dip::imwrite(argv[2], outputImageResize4D);
+
+ return;
+}
+
+int main(int argc, char *argv[]) {
+ testImplementation(argc, argv);
+ return 0;
+}
diff --git a/examples/MLIRCF/.gitignore b/examples/MLIRCF/.gitignore
new file mode 100644
index 0000000000..790429d34e
--- /dev/null
+++ b/examples/MLIRCF/.gitignore
@@ -0,0 +1,3 @@
+log*
+core
+a.out
diff --git a/examples/MLIRCF/cf-iteration-exit.mlir b/examples/MLIRCF/cf-iteration-exit.mlir
new file mode 100644
index 0000000000..89281c9e34
--- /dev/null
+++ b/examples/MLIRCF/cf-iteration-exit.mlir
@@ -0,0 +1,47 @@
+// RUN: buddy-opt %s \
+// RUN: -convert-vector-to-llvm \
+// RUN: -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+// The example is equivalent to the following code.
+// int main() {
+// int val = 0;
+// for (int i = 1; i < 5; i++) {
+// val += 5;
+// if (i == 3) {
+// std::cout << val << std::endl;
+// return 0;
+// }
+// }
+// return 0;
+// }
+
+module {
+ func.func @main() {
+ %c0 = arith.constant 0 : index
+ %c3 = arith.constant 3 : index
+ %c5 = arith.constant 5 : index
+ %c1 = arith.constant 1 : index
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cst_5 = arith.constant 5.000000e+00 : f32
+ cf.br ^bb1(%c0, %cst_0 : index, f32)
+ ^bb1(%0: index, %1: f32):
+ %2 = arith.cmpi slt, %0, %c5 : index
+ cf.cond_br %2, ^bb2, ^bb4(%1: f32)
+ ^bb2:
+ %3 = arith.addf %1, %cst_5 : f32
+ %4 = arith.addi %0, %c1 : index
+ cf.br ^bb3 (%4, %3 : index, f32)
+ ^bb3(%iter_idx: index, %iter_var: f32):
+ %eq = arith.cmpi eq, %iter_idx, %c3 : index
+ cf.cond_br %eq, ^bb4(%iter_var: f32), ^bb1(%iter_idx, %iter_var: index, f32)
+ ^bb4(%ret_var: f32):
+ // CHECK: 15
+ vector.print %ret_var : f32
+ return
+ }
+}
diff --git a/examples/MLIRCF/makefile b/examples/MLIRCF/makefile
new file mode 100644
index 0000000000..5837ebf442
--- /dev/null
+++ b/examples/MLIRCF/makefile
@@ -0,0 +1,44 @@
+#!/bin/bash
+BUDDY_OPT := ../../build/bin/buddy-opt
+MLIR_OPT := ../../llvm/build/bin/mlir-opt
+MLIR_TRANSLATE := ../../llvm/build/bin/mlir-translate
+MLIR_CPU_RUNNER := ../../llvm/build/bin/mlir-cpu-runner
+LLC := ../../llvm/build/bin/llc
+OPT_FLAG := -O0
+CLANG := ../../llvm/build//bin/clang
+MLIR_LIB := ../../llvm/build/lib/
+BUDDY_LIB := ../../build/midend/lib/
+
+ifeq ($(shell uname),Linux)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.so
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.so
+MLIR_ASYNC_RUNTIME := ../../llvm/build/lib/libmlir_async_runtime.so
+MTRIPLE := x86_64-unknown-linux-gnu
+else ifeq ($(shell uname),Darwin)
+MLIR_RUNNER_UTILS := ../../llvm/build/lib/libmlir_runner_utils.dylib
+MLIR_C_RUNNER_UTILS := ../../llvm/build/lib/libmlir_c_runner_utils.dylib
+MLIR_ASYNC_RUNTIME := ./../llvm/build/lib/libmlir_async_runtime.dylib
+MTRIPLE := x86_64-apple-darwin
+endif
+
+cf-iteration-exit-lower:
+ @${MLIR_OPT} ./cf-iteration-exit.mlir \
+ -convert-vector-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts \
+ -o ./log.mlir
+
+cf-iteration-exit-translate:
+ @${MLIR_OPT} ./cf-iteration-exit.mlir \
+ -convert-vector-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+cf-iteration-exit-run:
+ @${MLIR_OPT} ./cf-iteration-exit.mlir \
+ -convert-vector-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
diff --git a/examples/MLIRLinalg/linalg-batch-matmul-dync.mlir b/examples/MLIRLinalg/linalg-batch-matmul-dync.mlir
new file mode 100644
index 0000000000..04dea80df6
--- /dev/null
+++ b/examples/MLIRLinalg/linalg-batch-matmul-dync.mlir
@@ -0,0 +1,65 @@
+// RUN: buddy-opt %s -batchmatmul-tile-optimize="vec-size=64 kernel-m=4 kernel-n=2" \
+// RUN: -convert-linalg-to-loops -expand-strided-metadata -lower-affine \
+// RUN: -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm \
+// RUN: -convert-arith-to-llvm -convert-func-to-llvm -reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+ func.func private @printMemrefF32(memref<*xf32>)
+
+ // Definition for the batch matrix multiplication function
+ func.func @buddy_batchmatmul_f32(%A: memref, %B: memref, %C: memref) {
+ linalg.batch_matmul
+ ins(%A, %B: memref, memref)
+ outs(%C: memref)
+ return
+ }
+
+ func.func @main(){
+ // Set up dims.
+ %cBatch = arith.constant 2:index
+ %cM = arith.constant 2 : index
+ %cN = arith.constant 3 : index
+ %cK = arith.constant 4 : index
+
+ // Set Init Value.
+ %cf1 = arith.constant 1.0 : f32
+ %cf2 = arith.constant 2.0 : f32
+ %c0 = arith.constant 0.0 : f32
+
+ %A = memref.alloc(%cBatch,%cM, %cK) : memref
+ %B = memref.alloc(%cBatch,%cK, %cN) : memref
+ %C = memref.alloc(%cBatch,%cM, %cN) : memref
+
+ linalg.fill
+ ins(%cf1 : f32)
+ outs(%A:memref)
+
+ linalg.fill
+ ins(%cf2 : f32)
+ outs(%B:memref)
+
+ linalg.fill
+ ins(%c0 : f32)
+ outs(%C:memref)
+
+ call @buddy_batchmatmul_f32(%A, %B, %C) : (memref, memref, memref) -> ()
+
+ %print_C = memref.cast %C : memref to memref<*xf32>
+ call @printMemrefF32(%print_C) : (memref<*xf32>) -> ()
+
+ memref.dealloc %C : memref
+ memref.dealloc %B : memref
+ memref.dealloc %A : memref
+ return
+ }
+}
+
+// CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [2, 2, 3] strides = [6, 3, 1] data =
+// CHECK{LITERAL}: [[[8, 8, 8],
+// CHECK{LITERAL}: [8, 8, 8]],
+// CHECK{LITERAL}: [[8, 8, 8],
+// CHECK{LITERAL}: [8, 8, 8]]]
diff --git a/examples/MLIRLinalg/linalg-conv2d_nhwc_fhwc.mlir b/examples/MLIRLinalg/linalg-conv2d_nhwc_fhwc.mlir
new file mode 100644
index 0000000000..ea81007153
--- /dev/null
+++ b/examples/MLIRLinalg/linalg-conv2d_nhwc_fhwc.mlir
@@ -0,0 +1,82 @@
+// RUN: buddy-opt %s \
+// RUN: -conv-nhwc-fhwc-optimize -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+// RUN: -convert-func-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+ func.func private @printMemrefF32(memref<*xf32>)
+ func.func @alloc_2d_filled_f32(%arg0: index, %arg1: index, %arg2: index, %arg3: index, %arg4: f32) -> memref {
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %0 = memref.alloc(%arg0, %arg1, %arg2, %arg3) : memref
+ scf.for %arg5 = %c0 to %arg0 step %c1 {
+ scf.for %arg6 = %c0 to %arg1 step %c1 {
+ scf.for %arg7 = %c0 to %arg2 step %c1 {
+ scf.for %arg8 = %c0 to %arg3 step %c1 {
+ %iarg8=arith.index_cast %arg8 : index to i32
+ %loopf= arith.sitofp %iarg8 : i32 to f32
+ memref.store %loopf, %0[%arg5, %arg6, %arg7, %arg8] : memref
+ }
+ }
+ }
+ }
+ return %0 : memref
+ }
+ func.func @conv_2d_nhwc_fhwc(%arg0: memref, %arg1: memref, %arg2: memref) {
+ linalg.conv_2d_nhwc_fhwc ins(%arg0, %arg1 : memref, memref) outs(%arg2 : memref)
+ return
+ }
+ func.func @main() {
+ // Intput(image, filter) and output value.
+ %cst = arith.constant 0.500000e+00 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+
+ %current_image_n = arith.constant 1 : index
+ %current_image_c = arith.constant 2 : index
+ %current_image_h = arith.constant 4 : index
+ %current_image_w = arith.constant 4 : index
+
+ %current_filter_f = arith.constant 2 : index
+ %current_filter_c = arith.constant 2 : index
+ %current_filter_h = arith.constant 2 : index
+ %current_filter_w = arith.constant 2 : index
+
+ %current_output_n = arith.constant 1 : index
+ %current_output_c = arith.constant 2 : index
+ %current_output_h = arith.constant 3 : index
+ %current_output_w = arith.constant 3 : index
+
+ // Image.
+ %image = call @alloc_2d_filled_f32(%current_image_n,%current_image_h, %current_image_w, %current_image_c, %cst) : (index, index, index, index, f32) -> memref
+ // Filter.
+ %filter = call @alloc_2d_filled_f32(%current_filter_f, %current_filter_h, %current_filter_w,%current_filter_c, %cst) : (index, index, index, index, f32) -> memref
+ // Output.
+ %output = call @alloc_2d_filled_f32(%current_output_n, %current_output_h, %current_output_w,%current_output_c, %cst_0) : (index, index, index, index, f32) -> memref
+
+ call @conv_2d_nhwc_fhwc(%image, %filter, %output) : (memref, memref, memref) -> ()
+
+ %3 = memref.cast %output : memref to memref<*xf32>
+ call @printMemrefF32(%3) : (memref<*xf32>) -> ()
+
+
+ memref.dealloc %output : memref
+ memref.dealloc %image : memref
+ memref.dealloc %filter : memref
+ return
+ }
+}
+
+// CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 3, 3, 2] strides = [18, 6, 2, 1] data =
+// CHECK{LITERAL}: [[[[4, 5],
+// CHECK{LITERAL}: [4, 5],
+// CHECK{LITERAL}: [4, 5]],
+// CHECK{LITERAL}: [[4, 5],
+// CHECK{LITERAL}: [4, 5],
+// CHECK{LITERAL}: [4, 5]],
+// CHECK{LITERAL}: [[4, 5],
+// CHECK{LITERAL}: [4, 5],
+// CHECK{LITERAL}: [4, 5]]]]
diff --git a/examples/MLIRLinalg/linalg-depthwise_conv_2d_nhwc_hwc.mlir b/examples/MLIRLinalg/linalg-depthwise_conv_2d_nhwc_hwc.mlir
new file mode 100644
index 0000000000..905df48bd8
--- /dev/null
+++ b/examples/MLIRLinalg/linalg-depthwise_conv_2d_nhwc_hwc.mlir
@@ -0,0 +1,82 @@
+// RUN: buddy-opt %s \
+// RUN: -depthwise-conv-nhwc-hwc-optimize -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+// RUN: -convert-func-to-llvm -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=void \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+module {
+ func.func private @printMemrefF32(memref<*xf32>)
+
+ func.func @depthwise_conv_2d_nhwc_hwc(%arg0: memref, %arg1: memref, %arg2: memref) {
+ linalg.depthwise_conv_2d_nhwc_hwc
+ {dilations = dense<[1,1]> : tensor<2xi64>, strides = dense<[1,1]> : tensor<2xi64>}
+ ins(%arg0, %arg1 : memref, memref)
+ outs(%arg2 : memref)
+ return
+ }
+
+ func.func @main() {
+ // Constants for input image, filter, and output sizes.
+ %cst = arith.constant 0.500000e+00 : f32
+ %cst_0 = arith.constant 0.000000e+00 : f32
+ %cf1 = arith.constant 1.0 : f32
+
+ %image_n = arith.constant 1 : index
+ %image_h = arith.constant 4 : index
+ %image_w = arith.constant 4 : index
+ %image_c = arith.constant 2 : index
+
+ %filter_h = arith.constant 1 : index
+ %filter_w = arith.constant 2 : index
+ %filter_c = arith.constant 2 : index
+
+ %output_n = arith.constant 1 : index
+ %output_h = arith.constant 3 : index
+ %output_w = arith.constant 3 : index
+ %output_c = arith.constant 2 : index
+
+ %image = memref.alloc(%image_n,%image_h,%image_w,%image_c) : memref
+ %filter = memref.alloc(%filter_h,%filter_w,%filter_c) : memref
+ %output = memref.alloc(%output_n,%output_h,%output_w,%output_c) : memref
+
+ // Allocate and fill image, filter, and output.
+ linalg.fill
+ ins(%cf1 : f32)
+ outs(%image:memref)
+
+ linalg.fill
+ ins(%cf1 : f32)
+ outs(%filter:memref)
+ linalg.fill
+ ins(%cf1 : f32)
+ outs(%output:memref)
+
+ // Call depthwise convolution.
+ call @depthwise_conv_2d_nhwc_hwc(%image, %filter, %output) : (memref, memref, memref) -> ()
+
+ %output_cast = memref.cast %output : memref to memref<*xf32>
+
+ // Print the output.
+ call @printMemrefF32(%output_cast) : (memref<*xf32>) -> ()
+
+ // Deallocate memory.
+ memref.dealloc %output : memref
+ memref.dealloc %image : memref
+ memref.dealloc %filter : memref
+ return
+ }
+}
+
+// CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 3, 3, 2] strides = [18, 6, 2, 1] data =
+// CHECK{LITERAL}: [[[[3, 3],
+// CHECK{LITERAL}: [3, 3],
+// CHECK{LITERAL}: [3, 3]],
+// CHECK{LITERAL}: [[3, 3],
+// CHECK{LITERAL}: [3, 3],
+// CHECK{LITERAL}: [3, 3]],
+// CHECK{LITERAL}: [[3, 3],
+// CHECK{LITERAL}: [3, 3],
+// CHECK{LITERAL}: [3, 3]]]]
diff --git a/examples/MLIRLinalg/linalg-matmul-opt-f32.mlir b/examples/MLIRLinalg/linalg-matmul-opt-f32.mlir
index 5111b57dbe..53148b0d0a 100644
--- a/examples/MLIRLinalg/linalg-matmul-opt-f32.mlir
+++ b/examples/MLIRLinalg/linalg-matmul-opt-f32.mlir
@@ -1,4 +1,4 @@
-// RUN: buddy-opt -matmul-paralell-vectorization-optimize -verify-diagnostics -expand-strided-metadata -lower-affine \
+// RUN: buddy-opt -matmul-parallel-vectorization-optimize -verify-diagnostics -expand-strided-metadata -lower-affine \
// RUN: -convert-linalg-to-loops -convert-vector-to-scf -convert-scf-to-cf -convert-vector-to-llvm -finalize-memref-to-llvm \
// RUN: -llvm-request-c-wrappers -convert-func-to-llvm -reconcile-unrealized-casts %s \
// RUN: | mlir-cpu-runner -O0 -e buddy_matmul_f32 -entry-point-result=void \
diff --git a/examples/MLIRLinalg/linalg-matmul-opt-i8.mlir b/examples/MLIRLinalg/linalg-matmul-opt-i8.mlir
index 9a7b72e5e0..26aa92cbe5 100644
--- a/examples/MLIRLinalg/linalg-matmul-opt-i8.mlir
+++ b/examples/MLIRLinalg/linalg-matmul-opt-i8.mlir
@@ -1,4 +1,4 @@
-// RUN: buddy-opt -matmul-paralell-vectorization-optimize -verify-diagnostics -expand-strided-metadata \
+// RUN: buddy-opt -matmul-parallel-vectorization-optimize -verify-diagnostics -expand-strided-metadata \
// RUN: -lower-affine -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf \
// RUN: -convert-linalg-to-loops -convert-scf-to-cf -llvm-request-c-wrappers -convert-func-to-llvm \
// RUN: -reconcile-unrealized-casts %s \
diff --git a/examples/MLIRLinalg/makefile b/examples/MLIRLinalg/makefile
index f214fa7f67..d9a37926f4 100644
--- a/examples/MLIRLinalg/makefile
+++ b/examples/MLIRLinalg/makefile
@@ -60,6 +60,46 @@ linalg-conv2d-tiling-run:
-convert-func-to-llvm -reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+linalg-conv2d_nhwc_fhwc-optimize-lower:
+ @${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir \
+ -conv-nhwc-fhwc-optimize="vec-size=16" \
+ -o ./log.mlir
+
+linalg-conv2d_nhwc_fhwc-optimize-run:
+ @${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir ${MLIR_OPT_OPTIONS} \
+ -conv-nhwc-fhwc-optimize="vec-size=16" \
+ -lower-affine -convert-scf-to-cf \
+ -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+ -convert-func-to-llvm -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+
+linalg-conv2d_nhwc_fhwc-tile-optimize-lower:
+ @${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir \
+ -conv-nhwc-fhwc-tile-optimize="vec-size=16 tiling-height=2 tiling-width=3" \
+ -o ./log.mlir
+
+linalg-conv2d_nhwc_fhwc-tile-optimize-run:
+ @${BUDDY_OPT} linalg-conv2d_nhwc_fhwc.mlir ${MLIR_OPT_OPTIONS} \
+ -conv-nhwc-fhwc-tile-optimize="vec-size=16 tiling-height=2 tiling-width=3" \
+ -lower-affine -convert-scf-to-cf \
+ -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+ -convert-func-to-llvm -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-depthwise_conv_2d_nhwc_hwc-optimize-lower:
+ @${BUDDY_OPT} linalg-depthwise_conv_2d_nhwc_hwc.mlir \
+ -depthwise-conv-nhwc-hwc-optimize="vec-size=16" \
+ -o ./log.mlir
+
+linalg-depthwise_conv_2d_nhwc_hwc-optimize-run:
+ @${BUDDY_OPT} linalg-depthwise_conv_2d_nhwc_hwc.mlir \
+ -depthwise-conv-nhwc-hwc-optimize="vec-size=16" \
+ -convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
+ -convert-vector-to-llvm -finalize-memref-to-llvm -convert-arith-to-llvm \
+ -convert-func-to-llvm -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
linalg-generic-lower:
@${MLIR_OPT} ./linalg-generic.mlir \
-convert-linalg-to-loops -lower-affine -convert-scf-to-cf \
@@ -177,6 +217,46 @@ linalg-batch-matmul-optimize-lower:
-batchmatmul-optimize="vector-size=64" \
-o ./log.mlir
+linalg-batch-matmul-tile-optimize-lower:
+ @${BUDDY_OPT} linalg-batch-matmul-dync.mlir ${MLIR_OPT_OPTIONS} \
+ -batchmatmul-tile-optimize="vec-size=64 kernel-m=4 kernel-n=2" \
+ -o ./log.mlir
+
+linalg-batch-matmul-tile-optimize-run:
+ @${BUDDY_OPT} linalg-batch-matmul-dync.mlir ${MLIR_OPT_OPTIONS} \
+ -batchmatmul-tile-optimize="vec-size=64 kernel-m=4 kernel-n=2" \
+ -convert-linalg-to-loops \
+ -expand-strided-metadata \
+ -lower-affine \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-arith-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+linalg-batch-matmul-scf-optimize-lower:
+ @${BUDDY_OPT} linalg-batch-matmul-dync.mlir ${MLIR_OPT_OPTIONS} \
+ -batchmatmul-scf-optimize="vector-size=64" \
+ -o ./log.mlir
+
+linalg-batch-matmul-scf-optimize-run:
+ @${BUDDY_OPT} linalg-batch-matmul-dync.mlir ${MLIR_OPT_OPTIONS} \
+ -batchmatmul-scf-optimize="vector-size=64" \
+ -convert-linalg-to-loops \
+ -expand-strided-metadata \
+ -lower-affine \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -finalize-memref-to-llvm \
+ -convert-arith-to-llvm \
+ -convert-func-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
linalg-batch-matmul-optimize-translate:
@${BUDDY_OPT} linalg-batch-matmul-f32.mlir ${MLIR_OPT_OPTIONS} \
-batchmatmul-optimize="vector-size=64" \
@@ -248,7 +328,7 @@ linalg-batch-matmul-i8-optimize-translate:
linalg-matmul-parallized-vectorized-optmize-run:
@${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \
- -matmul-paralell-vectorization-optimize="vector-size=128" \
+ -matmul-parallel-vectorization-optimize="vector-size=128" \
-convert-linalg-to-loops \
-expand-strided-metadata \
-lower-affine \
@@ -263,12 +343,12 @@ linalg-matmul-parallized-vectorized-optmize-run:
linalg-matmul-parallized-vectorized-optmize-lower:
@${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \
- -matmul-paralell-vectorization-optimize="vector-size=128" \
+ -matmul-parallel-vectorization-optimize="vector-size=128" \
-o ./log.mlir
linalg-matmul-parallized-vectorized-optmize-translate:
@${BUDDY_OPT} linalg-matmul-opt-f32.mlir ${MLIR_OPT_OPTIONS} \
- -matmul-paralell-vectorization-optimize="vector-size=128" \
+ -matmul-parallel-vectorization-optimize="vector-size=128" \
-convert-linalg-to-loops \
-expand-strided-metadata \
-lower-affine \
@@ -282,7 +362,7 @@ linalg-matmul-parallized-vectorized-optmize-translate:
linalg-matmul-i8-parallized-vectorized-optmize-run:
@${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \
- -matmul-paralell-vectorization-optimize="vector-size=128" \
+ -matmul-parallel-vectorization-optimize="vector-size=128" \
-convert-linalg-to-loops \
-expand-strided-metadata \
-lower-affine \
@@ -297,12 +377,12 @@ linalg-matmul-i8-parallized-vectorized-optmize-run:
linalg-matmul-i8-parallized-vectorized-optmize-lower:
@${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \
- -matmul-paralell-vectorization-optimize="vector-size=128" \
+ -matmul-parallel-vectorization-optimize="vector-size=128" \
-o ./log.mlir
linalg-matmul-i8-parallized-vectorized-optmize-translate:
@${BUDDY_OPT} linalg-matmul-opt-i8.mlir ${MLIR_OPT_OPTIONS} \
- -matmul-paralell-vectorization-optimize="vector-size=128" \
+ -matmul-parallel-vectorization-optimize="vector-size=128" \
-convert-linalg-to-loops \
-expand-strided-metadata \
-lower-affine \
diff --git a/examples/MLIRVector/makefile b/examples/MLIRVector/makefile
index 681335c7fd..ccc9e9af24 100644
--- a/examples/MLIRVector/makefile
+++ b/examples/MLIRVector/makefile
@@ -43,17 +43,20 @@ vector-load-run:
vector-broadcast-lower:
@${MLIR_OPT} ./vector-broadcast.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts -o ./log.mlir
vector-broadcast-translate:
@${MLIR_OPT} ./vector-broadcast.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
vector-broadcast-asm-x86:
@${MLIR_OPT} ./vector-broadcast.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir | \
@@ -62,6 +65,7 @@ vector-broadcast-asm-x86:
vector-broadcast-asm-rv:
@${MLIR_OPT} ./vector-broadcast.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir | \
@@ -72,6 +76,7 @@ vector-broadcast-asm-rv:
run-targets += vector-broadcast-run
vector-broadcast-run:
@${MLIR_OPT} ./vector-broadcast.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -79,17 +84,20 @@ vector-broadcast-run:
vector-fma-lower:
@${MLIR_OPT} ./vector-fma.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts -o ./log.mlir
vector-fma-translate:
@${MLIR_OPT} ./vector-fma.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
vector-fma-asm-x86:
@${MLIR_OPT} ./vector-fma.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir | \
@@ -98,6 +106,7 @@ vector-fma-asm-x86:
vector-fma-asm-rv:
@${MLIR_OPT} ./vector-fma.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir | \
@@ -108,6 +117,7 @@ vector-fma-asm-rv:
run-targets += vector-fma-run
vector-fma-run:
@${MLIR_OPT} ./vector-fma.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -115,17 +125,20 @@ vector-fma-run:
vector-long-lower:
@${MLIR_OPT} ./vector-long.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts -o ./log.mlir
vector-long-translate:
@${MLIR_OPT} ./vector-long.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
vector-long-asm-x86:
@${MLIR_OPT} ./vector-long.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir | \
@@ -134,6 +147,7 @@ vector-long-asm-x86:
vector-long-asm-rv:
@${MLIR_OPT} ./vector-long.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_TRANSLATE} --mlir-to-llvmir | \
@@ -144,6 +158,7 @@ vector-long-asm-rv:
run-targets += vector-long-run
vector-long-run:
@${MLIR_OPT} ./vector-long.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -187,6 +202,7 @@ vector-shape-cast-translate:
run-targets += vector-shape-cast-run
vector-shape-cast-run:
@${MLIR_OPT} ./vector-shape-cast.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -209,6 +225,7 @@ vector-type-cast-translate:
run-targets += vector-type-cast-run
vector-type-cast-run:
@${MLIR_OPT} ./vector-type-cast.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -253,6 +270,7 @@ vector-shuffle-translate:
run-targets += vector-shuffle-run
vector-shuffle-run:
@${MLIR_OPT} ./vector-shuffle.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -275,6 +293,7 @@ vector-splat-translate:
run-targets += vector-splat-run
vector-splat-run:
@${MLIR_OPT} ./vector-splat.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -297,6 +316,7 @@ vector-insert-translate:
run-targets += vector-insert-run
vector-insert-run:
@${MLIR_OPT} ./vector-insert.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -319,6 +339,7 @@ vector-reduction-translate:
run-targets += vector-reduction-run
vector-reduction-run:
@${MLIR_OPT} ./vector-reduction.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -341,6 +362,7 @@ vector-outerproduct-translate:
run-targets += vector-outerproduct-run
vector-outerproduct-run:
@${MLIR_OPT} ./vector-outerproduct.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -363,6 +385,7 @@ vector-create-mask-translate:
run-targets += vector-create-mask-run
vector-create-mask-run:
@${MLIR_OPT} ./vector-create-mask.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -384,6 +407,7 @@ vector-extract-translate:
run-targets += vector-extract-run
vector-extract-run:
@${MLIR_OPT} ./vector-extract.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -405,6 +429,7 @@ vector-maskedload-translate:
run-targets += vector-maskedload-run
vector-maskedload-run:
@${MLIR_OPT} ./vector-maskedload.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -427,6 +452,7 @@ vector-maskedstore-translate:
run-targets += vector-maskedstore-run
vector-maskedstore-run:
@${MLIR_OPT} ./vector-maskedstore.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -449,6 +475,7 @@ vector-extract-strided-slice-translate:
run-targets += vector-extract-strided-slice-run
vector-extract-strided-slice-run:
@${MLIR_OPT} ./vector-extract-strided-slice.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -470,6 +497,7 @@ vector-constant-mask-translate:
run-targets += vector-constant-mask-run
vector-constant-mask-run:
@${MLIR_OPT} ./vector-constant-mask.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -491,6 +519,7 @@ vector-expandload-translate:
run-targets += vector-expandload-run
vector-expandload-run:
@${MLIR_OPT} ./vector-expandload.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -512,6 +541,7 @@ vector-compressstore-translate:
run-targets += vector-compressstore-run
vector-compressstore-run:
@${MLIR_OPT} ./vector-compressstore.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -533,6 +563,7 @@ vector-insert-strided-slice-translate:
run-targets += vector-insert-strided-slice-run
vector-insert-strided-slice-run:
@${MLIR_OPT} ./vector-insert-strided-slice.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -554,6 +585,7 @@ vector-scatter-translate:
run-targets += vector-scatter-run
vector-scatter-run:
@${MLIR_OPT} ./vector-scatter.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -576,6 +608,7 @@ vector-gather-translate:
run-targets += vector-gather-run
vector-gather-run:
@${MLIR_OPT} ./vector-gather.mlir \
+ -convert-vector-to-scf -convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
-split-input-file -verify-diagnostics \
--reconcile-unrealized-casts | \
@@ -598,7 +631,7 @@ vector-transfer-read-translate:
run-targets += vector-transfer-read-run
vector-transfer-read-run:
@${MLIR_OPT} ./vector-transfer-read.mlir \
- --convert-vector-to-scf --lower-affine --convert-scf-to-cf \
+ --convert-vector-to-scf --lower-affine --convert-scf-to-cf \
--convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
@@ -669,3 +702,27 @@ vector-store-run:
--reconcile-unrealized-casts | \
${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+
+vector-iteration-lower:
+ @${MLIR_OPT} ./vector-iteration.mlir \
+ --lower-affine \
+ -convert-vector-to-scf -convert-scf-to-cf \
+ --convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
+ --reconcile-unrealized-casts -o ./log.mlir
+
+vector-iteration-translate:
+ @${MLIR_OPT} ./vector-iteration.mlir \
+ --lower-affine \
+ -convert-vector-to-scf -convert-scf-to-cf \
+ --convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
+ --reconcile-unrealized-casts | \
+ ${MLIR_TRANSLATE} --mlir-to-llvmir -o log.ll
+
+vector-iteration-run:
+ @${MLIR_OPT} ./vector-iteration.mlir \
+ --lower-affine \
+ -convert-vector-to-scf -convert-scf-to-cf \
+ --convert-vector-to-llvm --finalize-memref-to-llvm --convert-func-to-llvm \
+ --reconcile-unrealized-casts | \
+ ${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=i32 \
+ -shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
diff --git a/examples/MLIRVector/vector-iteration.mlir b/examples/MLIRVector/vector-iteration.mlir
new file mode 100644
index 0000000000..7d63f22896
--- /dev/null
+++ b/examples/MLIRVector/vector-iteration.mlir
@@ -0,0 +1,128 @@
+// RUN: buddy-opt %s \
+// RUN: -lower-affine \
+// RUN: -convert-vector-to-scf -convert-scf-to-cf \
+// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \
+// RUN: -reconcile-unrealized-casts \
+// RUN: | mlir-cpu-runner -e main -entry-point-result=i32 \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
+// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN: | FileCheck %s
+
+memref.global "private" @gv : memref<4x4xf32> = dense<[[0. , 1. , 2. , 3. ],
+ [10., 11., 12., 13.],
+ [20., 21., 22., 23.],
+ [30., 31., 32., 33.]]>
+
+memref.global "private" @gv_pat_1 : memref<10xf32> = dense<[0. , 1. , 2. , 3. , 4. , 5. , 6. , 7. , 8. , 9.]>
+memref.global "private" @gv_pat_2 : memref<10xf32> = dense<[0. , 1. , 2. , 3. , 4. , 5. , 6. , 7. , 8. , 9.]>
+
+func.func private @printMemrefF32(memref<*xf32>)
+
+func.func @main() -> i32 {
+ %mem = memref.get_global @gv : memref<4x4xf32>
+ %c0 = arith.constant 0 : index
+ %c1 = arith.constant 1 : index
+ %c2 = arith.constant 2 : index
+ %sum_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
+ %sum = affine.for %i = 0 to 3 iter_args(%sum_iter = %sum_0) -> (vector<4xf32>) {
+ %load_vec1 = vector.load %mem[%c0, %c0] : memref<4x4xf32>, vector<4xf32>
+ %load_vec2 = vector.load %mem[%i, %c0] : memref<4x4xf32>, vector<4xf32>
+ %sum_next = vector.fma %load_vec1, %load_vec2, %sum_iter : vector<4xf32>
+ affine.yield %sum_next : vector<4xf32>
+ }
+ // CHECK: ( 0, 33, 72, 117 )
+ vector.print %sum : vector<4xf32>
+
+ // ---------------------------------------------------------------------------
+ // Iteration Pattern 1
+ // Main Vector Loop + Scalar Remainder + Fixed Vector Type
+ // ---------------------------------------------------------------------------
+
+ // 1. Get the total length of the workload.
+ %mem_pat_1 = memref.get_global @gv_pat_1 : memref<10xf32>
+ %print_mem_pat_1 = memref.cast %mem_pat_1 : memref<10xf32> to memref<*xf32>
+ %vl_total_pat_1 = memref.dim %mem_pat_1, %c0 : memref<10xf32>
+
+ // 2. Set the iteration step (vector size).
+ %vl_step_pat_1 = arith.constant 4 : index
+
+ // 3. Calculate the upper bound for vectorized processing
+ // - Subtract `vl_step` is to avoid overflow at the vectorization tail.
+ // - Add 1 to ensure the final loop runs when the workload length is divisible
+ // by the vector size.
+ %vl_upbound_pat_1_ = arith.subi %vl_total_pat_1, %vl_step_pat_1 : index
+ %vl_upbound_pat_1 = arith.addi %vl_upbound_pat_1_, %c1 : index
+
+ // 4. Perform the vectorization body.
+ %iter_idx_pat_1 = scf.for %i = %c0 to %vl_upbound_pat_1 step %vl_step_pat_1
+ iter_args(%iter_init = %c0) -> (index) {
+ %load_vec1 = vector.load %mem_pat_1[%i] : memref<10xf32>, vector<4xf32>
+ %load_vec2 = vector.load %mem_pat_1[%i] : memref<10xf32>, vector<4xf32>
+ %res = arith.addf %load_vec1, %load_vec2 : vector<4xf32>
+ vector.store %res, %mem_pat_1[%i] : memref<10xf32>, vector<4xf32>
+ %i_next = arith.addi %i, %vl_step_pat_1 : index
+ scf.yield %i_next : index
+ }
+ // CHECK: [0, 2, 4, 6, 8, 10, 12, 14, 8, 9]
+ call @printMemrefF32(%print_mem_pat_1) : (memref<*xf32>) -> ()
+
+ // 5. Process the remainder of the elements with scalar operations.
+ scf.for %i = %iter_idx_pat_1 to %vl_total_pat_1 step %c1 {
+ %ele1 = memref.load %mem_pat_1[%i] : memref<10xf32>
+ %ele2 = memref.load %mem_pat_1[%i] : memref<10xf32>
+ %res = arith.addf %ele1, %ele2 : f32
+ memref.store %res, %mem_pat_1[%i] : memref<10xf32>
+ }
+ // CHECK: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
+ call @printMemrefF32(%print_mem_pat_1) : (memref<*xf32>) -> ()
+
+ // ---------------------------------------------------------------------------
+ // Iteration Pattern 2
+ // Main Vector Loop + Masked Vector Remainder + Fixed Vector Type
+ // ---------------------------------------------------------------------------
+
+ // 1. Get the total length of the workload.
+ %mem_pat_2 = memref.get_global @gv_pat_2 : memref<10xf32>
+ %print_mem_pat_2 = memref.cast %mem_pat_2 : memref<10xf32> to memref<*xf32>
+ %vl_total_pat_2 = memref.dim %mem_pat_2, %c0 : memref<10xf32>
+
+ // 2. Set the iteration step (vector size).
+ %vl_step_pat_2 = arith.constant 4 : index
+
+ // 3. Calculate the upper bound for vectorized processing
+ // - Subtract `vl_step` is to avoid overflow at the vectorization tail.
+ // - Add 1 to ensure the final loop runs when the workload length is divisible
+ // by the vector size.
+ %vl_upbound_pat_2_ = arith.subi %vl_total_pat_2, %vl_step_pat_2 : index
+ %vl_upbound_pat_2 = arith.addi %vl_upbound_pat_2_, %c1 : index
+
+ // 4. Perform the vectorization body.
+ %iter_idx_pat_2 = scf.for %i = %c0 to %vl_upbound_pat_2 step %vl_step_pat_2
+ iter_args(%iter_init = %c0) -> (index) {
+ %load_vec1 = vector.load %mem_pat_2[%i] : memref<10xf32>, vector<4xf32>
+ %load_vec2 = vector.load %mem_pat_2[%i] : memref<10xf32>, vector<4xf32>
+ %res = arith.addf %load_vec1, %load_vec2 : vector<4xf32>
+ vector.store %res, %mem_pat_2[%i] : memref<10xf32>, vector<4xf32>
+ %i_next = arith.addi %i, %vl_step_pat_1 : index
+ scf.yield %i_next : index
+ }
+ // CHECK: [0, 2, 4, 6, 8, 10, 12, 14, 8, 9]
+ call @printMemrefF32(%print_mem_pat_2) : (memref<*xf32>) -> ()
+
+ // 5. Compute the tail size and create mask and pass-through vector for the
+ // remaining elements.
+ %tail_size_pat_2 = arith.subi %vl_total_pat_2, %iter_idx_pat_2 : index
+ %mask_pat_2 = vector.create_mask %tail_size_pat_2 : vector<4xi1>
+ %pass_thr_vec = arith.constant dense<0.> : vector<4xf32>
+
+ // 6. Process the remaining elements using masked vector operations.
+ %ele1 = vector.maskedload %mem_pat_2[%iter_idx_pat_2], %mask_pat_2, %pass_thr_vec : memref<10xf32>, vector<4xi1>, vector<4xf32> into vector<4xf32>
+ %ele2 = vector.maskedload %mem_pat_2[%iter_idx_pat_2], %mask_pat_2, %pass_thr_vec : memref<10xf32>, vector<4xi1>, vector<4xf32> into vector<4xf32>
+ %res = arith.addf %ele1, %ele2 : vector<4xf32>
+ vector.maskedstore %mem_pat_2[%iter_idx_pat_2], %mask_pat_2, %res : memref<10xf32>, vector<4xi1>, vector<4xf32>
+ // CHECK: [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
+ call @printMemrefF32(%print_mem_pat_2) : (memref<*xf32>) -> ()
+
+ %ret = arith.constant 0 : i32
+ return %ret : i32
+}
diff --git a/examples/RVVExperiment/makefile b/examples/RVVExperiment/makefile
index 5a8a28f38d..6cadb07cdc 100644
--- a/examples/RVVExperiment/makefile
+++ b/examples/RVVExperiment/makefile
@@ -110,19 +110,16 @@ rvv-insert-extract-intrinsics-asm:
-mattr=+m,+d,+v -riscv-v-vector-bits-min=256 \
--filetype=asm -o log.s
-# TODO: Fix me.
rvv-c-setvl-translate:
@${LOCAL_CLANG} -march=rv64gcv --target=riscv64-unknown-linux-gnu \
--sysroot=${RISCV_GNU_TOOLCHAIN_SYSROOT} --gcc-toolchain=${RISCV_GNU_TOOLCHAIN} \
./rvv-c-setvl.c -fPIC -S -emit-llvm -o log.ll
-# TODO: Fix me.
rvv-c-setvl-asm:
@${LOCAL_CLANG} -march=rv64gcv --target=riscv64-unknown-linux-gnu \
--sysroot=${RISCV_GNU_TOOLCHAIN_SYSROOT} --gcc-toolchain=${RISCV_GNU_TOOLCHAIN} \
./rvv-c-setvl.c -fPIC -S -o log.s
-# TODO: Fix me.
run-targets += rvv-c-setvl-run
rvv-c-setvl-run:
@${LOCAL_CLANG} -march=rv64gcv --target=riscv64-unknown-linux-gnu \
diff --git a/examples/RVVExperiment/rvv-c-setvl.c b/examples/RVVExperiment/rvv-c-setvl.c
index c8d1ccfbb1..4a8489d55d 100644
--- a/examples/RVVExperiment/rvv-c-setvl.c
+++ b/examples/RVVExperiment/rvv-c-setvl.c
@@ -3,7 +3,7 @@
int main() {
int avl = 70;
- int vl = vsetvl_e32m2(avl);
+ int vl = __riscv_vsetvl_e32m2(avl);
printf("vl: %d\n", vl);
return 0;
diff --git a/examples/VectorExpDialect/makefile b/examples/VectorExpDialect/makefile
index ab85a8a2cc..fc88556419 100644
--- a/examples/VectorExpDialect/makefile
+++ b/examples/VectorExpDialect/makefile
@@ -319,3 +319,24 @@ vector-exp-dynamic-vector-run:
-L${CROSS_MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
-o a.out
@LD_LIBRARY_PATH=${CROSS_MLIR_LIB} ${QEMU} -L ${RISCV_GNU_TOOLCHAIN_SYSROOT} -cpu max a.out
+
+vector-exp-iteration-aot:
+ @${BUDDY_OPT} ./vector-exp-iteration.mlir \
+ -lower-vector-exp \
+ -lower-affine \
+ -convert-vector-to-scf \
+ -convert-scf-to-cf \
+ -convert-vector-to-llvm \
+ -convert-index-to-llvm \
+ -convert-arith-to-llvm \
+ -convert-func-to-llvm \
+ -finalize-memref-to-llvm \
+ -reconcile-unrealized-casts | \
+ ${BUDDY_TRANSLATE} -buddy-to-llvmir -o log.ll
+ ${LOCAL_CLANG} -O3 log.ll \
+ -march=rv64gcv --target=riscv64-unknown-linux-gnu -fPIC \
+ --sysroot=${RISCV_GNU_TOOLCHAIN}/sysroot \
+ --gcc-toolchain=${RISCV_GNU_TOOLCHAIN} \
+ -L${CROSS_MLIR_LIB} -lmlir_runner_utils -lmlir_c_runner_utils \
+ -o a.out
+
diff --git a/examples/VectorExpDialect/vector-exp-iteration.mlir b/examples/VectorExpDialect/vector-exp-iteration.mlir
new file mode 100644
index 0000000000..bc879d0103
--- /dev/null
+++ b/examples/VectorExpDialect/vector-exp-iteration.mlir
@@ -0,0 +1,57 @@
+memref.global "private" @gv : memref<10xf32> = dense<[0. , 1. , 2. , 3. , 4. , 5. , 6. , 7. , 8. , 9.]>
+
+func.func private @printMemrefF32(memref<*xf32>)
+
+func.func @main() -> i32 {
+ %c0 = arith.constant 0 : index
+
+ // ---------------------------------------------------------------------------
+ // Iteration Pattern for RVV Dynamic Vector Length
+ // ---------------------------------------------------------------------------
+
+ // 1. Get the total length of the workload.
+ %mem = memref.get_global @gv : memref<10xf32>
+ %print_mem = memref.cast %mem : memref<10xf32> to memref<*xf32>
+ %vl_total = memref.dim %mem, %c0 : memref<10xf32>
+
+ // 2. Set the scale factor, iteration step, and mask.
+ %vs = vector.vscale
+ %factor = arith.constant 2 : index
+ %vl_step = arith.muli %vs, %factor : index
+ %mask = arith.constant dense<1> : vector<[2]xi1>
+ %vl_total_i32 = index.casts %vl_total : index to i32
+ %vl_step_i32 = index.casts %vl_step : index to i32
+
+ // 3. Perform the vectorization.
+ %iter_vl = scf.for %i = %c0 to %vl_total step %vl_step
+ iter_args(%iter_vl_i32 = %vl_total_i32) -> (i32) {
+
+ %load_vec1 = vector_exp.predication %mask, %iter_vl_i32 : vector<[2]xi1>, i32 {
+ %ele = vector.load %mem[%i] : memref<10xf32>, vector<[2]xf32>
+ vector.yield %ele : vector<[2]xf32>
+ } : vector<[2]xf32>
+
+ %load_vec2 = vector_exp.predication %mask, %iter_vl_i32 : vector<[2]xi1>, i32 {
+ %ele = vector.load %mem[%i] : memref<10xf32>, vector<[2]xf32>
+ vector.yield %ele : vector<[2]xf32>
+ } : vector<[2]xf32>
+
+ %res = "llvm.intr.vp.fadd" (%load_vec1, %load_vec2, %mask, %iter_vl_i32) :
+ (vector<[2]xf32>, vector<[2]xf32>, vector<[2]xi1>, i32) -> vector<[2]xf32>
+
+ vector_exp.predication %mask, %iter_vl_i32 : vector<[2]xi1>, i32 {
+ vector.store %res, %mem[%i] : memref<10xf32>, vector<[2]xf32>
+ vector.yield
+ } : () -> ()
+
+ // Update dynamic vector length.
+ %new_vl = arith.subi %iter_vl_i32, %vl_step_i32 : i32
+ scf.yield %new_vl : i32
+ }
+
+ // CHECK: [0, 2, 4, 6, 8, 10, 12, 14, 8, 9]
+ call @printMemrefF32(%print_mem) : (memref<*xf32>) -> ()
+
+ %ret = arith.constant 0 : i32
+ return %ret : i32
+}
diff --git a/examples/lit.cfg.py b/examples/lit.cfg.py
index a1527a03a3..c1c4c05bd6 100644
--- a/examples/lit.cfg.py
+++ b/examples/lit.cfg.py
@@ -43,6 +43,7 @@
'BuddyBert',
'BuddyMobileNetV3',
'BuddyResNet18',
+ 'BuddyGPU',
'ConvOpt',
'DAPDialect',
'DIPDialect',
diff --git a/flake.lock b/flake.lock
index 7bdd046777..bd79922394 100644
--- a/flake.lock
+++ b/flake.lock
@@ -5,11 +5,11 @@
"systems": "systems"
},
"locked": {
- "lastModified": 1694529238,
- "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
+ "lastModified": 1710146030,
+ "narHash": "sha256-SZ5L6eA7HJ/nmkzGG7/ISclqe6oZdOZTNoesiInkXPQ=",
"owner": "numtide",
"repo": "flake-utils",
- "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
+ "rev": "b1d9ab70662946ef0850d488da1c9019f3a9752a",
"type": "github"
},
"original": {
@@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
- "lastModified": 1699099776,
- "narHash": "sha256-X09iKJ27mGsGambGfkKzqvw5esP1L/Rf8H3u3fCqIiU=",
+ "lastModified": 1722813957,
+ "narHash": "sha256-IAoYyYnED7P8zrBFMnmp7ydaJfwTnwcnqxUElC1I26Y=",
"owner": "NixOS",
"repo": "nixpkgs",
- "rev": "85f1ba3e51676fa8cc604a3d863d729026a6b8eb",
+ "rev": "cb9a96f23c491c081b38eab96d22fa958043c9fa",
"type": "github"
},
"original": {
diff --git a/flake.nix b/flake.nix
index 8f94e2aec0..c3af6d9d59 100644
--- a/flake.nix
+++ b/flake.nix
@@ -9,36 +9,17 @@
outputs = { self, nixpkgs, flake-utils }@inputs:
let
overlay = import ./nix/overlay.nix;
- pkgsForSys = system: import nixpkgs { overlays = [ overlay ]; inherit system; };
in
flake-utils.lib.eachDefaultSystem
(system:
let
- pkgs = pkgsForSys system;
- mkLLVMShell = pkgs.mkShell.override { stdenv = pkgs.llvmPkgs.stdenv; };
+ pkgs = import nixpkgs { overlays = [ overlay ]; inherit system; };
in
{
# Help other use packages in this flake
legacyPackages = pkgs;
- devShells.default = mkLLVMShell {
- buildInputs = with pkgs; [
- # buddy-mlir build tools
- cmake
- ninja
- python3
- llvmPkgs.bintools # For ld.lld
-
- # buddy-mlir libraries
- libjpeg
- libpng
- zlib-ng
- ];
-
- postHook = ''
- export PATH="${pkgs.clang-tools}/bin:$PATH"
- '';
- };
+ devShells.default = pkgs.buddy-mlir.devShell;
formatter = pkgs.nixpkgs-fmt;
}) //
diff --git a/frontend/Interfaces/buddy/Core/Container.h b/frontend/Interfaces/buddy/Core/Container.h
index db8b66c179..6e3ff18d53 100644
--- a/frontend/Interfaces/buddy/Core/Container.h
+++ b/frontend/Interfaces/buddy/Core/Container.h
@@ -132,7 +132,7 @@ MemRef::MemRef(intptr_t sizes[N], T init) : MemRef(sizes) {
template
MemRef::MemRef(intptr_t sizes[N], bool needMalloc, intptr_t offset)
- : offset(offset), aligned(nullptr), allocated(nullptr) {
+ : allocated(nullptr), aligned(nullptr), offset(offset) {
for (size_t i = 0; i < N; i++) {
this->sizes[i] = sizes[i];
}
@@ -152,7 +152,7 @@ MemRef::MemRef(std::vector sizes, T init) : MemRef(sizes) {
template
MemRef::MemRef(std::vector sizes, bool needMalloc,
intptr_t offset)
- : offset(offset), aligned(nullptr), allocated(nullptr) {
+ : allocated(nullptr), aligned(nullptr), offset(offset) {
if (sizes.size() != N) {
throw std::runtime_error("Invalid number of dimensions.");
}
diff --git a/frontend/Interfaces/buddy/DAP/AudioContainer.h b/frontend/Interfaces/buddy/DAP/AudioContainer.h
index 9bc9245742..7c3901e733 100644
--- a/frontend/Interfaces/buddy/DAP/AudioContainer.h
+++ b/frontend/Interfaces/buddy/DAP/AudioContainer.h
@@ -14,6 +14,13 @@
//
//===----------------------------------------------------------------------===//
//
+// The audio decoding process in this file references the `AudioFile` library,
+// which is hereby acknowledged.
+// For the license of the `AudioFile` library,
+// please see: https://github.com/adamstark/AudioFile/blob/master/LICENSE
+//
+//===----------------------------------------------------------------------===//
+//
// Audio container descriptor.
//
//===----------------------------------------------------------------------===//
@@ -21,79 +28,592 @@
#ifndef FRONTEND_INTERFACES_BUDDY_DAP_AUDIOCONTAINER
#define FRONTEND_INTERFACES_BUDDY_DAP_AUDIOCONTAINER
-#include "AudioFile.h"
#include "buddy/Core/Container.h"
+#include
+#include
+#include
+#include
+#include
namespace dap {
-
-// Audio container.
-// - T represents the type of the elements.
-// - N represents the number of audio channels (Normally would be 1 or 2).
-// If N is smaller than channels from the file, only previous N channels will be
-// manipulated.
-template class Audio {
+template class Audio : public MemRef {
public:
- Audio() : audioFile(), data(nullptr) {}
- explicit Audio(std::string filename) : audioFile(filename), data(nullptr) {}
- void fetchMetadata(const AudioFile &aud);
- bool save(std::string filename);
- AudioFile &getAudioFile() {
- moveToAudioFile();
- return audioFile;
- }
- MemRef &getMemRef() {
- moveToMemRef();
- return *data;
- }
-
-protected:
- void moveToMemRef();
- void moveToAudioFile();
- AudioFile audioFile;
- MemRef *data;
+ // Constructor to initialize the Audio MemRef object with a file name.
+ Audio(std::string filename);
+ // Constructor to convert MemRef object to Audio MemRef object. Member
+ // variables are initialized with default values.
+ Audio(MemRef &&memref) noexcept;
+
+ // Retrieve the name of the audio format.
+ std::string getFormatName() const {
+ switch (this->audioFormat) {
+ case AudioFormat::WAV:
+ return "WAV";
+ default:
+ return "Unsupported format";
+ }
+ }
+ // Returns the number of bits per sample.
+ int getBitDepth() const { return static_cast(this->bitsPerSample); }
+ // Returns the number of samples per channel.
+ size_t getSamplesNum() const { return this->numSamples; }
+ // Returns the number of audio channels.
+ int getChannelsNum() const { return static_cast(this->numChannels); }
+ // Returns the sampling rate in samples per second.
+ int getSampleRate() const { return static_cast(this->sampleRate); }
+
+ // Sets the number of bits per sample.
+ void setBitDepth(int bitDepth) {
+ this->bitsPerSample = static_cast(bitDepth);
+ }
+ // Sets the number of samples per channel.
+ void setSamplesNum(size_t samplesNum) { this->numSamples = samplesNum; }
+ // Sets the number of audio channels.
+ void setChannelsNum(int channelsNum) {
+ this->numChannels = static_cast(channelsNum);
+ }
+ // Sets the sampling rate in samples per second.
+ void setSampleRate(int sampleRate) {
+ this->sampleRate = static_cast(sampleRate);
+ }
+
+ // Create an Audio File with file name and format.
+ bool saveToFile(std::string filename, std::string format);
+
+private:
+ // Sample bit depth.
+ uint16_t bitsPerSample;
+ // Number of samples per channel.
+ size_t numSamples;
+ // Number of audio channels.
+ uint16_t numChannels;
+ // Samples per second (Hz).
+ uint32_t sampleRate;
+ // Enum to represent supported audio formats.
+ enum class AudioFormat {
+ ERROR, // Represents an error or unsupported format.
+ WAV, // WAV format.
+ } audioFormat;
+ // Enum to represent byte order of data.
+ enum class Endianness { LittleEndian, BigEndian };
+
+ // Decoders for multiple audio file formats.
+ // Decode a WAV file into MemRef format.
+ bool decodeWaveFile(const std::vector &fileData);
+
+ // Encoders for multiple audio file formats.
+ // Encode a MemRef into WAV format.
+ bool EncodeWaveFile(std::vector &fileData);
+
+ // Helper functions for decoding and data manipulation
+ // Find the index of a specified chunk in the audio file.
+ size_t getIndexOfChunk(const std::vector &fileData,
+ const std::string &chunkHeaderID, size_t startIndex,
+ Endianness endianness = Endianness::LittleEndian);
+ // Convert four bytes to a 32-bit integer according to byte order of data.
+ int32_t fourBytesToI32(const std::vector &fileData,
+ size_t startIndex,
+ Endianness endianness = Endianness::LittleEndian);
+ // Convert two bytes to a 16-bit integer according to byte order of data.
+ int16_t twoBytesToI16(const std::vector &fileData, size_t startIndex,
+ Endianness endianness = Endianness::LittleEndian);
+ // Normalize 8-bit unsigned integer sample to a range of -1.0 to 1.0.
+ T oneByteToSample(uint8_t data) {
+ return static_cast(data - 128) / static_cast(128.);
+ }
+ // Normalize 16-bit signed integer sample to a range of -1.0 to 1.0.
+ T twoBytesToSample(int16_t data) {
+ return static_cast(data) / static_cast(32768.);
+ }
+
+ // Helper functions for encoding and data manipulation.
+ // Converts each character in the string to a byte.
+ void stringToBytes(std::vector &fileData, const std::string &str) {
+ for (size_t i = 0; i < str.size(); i++)
+ fileData.push_back(static_cast(str[i]));
+ }
+ // Converts a 32-bit integer to four bytes according to byte order of data.
+ void i32ToFourBytes(std::vector &fileData, int32_t num,
+ Endianness endianness = Endianness::LittleEndian);
+ // Converts a 16-bit integer to two bytes according to byte order of data.
+ void i16ToTwoBytes(std::vector &fileData, int16_t num,
+ Endianness endianness = Endianness::LittleEndian);
+ // Converts an audio sample to a 8-bit PCM format (one byte).
+ uint8_t sampleToOneByte(T sample);
+ // Converts an audio sample to a 16-bit PCM format (two bytes).
+ int16_t sampleToI16(T sample);
};
-template bool Audio::save(std::string filename) {
- if (!this->audioFile.samples) {
- auto temp = this->data->release();
- if constexpr (std::is_same_v) {
- for (int i = 0; i < audioFile.numSamples; i++) {
- if (temp[i] != temp[i]) { // To handle NaN values
- temp[i] = 0.9999999;
- } else { // Clamp the values between -1.0 to 1.0
- temp[i] = std::clamp(temp[i], float(-1.0), float(0.9999999));
- }
+// Audio Container Constructor.
+// Constructs an audio container object from the audio file path.
+template Audio::Audio(std::string filePath) {
+ // ---------------------------------------------------------------------------
+ // 1. Read the audio file into a std::vector.
+ // ---------------------------------------------------------------------------
+ // Open the file in binary mode and position the file pointer at the end of
+ // the file.
+ std::ifstream file(filePath, std::ios::binary | std::ios::ate);
+ // Check if the file was successfully opened.
+ if (!file) {
+ throw std::runtime_error("Error: Unable to open file at " + filePath);
+ }
+ // Get the size of the file.
+ size_t dataLength = file.tellg();
+ // Move file pointer to the beginning of the file.
+ file.seekg(0, std::ios::beg);
+ // Create a vector to store the data.
+ std::vector fileData(dataLength);
+ // Read the data.
+ if (!file.read(reinterpret_cast(fileData.data()), dataLength)) {
+ throw std::runtime_error("Error: Unable to read data from " + filePath);
+ }
+ // ---------------------------------------------------------------------------
+ // 2. Determine the audio format and decode the audio data into MemRef.
+ // ---------------------------------------------------------------------------
+ std::string header(fileData.begin(), fileData.begin() + 4);
+ // Check the file header to determine the format.
+ if (header == "RIFF") {
+ this->audioFormat = AudioFormat::WAV;
+ bool success = decodeWaveFile(fileData);
+ if (!success) {
+ this->audioFormat = AudioFormat::ERROR;
+ throw std::runtime_error("Failed to decode WAV file from " + filePath);
+ };
+ } else {
+ this->audioFormat = AudioFormat::ERROR;
+ throw std::runtime_error("Unsupported audio format detected in file " +
+ filePath);
+ }
+}
+
+// Constructs an audio container object from a MemRef object. Initializes
+// metadata with default values.
+template
+Audio::Audio(MemRef &&memref) noexcept
+ : MemRef(std::move(memref)), bitsPerSample(0), numSamples(0),
+ numChannels(0), sampleRate(0) {}
+
+// Create Audio File.
+// Save Audio MemRef to the specified file path using the desired format.
+template
+bool Audio::saveToFile(std::string filePath, std::string format) {
+ // ---------------------------------------------------------------------------
+ // 1. Determine the audio format and encode the MemRef into file data.
+ // ---------------------------------------------------------------------------
+ // Convert the string to lowercase before comparison, ensuring that case
+ // variations are handled without repeating conditions.
+ std::transform(format.begin(), format.end(), format.begin(), ::tolower);
+ // Vector for storing bytes in a specific audio format.
+ std::vector fileData;
+ // Select encoder.
+ if (format == "wav" || format == "wave") {
+ bool success = EncodeWaveFile(fileData);
+ if (!success) {
+ std::cerr << "Failed to encode WAVE file." << std::endl;
+ return false;
+ }
+ } else {
+ std::cerr << "Unsupported: The encoding method for " << format
+ << " format is not yet supported." << std::endl;
+ return false;
+ }
+ // ---------------------------------------------------------------------------
+ // 2. Write std::vector into audio file.
+ // ---------------------------------------------------------------------------
+ std::ofstream outputFile(filePath, std::ios::binary);
+
+ if (outputFile.is_open()) {
+ for (size_t i = 0; i < fileData.size(); i++) {
+ char value = static_cast(fileData[i]);
+ outputFile.write(&value, sizeof(char));
+ }
+
+ outputFile.close();
+
+ return true;
+ }
+
+ return false;
+}
+
+// WAV Audio File Decoder
+template
+bool Audio::decodeWaveFile(const std::vector &fileData) {
+ // This container class only cares about the data and key information in the
+ // audio file, so only the format and data chunk are decoded here.
+ // Find the starting indices of critical chunks within the WAV file.
+ size_t indexOfFormatChunk = getIndexOfChunk(fileData, "fmt ", 12);
+ size_t indexOfDataChunk = getIndexOfChunk(fileData, "data", 12);
+
+ // Decode the 'format' chunk to obtain format specifications.
+ // Format sub-chunk:
+ // sub-chunk ID: char[4] | 4 bytes | "fmt "
+ // sub-chunk size: uint32_t | 4 bytes
+ // audio format: uint16_t | 2 bytes | 1 for PCM
+ // number of channels: uint16_t | 2 bytes
+ // sample rate: uint32_t | 4 bytes
+ // byte rate: uint32_t | 4 bytes
+ // block align: uint16_t | 2 bytes
+ // bits per sample: uint16_t | 2 bytes
+ std::string formatChunkID(fileData.begin() + indexOfFormatChunk,
+ fileData.begin() + indexOfFormatChunk + 4);
+ // uint32_t fmtChunkSize = fourBytesToI32(fileData, indexOfFormatChunk + 4);
+ // uint16_t audioFormat = twoBytesToI16(fileData, indexOfFormatChunk + 8);
+ this->numChannels = twoBytesToI16(fileData, indexOfFormatChunk + 10);
+ this->sampleRate = fourBytesToI32(fileData, indexOfFormatChunk + 12);
+ // byteRate = sampleRate * numChannels * bitsPerSample / 8
+ // uint32_t byteRate = fourBytesToI32(fileData, indexOfFormatChunk + 16);
+ // blockAlign = numChannels * bitsPerSample / 8
+ uint16_t blockAlign = twoBytesToI16(fileData, indexOfFormatChunk + 20);
+ this->bitsPerSample = twoBytesToI16(fileData, indexOfFormatChunk + 22);
+ uint16_t numBytesPerSample = static_cast(this->bitsPerSample) / 8;
+
+ // Decode `data` chunk.
+ // Data sub-chunk:
+ // sub-chunk ID: char[4] | 4 bytes | "data"
+ // sub-chunk size: uint32_t | 4 bytes
+ // data | remains
+ std::string dataChunkID(fileData.begin() + indexOfDataChunk,
+ fileData.begin() + indexOfDataChunk + 4);
+ int32_t dataChunkSize = fourBytesToI32(fileData, indexOfDataChunk + 4);
+ this->numSamples = dataChunkSize / blockAlign;
+ // size_t numSamplesPerChannels = this->numSamples / this->numChannels;
+ size_t samplesStartIndex = indexOfDataChunk + 8;
+
+ // Audio MemRef layout defaults to 1 dimension.
+ // Sample values from multiple channels are stored together.
+ if (N == 1) {
+ this->sizes[0] = this->numSamples;
+ } else if (N == this->numChannels) {
+ // TODO: add conversion from 1 dimension to multi-dimension
+ std::cerr << "Unsupported: The MemRef layout of multi-dimensional channels "
+ "is not yet supported."
+ << std::endl;
+ return false;
+ } else {
+ std::cerr << "Error: dimension mismatch (audio file channel: "
+ << this->numChannels << " MemRef layout channel: " << N << ")"
+ << std::endl;
+ return false;
+ }
+
+ // Allocate memory for MemRef.
+ this->setStrides();
+ size_t size = this->product(this->sizes);
+ this->allocated = (T *)malloc(sizeof(T) * size);
+ this->aligned = this->allocated;
+
+ // Sample data type: 8 bit
+ if (this->bitsPerSample == 8) {
+ size_t memrefIndex = 0;
+ for (size_t i = 0; i < this->numSamples; i++) {
+ for (size_t channel = 0; channel < this->numChannels; channel++) {
+ size_t sampleIndex =
+ samplesStartIndex + (blockAlign * i) + channel * numBytesPerSample;
+ this->aligned[memrefIndex] = oneByteToSample(fileData[sampleIndex]);
+ memrefIndex++;
+ }
+ }
+ }
+ // Sample data type: 16 bit
+ else if (this->bitsPerSample == 16) {
+ size_t memrefIndex = 0;
+ for (size_t i = 0; i < this->numSamples; i++) {
+ for (size_t channel = 0; channel < this->numChannels; channel++) {
+ size_t sampleIndex =
+ samplesStartIndex + (blockAlign * i) + channel * numBytesPerSample;
+ int16_t dataTwoBytes = twoBytesToI16(fileData, sampleIndex);
+ this->aligned[memrefIndex] = twoBytesToSample(dataTwoBytes);
+ memrefIndex++;
+ }
+ }
+ }
+ // Other data types are not currently supported.
+ else {
+ std::cerr << "Unsupported audio data type." << std::endl;
+ return false;
+ }
+
+ return true;
+}
+
+// WAV Audio File Encoder
+template
+bool Audio::EncodeWaveFile(std::vector &fileData) {
+ // Encode the 'header' chunk.
+ // RIFF chunk descriptor
+ // chunk ID: char[4] | 4 bytes | "RIFF"
+ // chunk size: uint32_t | 4bytes
+ // format: char[4] | 4 bytes | "WAVE"
+ stringToBytes(fileData, "RIFF");
+ int16_t audioFormat = this->bitsPerSample == 32 ? 0 : 1;
+ // Size for 'format' sub-chunk, doesn't include metadata length.
+ int32_t formatChunkSize = audioFormat == 1 ? 16 : 18;
+ // Size for 'data' sub-chunk, doesn't include metadata length.
+ int32_t dataChunkSize =
+ this->numSamples * this->numChannels * this->bitsPerSample / 8;
+ // The file size in bytes include header chunk size(4, not counting RIFF and
+ // WAVE), the format chunk size(formatChunkSize and 8 bytes for metadata), the
+ // data chunk size(dataChunkSize and 8 bytes for metadata).
+ int32_t fileSizeInBytes = 4 + formatChunkSize + 8 + dataChunkSize + 8;
+ i32ToFourBytes(fileData, fileSizeInBytes);
+ stringToBytes(fileData, "WAVE");
+
+ // Encode the 'format' chunk.
+ // Format sub-chunk:
+ // sub-chunk ID: char[4] | 4 bytes | "fmt "
+ // sub-chunk size: uint32_t | 4 bytes
+ // audio format: uint16_t | 2 bytes | 1 for PCM
+ // number of channels: uint16_t | 2 bytes
+ // sample rate: uint32_t | 4 bytes
+ // byte rate: uint32_t | 4 bytes
+ // block align: uint16_t | 2 bytes
+ // bits per sample: uint16_t | 2 bytes
+ stringToBytes(fileData, "fmt ");
+ i32ToFourBytes(fileData, formatChunkSize);
+ i16ToTwoBytes(fileData, audioFormat);
+ i16ToTwoBytes(fileData, static_cast(this->numChannels));
+ i32ToFourBytes(fileData, static_cast