From fad96292e8fca71f588133bd914d1adf1d0b7953 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Fri, 18 Oct 2024 17:06:47 -0700 Subject: [PATCH] Direct Driver HAL (#816) --- .github/CODEOWNERS | 3 +- .github/workflows/ci-linux.yml | 74 +- .github/workflows/ci-macos.yml | 5 +- .github/workflows/ci-windows.yml | 20 +- README.md | 125 +- build_tools/build_test_cpp.ps1 | 8 +- build_tools/build_test_cpp.sh | 122 +- .../ci/amdxdna_driver_utils/amdxdna_accel.py | 789 +++++++++++ .../ci/amdxdna_driver_utils/amdxdna_ioctl.py | 217 +++ build_tools/ci/cpu_comparison/run.py | 38 +- build_tools/ci/run_all_runtime_tests.sh | 52 + build_tools/ci/run_matmul_test.sh | 62 +- build_tools/download_peano.ps1 | 5 +- cmake/iree_aie_utils.cmake | 3 + .../AMD-AIE/iree-amd-aie/CMakeLists.txt | 1 + .../iree-amd-aie/PluginRegistration.cpp | 11 +- .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 310 +++-- .../AMD-AIE/iree-amd-aie/Target/AIETarget.h | 10 + .../iree-amd-aie/Target/CMakeLists.txt | 1 + .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp | 135 +- .../AMD-AIE/iree-amd-aie/Target/XCLBinGen.h | 5 +- .../Target/test/amd_aie_target_backend.mlir | 4 +- iree_runtime_plugin.cmake | 2 +- runtime/src/iree-amd-aie/CMakeLists.txt | 10 +- .../driver/xrt-lite/CMakeLists.txt | 48 + .../iree-amd-aie/driver/xrt-lite/allocator.cc | 200 +++ .../iree-amd-aie/driver/xrt-lite/allocator.h | 19 + .../src/iree-amd-aie/driver/xrt-lite/api.h | 44 + .../iree-amd-aie/driver/xrt-lite/buffer.cc | 176 +++ .../src/iree-amd-aie/driver/xrt-lite/buffer.h | 24 + .../driver/xrt-lite/cts/CMakeLists.txt | 110 ++ .../xrt-lite/cts/executable_cache_test.cc | 85 ++ .../xrt-lite/cts/executable_cache_test.mlir | 33 + .../xrt-lite/cts/matmul_dispatch_test.cc | 224 ++++ .../iree-amd-aie/driver/xrt-lite/device.cc | 285 ++++ .../src/iree-amd-aie/driver/xrt-lite/device.h | 33 + .../driver/xrt-lite/direct_command_buffer.cc | 220 ++++ .../driver/xrt-lite/direct_command_buffer.h | 24 + .../iree-amd-aie/driver/xrt-lite/driver.cc | 134 ++ .../driver/xrt-lite/executable.cc | 228 ++++ .../iree-amd-aie/driver/xrt-lite/executable.h | 47 + .../driver/xrt-lite/nop_executable_cache.cc | 107 ++ .../driver/xrt-lite/nop_executable_cache.h | 21 + .../driver/xrt-lite/nop_semaphore.cc | 71 + .../driver/xrt-lite/nop_semaphore.h | 19 + .../xrt-lite/registration/CMakeLists.txt | 23 + .../xrt-lite/registration/driver_module.c | 155 +++ .../xrt-lite/registration/driver_module.h | 24 + .../driver/xrt-lite/shim/CMakeLists.txt | 10 + .../driver/xrt-lite/shim/linux/CMakeLists.txt | 8 + .../xrt-lite/shim/linux/kmq/CMakeLists.txt | 34 + .../xrt-lite/shim/linux/kmq/amdxdna_accel.h | 591 +++++++++ .../driver/xrt-lite/shim/linux/kmq/bo.cpp | 474 +++++++ .../driver/xrt-lite/shim/linux/kmq/bo.h | 116 ++ .../driver/xrt-lite/shim/linux/kmq/device.cpp | 277 ++++ .../driver/xrt-lite/shim/linux/kmq/device.h | 70 + .../driver/xrt-lite/shim/linux/kmq/ert.h | 1163 +++++++++++++++++ .../driver/xrt-lite/shim/linux/kmq/fence.cpp | 245 ++++ .../driver/xrt-lite/shim/linux/kmq/fence.h | 57 + .../driver/xrt-lite/shim/linux/kmq/hwctx.cpp | 164 +++ .../driver/xrt-lite/shim/linux/kmq/hwctx.h | 79 ++ .../driver/xrt-lite/shim/linux/kmq/hwq.cpp | 109 ++ .../driver/xrt-lite/shim/linux/kmq/hwq.h | 33 + .../driver/xrt-lite/shim/linux/kmq/kernel.cpp | 126 ++ .../driver/xrt-lite/shim/linux/kmq/kernel.h | 37 + .../xrt-lite/shim/linux/kmq/shim_debug.cpp | 37 + .../xrt-lite/shim/linux/kmq/shim_debug.h | 49 + .../driver/xrt-lite/shim/linux/kmq/xrt_mem.h | 147 +++ .../src/iree-amd-aie/driver/xrt-lite/util.h | 33 + .../iree-amd-aie/driver/xrt/CMakeLists.txt | 2 + .../driver/xrt/cts/CMakeLists.txt | 114 ++ .../driver/xrt/cts/executable_cache_test.cc | 85 ++ .../driver/xrt/cts/executable_cache_test.mlir | 33 + .../driver/xrt/cts/matmul_dispatch_test.cc | 224 ++++ .../src/iree-amd-aie/schemas/CMakeLists.txt | 13 + .../schemas/pdi_executable_def.fbs | 57 + tests/conftest.py | 36 +- 77 files changed, 8432 insertions(+), 357 deletions(-) create mode 100644 build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py create mode 100644 build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py create mode 100755 build_tools/ci/run_all_runtime_tests.sh create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/api.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/device.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/device.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/executable.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h create mode 100755 runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/xrt_mem.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt-lite/util.h create mode 100644 runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt create mode 100644 runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.cc create mode 100644 runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir create mode 100644 runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc create mode 100644 runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 61e98b3de..138afc058 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -15,8 +15,7 @@ /compiler/ @MaheshRavishankar @nirvedhmeshram @yzhang93 @Abhishek-Varma @jtuyls # Runtime -/runtime/ @nirvedhmeshram -/runtime/src/iree-amd-aie/aie_runtime @makslevental +/runtime/ @makslevental # AIE Passes /compiler/plugins/target/AMD-AIE/aie @makslevental diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 9acdcf2ec..756d622db 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -49,13 +49,10 @@ jobs: git remote add origin $REPO_ADDRESS git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME git reset --hard FETCH_HEAD - git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10 - - - name: Install deps - run: | - dnf install -y almalinux-release-devel epel-release - yum remove -y openssl-devel zlib-devel || true - yum install -y protobuf-devel protobuf-compiler tmate + git -c submodule."third_party/torch-mlir".update=none \ + -c submodule."third_party/stablehlo".update=none \ + -c submodule."third_party/XRT".update=none \ + submodule update --init --recursive --depth 1 --single-branch -j 10 - name: Python deps run: | @@ -69,6 +66,11 @@ jobs: key: ${{ env.CACHE_KEY }} restore-keys: linux-build-test-cpp- + - name: Peano dep + run: | + bash build_tools/download_peano.sh + echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV + - name: Build packages run: | export cache_dir="${{ env.CACHE_DIR }}" @@ -147,60 +149,54 @@ jobs: source .venv/bin/activate pip install -r tests/requirements.txt + - name: Query device info + run: | + source .venv/bin/activate + echo "aie-metadata" + python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --aie-metadata + echo "aie-version" + python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --aie-version + echo "XRT_LITE_N_CORE_ROWS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-rows)" >> $GITHUB_ENV + echo "XRT_LITE_N_CORE_COLS=$(python build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py --num-cols)" >> $GITHUB_ENV + - name : E2E comparison of AIE to llvm-cpu run: | source .venv/bin/activate - source /opt/xilinx/xrt/setup.sh python build_tools/ci/cpu_comparison/run.py \ test_aie_vs_cpu \ $PWD/iree-install \ $PWD/llvm-aie \ - --xrt-dir /opt/xilinx/xrt \ --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v + --reset-npu-between-runs -v \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS - name: E2E correctness matmul test run: | - # Without this additional line an error like - # - # [XRT] ERROR: Failed to allocate host memory buffer (mmap(len=10616832, prot=3, flags=8193, offset=4294967296) - # failed (err=11): Resource temporarily unavailable), make sure host bank is enabled (see xbutil configure --host-mem) - # iree-amd-aie/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc:179: RESOURCE_EXHAUSTED; could not allocate - # memory for buffer; while invoking C++ function matmul_test.generate_random_matrix; while calling import; - # - # might be observed when too much memory is allocated. This - # error was seen when running a bf16->f32 matmul with m=n=k=2304. - # - # This line was suggested at https://github.com/Xilinx/mlir-air/issues/566 - # - # Note that this is only half of the fix. It is also necessary that - # the machine that CI is running on has permission to run this line. - # - # This permission can be adding by adding the line - # ``` - # %github ALL=(ALL) NOPASSWD: /usr/bin/prlimit * - # ``` - # - # to the file /etc/sudoers.d/github, which can be done by running - # ``` - # sudo visudo -f /etc/sudoers.d/github - # ``` - # on the github CI machine. + # https://stackoverflow.com/a/17567422 + # shim_xdna::bo::map_drm_bo does an mmap with MAP_LOCKED + # which can fail if limit is to low sudo prlimit -lunlimited --pid $$ source .venv/bin/activate - source /opt/xilinx/xrt/setup.sh bash build_tools/ci/run_matmul_test.sh \ test_matmuls \ iree-install \ $PWD/llvm-aie \ - /opt/xilinx/xrt \ /opt/Xilinx/Vitis/2024.2 - name: Python tests run: | source .venv/bin/activate - source /opt/xilinx/xrt/setup.sh pytest -v tests \ --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ - --peano-install-dir=$PWD/llvm-aie + --peano-install-dir=$PWD/llvm-aie \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS + + - name: XRT-LITE tests + run: | + DEVICE_TEST_DIR="$PWD/iree-install/device_tests" + for t in $(ls $DEVICE_TEST_DIR); do + $DEVICE_TEST_DIR/$t --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS + done diff --git a/.github/workflows/ci-macos.yml b/.github/workflows/ci-macos.yml index 9d273d462..4871e9745 100644 --- a/.github/workflows/ci-macos.yml +++ b/.github/workflows/ci-macos.yml @@ -62,7 +62,10 @@ jobs: git remote add origin $REPO_ADDRESS git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME git reset --hard FETCH_HEAD - git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10 + git -c submodule."third_party/torch-mlir".update=none \ + -c submodule."third_party/stablehlo".update=none \ + -c submodule."third_party/XRT".update=none \ + submodule update --init --recursive --depth 1 --single-branch -j 10 - uses: actions/setup-python@v4 with: diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 48777f7f5..86b5f4f8a 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -60,7 +60,10 @@ jobs: git remote add origin $REPO_ADDRESS git -c protocol.version=2 fetch --depth 1 origin $BRANCH_NAME git reset --hard FETCH_HEAD - git -c submodule."third_party/torch-mlir".update=none -c submodule."third_party/stablehlo".update=none -c submodule."src/runtime_src/core/common/aiebu".update=none submodule update --init --recursive --depth 1 --single-branch -j 10 + git -c submodule."third_party/torch-mlir".update=none \ + -c submodule."third_party/stablehlo".update=none \ + -c submodule."src/runtime_src/core/common/aiebu".update=none \ + submodule update --init --recursive --depth 1 --single-branch -j 10 - name: Setup Cpp uses: aminya/setup-cpp@v1 @@ -87,6 +90,11 @@ jobs: key: ${{ env.CACHE_KEY }} restore-keys: windows-build-test-cpp- + - name: Peano dep + run: | + .\build_tools\download_peano.ps1 + Add-Content -Path $env:GITHUB_ENV -Value "PEANO_INSTALL_DIR=$PWD\llvm-aie" + - name: Build packages run: | $env:cache_dir = "${{ env.CACHE_DIR }}" @@ -94,7 +102,6 @@ jobs: .\build_tools\build_llvm.ps1 # Remove-Item -Path "$pwd\llvm-build" -Force $env:llvm_install_dir = "$pwd\llvm-install" - echo $env:llvm_install_dir .\build_tools.\build_test_cpp.ps1 - name: Create artifacts @@ -170,6 +177,7 @@ jobs: shell: bash run: | source .venv/Scripts/activate + export DEVICE_HAL=xrt bash build_tools/ci/run_matmul_test.sh \ /c/test_matmuls \ $PWD/iree-install \ @@ -182,7 +190,8 @@ jobs: python build_tools/ci/cpu_comparison/run.py \ /c/test_aie_vs_cpu \ $PWD/iree-install \ - $PWD/llvm-aie -v + $PWD/llvm-aie -v \ + --device-hal=xrt - name: Python tests run: | @@ -191,5 +200,6 @@ jobs: mkdir temp pytest tests -sv ` --basetemp=$PWD\temp ` - --iree-install-dir="$PWD/iree-install" ` - --peano-install-dir="$PWD/llvm-aie" + --iree-install-dir="$PWD\iree-install" ` + --peano-install-dir="$PWD\llvm-aie" ` + --device-hal=xrt diff --git a/README.md b/README.md index d42c62e19..c6bbe5c1f 100644 --- a/README.md +++ b/README.md @@ -4,12 +4,7 @@ # AMD AIE Plugin for IREE -This repository contains an early-phase IREE compiler and runtime plugin for interfacing the AMD AIE accelerator to IREE. - -## Architectural Overview - -![image](https://github.com/nod-ai/iree-amd-aie/assets/74956/3fa73139-5fdf-4658-86c3-0705352c4ea0) - +This repository contains an early-phase IREE compiler and runtime plugin for targeting AMD NPUs with IREE. ## Developer Setup @@ -26,32 +21,30 @@ git clone --recursive git@github.com:nod-ai/iree-amd-aie.git git clone --recursive https://github.com/nod-ai/iree-amd-aie.git ``` -or if you want a faster checkout - +or, if you want a faster checkout, ``` git \ -c submodule."third_party/torch-mlir".update=none \ -c submodule."third_party/stablehlo".update=none \ - -c submodule."src/runtime_src/core/common/aiebu".update=none \ + -c submodule."third_party/XRT".update=none \ clone \ --recursive \ --shallow-submodules \ - https://github.com/nod-ai/iree-amd-aie.git + git@github.com:nod-ai/iree-amd-aie.git # https://github.com/nod-ai/iree-amd-aie.git ``` -The above avoids cloning entire repo histories, and skips unused nested submodules. +The above avoids cloning entire repo histories for submodules, and skips a few, currently, unused, +submodules that are nested in IREE. ## Building (along with IREE) ### Just show me the CMake -To configure and build with XRT runtime enabled - ``` cd iree-amd-aie cmake \ - -B $WHERE_YOU_WOULD_LIKE_TO_BUILD \ + -B \ -S third_party/iree \ -DIREE_CMAKE_PLUGIN_PATHS=$PWD \ -DIREE_BUILD_PYTHON_BINDINGS=ON \ @@ -62,20 +55,20 @@ cmake \ -DIREE_TARGET_BACKEND_DEFAULTS=OFF \ -DIREE_TARGET_BACKEND_LLVM_CPU=ON \ -DIREE_BUILD_TESTS=ON \ - -DIREE_EXTERNAL_HAL_DRIVERS=xrt \ - -DCMAKE_INSTALL_PREFIX=$WHERE_YOU_WOULD_LIKE_TO_INSTALL -cmake --build $WHERE_YOU_WOULD_LIKE_TO_BUILD + -DIREE_EXTERNAL_HAL_DRIVERS=xrt-lite \ + -DCMAKE_INSTALL_PREFIX= +cmake --build ``` ### Instructions -The bare minimum configure command for IREE with the amd-aie plugin +The bare minimum configure command for IREE with the amd-aie plugin ``` cmake \ - -B $WHERE_YOU_WOULD_LIKE_TO_BUILD \ - -S $IREE_REPO_SRC_DIR \ - -DIREE_CMAKE_PLUGIN_PATHS=$IREE_AMD_AIE_REPO_SRC_DIR \ + -B \ + -S \ + -DIREE_CMAKE_PLUGIN_PATHS= \ -DIREE_BUILD_PYTHON_BINDINGS=ON ``` @@ -88,7 +81,8 @@ Very likely, you will want to use `ccache` and `lld` (or some other modern linke -DCMAKE_SHARED_LINKER_FLAGS="-fuse-ld=lld" ``` -If you don't plan on using any of IREE's frontends or backends/targets (e.g., you're doing work on this code base itself), you can opt-out of everything (except the `llvm-cpu` backend) with +If you don't plan on using any of IREE's frontends or backends/targets (e.g., you're doing work on this code base itself), +you can opt-out of everything (except the `llvm-cpu` backend) with ``` -DIREE_INPUT_STABLEHLO=OFF \ @@ -111,75 +105,64 @@ If you're "bringing your own LLVM", i.e., you have a prebuilt/compiled distribut -DIREE_BUILD_BUNDLED_LLVM=OFF ``` -In this case you will need to supply `-DLLVM_EXTERNAL_LIT=$SOMEWHERE` (e.g., `pip install lit; SOMEWHERE=$(which lit)`). +In this case you will need `lit` somewhere in your environment and you will need to add to CMake `-DLLVM_EXTERNAL_LIT=` +(e.g., `pip install lit; SOMEWHERE=$(which lit)`). -Note, getting the right/matching build of LLVM, that works with IREE is tough (besides the commit hash, there are various flags to set). -To enable adventurous users to avail themselves of `-DIREE_BUILD_BUNDLED_LLVM=OFF` we cache/store/save the LLVM distribution for every successful CI run. -These can then be downloaded by checking the artifacts section of any recent CI run's [Summary page](https://github.com/nod-ai/iree-amd-aie/actions/runs/10713474448): +See [Bringing your own LLVM](#bringing-your-own-llvm) below for more information on using prebuilt/compiled distributions of LLVM. -

- -

+## Testing -Lit tests specific to AIE can be run with something like +Lit tests (i.e., compiler tests) specific to AIE can be run with something like ``` -cd $WHERE_YOU_WOULD_LIKE_TO_BUILD -ctest -R amd-aie +cd +ctest -R amd-aie --output-on-failure -j 10 ``` -Other tests which run on hardware and requiring XRT are in the `build_tools` subdirectory. - -## Runtime driver setup +(the `-j 10` runs `10` tests in parallel) -To enable the runtime driver, you need to also enable the XRT HAL +Other tests, which run on device, are in the `build_tools` subdirectory. +See [build_tools/ci/run_all_runtime_tests.sh](build_tools/ci/run_all_runtime_tests.sh) for an example script that shows how to run all the runtime tests. -``` - -DIREE_EXTERNAL_HAL_DRIVERS=xrt -``` +## Pro-tips -Additional IREE-specific flags are explained at [IREE's build instructions](https://iree.dev/building-from-source/getting-started/#quickstart-clone-and-build). To use Ninja instead of Make, and clang++ instead of g++, you can add +### Bringing your own LLVM +When using a pre-built distribution of LLVM, getting the right/matching build, that works with IREE, is tough (besides the commit hash, there are various flags to set). +To enable adventurous users to avail themselves of `-DIREE_BUILD_BUNDLED_LLVM=OFF` we cache/store/save the LLVM distribution for every successful CI run. +These can then be downloaded by checking the artifacts section of any recent CI run's [Summary page](https://github.com/nod-ai/iree-amd-aie/actions/runs/10713474448): -``` - -G Ninja \ - -DCMAKE_CXX_COMPILER=clang++ \ - -DCMAKE_C_COMPILER=clang -``` +

+ +

-### Ubuntu Dependencies +### Debugging HAL -XRT requires a number of packages. Here are the requirements for various operating systems +You can turn on HAL API tracing by adding to CMake: ``` -apt install \ - libcurl4-openssl-dev \ - libdrm-dev \ - libelf-dev \ - libprotobuf-dev \ - libudev-dev \ - pkg-config \ - protobuf-compiler \ - python3-pybind11 \ - systemtap-sdt-dev \ - uuid-dev +-DIREE_ENABLE_RUNTIME_TRACING=ON +-DIREE_TRACING_PROVIDER=console +// optional but recommended +-DIREE_TRACING_CONSOLE_FLUSH=1 ``` -### RH Based Deps +This will you show you all the HAL APIs that have `IREE_TRACE_ZONE_BEGIN ... IREE_TRACE_ZONE_END` that are hit during a run/execution (of, e.g., `iree-run-module`). -This is an incomplete list derived by adding what is needed to our development base manylinux (AlmaLinux 8) image. +You can turn on VM tracing by adding to CMake: ``` -yum install \ - libcurl-devel \ - libdrm-devel \ - libudev-devel \ - libuuid-devel \ - ncurses-devel \ - pkgconfig \ - protobuf-compiler \ - protobuf-devel \ - systemtap-sdt-devel \ - uuid-devel +-DIREE_VM_EXECUTION_TRACING_ENABLE=1 +-DIREE_VM_EXECUTION_TRACING_FORCE_ENABLE=1 +// optional +-DIREE_VM_EXECUTION_TRACING_SRC_LOC_ENABLE=1 ``` + +This will show you all of the [VM dispatches](https://github.com/iree-org/iree/blob/0e8a5737dfe49a48a4e9c15ba7a7d24dd2fd7623/runtime/src/iree/vm/bytecode/dispatch.c#L661) that actually occur during a run/execution. +Note, this is roughly equivalent to [passing](https://github.com/nod-ai/iree-amd-aie/blob/737092791dc2428ad71bc172f69804c583b0f60e/build_tools/ci/run_matmul_test.sh#L420) `--compile-to=vm` to `iree-compile`. + +## Architectural overview (out of date) + +![image](https://github.com/nod-ai/iree-amd-aie/assets/74956/3fa73139-5fdf-4658-86c3-0705352c4ea0) + diff --git a/build_tools/build_test_cpp.ps1 b/build_tools/build_test_cpp.ps1 index f508ce5ff..e686f40cf 100644 --- a/build_tools/build_test_cpp.ps1 +++ b/build_tools/build_test_cpp.ps1 @@ -84,6 +84,12 @@ $CMAKE_ARGS = @( "-DIREE_BUILD_PYTHON_BINDINGS=ON" ) +$peano_install_dir = "$env:PEANO_INSTALL_DIR" +if ($peano_install_dir -and (Test-Path "$peano_install_dir")) +{ + $CMAKE_ARGS += @("-DPEANO_INSTALL_DIR=$peano_install_dir") +} + if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) { echo "using existing llvm install @ $llvm_install_dir" @@ -116,7 +122,7 @@ echo "-----" # better have git-bash installed... $env:Path = "C:\Program Files\Git\bin;$env:Path" pushd $build_dir -& bash -l -c "ctest -R amd-aie --output-on-failure -j --repeat until-pass:5" +& bash -l -c "ctest -R amd-aie -E driver --output-on-failure -j --repeat until-pass:5" popd if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index d04bf8bb4..a1d610f64 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -31,7 +31,7 @@ mkdir -p "${cache_dir}/pip" python="$(which python)" echo "Using python: $python" -if [[ "$OSTYPE" == "linux-gnu"* ]]; then +if [[ "$OSTYPE" == "linux"* ]]; then export CMAKE_TOOLCHAIN_FILE="$this_dir/linux_default_toolchain.cmake" export CC=clang export CXX=clang++ @@ -61,54 +61,71 @@ echo '{ }' > $iree_dir/CMakeUserPresets.json cd $iree_dir -CMAKE_ARGS="\ - -GNinja \ - -DCMAKE_BUILD_TYPE=Release \ - -DCMAKE_INSTALL_PREFIX=$install_dir \ - -DCMAKE_INSTALL_LIBDIR=lib \ - -DIREE_ERROR_ON_MISSING_SUBMODULES=OFF \ - -DIREE_ENABLE_ASSERTIONS=ON \ - -DIREE_BUILD_SAMPLES=OFF \ - -DIREE_BUILD_PYTHON_BINDINGS=ON \ - -DIREE_BUILD_BINDINGS_TFLITE=OFF \ - -DIREE_HAL_DRIVER_DEFAULTS=OFF \ - -DIREE_HAL_DRIVER_LOCAL_SYNC=ON \ - -DIREE_HAL_DRIVER_LOCAL_TASK=ON \ - -DIREE_TARGET_BACKEND_DEFAULTS=OFF \ - -DIREE_TARGET_BACKEND_LLVM_CPU=ON \ - -DIREE_INPUT_TOSA=OFF \ - -DIREE_INPUT_STABLEHLO=OFF \ - -DIREE_INPUT_TORCH=OFF \ - -DCMAKE_OBJECT_PATH_MAX=4096 \ - -DIREE_CMAKE_PLUGIN_PATHS=$repo_root" - -if [ -d "${llvm_install_dir}" ]; then - CMAKE_ARGS="$CMAKE_ARGS \ - -DIREE_BUILD_BUNDLED_LLVM=OFF \ - -DClang_DIR=$llvm_install_dir/lib/cmake/clang \ - -DLLD_DIR=$llvm_install_dir/lib/cmake/lld \ - -DMLIR_DIR=$llvm_install_dir/lib/cmake/mlir \ - -DLLVM_DIR=$llvm_install_dir/lib/cmake/llvm" +CMAKE_ARGS=( + -GNinja + -DCMAKE_BUILD_TYPE=Release + -DCMAKE_INSTALL_PREFIX="$install_dir" + -DCMAKE_INSTALL_LIBDIR=lib + -DIREE_ERROR_ON_MISSING_SUBMODULES=OFF + -DIREE_ENABLE_ASSERTIONS=ON + -DIREE_BUILD_SAMPLES=OFF + -DIREE_BUILD_PYTHON_BINDINGS=ON + -DIREE_BUILD_BINDINGS_TFLITE=OFF + -DIREE_HAL_DRIVER_DEFAULTS=OFF + -DIREE_HAL_DRIVER_LOCAL_SYNC=ON + -DIREE_HAL_DRIVER_LOCAL_TASK=ON + -DIREE_TARGET_BACKEND_DEFAULTS=OFF + -DIREE_TARGET_BACKEND_LLVM_CPU=ON + -DIREE_INPUT_TOSA=OFF + -DIREE_INPUT_STABLEHLO=OFF + -DIREE_INPUT_TORCH=OFF + -DCMAKE_OBJECT_PATH_MAX=4096 + -DIREE_CMAKE_PLUGIN_PATHS="$repo_root" +) + +PEANO_INSTALL_DIR=${PEANO_INSTALL_DIR:-""} +if [ "$PEANO_INSTALL_DIR" != "" ] && [ -d "$PEANO_INSTALL_DIR" ]; then + CMAKE_ARGS+=(-DPEANO_INSTALL_DIR="$PEANO_INSTALL_DIR") +fi + +if [ -d "$llvm_install_dir" ]; then + CMAKE_ARGS+=( + -DIREE_BUILD_BUNDLED_LLVM=OFF + -DClang_DIR="$llvm_install_dir/lib/cmake/clang" + -DLLD_DIR="$llvm_install_dir/lib/cmake/lld" + -DMLIR_DIR="$llvm_install_dir/lib/cmake/mlir" + -DLLVM_DIR="$llvm_install_dir/lib/cmake/llvm" + ) fi -if [[ "$OSTYPE" == "linux-gnu"* ]]; then - cmake $CMAKE_ARGS \ - -DCMAKE_EXE_LINKER_FLAGS_INIT="-fuse-ld=lld" \ - -DCMAKE_SHARED_LINKER_FLAGS_INIT="-fuse-ld=lld" \ - -DCMAKE_MODULE_LINKER_FLAGS_INIT="-fuse-ld=lld" \ - -DCMAKE_C_COMPILER="${CC}" \ - -DCMAKE_CXX_COMPILER="${CXX}" \ - -DLLVM_TARGET_ARCH=X86 \ - -DLLVM_TARGETS_TO_BUILD=X86 \ - -DIREE_EXTERNAL_HAL_DRIVERS=xrt \ - -S $iree_dir -B $build_dir +if [[ "$OSTYPE" == "linux"* ]]; then + CMAKE_ARGS+=( + -DCMAKE_EXE_LINKER_FLAGS_INIT="-fuse-ld=lld" + -DCMAKE_SHARED_LINKER_FLAGS_INIT="-fuse-ld=lld" + -DCMAKE_MODULE_LINKER_FLAGS_INIT="-fuse-ld=lld" + -DCMAKE_C_COMPILER="${CC}" + -DCMAKE_CXX_COMPILER="${CXX}" + -DLLVM_TARGET_ARCH=X86 + -DLLVM_TARGETS_TO_BUILD=X86 + -DIREE_EXTERNAL_HAL_DRIVERS="xrt-lite" + -S + "$iree_dir" + -B + "$build_dir" + ) elif [[ "$OSTYPE" == "darwin"* ]]; then - cmake $CMAKE_ARGS \ - -DLLVM_TARGET_ARCH="X86;ARM" \ - -DLLVM_TARGETS_TO_BUILD="X86;ARM" \ - -S $iree_dir -B $build_dir + CMAKE_ARGS+=( + -DLLVM_TARGET_ARCH="X86;ARM" + -DLLVM_TARGETS_TO_BUILD="X86;ARM" + -S + "$iree_dir" + -B + "$build_dir" + ) fi +cmake "${CMAKE_ARGS[@]}" + echo "Building all" echo "------------" cmake --build "$build_dir" -- -k 0 @@ -123,15 +140,20 @@ cmake --build "$build_dir" --target iree-install-dist echo "CTest" echo "-----" -if [[ "$OSTYPE" == "linux-gnu"* ]]; then - ctest --test-dir "$build_dir" -R amd-aie --output-on-failure -j +if [[ "$OSTYPE" == "linux"* ]]; then + ctest --test-dir "$build_dir" -R amd-aie -E "driver" --output-on-failure -j elif [[ "$OSTYPE" == "darwin"* ]]; then ctest --test-dir "$build_dir" -R amd-aie -E "matmul_pack_peel_air_e2e|matmul_elementwise_pack_peel_air_e2e|conv_fill_spec_pad" --output-on-failure -j --repeat until-pass:5 fi if [ -d "$llvm_install_dir" ]; then - cp "$llvm_install_dir"/bin/lld "$install_dir"/bin - cp "$llvm_install_dir"/bin/FileCheck "$install_dir"/bin - cp "$llvm_install_dir"/bin/not "$install_dir"/bin + cp "$llvm_install_dir/bin/lld" "$install_dir/bin" + cp "$llvm_install_dir/bin/FileCheck" "$install_dir/bin" + cp "$llvm_install_dir/bin/not" "$install_dir/bin" +fi + +cp "$build_dir/tools/testing/e2e/iree-e2e-matmul-test" "$install_dir/bin" +if [[ "$OSTYPE" == "linux"* ]]; then + mkdir -p "$install_dir/device_tests" + cp "$build_dir"/runtime/plugins/AMD-AIE/iree-amd-aie/driver/xrt-lite/cts/*test "$install_dir/device_tests" fi -cp "$build_dir"/tools/testing/e2e/iree-e2e-matmul-test "$install_dir"/bin diff --git a/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py b/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py new file mode 100644 index 000000000..edcd5f260 --- /dev/null +++ b/build_tools/ci/amdxdna_driver_utils/amdxdna_accel.py @@ -0,0 +1,789 @@ +# generated using clang2py amdxdna_accel.h -o amdxdna_accel.py -k cdefstum +import ctypes + + +class AsDictMixin: + @classmethod + def as_dict(cls, self): + result = {} + if not isinstance(self, AsDictMixin): + # not a structure, assume it's already a python object + return self + if not hasattr(cls, "_fields_"): + return result + for field_tuple in cls._fields_: # noqa + field = field_tuple[0] + if field.startswith("PADDING_"): + continue + value = getattr(self, field) + type_ = type(value) + if hasattr(value, "_length_") and hasattr(value, "_type_"): + # array + if not hasattr(type_, "as_dict"): + value = [v for v in value] + else: + type_ = type_._type_ + value = [type_.as_dict(v) for v in value] + elif hasattr(value, "contents") and hasattr(value, "_type_"): + # pointer + try: + if not hasattr(type_, "as_dict"): + value = value.contents + else: + type_ = type_._type_ + value = type_.as_dict(value.contents) + except ValueError: + # nullptr + value = None + elif isinstance(value, AsDictMixin): + # other structure + value = type_.as_dict(value) + result[field] = value + return result + + +class Structure(ctypes.Structure, AsDictMixin): + def __init__(self, *args, **kwds): + # We don't want to use positional arguments fill PADDING_* fields + + args = dict(zip(self.__class__._field_names_(), args)) + args.update(kwds) + super(Structure, self).__init__(**args) + + @classmethod + def _field_names_(cls): + if hasattr(cls, "_fields_"): + return (f[0] for f in cls._fields_ if not f[0].startswith("PADDING")) + else: + return () + + @classmethod + def get_type(cls, field): + for f in cls._fields_: + if f[0] == field: + return f[1] + return None + + @classmethod + def bind(cls, bound_fields): + fields = {} + for name, type_ in cls._fields_: + if hasattr(type_, "restype"): + if name in bound_fields: + if bound_fields[name] is None: + fields[name] = type_() + else: + # use a closure to capture the callback from the loop scope + fields[name] = type_( + (lambda callback: lambda *args: callback(*args))( + bound_fields[name] + ) + ) + del bound_fields[name] + else: + # default callback implementation (does nothing) + try: + default_ = type_(0).restype().value + except TypeError: + default_ = None + fields[name] = type_( + (lambda default_: lambda *args: default_)(default_) + ) + else: + # not a callback function, use default initialization + if name in bound_fields: + fields[name] = bound_fields[name] + del bound_fields[name] + else: + fields[name] = type_() + if len(bound_fields) != 0: + raise ValueError( + "Cannot bind the following unknown callback(s) {}.{}".format( + cls.__name__, bound_fields.keys() + ) + ) + return cls(**fields) + + +class Union(ctypes.Union, AsDictMixin): + pass + + +AMDXDNA_ACCEL_H_ = True # macro +AMDXDNA_DRIVER_MAJOR = 1 # macro +AMDXDNA_DRIVER_MINOR = 0 # macro +AMDXDNA_INVALID_CMD_HANDLE = ~0 # macro +AMDXDNA_INVALID_ADDR = ~0 # macro +AMDXDNA_INVALID_CTX_HANDLE = 0 # macro +AMDXDNA_INVALID_BO_HANDLE = 0 # macro +AMDXDNA_INVALID_FENCE_HANDLE = 0 # macro +SYNC_DIRECT_TO_DEVICE = 0 # macro +SYNC_DIRECT_FROM_DEVICE = 1 # macro + +# values for enumeration 'amdxdna_drm_ioctl_id' +amdxdna_drm_ioctl_id__enumvalues = { + 0: "DRM_AMDXDNA_CREATE_HWCTX", + 1: "DRM_AMDXDNA_DESTROY_HWCTX", + 2: "DRM_AMDXDNA_CONFIG_HWCTX", + 3: "DRM_AMDXDNA_CREATE_BO", + 4: "DRM_AMDXDNA_GET_BO_INFO", + 5: "DRM_AMDXDNA_SYNC_BO", + 6: "DRM_AMDXDNA_EXEC_CMD", + 7: "DRM_AMDXDNA_WAIT_CMD", + 8: "DRM_AMDXDNA_GET_INFO", + 9: "DRM_AMDXDNA_SET_STATE", + 10: "DRM_AMDXDNA_SUBMIT_WAIT", + 11: "DRM_AMDXDNA_SUBMIT_SIGNAL", + 12: "DRM_AMDXDNA_NUM_IOCTLS", +} +DRM_AMDXDNA_CREATE_HWCTX = 0 +DRM_AMDXDNA_DESTROY_HWCTX = 1 +DRM_AMDXDNA_CONFIG_HWCTX = 2 +DRM_AMDXDNA_CREATE_BO = 3 +DRM_AMDXDNA_GET_BO_INFO = 4 +DRM_AMDXDNA_SYNC_BO = 5 +DRM_AMDXDNA_EXEC_CMD = 6 +DRM_AMDXDNA_WAIT_CMD = 7 +DRM_AMDXDNA_GET_INFO = 8 +DRM_AMDXDNA_SET_STATE = 9 +DRM_AMDXDNA_SUBMIT_WAIT = 10 +DRM_AMDXDNA_SUBMIT_SIGNAL = 11 +DRM_AMDXDNA_NUM_IOCTLS = 12 +amdxdna_drm_ioctl_id = ctypes.c_uint32 # enum + +# values for enumeration 'amdxdna_device_type' +amdxdna_device_type__enumvalues = { + -1: "AMDXDNA_DEV_TYPE_UNKNOWN", + 0: "AMDXDNA_DEV_TYPE_KMQ", + 1: "AMDXDNA_DEV_TYPE_UMQ", +} +AMDXDNA_DEV_TYPE_UNKNOWN = -1 +AMDXDNA_DEV_TYPE_KMQ = 0 +AMDXDNA_DEV_TYPE_UMQ = 1 +amdxdna_device_type = ctypes.c_int32 # enum + + +class struct_amdxdna_qos_info(Structure): + pass + + +struct_amdxdna_qos_info._pack_ = 1 # source:False +struct_amdxdna_qos_info._fields_ = [ + ("gops", ctypes.c_uint32), + ("fps", ctypes.c_uint32), + ("dma_bandwidth", ctypes.c_uint32), + ("latency", ctypes.c_uint32), + ("frame_exec_time", ctypes.c_uint32), + ("priority", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_create_hwctx(Structure): + pass + + +struct_amdxdna_drm_create_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_create_hwctx._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("qos_p", ctypes.c_uint64), + ("umq_bo", ctypes.c_uint32), + ("log_buf_bo", ctypes.c_uint32), + ("max_opc", ctypes.c_uint32), + ("num_tiles", ctypes.c_uint32), + ("mem_size", ctypes.c_uint32), + ("umq_doorbell", ctypes.c_uint32), + ("handle", ctypes.c_uint32), + ("PADDING_0", ctypes.c_ubyte * 4), +] + + +# DRM_IOCTL_AMDXDNA_CREATE_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX , struct_amdxdna_drm_create_hwctx ) # macro +class struct_amdxdna_drm_destroy_hwctx(Structure): + pass + + +struct_amdxdna_drm_destroy_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_destroy_hwctx._fields_ = [ + ("handle", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +# DRM_IOCTL_AMDXDNA_DESTROY_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX , struct_amdxdna_drm_destroy_hwctx ) # macro +class struct_amdxdna_cu_config(Structure): + pass + + +struct_amdxdna_cu_config._pack_ = 1 # source:False +struct_amdxdna_cu_config._fields_ = [ + ("cu_bo", ctypes.c_uint32), + ("cu_func", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 3), +] + + +def struct_amdxdna_hwctx_param_config_cu(num_cus, cu_configs): + assert len(cu_configs) == num_cus + + class struct_amdxdna_hwctx_param_config_cu(Structure): + pass + + struct_amdxdna_hwctx_param_config_cu._pack_ = 1 # source:False + struct_amdxdna_hwctx_param_config_cu._fields_ = [ + ("num_cus", ctypes.c_uint16), + ("pad", ctypes.c_uint16 * 3), + ("cu_configs", struct_amdxdna_cu_config * num_cus), + ] + struc = struct_amdxdna_hwctx_param_config_cu() + struc.num_cus = num_cus + struc.cu_configs = (struct_amdxdna_cu_config * num_cus)(*cu_configs) + return struc + + +# values for enumeration 'amdxdna_drm_config_hwctx_param' +amdxdna_drm_config_hwctx_param__enumvalues = { + 0: "DRM_AMDXDNA_HWCTX_CONFIG_CU", + 1: "DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF", + 2: "DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF", + 3: "DRM_AMDXDNA_HWCTX_CONFIG_NUM", +} +DRM_AMDXDNA_HWCTX_CONFIG_CU = 0 +DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF = 1 +DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF = 2 +DRM_AMDXDNA_HWCTX_CONFIG_NUM = 3 +amdxdna_drm_config_hwctx_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_config_hwctx(Structure): + pass + + +struct_amdxdna_drm_config_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_config_hwctx._fields_ = [ + ("handle", ctypes.c_uint32), + ("param_type", ctypes.c_uint32), + ("param_val", ctypes.c_uint64), + ("param_val_size", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + +# DRM_IOCTL_AMDXDNA_CONFIG_HWCTX = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX , struct_amdxdna_drm_config_hwctx ) # macro + +# values for enumeration 'amdxdna_bo_type' +amdxdna_bo_type__enumvalues = { + 0: "AMDXDNA_BO_INVALID", + 1: "AMDXDNA_BO_SHMEM", + 2: "AMDXDNA_BO_DEV_HEAP", + 3: "AMDXDNA_BO_DEV", + 4: "AMDXDNA_BO_CMD", + 5: "AMDXDNA_BO_DMA", +} +AMDXDNA_BO_INVALID = 0 +AMDXDNA_BO_SHMEM = 1 +AMDXDNA_BO_DEV_HEAP = 2 +AMDXDNA_BO_DEV = 3 +AMDXDNA_BO_CMD = 4 +AMDXDNA_BO_DMA = 5 +amdxdna_bo_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_create_bo(Structure): + pass + + +struct_amdxdna_drm_create_bo._pack_ = 1 # source:False +struct_amdxdna_drm_create_bo._fields_ = [ + ("flags", ctypes.c_uint64), + ("type", ctypes.c_uint32), + ("_pad", ctypes.c_uint32), + ("vaddr", ctypes.c_uint64), + ("size", ctypes.c_uint64), + ("handle", ctypes.c_uint32), + ("PADDING_0", ctypes.c_ubyte * 4), +] + + +# DRM_IOCTL_AMDXDNA_CREATE_BO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO , struct_amdxdna_drm_create_bo ) # macro +class struct_amdxdna_drm_get_bo_info(Structure): + pass + + +struct_amdxdna_drm_get_bo_info._pack_ = 1 # source:False +struct_amdxdna_drm_get_bo_info._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("handle", ctypes.c_uint32), + ("_pad", ctypes.c_uint32), + ("map_offset", ctypes.c_uint64), + ("vaddr", ctypes.c_uint64), + ("xdna_addr", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_GET_BO_INFO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO , struct_amdxdna_drm_get_bo_info ) # macro +class struct_amdxdna_drm_sync_bo(Structure): + pass + + +struct_amdxdna_drm_sync_bo._pack_ = 1 # source:False +struct_amdxdna_drm_sync_bo._fields_ = [ + ("handle", ctypes.c_uint32), + ("direction", ctypes.c_uint32), + ("offset", ctypes.c_uint64), + ("size", ctypes.c_uint64), +] + +# DRM_IOCTL_AMDXDNA_SYNC_BO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO , struct_amdxdna_drm_sync_bo ) # macro + +# values for enumeration 'amdxdna_cmd_type' +amdxdna_cmd_type__enumvalues = { + 0: "AMDXDNA_CMD_SUBMIT_EXEC_BUF", + 1: "AMDXDNA_CMD_SUBMIT_DEPENDENCY", + 2: "AMDXDNA_CMD_SUBMIT_SIGNAL", +} +AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0 +AMDXDNA_CMD_SUBMIT_DEPENDENCY = 1 +AMDXDNA_CMD_SUBMIT_SIGNAL = 2 +amdxdna_cmd_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_exec_cmd(Structure): + pass + + +struct_amdxdna_drm_exec_cmd._pack_ = 1 # source:False +struct_amdxdna_drm_exec_cmd._fields_ = [ + ("ext", ctypes.c_uint64), + ("ext_flags", ctypes.c_uint64), + ("hwctx", ctypes.c_uint32), + ("type", ctypes.c_uint32), + ("cmd_handles", ctypes.c_uint64), + ("args", ctypes.c_uint64), + ("cmd_count", ctypes.c_uint32), + ("arg_count", ctypes.c_uint32), + ("seq", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_EXEC_CMD = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD , struct_amdxdna_drm_exec_cmd ) # macro +class struct_amdxdna_drm_wait_cmd(Structure): + pass + + +struct_amdxdna_drm_wait_cmd._pack_ = 1 # source:False +struct_amdxdna_drm_wait_cmd._fields_ = [ + ("hwctx", ctypes.c_uint32), + ("timeout", ctypes.c_uint32), + ("seq", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_WAIT_CMD = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD , struct_amdxdna_drm_wait_cmd ) # macro +class struct_amdxdna_drm_query_aie_status(Structure): + pass + + +struct_amdxdna_drm_query_aie_status._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_status._fields_ = [ + ("buffer", ctypes.c_uint64), + ("buffer_size", ctypes.c_uint32), + ("cols_filled", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_aie_version(Structure): + pass + + +struct_amdxdna_drm_query_aie_version._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_version._fields_ = [ + ("major", ctypes.c_uint32), + ("minor", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_aie_tile_metadata(Structure): + pass + + +struct_amdxdna_drm_query_aie_tile_metadata._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_tile_metadata._fields_ = [ + ("row_count", ctypes.c_uint16), + ("row_start", ctypes.c_uint16), + ("dma_channel_count", ctypes.c_uint16), + ("lock_count", ctypes.c_uint16), + ("event_reg_count", ctypes.c_uint16), + ("pad", ctypes.c_uint16 * 3), +] + + +class struct_amdxdna_drm_query_aie_metadata(Structure): + pass + + +struct_amdxdna_drm_query_aie_metadata._pack_ = 1 # source:False +struct_amdxdna_drm_query_aie_metadata._fields_ = [ + ("col_size", ctypes.c_uint32), + ("cols", ctypes.c_uint16), + ("rows", ctypes.c_uint16), + ("version", struct_amdxdna_drm_query_aie_version), + ("core", struct_amdxdna_drm_query_aie_tile_metadata), + ("mem", struct_amdxdna_drm_query_aie_tile_metadata), + ("shim", struct_amdxdna_drm_query_aie_tile_metadata), +] + + +class struct_amdxdna_drm_query_clock(Structure): + pass + + +struct_amdxdna_drm_query_clock._pack_ = 1 # source:False +struct_amdxdna_drm_query_clock._fields_ = [ + ("name", ctypes.c_ubyte * 16), + ("freq_mhz", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +class struct_amdxdna_drm_query_clock_metadata(Structure): + _pack_ = 1 # source:False + _fields_ = [ + ("mp_npu_clock", struct_amdxdna_drm_query_clock), + ("h_clock", struct_amdxdna_drm_query_clock), + ] + + +# values for enumeration 'amdxdna_sensor_type' +amdxdna_sensor_type__enumvalues = { + 0: "AMDXDNA_SENSOR_TYPE_POWER", +} +AMDXDNA_SENSOR_TYPE_POWER = 0 +amdxdna_sensor_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_query_sensor(Structure): + pass + + +struct_amdxdna_drm_query_sensor._pack_ = 1 # source:False +struct_amdxdna_drm_query_sensor._fields_ = [ + ("label", ctypes.c_ubyte * 64), + ("input", ctypes.c_uint32), + ("max", ctypes.c_uint32), + ("average", ctypes.c_uint32), + ("highest", ctypes.c_uint32), + ("status", ctypes.c_ubyte * 64), + ("units", ctypes.c_ubyte * 16), + ("unitm", ctypes.c_byte), + ("type", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 6), +] + + +class struct_amdxdna_drm_query_hwctx(Structure): + pass + + +struct_amdxdna_drm_query_hwctx._pack_ = 1 # source:False +struct_amdxdna_drm_query_hwctx._fields_ = [ + ("context_id", ctypes.c_uint32), + ("start_col", ctypes.c_uint32), + ("num_col", ctypes.c_uint32), + ("pad", ctypes.c_uint32), + ("pid", ctypes.c_int64), + ("command_submissions", ctypes.c_uint64), + ("command_completions", ctypes.c_uint64), + ("migrations", ctypes.c_uint64), + ("preemptions", ctypes.c_uint64), + ("errors", ctypes.c_uint64), +] + + +class struct_amdxdna_drm_aie_mem(Structure): + pass + + +struct_amdxdna_drm_aie_mem._pack_ = 1 # source:False +struct_amdxdna_drm_aie_mem._fields_ = [ + ("col", ctypes.c_uint32), + ("row", ctypes.c_uint32), + ("addr", ctypes.c_uint32), + ("size", ctypes.c_uint32), + ("buf_p", ctypes.c_uint64), +] + + +class struct_amdxdna_drm_aie_reg(Structure): + pass + + +struct_amdxdna_drm_aie_reg._pack_ = 1 # source:False +struct_amdxdna_drm_aie_reg._fields_ = [ + ("col", ctypes.c_uint32), + ("row", ctypes.c_uint32), + ("addr", ctypes.c_uint32), + ("val", ctypes.c_uint32), +] + +# values for enumeration 'amdxdna_power_mode_type' +amdxdna_power_mode_type__enumvalues = { + 0: "POWER_MODE_DEFAULT", + 1: "POWER_MODE_LOW", + 2: "POWER_MODE_MEDIUM", + 3: "POWER_MODE_HIGH", +} +POWER_MODE_DEFAULT = 0 +POWER_MODE_LOW = 1 +POWER_MODE_MEDIUM = 2 +POWER_MODE_HIGH = 3 +amdxdna_power_mode_type = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_get_power_mode(Structure): + pass + + +struct_amdxdna_drm_get_power_mode._pack_ = 1 # source:False +struct_amdxdna_drm_get_power_mode._fields_ = [ + ("power_mode", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 7), +] + + +class struct_amdxdna_drm_query_firmware_version(Structure): + pass + + +struct_amdxdna_drm_query_firmware_version._pack_ = 1 # source:False +struct_amdxdna_drm_query_firmware_version._fields_ = [ + ("major", ctypes.c_uint32), + ("minor", ctypes.c_uint32), + ("patch", ctypes.c_uint32), + ("build", ctypes.c_uint32), +] + +# values for enumeration 'amdxdna_drm_get_param' +amdxdna_drm_get_param__enumvalues = { + 0: "DRM_AMDXDNA_QUERY_AIE_STATUS", + 1: "DRM_AMDXDNA_QUERY_AIE_METADATA", + 2: "DRM_AMDXDNA_QUERY_AIE_VERSION", + 3: "DRM_AMDXDNA_QUERY_CLOCK_METADATA", + 4: "DRM_AMDXDNA_QUERY_SENSORS", + 5: "DRM_AMDXDNA_QUERY_HW_CONTEXTS", + 6: "DRM_AMDXDNA_READ_AIE_MEM", + 7: "DRM_AMDXDNA_READ_AIE_REG", + 8: "DRM_AMDXDNA_QUERY_FIRMWARE_VERSION", + 9: "DRM_AMDXDNA_GET_POWER_MODE", + 10: "DRM_AMDXDNA_NUM_GET_PARAM", +} +DRM_AMDXDNA_QUERY_AIE_STATUS = 0 +DRM_AMDXDNA_QUERY_AIE_METADATA = 1 +DRM_AMDXDNA_QUERY_AIE_VERSION = 2 +DRM_AMDXDNA_QUERY_CLOCK_METADATA = 3 +DRM_AMDXDNA_QUERY_SENSORS = 4 +DRM_AMDXDNA_QUERY_HW_CONTEXTS = 5 +DRM_AMDXDNA_READ_AIE_MEM = 6 +DRM_AMDXDNA_READ_AIE_REG = 7 +DRM_AMDXDNA_QUERY_FIRMWARE_VERSION = 8 +DRM_AMDXDNA_GET_POWER_MODE = 9 +DRM_AMDXDNA_NUM_GET_PARAM = 10 +amdxdna_drm_get_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_get_info(Structure): + pass + + +struct_amdxdna_drm_get_info._pack_ = 1 # source:False +struct_amdxdna_drm_get_info._fields_ = [ + ("param", ctypes.c_uint32), + ("buffer_size", ctypes.c_uint32), + ("buffer", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_GET_INFO = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO , struct_amdxdna_drm_get_info ) # macro +class struct_amdxdna_drm_set_power_mode(Structure): + pass + + +struct_amdxdna_drm_set_power_mode._pack_ = 1 # source:False +struct_amdxdna_drm_set_power_mode._fields_ = [ + ("power_mode", ctypes.c_ubyte), + ("pad", ctypes.c_ubyte * 7), +] + +# values for enumeration 'amdxdna_drm_set_param' +amdxdna_drm_set_param__enumvalues = { + 0: "DRM_AMDXDNA_SET_POWER_MODE", + 1: "DRM_AMDXDNA_WRITE_AIE_MEM", + 2: "DRM_AMDXDNA_WRITE_AIE_REG", + 3: "DRM_AMDXDNA_NUM_SET_PARAM", +} +DRM_AMDXDNA_SET_POWER_MODE = 0 +DRM_AMDXDNA_WRITE_AIE_MEM = 1 +DRM_AMDXDNA_WRITE_AIE_REG = 2 +DRM_AMDXDNA_NUM_SET_PARAM = 3 +amdxdna_drm_set_param = ctypes.c_uint32 # enum + + +class struct_amdxdna_drm_set_state(Structure): + pass + + +struct_amdxdna_drm_set_state._pack_ = 1 # source:False +struct_amdxdna_drm_set_state._fields_ = [ + ("param", ctypes.c_uint32), + ("buffer_size", ctypes.c_uint32), + ("buffer", ctypes.c_uint64), +] + + +# DRM_IOCTL_AMDXDNA_SET_STATE = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE , struct_amdxdna_drm_set_state ) # macro +class struct_amdxdna_drm_syncobjs(Structure): + pass + + +struct_amdxdna_drm_syncobjs._pack_ = 1 # source:False +struct_amdxdna_drm_syncobjs._fields_ = [ + ("handles", ctypes.c_uint64), + ("points", ctypes.c_uint64), + ("count", ctypes.c_uint32), + ("pad", ctypes.c_uint32), +] + + +def struct_amdxdna_cmd_chain(command_count): + class struct_amdxdna_cmd_chain(Structure): + pass + + struct_amdxdna_cmd_chain._pack_ = 1 # source:False + struct_amdxdna_cmd_chain._fields_ = [ + ("command_count", ctypes.c_uint32), + ("submit_index", ctypes.c_uint32), + ("error_index", ctypes.c_uint32), + ("reserved", ctypes.c_uint32 * 3), + ("data", ctypes.c_uint64 * command_count), + ] + return struct_amdxdna_cmd_chain + + +def struct_amdxdna_cmd(count): + class struct_amdxdna_cmd(Structure): + pass + + struct_amdxdna_cmd._pack_ = 1 # source:False + struct_amdxdna_cmd._fields_ = [ + ("state", ctypes.c_uint32, 4), + ("unused", ctypes.c_uint32, 6), + ("extra_cu_masks", ctypes.c_uint32, 2), + ("count", ctypes.c_uint32, 11), + ("opcode", ctypes.c_uint32, 5), + ("reserved", ctypes.c_uint32, 4), + ("data", ctypes.c_uint32 * count), + ] + return struct_amdxdna_cmd + + +# DRM_IOCTL_AMDXDNA_SUBMIT_WAIT = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_WAIT , struct_amdxdna_drm_syncobjs ) # macro +# DRM_IOCTL_AMDXDNA_SUBMIT_SIGNAL = DRM_IOWR ( DRM_COMMAND_BASE + DRM_AMDXDNA_SUBMIT_SIGNAL , struct_amdxdna_drm_syncobjs ) # macro +__all__ = [ + "AMDXDNA_ACCEL_H_", + "AMDXDNA_BO_CMD", + "AMDXDNA_BO_DEV", + "AMDXDNA_BO_DEV_HEAP", + "AMDXDNA_BO_DMA", + "AMDXDNA_BO_INVALID", + "AMDXDNA_BO_SHMEM", + "AMDXDNA_CMD_SUBMIT_DEPENDENCY", + "AMDXDNA_CMD_SUBMIT_EXEC_BUF", + "AMDXDNA_CMD_SUBMIT_SIGNAL", + "AMDXDNA_DEV_TYPE_KMQ", + "AMDXDNA_DEV_TYPE_UMQ", + "AMDXDNA_DEV_TYPE_UNKNOWN", + "AMDXDNA_DRIVER_MAJOR", + "AMDXDNA_DRIVER_MINOR", + "AMDXDNA_INVALID_ADDR", + "AMDXDNA_INVALID_BO_HANDLE", + "AMDXDNA_INVALID_CMD_HANDLE", + "AMDXDNA_INVALID_CTX_HANDLE", + "AMDXDNA_INVALID_FENCE_HANDLE", + "AMDXDNA_SENSOR_TYPE_POWER", + "DRM_AMDXDNA_CONFIG_HWCTX", + "DRM_AMDXDNA_CREATE_BO", + "DRM_AMDXDNA_CREATE_HWCTX", + "DRM_AMDXDNA_DESTROY_HWCTX", + "DRM_AMDXDNA_EXEC_CMD", + "DRM_AMDXDNA_GET_BO_INFO", + "DRM_AMDXDNA_GET_INFO", + "DRM_AMDXDNA_GET_POWER_MODE", + "DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF", + "DRM_AMDXDNA_HWCTX_CONFIG_CU", + "DRM_AMDXDNA_HWCTX_CONFIG_NUM", + "DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF", + "DRM_AMDXDNA_NUM_GET_PARAM", + "DRM_AMDXDNA_NUM_IOCTLS", + "DRM_AMDXDNA_NUM_SET_PARAM", + "DRM_AMDXDNA_QUERY_AIE_METADATA", + "DRM_AMDXDNA_QUERY_AIE_STATUS", + "DRM_AMDXDNA_QUERY_AIE_VERSION", + "DRM_AMDXDNA_QUERY_CLOCK_METADATA", + "DRM_AMDXDNA_QUERY_FIRMWARE_VERSION", + "DRM_AMDXDNA_QUERY_HW_CONTEXTS", + "DRM_AMDXDNA_QUERY_SENSORS", + "DRM_AMDXDNA_READ_AIE_MEM", + "DRM_AMDXDNA_READ_AIE_REG", + "DRM_AMDXDNA_SET_POWER_MODE", + "DRM_AMDXDNA_SET_STATE", + "DRM_AMDXDNA_SUBMIT_SIGNAL", + "DRM_AMDXDNA_SUBMIT_WAIT", + "DRM_AMDXDNA_SYNC_BO", + "DRM_AMDXDNA_WAIT_CMD", + "DRM_AMDXDNA_WRITE_AIE_MEM", + "DRM_AMDXDNA_WRITE_AIE_REG", + "POWER_MODE_DEFAULT", + "POWER_MODE_HIGH", + "POWER_MODE_LOW", + "POWER_MODE_MEDIUM", + "SYNC_DIRECT_FROM_DEVICE", + "SYNC_DIRECT_TO_DEVICE", + "amdxdna_bo_type", + "amdxdna_cmd_type", + "amdxdna_device_type", + "amdxdna_drm_config_hwctx_param", + "amdxdna_drm_get_param", + "amdxdna_drm_ioctl_id", + "amdxdna_drm_set_param", + "amdxdna_power_mode_type", + "amdxdna_sensor_type", + "struct_amdxdna_cu_config", + "struct_amdxdna_drm_aie_mem", + "struct_amdxdna_drm_aie_reg", + "struct_amdxdna_drm_config_hwctx", + "struct_amdxdna_drm_create_bo", + "struct_amdxdna_drm_create_hwctx", + "struct_amdxdna_drm_destroy_hwctx", + "struct_amdxdna_drm_exec_cmd", + "struct_amdxdna_drm_get_bo_info", + "struct_amdxdna_drm_get_info", + "struct_amdxdna_drm_get_power_mode", + "struct_amdxdna_drm_query_aie_metadata", + "struct_amdxdna_drm_query_aie_status", + "struct_amdxdna_drm_query_aie_tile_metadata", + "struct_amdxdna_drm_query_aie_version", + "struct_amdxdna_drm_query_clock", + "struct_amdxdna_drm_query_clock_metadata", + "struct_amdxdna_drm_query_firmware_version", + "struct_amdxdna_drm_query_hwctx", + "struct_amdxdna_drm_query_sensor", + "struct_amdxdna_drm_set_power_mode", + "struct_amdxdna_drm_set_state", + "struct_amdxdna_drm_sync_bo", + "struct_amdxdna_drm_syncobjs", + "struct_amdxdna_drm_wait_cmd", + "struct_amdxdna_hwctx_param_config_cu", + "struct_amdxdna_qos_info", + "struct_amdxdna_cmd_chain", + "struct_amdxdna_cmd", +] diff --git a/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py b/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py new file mode 100644 index 000000000..38aafabc0 --- /dev/null +++ b/build_tools/ci/amdxdna_driver_utils/amdxdna_ioctl.py @@ -0,0 +1,217 @@ +import argparse +import array +import ctypes +import ctypes.util +import fcntl +import pathlib +import re +import struct +from argparse import Namespace +from pprint import pformat + +import amdxdna_accel +from amdxdna_accel import ( + struct_amdxdna_drm_query_aie_version, + struct_amdxdna_drm_get_info, + struct_amdxdna_drm_query_aie_metadata, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_AIE_METADATA, +) + +_IOC_NRBITS = 8 +_IOC_TYPEBITS = 8 +_IOC_SIZEBITS = 14 +_IOC_DIRBITS = 2 + +_IOC_NRMASK = (1 << _IOC_NRBITS) - 1 +_IOC_TYPEMASK = (1 << _IOC_TYPEBITS) - 1 +_IOC_SIZEMASK = (1 << _IOC_SIZEBITS) - 1 +_IOC_DIRMASK = (1 << _IOC_DIRBITS) - 1 + +_IOC_NRSHIFT = 0 +_IOC_TYPESHIFT = _IOC_NRSHIFT + _IOC_NRBITS +_IOC_SIZESHIFT = _IOC_TYPESHIFT + _IOC_TYPEBITS +_IOC_DIRSHIFT = _IOC_SIZESHIFT + _IOC_SIZEBITS + +IOC_NONE = 0 +IOC_WRITE = 1 +IOC_READ = 2 + + +def _IOC(dir, type, nr, size): + assert dir <= _IOC_DIRMASK, dir + assert type <= _IOC_TYPEMASK, type + assert nr <= _IOC_NRMASK, nr + assert size <= _IOC_SIZEMASK, size + return ( + (dir << _IOC_DIRSHIFT) + | (type << _IOC_TYPESHIFT) + | (nr << _IOC_NRSHIFT) + | (size << _IOC_SIZESHIFT) + ) + + +def _IOC_TYPECHECK(t): + if isinstance(t, (memoryview, bytearray)): + size = len(t) + elif isinstance(t, struct.Struct): + size = t.size + elif isinstance(t, array.array): + size = t.itemsize * len(t) + else: + size = ctypes.sizeof(t) + assert size <= _IOC_SIZEMASK, size + return size + + +def _IOWR(type, nr, size): + return _IOC(IOC_READ | IOC_WRITE, type, nr, _IOC_TYPECHECK(size)) + + +def get_struct(argp, stype): + return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents + + +def get_void_ptr_to_struct(s): + ptr = ctypes.pointer(s) + return ctypes.cast(ptr, ctypes.c_void_p) + + +def format_struct(s): + return pformat(s.as_dict(s)) + + +# +DRM_IOCTL_BASE = ord("d") +DRM_COMMAND_BASE = 0x40 + + +def DRM_IOWR(nr, type): + return _IOWR(DRM_IOCTL_BASE, nr, type) + + +def ioctls_from_header(): + hdr = ( + (pathlib.Path(__file__).parent / "amdxdna_accel.py") + .read_text() + .replace("\\\n", "") + ) + pattern = "DRM_IOCTL_AMDXDNA_([A-Z0-9_]+) = DRM_IOWR \( DRM_COMMAND_BASE \+ DRM_AMDXDNA_([A-Z0-9_]+) , struct_amdxdna_drm_([a-z0-9_]+) \)" + matches = re.findall(pattern, hdr, re.MULTILINE) + ioctls = Namespace() + for name, offset, sname in matches: + assert name == offset + offset = f"DRM_AMDXDNA_{name}" + assert hasattr(amdxdna_accel, offset) + offset = getattr(amdxdna_accel, offset) + struc = getattr(amdxdna_accel, "struct_amdxdna_drm_" + sname) + setattr( + ioctls, + f"DRM_IOCTL_AMDXDNA_{name}", + DRM_IOWR(DRM_COMMAND_BASE + offset, struc), + ) + + return ioctls + + +ioctls = ioctls_from_header() + + +def get_aie_version(drv_fd): + version = struct_amdxdna_drm_query_aie_version() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_VERSION, + ctypes.sizeof(struct_amdxdna_drm_query_aie_version), + get_void_ptr_to_struct(version).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + + return version.major, version.minor + + +def get_aie_metadata(drv_fd): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + + return format_struct(metadata) + + +def get_core_n_rows(drv_fd): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + return metadata.core.row_count + + +def find_npu_device(): + drvpath = pathlib.Path("/sys/bus/pci/drivers/amdxdna") + for file in drvpath.iterdir(): + if file.is_symlink(): + actual_path = (drvpath / file.readlink()).resolve() + if str(actual_path).startswith("/sys/devices/pci"): + return actual_path + raise RuntimeError("npu device not found") + + +def read_vbnv(npu_device_path): + f = open(npu_device_path / "vbnv") + vbnv = f.read() + assert vbnv.startswith("RyzenAI-") + return vbnv.split("-")[-1].strip() + + +def get_core_n_cols(drv_fd, npu_device): + metadata = struct_amdxdna_drm_query_aie_metadata() + info_params = struct_amdxdna_drm_get_info( + DRM_AMDXDNA_QUERY_AIE_METADATA, + ctypes.sizeof(struct_amdxdna_drm_query_aie_metadata), + get_void_ptr_to_struct(metadata).value, + ) + + fcntl.ioctl(drv_fd, ioctls.DRM_IOCTL_AMDXDNA_GET_INFO, info_params) + if npu_device == "npu1": + # phoenix + return metadata.cols - 1 + elif npu_device == "npu4": + # strix + return metadata.cols + + return NotImplementedError(f"unrecognized {npu_device=}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--npu-device", action="store_true") + parser.add_argument("--num-rows", action="store_true") + parser.add_argument("--num-cols", action="store_true") + parser.add_argument("--aie-metadata", action="store_true") + parser.add_argument("--aie-version", action="store_true") + args = parser.parse_args() + + drv_path = "/dev/accel/accel0" + drv_fd = open(drv_path, "r+") + npu_device_path = find_npu_device() + npu_device = read_vbnv(npu_device_path) + + if args.npu_device: + print(npu_device) + if args.num_rows: + print(get_core_n_rows(drv_fd)) + if args.num_cols: + print(get_core_n_cols(drv_fd, npu_device)) + if args.aie_metadata: + print(get_aie_metadata(drv_fd)) + if args.aie_version: + print(get_aie_version(drv_fd)) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index fdacf7cbc..96f060d4c 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -2,18 +2,19 @@ # Copyright 2024 The IREE Authors -import sys import argparse import os import platform import re import subprocess +import sys import time from pathlib import Path from textwrap import dedent import numpy as np +from convolution_template.convolution_generator import ConvolutionMlirGenerator from input_generator import ( generate_inputs, verify_determinism, @@ -22,7 +23,6 @@ np_from_binfile, ) from matmul_template.matmul_generator import generate_matmul_test -from convolution_template.convolution_generator import ConvolutionMlirGenerator from output_comparer import compare @@ -146,6 +146,7 @@ def generate_aie_vmfb( f"--iree-amd-aie-install-dir={config.iree_install_dir}", f"--iree-amd-aie-vitis-install-dir={config.vitis_dir}", f"--iree-hal-dump-executable-files-to={config.output_dir}", + f"--iree-amdaie-device-hal={config.device_hal}", "--iree-scheduling-optimize-bindings=false", "--iree-hal-memoization=false", "--iree-hal-indirect-command-buffers=false", @@ -191,11 +192,16 @@ def generate_aie_output(config, aie_vmfb, input_args, function_name, name, outpu config.iree_run_exe, f"--module={aie_vmfb}", *input_args, - "--device=xrt", + f"--device={config.device_hal}", f"--output=@{aie_bin}", ] if function_name: run_args += [f"--function={function_name}"] + if config.xrt_lite_n_core_rows is not None: + run_args += [f"--xrt_lite_n_core_rows={config.xrt_lite_n_core_rows}"] + if config.xrt_lite_n_core_cols is not None: + run_args += [f"--xrt_lite_n_core_cols={config.xrt_lite_n_core_cols}"] + if config.reset_npu_between_runs: shell_out(config.reset_npu_script, verbose=config.verbose) @@ -267,6 +273,9 @@ def __init__( reset_npu_between_runs, do_not_run_aie, additional_aie_compilation_flags, + device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ): self.output_dir = output_dir self.iree_install_dir = iree_install_dir @@ -283,6 +292,9 @@ def __init__( self.reset_npu_between_runs = reset_npu_between_runs self.do_not_run_aie = do_not_run_aie self.additional_aie_compilation_flags = additional_aie_compilation_flags + self.device_hal = device_hal + self.xrt_lite_n_core_rows = xrt_lite_n_core_rows + self.xrt_lite_n_core_cols = xrt_lite_n_core_cols # Try get the xrt and (linux) kernel versions. self.linux_kernel = "undetermined" @@ -846,6 +858,9 @@ def all_tests( do_not_run_aie, test_set, additional_aie_compilation_flags, + device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ): """ There are a few ways to add tests to this script: @@ -887,6 +902,9 @@ def all_tests( reset_npu_between_runs, do_not_run_aie, additional_aie_compilation_flags, + device_hal, + xrt_lite_n_core_rows, + xrt_lite_n_core_cols, ) if verbose: print(config) @@ -941,6 +959,8 @@ def all_tests( parser.add_argument("peano_install_dir", type=abs_path) parser.add_argument("--xrt-dir", type=abs_path) parser.add_argument("--vitis-dir", type=abs_path) + parser.add_argument("--xrt_lite_n_core_rows", type=int) + parser.add_argument("--xrt_lite_n_core_cols", type=int) # TODO(newling) make bool options boolean, not integer (tried but had issues) parser.add_argument( @@ -1023,6 +1043,15 @@ def all_tests( default="", ) + parser.add_argument( + "--device-hal", + default="xrt-lite", + const="xrt-lite", + nargs="?", + choices=["xrt", "xrt-lite"], + help="device HAL to use (default: %(default)s)", + ) + args = parser.parse_args() test_set_list = args.test_set.split(",") @@ -1038,4 +1067,7 @@ def all_tests( args.do_not_run_aie, test_set_list, args.additional_aie_compilation_flags, + args.device_hal, + args.xrt_lite_n_core_rows, + args.xrt_lite_n_core_cols, ) diff --git a/build_tools/ci/run_all_runtime_tests.sh b/build_tools/ci/run_all_runtime_tests.sh new file mode 100755 index 000000000..1d439b93f --- /dev/null +++ b/build_tools/ci/run_all_runtime_tests.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +set -eu + +this_dir="$(cd $(dirname $0) && pwd)" +src_dir="$(cd $this_dir/../.. && pwd)" + +if [ -z "${IREE_INSTALL_DIR}" ]; then + echo "IREE_INSTALL_DIR needs to be set" + exit 1 +fi + +if [ -z "${PEANO_INSTALL_DIR}" ]; then + echo "PEANO_INSTALL_DIR needs to be set" + exit 1 +fi + +if [ -z "${VITIS_DIR}" ]; then + echo "VITIS_DIR needs to be set" + exit 1 +fi + +if [ -z "${XILINXD_LICENSE_FILE}" ]; then + echo "XILINXD_LICENSE_FILE needs to be set" + exit 1 +fi + +export PYTHONPATH=$IREE_INSTALL_DIR/python_packages/iree_compiler:$IREE_INSTALL_DIR/python_packages/iree_runtime +export XRT_LITE_N_CORE_ROWS=$(python $this_dir/amdxdna_driver_utils/amdxdna_ioctl.py --num-rows) +export XRT_LITE_N_CORE_COLS=$(python $this_dir/amdxdna_driver_utils/amdxdna_ioctl.py --num-cols) +export PATH=$IREE_INSTALL_DIR/bin:$PATH + +$this_dir/cpu_comparison/run.py \ + $this_dir/test_aie_vs_cpu \ + $IREE_INSTALL_DIR \ + $PEANO_INSTALL_DIR \ + --vitis-dir $VITIS_DIR \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS \ + -v + +$this_dir/run_matmul_test.sh \ + $this_dir/test_matmuls \ + $IREE_INSTALL_DIR \ + $PEANO_INSTALL_DIR \ + $VITIS_DIR + +pytest -rv --capture=tee-sys $src_dir/tests \ + --peano-install-dir=$PEANO_INSTALL_DIR \ + --iree-install-dir=$IREE_INSTALL_DIR \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 50649e8fd..1ed121c44 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -28,16 +28,16 @@ if [ "$#" -lt 2 ] || [ "$#" -gt 5 ]; then # The expected parameters are # 1) (required) # 2) (required) - # 4) (optional) + # 3) (optional) + # 4) (optional) # 5) (optional) - # 6) (optional) echo -e "Illegal number of parameters: $#, expected 2-5 parameters." \ "\n The parameters are as follows:" \ "\n 1) (required)" \ "\n 2) (required)" \ "\n 3) (optional)" \ - "\n 4) (optional)" \ - "\n 5) (optional)" \ + "\n 4) (optional)" \ + "\n 5) (optional)" \ "\n Example, dependent on environment variables:" \ "\n ./run_matmul_test.sh " \ "results_dir_tmp \$IREE_INSTALL_DIR " \ @@ -102,23 +102,24 @@ if [ ! -d "${PEANO}" ]; then exit 1 fi -# Parameter 4) +# Parameter 4) if [ -z "${4-}" ]; then - XRT_DIR=/opt/xilinx/xrt + VITIS=/opt/Xilinx/Vitis/2024.2 else - XRT_DIR=`realpath "$4"` -fi -if [ -d "$XRT_DIR" ]; then - source $XRT_DIR/setup.sh + VITIS=`realpath "$4"` fi -# Parameter 5) +# Parameter 5) if [ -z "${5-}" ]; then - VITIS=/opt/Xilinx/Vitis/2024.2 + XRT_DIR=/opt/xilinx/xrt else - VITIS=`realpath "$5"` + XRT_DIR=`realpath "$5"` +fi +if [ -f "$XRT_DIR/setup.sh" ]; then + source $XRT_DIR/setup.sh fi + THIS_DIR="$(cd $(dirname $0) && pwd)" ROOT_DIR="$(cd $THIS_DIR/../.. && pwd)" @@ -147,6 +148,8 @@ cd ${OUTPUT_DIR} export MATMUL_TESTS_RUN=0 export MATMUL_TESTS_FAILS=0 +DEVICE_HAL="${DEVICE_HAL:-xrt-lite}" + ############################################################################### # Define helper function # ############################################################################### @@ -176,8 +179,6 @@ function run_matmul_test() { local target_device="npu1_4col" - local device="xrt" - local peano_install_path="${PEANO}" local amd_aie_install_path="${IREE_INSTALL_DIR}" @@ -274,10 +275,6 @@ function run_matmul_test() { target_backend="$2" shift 2 ;; - --device) - device="$2" - shift 2 - ;; --peano_install_path) peano_install_path="$2" shift 2 @@ -405,6 +402,7 @@ function run_matmul_test() { --iree-amd-aie-enable-chess=${use_chess} \ --iree-amdaie-enable-packet-flow=${enable_packet_flow} \ --iree-hal-dump-executable-files-to=$PWD \ + --iree-amdaie-device-hal=${DEVICE_HAL} \ --iree-hal-memoization=false \ --iree-hal-indirect-command-buffers=false \ --mlir-elide-resource-strings-if-larger=10 \ @@ -443,7 +441,7 @@ function run_matmul_test() { fi fi - # Renable exit on failure: + # Re-enable exit on failure: echo "**** Generating calls .vmfb file for ${name} ****" ${IREE_COMPILE_EXE} "${calls_ir}" \ --iree-hal-target-backends=${target_backend} \ @@ -463,9 +461,16 @@ function run_matmul_test() { COMMAND="${TEST_RUNNER} \ --module=${matmul_vmfb} \ --module=${calls_vmfb} \ - --device=${device} \ + --device=${DEVICE_HAL} \ --max_elements_to_check=${max_elements_to_check}" + if [ -n "${XRT_LITE_N_CORE_ROWS:-}" ]; then + COMMAND="${COMMAND} --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS" + fi + if [ -n "${XRT_LITE_N_CORE_COLS:-}" ]; then + COMMAND="${COMMAND} --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS" + fi + total_num_runs=$(( num_repeat_runs * num_corruption_repeat_runs)) echo "**** Running '${name}' matmul test ${total_num_runs} times (command ${COMMAND}) ****" for i in $(seq 1 $num_repeat_runs); do @@ -530,7 +535,6 @@ run_matmul_test \ --acc_type "f32" \ --target_backend "amd-aie" \ --target_device "npu1_4col" \ - --device "xrt" \ --peano_install_path "${PEANO}" \ --amd_aie_install_path "${IREE_INSTALL_DIR}" \ --vitis_path "${VITIS}" \ @@ -789,7 +793,19 @@ if [ -d "$VITIS" ]; then fi -echo "\n\n" +# note this will not actually show any devices because --xrt_lite_n_core_rows --xrt_lite_n_core_cols are not passed +# which i have omitted to make the conditional slightly more succinct +if [[ $($IREE_INSTALL_DIR/bin/iree-benchmark-module --dump_devices | grep xrt-lite) ]]; then + $IREE_INSTALL_DIR/bin/iree-benchmark-module \ + --module=$OUTPUT_DIR/mm_test1_bf16_f32_m64_n64_k64.vmfb \ + --function=matmul_64x64_64xbf16_ \ + --input=64x64xbf16 \ + --input=64x64xbf16 \ + --device=xrt-lite \ + --benchmark_repetitions=10 \ + --xrt_lite_n_core_rows=$XRT_LITE_N_CORE_ROWS \ + --xrt_lite_n_core_cols=$XRT_LITE_N_CORE_COLS +fi echo "$MATMUL_TESTS_RUN matmul tests run!" if [ $MATMUL_TESTS_FAILS -ne 0 ]; then diff --git a/build_tools/download_peano.ps1 b/build_tools/download_peano.ps1 index 89bd6808f..74a1240d8 100644 --- a/build_tools/download_peano.ps1 +++ b/build_tools/download_peano.ps1 @@ -9,4 +9,7 @@ $ErrorActionPreference = 'Stop' $this_dir = Split-Path -Path $MyInvocation.MyCommand.Path -Parent $RELEASE = (Get-Content -Path "$this_dir/peano_commit.txt") pip download llvm_aie==$RELEASE -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly -Expand-Archive (Get-ChildItem -Filter llvm*.whl).FullName -DestinationPath $PWD.Path +$peano = (Get-ChildItem -Filter llvm*.whl) +$new_name = ($peano.Basename + ".zip") +Rename-Item -Path $peano.Name -NewName $new_name +Expand-Archive $new_name -DestinationPath $PWD.Path -Force diff --git a/cmake/iree_aie_utils.cmake b/cmake/iree_aie_utils.cmake index 2e0911dfa..aa4c57027 100644 --- a/cmake/iree_aie_utils.cmake +++ b/cmake/iree_aie_utils.cmake @@ -29,6 +29,8 @@ function(replace_string_in_file _file _match_string _replace_string) if(NOT (EXISTS ${_file})) message(FATAL_ERROR "file ${_file} does not exist") endif() + set(_lock_file "${_file}.lock") + file(LOCK "${_lock_file}" GUARD FUNCTION) file(READ "${_file}" _file_contents) if(_file_contents STREQUAL "") message(FATAL_ERROR "empty file contents for ${_file}") @@ -38,5 +40,6 @@ function(replace_string_in_file _file _match_string _replace_string) message(FATAL_ERROR "empty replacement contents for ${_file}") endif() file(WRITE "${_file}" "${_file_contents}") + file(LOCK "${_lock_file}" RELEASE) endfunction() diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/CMakeLists.txt index b6077f9c5..bd5865430 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/CMakeLists.txt @@ -26,6 +26,7 @@ iree_cc_library( iree::base::core_headers iree::base::internal::flatcc::building iree-amd-aie::schemas::xrt_executable_def_c_fbs + iree-amd-aie::schemas::pdi_executable_def_c_fbs PUBLIC ) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp index 39c044d59..9fe88cd64 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp @@ -47,8 +47,15 @@ struct AMDAIESession void populateHALTargetDevices(IREE::HAL::TargetDeviceList &targets) override { // #hal.device.target<"xrt", ... - // #hal.executable.target<"amd-aie", ... - targets.add("xrt", [=]() { return AMDAIE::createTarget(options); }); + targets.add("xrt", [=] { + options.deviceHal = AMDAIE::AMDAIEOptions::DeviceHAL::XRT; + return AMDAIE::createTarget(options); + }); + // #hal.device.target<"xrt-lite", ... + targets.add("xrt-lite", [=] { + options.deviceHal = AMDAIE::AMDAIEOptions::DeviceHAL::XRT_LITE; + return AMDAIE::createTarget(options); + }); } void populateHALTargetBackends( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index 2c643d39b..d1c43f1ce 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -7,6 +7,7 @@ #include "iree-amd-aie/Target/AIETarget.h" #include +#include #include "XCLBinGen.h" #include "aie/AIEDialect.h" @@ -18,6 +19,8 @@ #include "air/Dialect/AIRRt/AIRRtDialect.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/schemas/pdi_executable_def_builder.h" +#include "iree-amd-aie/schemas/xrt_executable_def_builder.h" #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h" #include "iree/compiler/Dialect/HAL/Target/TargetRegistry.h" #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h" @@ -43,7 +46,6 @@ #include "mlir/Pass/PassManager.h" #include "mlir/Support/FileUtilities.h" #include "mlir/Target/LLVMIR/Dialect/All.h" -#include "runtime/plugins/AMD-AIE/iree-amd-aie/schemas/xrt_executable_def_builder.h" #define DEBUG_TYPE "aie-target" @@ -56,18 +58,20 @@ static xilinx::AIE::DeviceOp getDeviceOpWithName(ModuleOp moduleOp, moduleOp.walk([&](xilinx::AIE::DeviceOp d) { ++nDeviceOpsVisited; // This attribute should've been set in the dma-to-npu pass. - auto maybeName = d->getAttrOfType("runtime_sequence_name"); + StringAttr maybeName = + d->getAttrOfType("runtime_sequence_name"); if (!maybeName) return WalkResult::advance(); - auto name = maybeName.getValue(); + StringRef name = maybeName.getValue(); if (name != targetName) return WalkResult::advance(); deviceOp = d; return WalkResult::interrupt(); }); - if (!deviceOp) + if (!deviceOp) { moduleOp.emitError() << "visited " << nDeviceOpsVisited << " aie.device ops, and failed to find one with name " << targetName; + } return deviceOp; } @@ -84,7 +88,7 @@ static void sanitizeForBootgen(std::string &symbol) { class AIETargetDevice final : public IREE::HAL::TargetDevice { public: - AIETargetDevice(const AMDAIEOptions &options) : options(options) {} + AIETargetDevice(AMDAIEOptions options) : options(std::move(options)) {} IREE::HAL::DeviceTargetAttr getDefaultDeviceTarget( MLIRContext *context, @@ -101,8 +105,17 @@ class AIETargetDevice final : public IREE::HAL::TargetDevice { targetRegistry.getTargetBackend("amd-aie")->getDefaultExecutableTargets( context, "amd-aie", configAttr, executableTargetAttrs); - return IREE::HAL::DeviceTargetAttr::get(context, b.getStringAttr("xrt"), - configAttr, executableTargetAttrs); + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + return IREE::HAL::DeviceTargetAttr::get( + context, b.getStringAttr("xrt"), configAttr, executableTargetAttrs); + case AMDAIEOptions::DeviceHAL::XRT_LITE: + return IREE::HAL::DeviceTargetAttr::get( + context, b.getStringAttr("xrt-lite"), configAttr, + executableTargetAttrs); + default: + llvm_unreachable("unsupported device HAL\n"); + } } private: @@ -111,9 +124,19 @@ class AIETargetDevice final : public IREE::HAL::TargetDevice { class AIETargetBackend final : public IREE::HAL::TargetBackend { public: - explicit AIETargetBackend(const AMDAIEOptions &options) : options(options) {} - - std::string getLegacyDefaultDeviceID() const override { return "xrt"; } + explicit AIETargetBackend(AMDAIEOptions options) + : options(std::move(options)) {} + + std::string getLegacyDefaultDeviceID() const override { + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + return "xrt"; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + return "xrt-lite"; + default:; + llvm::report_fatal_error("unsupported default device\n"); + }; + } void getDefaultExecutableTargets( MLIRContext *context, StringRef deviceID, DictionaryAttr deviceConfigAttr, @@ -139,9 +162,19 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { addConfig("ukernels", StringAttr::get(context, options.enableAMDAIEUkernels)); auto configAttr = b.getDictionaryAttr(configItems); - return IREE::HAL::ExecutableTargetAttr::get( - context, b.getStringAttr("amd-aie"), - b.getStringAttr("amdaie-xclbin-fb"), configAttr); + + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + return IREE::HAL::ExecutableTargetAttr::get( + context, b.getStringAttr("amd-aie"), + b.getStringAttr("amdaie-xclbin-fb"), configAttr); + case AMDAIEOptions::DeviceHAL::XRT_LITE: + return IREE::HAL::ExecutableTargetAttr::get( + context, b.getStringAttr("amd-aie"), + b.getStringAttr("amdaie-pdi-fb"), configAttr); + default:; + llvm::report_fatal_error("unsupported default HAL\n"); + }; } void getDependentDialects(DialectRegistry ®istry) const override { @@ -191,48 +224,92 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { AMDAIEOptions options; }; +void serializeXCLBinToFb(FlatbufferBuilder &builder, + flatbuffers_string_vec_ref_t entryPointsRef, + SmallVector &asmInstrIndices, + SmallVector &xclbinIndices, + SmallVector xclbinRefs, + SmallVector asmInstrRefs) { + iree_amd_aie_hal_xrt_ExecutableDef_entry_points_add(builder, entryPointsRef); + flatbuffers_int32_vec_ref_t asmInstrIndicesRef = + builder.createInt32Vec(asmInstrIndices); + iree_amd_aie_hal_xrt_ExecutableDef_asm_instr_indices_add(builder, + asmInstrIndicesRef); + flatbuffers_int32_vec_ref_t xclbinIndicesRef = + builder.createInt32Vec(xclbinIndices); + iree_amd_aie_hal_xrt_ExecutableDef_xclbin_indices_add(builder, + xclbinIndicesRef); + flatbuffers_vec_ref_t xclbinsRef = + builder.createOffsetVecDestructive(xclbinRefs); + iree_amd_aie_hal_xrt_ExecutableDef_xclbins_add(builder, xclbinsRef); + flatbuffers_vec_ref_t asmInstrsRef = + builder.createOffsetVecDestructive(asmInstrRefs); + iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_add(builder, asmInstrsRef); + iree_amd_aie_hal_xrt_ExecutableDef_end_as_root(builder); +} + +void serializePDIToFb(FlatbufferBuilder &builder, + flatbuffers_string_vec_ref_t entryPointsRef, + SmallVector &asmInstrIndices, + SmallVector &pdiIndices, + SmallVector pdiRefs, + SmallVector asmInstrRefs) { + iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_add(builder, + entryPointsRef); + flatbuffers_int32_vec_ref_t asmInstrIndicesRef = + builder.createInt32Vec(asmInstrIndices); + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instr_indices_add( + builder, asmInstrIndicesRef); + flatbuffers_int32_vec_ref_t pdiIndicesRef = + builder.createInt32Vec(pdiIndices); + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdi_indices_add(builder, + pdiIndicesRef); + flatbuffers_vec_ref_t pdisRef = builder.createOffsetVecDestructive(pdiRefs); + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_add(builder, pdisRef); + flatbuffers_vec_ref_t asmInstrsRef = + builder.createOffsetVecDestructive(asmInstrRefs); + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_add(builder, asmInstrsRef); + iree_amd_aie_hal_xrt_lite_ExecutableDef_end_as_root(builder); +} + LogicalResult AIETargetBackend::serializeExecutable( const SerializationOptions &serOptions, IREE::HAL::ExecutableVariantOp variantOp, OpBuilder &executableBuilder) { ModuleOp moduleOp = variantOp.getInnerModule(); - auto basename = + std::string basename = llvm::join_items("_", serOptions.dumpBaseName, variantOp.getName()); sanitizeForBootgen(basename); - auto maybeWorkDir = [&]() -> FailureOr> { - // If a path for intermediates has been specified, assume it is common for - // all executables compiling in parallel, and so create an - // executable-specific subdir to keep this executable's intermediates - // separate. - if (!serOptions.dumpIntermediatesPath.empty()) { - SmallString<128> workDir{serOptions.dumpIntermediatesPath}; - llvm::sys::path::append(workDir, basename); - auto ecode = llvm::sys::fs::create_directories(workDir); - if (ecode) { - return moduleOp.emitError() - << "failed to create working directory " << workDir - << ". Error message : " << ecode.message(); - } - return workDir; - } + FailureOr> maybeWorkDir; + // If a path for intermediates has been specified, assume it is common for + // all executables compiling in parallel, and so create an + // executable-specific subdir to keep this executable's intermediates + // separate. + if (!serOptions.dumpIntermediatesPath.empty()) { + SmallString<128> workDir{serOptions.dumpIntermediatesPath}; + llvm::sys::path::append(workDir, basename); + if (auto ecode = llvm::sys::fs::create_directories(workDir)) { + return moduleOp.emitError() + << "failed to create working directory " << workDir + << ". Error message : " << ecode.message(); + } + maybeWorkDir = workDir; + } else { // No path for intermediates: make a temporary directory for this // executable that is certain to be distinct from the dir of any other // executable. SmallString<128> workDirFromScratch; - auto err = llvm::sys::fs::createUniqueDirectory( - /* prefix = */ variantOp.getName(), workDirFromScratch); - - if (err) + if (auto err = llvm::sys::fs::createUniqueDirectory( + /*prefix=*/variantOp.getName(), workDirFromScratch)) { return moduleOp.emitOpError() - << "failed to create working directory for xclbin generation: " + << "failed to create working directory for artifact generation: " << err.message(); + } + maybeWorkDir = workDirFromScratch; + } - return workDirFromScratch; - }(); - - if (failed(maybeWorkDir)) return failure(); - auto workDir = maybeWorkDir.value(); + SmallString<128> workDir = maybeWorkDir.value(); // collect names of kernels as they need to be in kernels.json // generated by `aie2xclbin` SmallVector entryPointNames; @@ -241,7 +318,7 @@ LogicalResult AIETargetBackend::serializeExecutable( // Map to keep track of which ordinal number belongs to which entry point, // typically the order is sequential but that is not gauranteed std::map entryPointOrdinals; - for (auto exportOp : variantOp.getExportOps()) { + for (IREE::HAL::ExecutableExportOp exportOp : variantOp.getExportOps()) { uint64_t ordinal = 0; if (std::optional optionalOrdinal = exportOp.getOrdinal()) { ordinal = optionalOrdinal->getZExtValue(); @@ -269,56 +346,79 @@ LogicalResult AIETargetBackend::serializeExecutable( // error out if we think the name will most likely be too long // for the artifact generation to succeed. We set this cut-off at 50 // characters. - if (entryPointName.size() > 50) + if (entryPointName.size() > 50) { return exportOp.emitError() << "entry point name: " << entryPointName << "is too long!"; + } } + uint64_t ordinalCount = entryPointOrdinals.size(); if (entryPointNames.empty()) { return moduleOp.emitOpError("should contain some entry points"); } - std::unique_ptr xclbinIn; - + std::unique_ptr artifactInput; FlatbufferBuilder builder; - iree_amd_aie_hal_xrt_ExecutableDef_start_as_root(builder); - SmallVector xclbinRefs; - SmallVector asmInstrRefs; + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + iree_amd_aie_hal_xrt_ExecutableDef_start_as_root(builder); + break; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + iree_amd_aie_hal_xrt_lite_ExecutableDef_start_as_root(builder); + break; + default: + llvm::errs() << "Unsupported device HAL\n"; + return failure(); + } + + SmallVector refs; + SmallVector asmInstrRefs; // Per entry-point data. // Note that the following vectors should all be of the same size and // element at index #i is for entry point with ordinal #i! SmallVector entryPointNamesFb(ordinalCount); - SmallVector xclbinIndices(ordinalCount); + SmallVector indices(ordinalCount); SmallVector asmInstrIndices(ordinalCount); for (size_t i = 0; i < entryPointNames.size(); i++) { uint64_t ordinal = entryPointOrdinals.at(entryPointNames[i]); - entryPointNamesFb[ordinal] = entryPointNames[i]; std::string errorMessage; - - // we add the entry point to the working directory for xclbin artifacts if - // there are multiple entry points so that we dont overwrite the xclbinutil + // we add the entry point to the working directory for artifacts if + // there are multiple entry points so that we don't overwrite the // generated artifacts e.g kernels.json, for different entry points which // will have the same exact names. SmallString<128> entryPointWorkDir(workDir); - if (ordinalCount > 1) + if (ordinalCount > 1) { llvm::sys::path::append(entryPointWorkDir, entryPointNamesFb[ordinal]); - auto err = llvm::sys::fs::create_directories(entryPointWorkDir); - if (err) + } + + if (auto err = llvm::sys::fs::create_directories(entryPointWorkDir)) { return moduleOp.emitOpError() - << "failed to create working directory for xclbin generation: " + << "failed to create working directory for artifact generation: " << err.message(); + } llvm::outs().flush(); - SmallString<128> xclbinPath(entryPointWorkDir); - llvm::sys::path::append(xclbinPath, entryPointNamesFb[ordinal] + ".xclbin"); + + SmallString<128> artifactPath(entryPointWorkDir); + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + llvm::sys::path::append(artifactPath, entryPointNamesFb[ordinal] + ".xclbin"); + break; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + llvm::sys::path::append(artifactPath, entryPointNamesFb[ordinal] + ".pdi"); + break; + default: + llvm::errs() << "Unsupported device HAL\n"; + return failure(); + } SmallString<128> npuInstPath(entryPointWorkDir); llvm::sys::path::append(npuInstPath, entryPointNamesFb[ordinal] + ".npu.txt"); - // Convert ordinal to hexadecimal string for xclbin kernel id. + // Convert ordinal to hexadecimal string for kernel id. std::stringstream ordinalHex; ordinalHex << "0x" << std::hex << ordinal; @@ -342,6 +442,7 @@ LogicalResult AIETargetBackend::serializeExecutable( // TODO(max): this should be an enum // TODO(max): this needs to be pulled from PCIE std::string npuVersion; + std::string targetArch; switch (options.AMDAIETargetDevice) { case AMDAIEDevice::npu1: case AMDAIEDevice::npu1_1col: @@ -349,18 +450,21 @@ LogicalResult AIETargetBackend::serializeExecutable( case AMDAIEDevice::npu1_3col: case AMDAIEDevice::npu1_4col: npuVersion = "npu1"; + targetArch = "AIE2"; break; case AMDAIEDevice::npu4: npuVersion = "npu4"; + targetArch = "AIE2P"; break; default: - llvm::report_fatal_error("unhandled NPU partitioning.\n"); + llvm::errs() << "unhandled NPU partitioning.\n"; + return failure(); } if (failed(aie2xclbin( /*ctx=*/variantOp->getContext(), deviceOps[i], /*outputNPU=*/npuInstPath.str().str(), - /*outputXCLBin=*/xclbinPath.str().str(), + /*artifactPath=*/artifactPath.str().str(), /*printIRBeforeAll=*/options.aie2xclbinPrintIrBeforeAll, /*printIRAfterAll=*/options.aie2xclbinPrintIrAfterAll, /*printIRModuleScope=*/options.aie2xclbinPrintIrModuleScope, @@ -371,17 +475,18 @@ LogicalResult AIETargetBackend::serializeExecutable( /*vitisDir=*/options.vitisInstallDir.empty() ? std::nullopt : std::optional{options.vitisInstallDir}, - // TODO(max): not right for strix - /*targetArch=*/"AIE2", + /*targetArch=*/targetArch, /*npuVersion=*/npuVersion, /*peanoDir=*/options.peanoInstallDir, + /*deviceHal=*/options.deviceHal, /*xclBinKernelID=*/ordinalHex.str(), /*xclBinKernelName=*/entryPointNamesFb[ordinal], /*xclBinInstanceName=*/"IREE", /*amdAIEInstallDir=*/options.amdAieInstallDir, /*InputXCLBin=*/std::nullopt, - /*ukernel=*/options.enableAMDAIEUkernels))) + /*ukernel=*/options.enableAMDAIEUkernels))) { return failure(); + } std::ifstream instrFile(static_cast(npuInstPath)); std::string line; @@ -395,40 +500,61 @@ LogicalResult AIETargetBackend::serializeExecutable( } npuInstrs.push_back(a); } - auto npuInstrsVec = builder.createInt32Vec(npuInstrs); + flatbuffers_int32_vec_ref_t npuInstrsVec = + builder.createInt32Vec(npuInstrs); asmInstrIndices[ordinal] = asmInstrRefs.size(); - asmInstrRefs.push_back( - iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec)); - xclbinIn = openInputFile(xclbinPath, &errorMessage); - if (!xclbinIn) { - moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage; + if (options.deviceHal == AMDAIEOptions::DeviceHAL::XRT_LITE) { + asmInstrRefs.push_back( + iree_amd_aie_hal_xrt_lite_AsmInstDef_create(builder, npuInstrsVec)); + } else if (options.deviceHal == AMDAIEOptions::DeviceHAL::XRT) { + asmInstrRefs.push_back( + iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec)); + } else { + llvm::report_fatal_error("unsupported backend"); } - auto xclbinStringRef = builder.createString(xclbinIn->getBuffer()); - xclbinIndices[ordinal] = xclbinRefs.size(); - xclbinRefs.push_back( - iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef)); - } - // Serialize the executable to flatbuffer format - auto entryPointsRef = builder.createStringVec(entryPointNamesFb); - iree_amd_aie_hal_xrt_ExecutableDef_entry_points_add(builder, entryPointsRef); - - flatbuffers_int32_vec_ref_t asmInstrIndicesRef = - builder.createInt32Vec(asmInstrIndices); - iree_amd_aie_hal_xrt_ExecutableDef_asm_instr_indices_add(builder, - asmInstrIndicesRef); - flatbuffers_int32_vec_ref_t xclbinIndicesRef = - builder.createInt32Vec(xclbinIndices); - iree_amd_aie_hal_xrt_ExecutableDef_xclbin_indices_add(builder, - xclbinIndicesRef); - auto xclbinsRef = builder.createOffsetVecDestructive(xclbinRefs); - iree_amd_aie_hal_xrt_ExecutableDef_xclbins_add(builder, xclbinsRef); + artifactInput = openInputFile(artifactPath, &errorMessage); + if (!artifactInput) { + moduleOp.emitOpError() + << "Failed to open artifact file: " << errorMessage; + } + flatbuffers_string_ref_t artifactStringRef = + builder.createString(artifactInput->getBuffer()); + indices[ordinal] = refs.size(); + + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + refs.push_back( + iree_amd_aie_hal_xrt_XclbinDef_create(builder, artifactStringRef)); + break; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + refs.push_back(iree_amd_aie_hal_xrt_lite_PdiDef_create( + builder, artifactStringRef)); + break; + default: + llvm::errs() << "Unsupported device HAL\n"; + return failure(); + } + } - auto asmInstrsRef = builder.createOffsetVecDestructive(asmInstrRefs); - iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_add(builder, asmInstrsRef); + // Serialize the executable to flatbuffer format + flatbuffers_string_vec_ref_t entryPointsRef = + builder.createStringVec(entryPointNamesFb); + switch (options.deviceHal) { + case AMDAIEOptions::DeviceHAL::XRT: + serializeXCLBinToFb(builder, entryPointsRef, asmInstrIndices, indices, + refs, asmInstrRefs); + break; + case AMDAIEOptions::DeviceHAL::XRT_LITE: + serializePDIToFb(builder, entryPointsRef, asmInstrIndices, indices, refs, + asmInstrRefs); + break; + default: + llvm::errs() << "Unsupported device HAL\n"; + return failure(); + } - iree_amd_aie_hal_xrt_ExecutableDef_end_as_root(builder); auto binaryOp = executableBuilder.create( variantOp.getLoc(), variantOp.getSymName(), variantOp.getTarget().getFormat(), diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h index 1f6518909..352c86a03 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h @@ -54,6 +54,9 @@ struct AMDAIEOptions { std::string enableAMDAIEUkernels{"none"}; bool enablePacketFlow{false}; + enum class DeviceHAL { XRT, XRT_LITE }; + DeviceHAL deviceHal{DeviceHAL::XRT_LITE}; + void bindOptions(OptionsBinder &binder) { static llvm::cl::OptionCategory category("AMD AIE Options"); binder.opt( @@ -187,6 +190,13 @@ struct AMDAIEOptions { binder.opt("iree-amdaie-enable-packet-flow", enablePacketFlow, llvm::cl::cat(category), llvm::cl::desc("Enable packet routing data movement.")); + + binder.opt( + "iree-amdaie-device-hal", deviceHal, llvm::cl::cat(category), + llvm::cl::desc("Sets the target device HAL."), + llvm::cl::values(clEnumValN(DeviceHAL::XRT, "xrt", "xrt device HAL"), + clEnumValN(DeviceHAL::XRT_LITE, "xrt-lite", + "xrt-lite device HAL"))); } }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt index 3c7cd4d64..63a38950a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt @@ -34,6 +34,7 @@ iree_cc_library( DEPS ::AIETargets iree-amd-aie::schemas::xrt_executable_def_c_fbs + iree-amd-aie::schemas::pdi_executable_def_c_fbs iree::base::internal::flatcc::building iree::base::internal::flatcc::parsing iree::compiler::Dialect::HAL::Target diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp index 13024aa11..df9191a39 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.cpp @@ -263,8 +263,10 @@ std::vector makeChessEnv(Path &vitisDir, Path path(::getenv("PATH")); Path lnx64o = aieToolsPath / "lib" / "lnx64.o"; Path dotLib = aieToolsPath / "lnx64" / "tools" / "dot" / "lib"; - Path ldLibraryPath(::getenv("LD_LIBRARY_PATH")); - + Path ldLibraryPath; + if (char *ldLibraryPath_ = ::getenv("LD_LIBRARY_PATH")) { + ldLibraryPath = ldLibraryPath_; + } std::string pathEnv = "PATH=" + chessccPath.string() + std::string{sys::EnvPathSeparator} + path.string(); std::string ldLibEnv = "LD_LIBRARY_PATH=" + lnx64o.string() + @@ -726,6 +728,60 @@ static json::Object makeKernelJSON(const std::string &name, {"instances", json::Array{json::Object{{"name", instance}}}}}; } +static LogicalResult generatePDI(const std::string &Output, + const Path &tempDir) { + std::string errorMessage; + // Create design.bif. + Path designBifFile = tempDir / "design.bif"; + { + auto designBifOut = openOutputFile(designBifFile.string(), &errorMessage); + if (!designBifOut) { + llvm::errs() << "failed to open design.bif because: " << errorMessage; + return failure(); + } + + designBifOut->os() << "all:\n" + << "{\n" + << " id_code = 0x14ca8093\n" + << " extended_id_code = 0x01\n" + << " image\n" + << " {\n" + << " name=aie_image, id=0x1c000000\n" + << " { type=cdo\n" + << " file=" << tempDir.string() + << "/aie_cdo_elfs.bin\n" + << " file=" << tempDir.string() + << "/aie_cdo_init.bin\n" + << " file=" << tempDir.string() + << "/aie_cdo_enable.bin\n" + << " }\n" + << " }\n" + << "}"; + designBifOut->keep(); + } + + // Execute the bootgen command. + { + // first element is empty string because iree_aie_bootgen_main + // is the main of bootgen.exe (and argv[0] is typically the name of the exe) + std::vector flags = { + "", "-arch", "versal", "-image", designBifFile.string(), + "-o", Output, "-w"}; + std::vector cstrings; + cstrings.reserve(flags.size()); + for (const auto &inputFlag : flags) { + cstrings.push_back(const_cast(inputFlag.c_str())); + } + if (iree_aie_bootgen_main(cstrings.size(), + const_cast(&cstrings[0]))) { + llvm::errs() << "failed to execute bootgen"; + return failure(); + } + } + + return success(); +} + static LogicalResult generateXCLBin( const std::string &Output, const Path &tempDir, const std::string &xclBinKernelID, const std::string &xclBinKernelName, @@ -830,58 +886,11 @@ static LogicalResult generateXCLBin( return failure(); } } - // Create design.bif. - Path designBifFile = tempDir / "design.bif"; - { - auto designBifOut = openOutputFile(designBifFile.string(), &errorMessage); - if (!designBifOut) { - llvm::errs() << "failed to open design.bif because: " << errorMessage; - return failure(); - } - designBifOut->os() << "all:\n" - << "{\n" - << " id_code = 0x14ca8093\n" - << " extended_id_code = 0x01\n" - << " image\n" - << " {\n" - << " name=aie_image, id=0x1c000000\n" - << " { type=cdo\n" - << " file=" << tempDir.string() - << "/aie_cdo_elfs.bin\n" - << " file=" << tempDir.string() - << "/aie_cdo_init.bin\n" - << " file=" << tempDir.string() - << "/aie_cdo_enable.bin\n" - << " }\n" - << " }\n" - << "}"; - designBifOut->keep(); + if (failed(generatePDI((tempDir / "design.pdi").string(), tempDir))) { + return failure(); } - // Execute the bootgen command. - { - // first element is empty string because iree_aie_bootgen_main - // is the main of bootgen.exe (and argv[0] is typically the name of the exe) - std::vector flags = {"", - "-arch", - "versal", - "-image", - designBifFile.string(), - "-o", - (tempDir / "design.pdi").string(), - "-w"}; - std::vector cstrings; - cstrings.reserve(flags.size()); - for (const auto &inputFlag : flags) { - cstrings.push_back(const_cast(inputFlag.c_str())); - } - if (iree_aie_bootgen_main(cstrings.size(), - const_cast(&cstrings[0]))) { - llvm::errs() << "failed to execute bootgen"; - return failure(); - } - } std::vector flags; // Execute the xclbinutil command. std::string memArg = "MEM_TOPOLOGY:JSON:" + memTopologyJsonFile.string(); @@ -1109,11 +1118,12 @@ LogicalResult emitNpuInstructions(AIE::DeviceOp deviceOp, LogicalResult aie2xclbin( MLIRContext *ctx, AIE::DeviceOp deviceOp, const std::string &outputNPU, - const std::string &outputXCLBin, bool printIRBeforeAll, + const std::string &artifactPath, bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope, bool timing, const std::string &tempDir, bool useChess, bool verbose, const std::optional &vitisDir, const std::string &targetArch, const std::string &npuVersion, const std::string &peanoDir, + const mlir::iree_compiler::AMDAIE::AMDAIEOptions::DeviceHAL deviceHal, const std::string &xclBinKernelID, const std::string &xclBinKernelName, const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir, const std::optional &InputXCLBin, @@ -1148,7 +1158,26 @@ LogicalResult aie2xclbin( return failure(); } - if (failed(generateXCLBin(outputXCLBin, tempDirPath, xclBinKernelID, + Path pdiPath = tempDirPath / "design.pdi"; + if (failed(generatePDI(pdiPath.string(), tempDirPath))) { + llvm::errs() << "Failed to generate PDI\n"; + return failure(); + } + + if (deviceHal == AMDAIEOptions::DeviceHAL::XRT_LITE) { + std::error_code ec; + if (!std::filesystem::copy_file( + pdiPath, artifactPath, + std::filesystem::copy_options::overwrite_existing, ec)) { + llvm::errs() << "Failed to copy file because: " << ec.message() << "\n"; + return failure(); + } + return success(); + } + + assert(deviceHal == AMDAIEOptions::DeviceHAL::XRT && + "generating XCLBin for non-XRT HAL"); + if (failed(generateXCLBin(artifactPath, tempDirPath, xclBinKernelID, xclBinKernelName, xclBinInstanceName, amdAIEInstallDir, verbose, InputXCLBin))) { llvm::errs() << "Failed to generate XCLBin\n"; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h index cd7bd2f2a..6083e7293 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/XCLBinGen.h @@ -7,18 +7,19 @@ #include +#include "AIETarget.h" #include "aie/AIEDialect.h" -#include "mlir/IR/MLIRContext.h" #include "mlir/Support/LogicalResult.h" namespace mlir::iree_compiler::AMDAIE { mlir::LogicalResult aie2xclbin( mlir::MLIRContext *ctx, xilinx::AIE::DeviceOp, const std::string &outputNPU, - const std::string &outputXCLBin, bool printIRBeforeAll, + const std::string &artifactPath, bool printIRBeforeAll, bool printIRAfterAll, bool printIRModuleScope, bool timing, const std::string &tempDir, bool useChess, bool verbose, const std::optional &vitisDir, const std::string &targetArch, const std::string &npuVersion, const std::string &peanoDir, + const mlir::iree_compiler::AMDAIE::AMDAIEOptions::DeviceHAL deviceHal, const std::string &xclBinKernelID, const std::string &xclBinKernelName, const std::string &xclBinInstanceName, const std::string &amdAIEInstallDir, const std::optional &InputXCLBin, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/amd_aie_target_backend.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/amd_aie_target_backend.mlir index 3b1e4a2a8..872ad76cd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/amd_aie_target_backend.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/test/amd_aie_target_backend.mlir @@ -1,8 +1,8 @@ // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets %s | FileCheck %s --check-prefix=DEFAULT // RUN: iree-compile --iree-hal-target-backends=amd-aie --compile-to=executable-targets --iree-amdaie-enable-ukernels=all %s | FileCheck %s --check-prefix=ENABLE_UKERNEL -// DEFAULT: hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>) { -// ENABLE_UKERNEL: hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "all"}>) { +// DEFAULT: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {target_device = "npu1_4col", ukernels = "none"}>) { +// ENABLE_UKERNEL: hal.executable.variant public @amdaie_pdi_fb target(<"amd-aie", "amdaie-pdi-fb", {target_device = "npu1_4col", ukernels = "all"}>) { func.func @matmul_small(%lhs : tensor<8x16xi32>, %rhs : tensor<16x32xi32>) -> tensor<8x32xi32> { %empty = tensor.empty() : tensor<8x32xi32> diff --git a/iree_runtime_plugin.cmake b/iree_runtime_plugin.cmake index 15a4d07da..d8138465e 100644 --- a/iree_runtime_plugin.cmake +++ b/iree_runtime_plugin.cmake @@ -23,8 +23,8 @@ endif() if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) include(iree_aie_xrt) - include(iree_aie_bootgen) endif() +include(iree_aie_bootgen) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/runtime/src AMD-AIE) add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/experimental AMD-AIE-experimental) diff --git a/runtime/src/iree-amd-aie/CMakeLists.txt b/runtime/src/iree-amd-aie/CMakeLists.txt index bfa015081..8b67676cd 100644 --- a/runtime/src/iree-amd-aie/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/CMakeLists.txt @@ -5,13 +5,17 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) - add_subdirectory(driver/xrt) + add_subdirectory(driver/xrt) endif() -# Flatbuffer schema generation does not require XRT. Moreover the generated +if("xrt-lite" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) + add_subdirectory(driver/xrt-lite) +endif() + +# Flatbuffer schema generation does not require a driver but the generated # flatbuffer header files are used by the compiler to create artefacts # (.vmfb file), and so the schema sub-directory is required even when not -# building the XRT driver code. +# building driver code. add_subdirectory(schemas) # Contains libiree_aie_runtime, i.e., suitably encapsulated calls to aie-rt. diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt new file mode 100644 index 000000000..0863ea8c3 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/CMakeLists.txt @@ -0,0 +1,48 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_add_all_subdirs() + +iree_register_external_hal_driver( + NAME + xrt-lite + DRIVER_TARGET + iree-amd-aie::driver::xrt-lite::registration + REGISTER_FN + iree_hal_xrt_lite_driver_module_register +) + +iree_cc_library( + NAME + xrt-lite + SRCS + allocator.cc + allocator.h + api.h + buffer.cc + buffer.h + direct_command_buffer.cc + direct_command_buffer.h + device.cc + driver.cc + executable.cc + executable.h + nop_executable_cache.cc + nop_executable_cache.h + nop_semaphore.cc + nop_semaphore.h + util.h + DEPS + iree::base + iree::base::core_headers + iree::hal::utils::deferred_command_buffer + iree::hal::utils::semaphore_base + iree::base::internal::flatcc::parsing + iree-amd-aie::schemas::pdi_executable_def_c_fbs + iree-amd-aie::driver::xrt-lite::shim::linux::kmq::shim-xdna + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc new file mode 100644 index 000000000..275781e67 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.cc @@ -0,0 +1,200 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/allocator.h" + +#include "iree-amd-aie/driver/xrt-lite/buffer.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" + +namespace { +extern const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable; +} + +struct iree_hal_xrt_lite_allocator { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + shim_xdna::device* shim_device; + IREE_STATISTICS(iree_hal_allocator_statistics_t statistics;) + + iree_hal_xrt_lite_allocator(iree_allocator_t host_allocator, + shim_xdna::device* shim_device) + : host_allocator(host_allocator), shim_device(shim_device) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_resource_initialize(&iree_hal_xrt_lite_allocator_vtable, + &this->resource); + + IREE_TRACE_ZONE_END(z0); + } +}; + +static iree_hal_buffer_compatibility_t +iree_hal_xrt_lite_allocator_query_buffer_compatibility( + iree_hal_allocator_t* base_allocator, iree_hal_buffer_params_t* params, + iree_device_size_t* allocation_size) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_buffer_compatibility_t compatibility = + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE; + + if (iree_any_bit_set(params->usage, IREE_HAL_BUFFER_USAGE_TRANSFER)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_TRANSFER; + } + + // Buffers can only be used on the queue if they are device visible. + if (iree_all_bits_set(params->type, IREE_HAL_MEMORY_TYPE_DEVICE_VISIBLE)) { + if (iree_any_bit_set(params->usage, + IREE_HAL_BUFFER_USAGE_DISPATCH_STORAGE)) { + compatibility |= IREE_HAL_BUFFER_COMPATIBILITY_QUEUE_DISPATCH; + } + } + + params->type &= ~IREE_HAL_MEMORY_TYPE_OPTIMAL; + + // Guard against the corner case where the requested buffer size is 0. The + // application is unlikely to do anything when requesting a 0-byte buffer; + // but it can happen in real world use cases. So we should at least not + // crash. + if (*allocation_size == 0) *allocation_size = 4; + // Align allocation sizes to 4 bytes so shaders operating on 32 bit types + // can act safely even on buffer ranges that are not naturally aligned. + *allocation_size = iree_host_align(*allocation_size, 4); + + IREE_TRACE_ZONE_END(z0); + return compatibility; +} + +static iree_status_t iree_hal_xrt_lite_allocator_allocate_buffer( + iree_hal_allocator_t* base_allocator, + const iree_hal_buffer_params_t* params, iree_device_size_t allocation_size, + iree_hal_buffer_t** out_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_allocator* allocator = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_allocator, + iree_hal_xrt_lite_allocator_vtable, + iree_hal_xrt_lite_allocator); + iree_hal_buffer_params_t compat_params = *params; + iree_hal_buffer_compatibility_t compatibility = + iree_hal_xrt_lite_allocator_query_buffer_compatibility( + base_allocator, &compat_params, &allocation_size); + if (!iree_all_bits_set(compatibility, + IREE_HAL_BUFFER_COMPATIBILITY_ALLOCATABLE)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "allocator cannot allocate a buffer with the given parameters"); + } + + uint32_t flags = XCL_BO_FLAGS_HOST_ONLY; + shim_xdna::bo* bo = + allocator->shim_device->alloc_bo(allocation_size, flags).release(); + iree_hal_buffer_t* buffer = nullptr; + iree_status_t status = iree_hal_xrt_lite_buffer_wrap( + bo, reinterpret_cast(allocator), + compat_params.type, compat_params.access, compat_params.usage, + allocation_size, + /*byte_offset=*/0, /*byte_length=*/allocation_size, + iree_hal_buffer_release_callback_null(), allocator->host_allocator, + &buffer); + + if (iree_status_is_ok(status)) { + IREE_STATISTICS(iree_hal_allocator_statistics_record_alloc( + &allocator->statistics, compat_params.type, allocation_size)); + *out_buffer = buffer; + } else { + iree_hal_buffer_release(buffer); + } + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_xrt_lite_allocator_deallocate_buffer( + iree_hal_allocator_t* base_allocator, iree_hal_buffer_t* base_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_allocator* allocator = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_allocator, + iree_hal_xrt_lite_allocator_vtable, + iree_hal_xrt_lite_allocator); + bool was_imported = false; + if (!was_imported) { + IREE_STATISTICS(iree_hal_allocator_statistics_record_free( + &allocator->statistics, iree_hal_buffer_memory_type(base_buffer), + iree_hal_buffer_allocation_size(base_buffer))); + } + iree_hal_buffer_destroy(base_buffer); + + IREE_TRACE_ZONE_END(z0); +} + +iree_status_t iree_hal_xrt_lite_allocator_create( + iree_allocator_t host_allocator, shim_xdna::device* device, + iree_hal_allocator_t** out_allocator) { + IREE_ASSERT_ARGUMENT(out_allocator); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_allocator* allocator = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, sizeof(*allocator), + reinterpret_cast(&allocator))); + allocator = + new (allocator) iree_hal_xrt_lite_allocator(host_allocator, device); + iree_status_t status = iree_ok_status(); + + if (iree_status_is_ok(status)) { + *out_allocator = reinterpret_cast(allocator); + } else { + iree_hal_allocator_release( + reinterpret_cast(allocator)); + } + IREE_TRACE_ZONE_END(z0); + return status; +} + +static void iree_hal_xrt_lite_allocator_destroy( + iree_hal_allocator_t* base_allocator) { + IREE_ASSERT_ARGUMENT(base_allocator); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_allocator* allocator = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_allocator, + iree_hal_xrt_lite_allocator_vtable, + iree_hal_xrt_lite_allocator); + iree_hal_resource_release(&allocator->resource); + iree_allocator_free(allocator->host_allocator, allocator); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_allocator_t iree_hal_xrt_lite_allocator_host_allocator( + const iree_hal_allocator_t* base_allocator) { + IREE_TRACE_ZONE_BEGIN(z0); + + const iree_hal_xrt_lite_allocator* allocator = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_allocator, + iree_hal_xrt_lite_allocator_vtable, + const iree_hal_xrt_lite_allocator); + + IREE_TRACE_ZONE_END(z0); + return allocator->host_allocator; +} + +namespace { +const iree_hal_allocator_vtable_t iree_hal_xrt_lite_allocator_vtable = { + .destroy = iree_hal_xrt_lite_allocator_destroy, + .host_allocator = iree_hal_xrt_lite_allocator_host_allocator, + .trim = unimplemented_ok_status, + .query_statistics = unimplemented_ok_void, + .query_buffer_compatibility = + iree_hal_xrt_lite_allocator_query_buffer_compatibility, + .allocate_buffer = iree_hal_xrt_lite_allocator_allocate_buffer, + .deallocate_buffer = iree_hal_xrt_lite_allocator_deallocate_buffer, +}; +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h new file mode 100644 index 000000000..062ba6505 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/allocator.h @@ -0,0 +1,19 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_XRT_LITE_ALLOCATOR_H_ +#define IREE_HAL_DRIVERS_XRT_LITE_ALLOCATOR_H_ + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +// Creates a buffer allocator used for persistent allocations. +iree_status_t iree_hal_xrt_lite_allocator_create( + iree_allocator_t host_allocator, shim_xdna::device* device, + iree_hal_allocator_t** out_allocator); + +#endif // IREE_HAL_DRIVERS_XRT_LITE_ALLOCATOR_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/api.h b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h new file mode 100644 index 000000000..c969388ba --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/api.h @@ -0,0 +1,44 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +struct iree_hal_xrt_lite_device_params { + int32_t n_core_rows; + int32_t n_core_cols; +}; + +IREE_API_EXPORT void iree_hal_xrt_lite_device_options_initialize( + struct iree_hal_xrt_lite_device_params* out_params); + +struct iree_hal_xrt_lite_driver_options { + struct iree_hal_xrt_lite_device_params device_params; +}; + +IREE_API_EXPORT void iree_hal_xrt_lite_driver_options_initialize( + struct iree_hal_xrt_lite_driver_options* out_options); + +// The provided `identifier` will be used by programs to distinguish the device +// type from other HAL implementations. If compiling programs with the IREE +// compiler this must match the value used by IREE::HAL::TargetDevice. +// +// `out_driver` must be released by the caller (see iree_hal_driver_release). +IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( + iree_string_view_t identifier, + const struct iree_hal_xrt_lite_driver_options* options, + const struct iree_hal_xrt_lite_device_params* device_params, + iree_allocator_t host_allocator, iree_hal_driver_t** out_driver); + +IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_device_create( + iree_string_view_t identifier, + const struct iree_hal_xrt_lite_device_params* params, + iree_allocator_t host_allocator, iree_hal_device_t** out_device); + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_API_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc new file mode 100644 index 000000000..6c0dff4f1 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.cc @@ -0,0 +1,176 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/buffer.h" + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" + +namespace { +extern const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable; +} + +struct iree_hal_xrt_lite_buffer { + iree_hal_buffer_t base; + shim_xdna::bo* bo; + iree_hal_buffer_release_callback_t release_callback; +}; + +static iree_status_t iree_hal_xrt_lite_buffer_invalidate_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); + if (IREE_UNLIKELY(!buffer->bo)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); + } + buffer->bo->sync(shim_xdna::direction::device2host, local_byte_length, + local_byte_offset); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_buffer_map_range( + iree_hal_buffer_t* base_buffer, iree_hal_mapping_mode_t mapping_mode, + iree_hal_memory_access_t memory_access, + iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, + iree_hal_buffer_mapping_t* mapping) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_buffer_validate_memory_type( + iree_hal_buffer_memory_type( + reinterpret_cast(buffer)), + IREE_HAL_MEMORY_TYPE_HOST_VISIBLE)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_buffer_validate_usage( + iree_hal_buffer_allowed_usage( + reinterpret_cast(buffer)), + mapping_mode == IREE_HAL_MAPPING_MODE_PERSISTENT + ? IREE_HAL_BUFFER_USAGE_MAPPING_PERSISTENT + : IREE_HAL_BUFFER_USAGE_MAPPING_SCOPED)); + + void* host_ptr = buffer->bo->map(); + // Should be guaranteed by previous checks. + IREE_ASSERT(host_ptr != nullptr); + uint8_t* data_ptr = reinterpret_cast(host_ptr) + local_byte_offset; + iree_status_t status = iree_hal_xrt_lite_buffer_invalidate_range( + base_buffer, local_byte_offset, local_byte_length); + // If we mapped for discard, scribble over the bytes. This is not a mandated + // behavior but it will make debugging issues easier. Alternatively for heap + // buffers we could reallocate them such that ASAN yells, but that would + // only work if the entire buffer was discarded. +#ifndef NDEBUG + if (iree_any_bit_set(memory_access, IREE_HAL_MEMORY_ACCESS_DISCARD)) { + memset(data_ptr, 0xCD, local_byte_length); + } +#endif // !NDEBUG + mapping->contents = iree_make_byte_span(data_ptr, local_byte_length); + + IREE_TRACE_ZONE_END(z0); + return status; +} + +static iree_status_t iree_hal_xrt_lite_buffer_flush_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); + if (IREE_UNLIKELY(!buffer->bo)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "buffer does not have device memory attached and cannot be mapped"); + } + + buffer->bo->sync(shim_xdna::direction::host2device, local_byte_length, + local_byte_offset); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_buffer_unmap_range( + iree_hal_buffer_t* base_buffer, iree_device_size_t local_byte_offset, + iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { + return iree_hal_xrt_lite_buffer_flush_range(base_buffer, local_byte_offset, + local_byte_length); +} + +iree_status_t iree_hal_xrt_lite_buffer_wrap( + shim_xdna::bo* bo, iree_hal_allocator_t* allocator, + iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_buffer_release_callback_t release_callback, + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { + IREE_ASSERT_ARGUMENT(out_buffer); + *out_buffer = nullptr; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_buffer* buffer = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, sizeof(*buffer), + reinterpret_cast(&buffer))); + iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, + allocation_size, byte_offset, byte_length, + memory_type, allowed_access, allowed_usage, + &iree_hal_xrt_lite_buffer_vtable, &buffer->base); + buffer->release_callback = release_callback; + buffer->bo = bo; + *out_buffer = &buffer->base; + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_buffer_destroy(iree_hal_buffer_t* base_buffer) { + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_t host_allocator = base_buffer->host_allocator; + if (buffer->release_callback.fn) { + buffer->release_callback.fn(buffer->release_callback.user_data, + base_buffer); + } + + delete buffer->bo; + iree_allocator_free(host_allocator, buffer); + + IREE_TRACE_ZONE_END(z0); +} + +shim_xdna::bo* iree_hal_xrt_lite_buffer_handle(iree_hal_buffer_t* base_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_buffer* buffer = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_buffer, iree_hal_xrt_lite_buffer_vtable, iree_hal_xrt_lite_buffer); + + IREE_TRACE_ZONE_END(z0); + return buffer->bo; +} + +namespace { +const iree_hal_buffer_vtable_t iree_hal_xrt_lite_buffer_vtable = { + .recycle = iree_hal_buffer_recycle, + .destroy = iree_hal_xrt_lite_buffer_destroy, + .map_range = iree_hal_xrt_lite_buffer_map_range, + .unmap_range = iree_hal_xrt_lite_buffer_unmap_range, + .invalidate_range = iree_hal_xrt_lite_buffer_invalidate_range, + .flush_range = iree_hal_xrt_lite_buffer_flush_range, +}; +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h new file mode 100644 index 000000000..b70517d1c --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/buffer.h @@ -0,0 +1,24 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ +#define IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +iree_status_t iree_hal_xrt_lite_buffer_wrap( + shim_xdna::bo* bo, iree_hal_allocator_t* allocator, + iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, + iree_device_size_t byte_offset, iree_device_size_t byte_length, + iree_hal_buffer_release_callback_t release_callback, + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); + +shim_xdna::bo* iree_hal_xrt_lite_buffer_handle(iree_hal_buffer_t* base_buffer); + +#endif // IREE_HAL_DRIVERS_XRT_LITE_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt new file mode 100644 index 000000000..03642c0c2 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/CMakeLists.txt @@ -0,0 +1,110 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +include(CMakeDependentOption) + +iree_hal_cts_test_suite( + DRIVER_NAME + xrt-lite + DRIVER_REGISTRATION_HDR + "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" + DRIVER_REGISTRATION_FN + "iree_hal_xrt_lite_driver_module_register" + COMPILER_TARGET_BACKEND + "amd-aie" + EXECUTABLE_FORMAT + "\"amdaie-pdi-fb\"" + DEPS + iree-amd-aie::driver::xrt-lite::registration + INCLUDED_TESTS + "allocator" + "buffer_mapping" + "driver" +) + +set(PEANO_INSTALL_DIR "" CACHE PATH "") +set(VITIS_DIR "" CACHE PATH "") +if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) + message(FATAL_ERROR "either PEANO_INSTALL_DIR or VITIS_DIR must be set") +endif() +cmake_dependent_option(USE_CHESS "" "1" "VITIS_DIR" "0") +set(TARGET_DEVICE "npu1_4col" CACHE STRING "") + +iree_bytecode_module( + NAME + xrt_lite_executable_cache_test_module + MODULE_FILE_NAME + xrt_lite_executable_cache_test.bin + SRC + "${CMAKE_CURRENT_LIST_DIR}/executable_cache_test.mlir" + FLAGS + --compile-mode=hal-executable + --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} + --iree-hal-target-backends=amd-aie + --iree-amdaie-lower-to-aie-pipeline=air + --iree-amdaie-target-device=${TARGET_DEVICE} + --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} + --iree-amd-aie-vitis-install-dir=${VITIS_DIR} + --iree-amd-aie-enable-chess=$ + --iree-amdaie-device-hal=xrt-lite + --iree-amd-aie-show-invoked-commands + --iree-hal-memoization=false + --iree-hal-indirect-command-buffers=false + PUBLIC + TESTONLY +) + +iree_c_embed_data( + NAME + xrt_lite_executables_c + SRCS + xrt_lite_executable_cache_test.bin + C_FILE_OUTPUT + xrt_lite_executables_c.c + H_FILE_OUTPUT + xrt_lite_executables_c.h + IDENTIFIER + iree_cts_testdata_executables_aie_xrt_lite + STRIP_PREFIX + xrt_lite_ + DEPENDS + ::xrt_lite_executable_cache_test_module + FLATTEN + PUBLIC + TESTONLY +) + +iree_cc_test( + NAME + xrt_lite_executable_cache_test + SRCS + executable_cache_test.cc + DEPS + ::xrt_lite_executables_c + iree-amd-aie::driver::xrt-lite::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main +) + +iree_cc_test( + NAME + xrt_lite_dispatch_test + SRCS + matmul_dispatch_test.cc + DEPS + ::xrt_lite_executables_c + iree-amd-aie::driver::xrt-lite::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main + iree::tools::testing::e2e::e2e_test_util +) + +target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_executable_cache_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") +target_include_directories(iree-amd-aie_driver_xrt-lite_cts_xrt_lite_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc new file mode 100644 index 000000000..9aa19a89c --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.cc @@ -0,0 +1,85 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" +#include "iree/base/api.h" +#include "iree/base/string_view.h" +#include "iree/hal/api.h" +#include "iree/hal/cts/cts_test_base.h" +#include "iree/testing/gtest.h" +#include "iree/testing/status_matchers.h" +#include "xrt_lite_executables_c.h" + +namespace iree::hal::cts { + +const char* get_test_driver_name() { return "xrt-lite"; } + +iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { + return iree_hal_xrt_lite_driver_module_register(registry); +} + +const char* get_test_executable_format() { return "amdaie-pdi-fb"; } + +iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { + const struct iree_file_toc_t* toc = + iree_cts_testdata_executables_aie_xrt_lite_create(); + const auto& file = toc[0]; + return iree_make_const_byte_span(file.data, file.size); +} + +class ExecutableCacheTest : public CTSTestBase<> {}; + +TEST_F(ExecutableCacheTest, Create) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +TEST_F(ExecutableCacheTest, CantPrepareUnknownFormat) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + EXPECT_FALSE(iree_hal_executable_cache_can_prepare_format( + executable_cache, /*caching_mode=*/0, iree_make_cstring_view("FOO?"))); + + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +TEST_F(ExecutableCacheTest, PrepareExecutable) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + iree_hal_executable_params_t executable_params; + iree_hal_executable_params_initialize(&executable_params); + executable_params.caching_mode = + IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; + executable_params.executable_format = + iree_make_cstring_view(get_test_executable_format()); + executable_params.executable_data = get_test_executable_data( + iree_make_cstring_view("executable_cache_test.bin")); + + iree_hal_executable_t* executable = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable( + executable_cache, &executable_params, &executable)); + + iree_hal_executable_release(executable); + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +} // namespace iree::hal::cts diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir new file mode 100644 index 000000000..dedbcab6b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/executable_cache_test.mlir @@ -0,0 +1,33 @@ +// bootstrapped from https://github.com/nod-ai/iree-amd-aie/blob/9c4c167baf89a279888fba8db75907845946077c/tests/samples/matmul_pack_peel_objectfifo_e2e.mlir + +#pipeline_layout = #hal.pipeline.layout< + bindings = [ + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding + ], + flags = Indirect +> +hal.executable.source public @amdaie_fb { + hal.executable.export public @matmul_f32_dispatch_0_matmul_32x32x32_f32 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_f32_dispatch_0_matmul_32x32x32_f32() { + %c0_f32 = arith.constant 0.0 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> + %5 = tensor.empty() : tensor<32x32xf32> + %6 = linalg.fill ins(%c0_f32 : f32) outs(%5 : tensor<32x32xf32>) -> tensor<32x32xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%6 : tensor<32x32xf32>) -> tensor<32x32xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xf32> -> !flow.dispatch.tensor> + return + } + } +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc new file mode 100644 index 000000000..ce7d4ca83 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/cts/matmul_dispatch_test.cc @@ -0,0 +1,224 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" +#include "iree/base/api.h" +#include "iree/base/string_view.h" +#include "iree/hal/api.h" +#include "iree/hal/buffer_view_util.h" +#include "iree/hal/cts/cts_test_base.h" +#include "iree/testing/gtest.h" +#include "iree/testing/status_matchers.h" +#include "tools/testing/e2e/test_utils.h" +#include "xrt_lite_executables_c.h" + +namespace iree::hal::cts { + +const char* get_test_driver_name() { return "xrt-lite"; } + +iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { + return iree_hal_xrt_lite_driver_module_register(registry); +} + +const char* get_test_executable_format() { return "amdaie-pdi-fb"; } + +iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { + const struct iree_file_toc_t* toc = + iree_cts_testdata_executables_aie_xrt_lite_create(); + const auto& file = toc[0]; + return iree_make_const_byte_span(file.data, file.size); +} + +class MatMulDispatchTest + : public CTSTestBase<::testing::TestWithParam> { + protected: + void PrepareMatmulExecutable() { + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status_), &executable_cache_)); + + iree_hal_executable_params_t executable_params; + iree_hal_executable_params_initialize(&executable_params); + executable_params.caching_mode = + IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; + executable_params.executable_format = + iree_make_cstring_view(get_test_executable_format()); + executable_params.executable_data = get_test_executable_data( + iree_make_cstring_view("xrt-lite_executable_cache_test.bin")); + + IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable( + executable_cache_, &executable_params, &executable_)); + } + + void CleanupExecutable() { + iree_hal_executable_release(executable_); + iree_hal_executable_cache_release(executable_cache_); + IREE_ASSERT_OK(loop_status_); + } + + iree_status_t loop_status_ = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache_ = nullptr; + iree_hal_executable_t* executable_ = nullptr; +}; + +int32_t generate_random_number(iree_hal_element_type_t element_type, + int32_t seed) { + int32_t min = 0; + int32_t max = 0; + iree_test_utils_get_min_max_for_element_type(element_type, &min, &max); + uint32_t range = (max - min + 1); + return (int32_t)iree_test_utils_pseudorandom_range( + reinterpret_cast(&seed), range) + + min; +} + +TEST_F(MatMulDispatchTest, Create) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + EXPECT_TRUE((iree_hal_command_buffer_allowed_categories(command_buffer) & + IREE_HAL_COMMAND_CATEGORY_DISPATCH) == + IREE_HAL_COMMAND_CATEGORY_DISPATCH); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_F(MatMulDispatchTest, BeginEnd) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_F(MatMulDispatchTest, SubmitEmpty) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer)); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_P(MatMulDispatchTest, DispatchMatmul) { + PrepareMatmulExecutable(); + + // Create input buffer. + constexpr iree_device_size_t WIDTH = 32; + constexpr iree_device_size_t M = WIDTH, K = WIDTH, N = WIDTH; + iree_hal_buffer_t *input_A = nullptr, *input_B = nullptr, *output_C = nullptr; + int32_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count() >> + 32; + int32_t a = generate_random_number( + iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed); + int32_t b = generate_random_number( + iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed + 1); + CreateFilledDeviceBuffer(M * K * sizeof(float), a, &input_A); + CreateFilledDeviceBuffer(K * N * sizeof(float), b, &input_B); + CreateFilledDeviceBuffer(M * N * sizeof(float), -1, &output_C); + + iree_hal_buffer_ref_t binding_refs[3]; + iree_hal_buffer_binding_table_t binding_table = + iree_hal_buffer_binding_table_empty(); + binding_refs[0] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/input_A, + /*offset=*/0, + /*length=*/M * K * sizeof(float), + }; + binding_refs[1] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/input_B, + /*offset=*/0, + /*length=*/K * N * sizeof(float), + }; + binding_refs[2] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/output_C, + /*offset=*/0, + /*length=*/M * N * sizeof(float), + }; + iree_hal_buffer_ref_list_t bindings = { + /*.count=*/IREE_ARRAYSIZE(binding_refs), + /*.values=*/binding_refs, + }; + + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + binding_table.count, &command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + + uint32_t workgroup_count[3] = {1, 1, 1}; + IREE_ASSERT_OK(iree_hal_command_buffer_dispatch( + command_buffer, executable_, /*entry_point=*/0, workgroup_count, + iree_const_byte_span_empty(), bindings, IREE_HAL_DISPATCH_FLAG_NONE)); + + IREE_ASSERT_OK(iree_hal_command_buffer_execution_barrier( + command_buffer, + /*source_stage_mask=*/IREE_HAL_EXECUTION_STAGE_DISPATCH | + IREE_HAL_EXECUTION_STAGE_TRANSFER | + IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE, + /*target_stage_mask=*/IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE | + IREE_HAL_EXECUTION_STAGE_DISPATCH | IREE_HAL_EXECUTION_STAGE_TRANSFER, + IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, /*memory_barrier_count=*/0, + /*memory_barriers=*/nullptr, + /*buffer_barrier_count=*/0, /*buffer_barriers=*/nullptr)); + + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer, binding_table)); + + std::vector output_values; + output_values.reserve(M * N); + IREE_ASSERT_OK(iree_hal_device_transfer_d2h( + device_, output_C, + /*source_offset=*/0, output_values.data(), M * N * sizeof(float), + IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout())); + std::vector correct_output_values; + correct_output_values.reserve(M * N); + std::fill_n(correct_output_values.data(), M * N, (float)WIDTH * (a * b)); + int n_wrong = 0; + for (int i = 0; i < M * N; ++i) { + if (output_values[i] != correct_output_values[i]) { + std::cout << "wrong @ i:" << i << ", " << output_values[i] + << " != " << correct_output_values[i] << "\n"; + n_wrong += 1; + } + } + EXPECT_EQ(n_wrong, 0); + + iree_hal_command_buffer_release(command_buffer); + iree_hal_buffer_release(output_C); + iree_hal_buffer_release(input_B); + iree_hal_buffer_release(input_A); + CleanupExecutable(); +} + +INSTANTIATE_TEST_SUITE_P(MatMulDispatchTest, MatMulDispatchTest, + ::testing::Values(RecordingType::kDirect), + GenerateTestName()); + +} // namespace iree::hal::cts diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc new file mode 100644 index 000000000..323bd4aaa --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.cc @@ -0,0 +1,285 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" + +#include "iree-amd-aie/driver/xrt-lite/allocator.h" +#include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/device.h" +#include "iree-amd-aie/driver/xrt-lite/direct_command_buffer.h" +#include "iree-amd-aie/driver/xrt-lite/nop_executable_cache.h" +#include "iree-amd-aie/driver/xrt-lite/nop_semaphore.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" +#include "iree/hal/utils/deferred_command_buffer.h" +#include "iree/hal/utils/deferred_work_queue.h" + +#define ARENA_BLOCK_SIZE (32 * 1024) + +namespace { +extern const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable; +} + +iree_hal_xrt_lite_device::iree_hal_xrt_lite_device( + const iree_hal_xrt_lite_device_params* options, + iree_allocator_t host_allocator) { + IREE_ASSERT_ARGUMENT(options); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_resource_initialize(&iree_hal_xrt_lite_device_vtable, &resource); + this->host_allocator = host_allocator; + shim_device = + new shim_xdna::device(options->n_core_rows, options->n_core_cols); + + iree_status_t status = iree_hal_xrt_lite_allocator_create( + host_allocator, shim_device, &device_allocator); + IREE_ASSERT(iree_status_is_ok(status)); + iree_arena_block_pool_initialize(ARENA_BLOCK_SIZE, host_allocator, + &block_pool); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_xrt_lite_device_create_executable_cache( + iree_hal_device_t* base_device, iree_string_view_t identifier, + iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + + IREE_TRACE_ZONE_END(z0); + return iree_hal_xrt_lite_nop_executable_cache_create( + device->shim_device, identifier, device->host_allocator, + out_executable_cache); +} + +static iree_status_t iree_hal_xrt_lite_device_create_command_buffer( + iree_hal_device_t* base_device, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_hal_queue_affinity_t queue_affinity, iree_host_size_t binding_capacity, + iree_hal_command_buffer_t** out_command_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + + if (!iree_all_bits_set(mode, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "unimplemented multi-shot command buffer"); + } + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + + IREE_TRACE_ZONE_END(z0); + return iree_hal_deferred_command_buffer_create( + device->device_allocator, mode, command_categories, binding_capacity, + &device->block_pool, device->host_allocator, out_command_buffer); +} + +static iree_status_t iree_hal_xrt_lite_device_create_semaphore( + iree_hal_device_t* base_device, uint64_t initial_value, + iree_hal_semaphore_flags_t flags, iree_hal_semaphore_t** out_semaphore) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + + IREE_TRACE_ZONE_END(z0); + return iree_hal_xrt_lite_semaphore_create(device->host_allocator, + initial_value, out_semaphore); +} + +static iree_status_t iree_hal_xrt_lite_device_queue_execute( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_host_size_t command_buffer_count, + iree_hal_command_buffer_t* const* command_buffers, + iree_hal_buffer_binding_table_t const* binding_tables) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + + for (iree_host_size_t i = 0; i < command_buffer_count; i++) { + iree_hal_command_buffer_t* xrt_command_buffer = nullptr; + iree_hal_command_buffer_mode_t mode = + IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT | + IREE_HAL_COMMAND_BUFFER_MODE_ALLOW_INLINE_EXECUTION | + IREE_HAL_COMMAND_BUFFER_MODE_UNVALIDATED; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_direct_command_buffer_create( + device, mode, IREE_HAL_COMMAND_CATEGORY_ANY, + /*binding_capacity=*/0, &device->block_pool, + device->host_allocator, &xrt_command_buffer)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_deferred_command_buffer_apply( + command_buffers[i], xrt_command_buffer, + iree_hal_buffer_binding_table_empty())); + } + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_device_replace_device_allocator( + iree_hal_device_t* base_device, iree_hal_allocator_t* new_allocator) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_allocator_retain(new_allocator); + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + device->device_allocator = new_allocator; + iree_hal_allocator_release(device->device_allocator); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_xrt_lite_device_query_i64( + iree_hal_device_t* base_device, iree_string_view_t category, + iree_string_view_t key, int64_t* out_value) { + IREE_TRACE_ZONE_BEGIN(z0); + + *out_value = 0; + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + if (iree_string_view_equal(category, IREE_SV("hal.device.id"))) { + *out_value = + iree_string_view_match_pattern(device->identifier, key) ? 1 : 0; + return iree_ok_status(); + } + + if (iree_string_view_equal(category, IREE_SV("hal.executable.format"))) { + *out_value = iree_string_view_equal(key, IREE_SV("amdaie-pdi-fb")) ? 1 : 0; + return iree_ok_status(); + } + + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unsupported query"); +} + +static iree_status_t iree_hal_xrt_lite_device_queue_alloca( + iree_hal_device_t* base_device, iree_hal_queue_affinity_t queue_affinity, + const iree_hal_semaphore_list_t wait_semaphore_list, + const iree_hal_semaphore_list_t signal_semaphore_list, + iree_hal_allocator_pool_t pool, iree_hal_buffer_params_t params, + iree_device_size_t allocation_size, + iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_semaphore_list_wait(wait_semaphore_list, + iree_infinite_timeout())); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_allocator_allocate_buffer(device->device_allocator, params, + allocation_size, out_buffer)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_semaphore_list_signal(signal_semaphore_list)); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_string_view_t iree_hal_xrt_lite_device_id( + iree_hal_device_t* base_device) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + + IREE_TRACE_ZONE_END(z0); + return device->identifier; +} + +static void iree_hal_xrt_lite_device_destroy(iree_hal_device_t* base_device) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + + iree_hal_allocator_release(device->device_allocator); + delete device->shim_device; + iree_allocator_free(device->host_allocator, device); + + IREE_TRACE_ZONE_END(z0); +}; + +static iree_allocator_t iree_hal_xrt_lite_device_host_allocator( + iree_hal_device_t* base_device) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + + IREE_TRACE_ZONE_END(z0); + return device->host_allocator; +} + +static iree_hal_allocator_t* iree_hal_xrt_lite_device_device_allocator( + iree_hal_device_t* base_device) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_device, iree_hal_xrt_lite_device_vtable, iree_hal_xrt_lite_device); + + IREE_TRACE_ZONE_END(z0); + return device->device_allocator; +} + +void iree_hal_xrt_lite_device_options_initialize( + iree_hal_xrt_lite_device_params* out_options) { + IREE_TRACE_ZONE_BEGIN(z0); + + memset(out_options, 0, sizeof(*out_options)); + + IREE_TRACE_ZONE_END(z0); +} + +iree_status_t iree_hal_xrt_lite_device_create( + iree_string_view_t identifier, + const iree_hal_xrt_lite_device_params* options, + iree_allocator_t host_allocator, iree_hal_device_t** out_device) { + IREE_ASSERT_ARGUMENT(options); + IREE_ASSERT_ARGUMENT(out_device); + *out_device = nullptr; + + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_device* device = nullptr; + iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size; + IREE_RETURN_IF_ERROR(iree_allocator_malloc( + host_allocator, total_size, reinterpret_cast(&device))); + device = new (device) iree_hal_xrt_lite_device(options, host_allocator); + iree_string_view_append_to_buffer( + identifier, &device->identifier, + reinterpret_cast(device) + total_size - identifier.size); + // TODO(max): device id + *out_device = reinterpret_cast(device); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +namespace { +const iree_hal_device_vtable_t iree_hal_xrt_lite_device_vtable = { + .destroy = iree_hal_xrt_lite_device_destroy, + .id = iree_hal_xrt_lite_device_id, + .host_allocator = iree_hal_xrt_lite_device_host_allocator, + .device_allocator = iree_hal_xrt_lite_device_device_allocator, + .replace_device_allocator = + iree_hal_xrt_lite_device_replace_device_allocator, + .query_i64 = iree_hal_xrt_lite_device_query_i64, + .create_command_buffer = iree_hal_xrt_lite_device_create_command_buffer, + .create_executable_cache = iree_hal_xrt_lite_device_create_executable_cache, + .create_semaphore = iree_hal_xrt_lite_device_create_semaphore, + .queue_alloca = iree_hal_xrt_lite_device_queue_alloca, + .queue_execute = iree_hal_xrt_lite_device_queue_execute, + .profiling_begin = unimplemented_ok_status, + .profiling_flush = unimplemented_ok_status, + .profiling_end = unimplemented_ok_status, +}; +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h new file mode 100644 index 000000000..ad3141e88 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/device.h @@ -0,0 +1,33 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ + +#include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/internal/arena.h" +#include "iree/hal/api.h" + +struct iree_hal_xrt_lite_device { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + // TODO(max): not used because "device allocations" are performed through + // device + iree_hal_allocator_t* device_allocator; + // block pool used for command buffer allocations, uses a larger block size + // since command buffers can contain inlined data + iree_arena_block_pool_t block_pool; + shim_xdna::device* shim_device; + // should come last; see the definition of total_size below in + // iree_hal_xrt_lite_device_create + iree_string_view_t identifier; + + iree_hal_xrt_lite_device(const iree_hal_xrt_lite_device_params* options, + iree_allocator_t host_allocator); +}; + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_DEVICE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc new file mode 100644 index 000000000..5861ebd8b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.cc @@ -0,0 +1,220 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/direct_command_buffer.h" + +#include "iree-amd-aie/driver/xrt-lite/buffer.h" +#include "iree-amd-aie/driver/xrt-lite/executable.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" +#include "iree/hal/utils/resource_set.h" + +struct iree_hal_xrt_lite_direct_command_buffer { + iree_hal_command_buffer_t base; + iree_allocator_t host_allocator; + // A resource set to maintain references to all resources used within the + // command buffer. Reset on each begin. + iree_hal_resource_set_t* resource_set; + // Staging arena used for host->device transfers. + iree_arena_allocator_t arena; + + iree_hal_xrt_lite_device* device; +}; + +namespace { +extern const iree_hal_command_buffer_vtable_t + iree_hal_xrt_lite_direct_command_buffer_vtable; +} // namespace + +iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( + iree_hal_xrt_lite_device* device, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, + iree_allocator_t host_allocator, + iree_hal_command_buffer_t** out_command_buffer) { + IREE_ASSERT_ARGUMENT(device); + IREE_ASSERT_ARGUMENT(out_command_buffer); + *out_command_buffer = nullptr; + if (binding_capacity > 0) { + // TODO(#10144): support indirect command buffers with binding tables. + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, + "indirect command buffers not yet implemented"); + } + + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_direct_command_buffer* command_buffer = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, + iree_allocator_malloc(host_allocator, + sizeof(*command_buffer) + + iree_hal_command_buffer_validation_state_size( + mode, binding_capacity), + reinterpret_cast(&command_buffer))); + iree_hal_command_buffer_initialize( + device->device_allocator, mode, command_categories, + IREE_HAL_QUEUE_AFFINITY_ANY, binding_capacity, + reinterpret_cast(command_buffer) + sizeof(*command_buffer), + &iree_hal_xrt_lite_direct_command_buffer_vtable, &command_buffer->base); + command_buffer->host_allocator = host_allocator; + command_buffer->device = device; + iree_arena_initialize(block_pool, &command_buffer->arena); + iree_status_t status = + iree_hal_resource_set_allocate(block_pool, &command_buffer->resource_set); + if (iree_status_is_ok(status)) { + *out_command_buffer = &command_buffer->base; + } else { + iree_hal_command_buffer_release(&command_buffer->base); + } + + IREE_TRACE_ZONE_END(z0); + + return status; +} + +static void iree_hal_xrt_lite_direct_command_buffer_destroy( + iree_hal_command_buffer_t* base_command_buffer) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_direct_command_buffer* command_buffer = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_command_buffer, iree_hal_xrt_lite_direct_command_buffer_vtable, + iree_hal_xrt_lite_direct_command_buffer); + iree_allocator_t host_allocator = command_buffer->host_allocator; + iree_hal_resource_set_free(command_buffer->resource_set); + iree_arena_deinitialize(&command_buffer->arena); + iree_allocator_free(host_allocator, command_buffer); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_update_buffer( + iree_hal_command_buffer_t* base_command_buffer, const void* source_buffer, + iree_host_size_t source_offset, iree_hal_buffer_ref_t target_ref) { + IREE_TRACE_ZONE_BEGIN(z0); + + const uint8_t* src = + reinterpret_cast(source_buffer) + source_offset; + // No need to Allocate scratch space (in an arena) as the memcpy + // used below is expected to be synchronized. + shim_xdna::bo* target_device_buffer = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(target_ref.buffer)); + void* target_device_buffer_ptr = target_device_buffer->map(); + uint8_t* dst = reinterpret_cast(target_device_buffer_ptr) + + iree_hal_buffer_byte_offset(target_ref.buffer) + + target_ref.offset; + memcpy(dst, src, target_ref.length); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_copy_buffer( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_buffer_ref_t source_ref, iree_hal_buffer_ref_t target_ref) { + IREE_TRACE_ZONE_BEGIN(z0); + + shim_xdna::bo* target_device_buffer = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(target_ref.buffer)); + void* target_device_buffer_ptr = target_device_buffer->map(); + iree_device_size_t target_offset = + iree_hal_buffer_byte_offset(target_ref.buffer) + target_ref.offset; + + shim_xdna::bo* source_device_buffer = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(source_ref.buffer)); + void* source_device_buffer_ptr = source_device_buffer->map(); + iree_device_size_t source_offset = + iree_hal_buffer_byte_offset(source_ref.buffer) + source_ref.offset; + + uint8_t* dst = + reinterpret_cast(target_device_buffer_ptr) + target_offset; + uint8_t* src = + reinterpret_cast(source_device_buffer_ptr) + source_offset; + memcpy(dst, src, target_ref.length); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_direct_command_buffer_dispatch( + iree_hal_command_buffer_t* base_command_buffer, + iree_hal_executable_t* base_executable, int32_t entry_point, + const uint32_t workgroup_count[3], iree_const_byte_span_t constants, + iree_hal_buffer_ref_list_t bindings, iree_hal_dispatch_flags_t flags) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_direct_command_buffer* command_buffer = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_command_buffer, iree_hal_xrt_lite_direct_command_buffer_vtable, + iree_hal_xrt_lite_direct_command_buffer); + // Lookup kernel parameters used for side-channeling additional launch + // information from the compiler. + iree_hal_xrt_lite_executable* executable = + iree_hal_xrt_lite_executable_cast(base_executable); + iree_hal_xrt_lite_kernel_params kernel_params = + executable->entry_points[entry_point]; + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, + &executable)); + + size_t ctrl_code_size = kernel_params.asm_inst.size() * sizeof(uint32_t); + auto bo_ctrl_code = command_buffer->device->shim_device->alloc_bo( + ctrl_code_size, XCL_BO_FLAGS_CACHEABLE); + uint32_t* instr_buffer = static_cast(bo_ctrl_code->map()); + memcpy(instr_buffer, kernel_params.asm_inst.data(), ctrl_code_size); + bo_ctrl_code->sync(shim_xdna::direction::host2device); + + shim_xdna::kernel ebuf(command_buffer->device->shim_device->get_pdev(), + ERT_START_CU); + shim_xdna::hw_ctx context = + command_buffer->device->shim_device->create_hw_context( + kernel_params.pdi, kernel_params.kernel_name); + shim_xdna::cuidx_t cu_idx = + context.open_cu_context(kernel_params.kernel_name); + + ebuf.set_cu_idx(cu_idx); + unsigned int opcode = 3; + ebuf.add_arg_64(opcode); + ebuf.add_arg_bo(*bo_ctrl_code); + ebuf.add_arg_32(kernel_params.asm_inst.size()); + + for (iree_host_size_t j = 0; j < bindings.count; ++j) { + shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); + ebuf.add_arg_bo(*bo); + } + + shim_xdna::hw_q* hwq = context.get_hw_queue(); + hwq->issue_command(ebuf.get_exec_buf_bo()); + hwq->wait_command(ebuf.get_exec_buf_bo(), 0); + + for (iree_host_size_t j = 0; j < bindings.count; ++j) { + shim_xdna::bo* bo = iree_hal_xrt_lite_buffer_handle( + iree_hal_buffer_allocated_buffer(bindings.values[j].buffer)); + // TODO(max): this should be happening automatically via a call to some + // buffer API that performs the sync (maybe invalidate_range) + bo->sync(shim_xdna::direction::device2host); + } + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +namespace { +const iree_hal_command_buffer_vtable_t + iree_hal_xrt_lite_direct_command_buffer_vtable = { + .destroy = iree_hal_xrt_lite_direct_command_buffer_destroy, + .begin = unimplemented_ok_status, + .end = unimplemented_ok_status, + .execution_barrier = unimplemented_ok_status, + .update_buffer = iree_hal_xrt_lite_direct_command_buffer_update_buffer, + .copy_buffer = iree_hal_xrt_lite_direct_command_buffer_copy_buffer, + .dispatch = iree_hal_xrt_lite_direct_command_buffer_dispatch, + .dispatch_indirect = unimplemented, +}; +} // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h new file mode 100644 index 000000000..da797f20f --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/direct_command_buffer.h @@ -0,0 +1,24 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ + +#include "iree-amd-aie/driver/xrt-lite/device.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/internal/arena.h" +#include "iree/hal/api.h" + +// `out_command_buffer` must be released by the caller (see +// iree_hal_command_buffer_release). +iree_status_t iree_hal_xrt_lite_direct_command_buffer_create( + iree_hal_xrt_lite_device* device, iree_hal_command_buffer_mode_t mode, + iree_hal_command_category_t command_categories, + iree_host_size_t binding_capacity, iree_arena_block_pool_t* block_pool, + iree_allocator_t host_allocator, + iree_hal_command_buffer_t** out_command_buffer); + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_XRT_LITE_COMMAND_BUFFER_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc new file mode 100644 index 000000000..3dbba529f --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/driver.cc @@ -0,0 +1,134 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" + +#define IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT 0 + +struct iree_hal_xrt_lite_driver { + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + iree_hal_xrt_lite_driver_options options; + // + trailing identifier string storage + iree_string_view_t identifier; +}; + +namespace { +extern const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable; +} + +void iree_hal_xrt_lite_driver_options_initialize( + iree_hal_xrt_lite_driver_options* out_options) { + IREE_TRACE_ZONE_BEGIN(z0); + + memset(out_options, 0, sizeof(*out_options)); + iree_hal_xrt_lite_device_options_initialize(&out_options->device_params); + + IREE_TRACE_ZONE_END(z0); +} + +IREE_API_EXPORT iree_status_t iree_hal_xrt_lite_driver_create( + iree_string_view_t identifier, + const iree_hal_xrt_lite_driver_options* options, + const iree_hal_xrt_lite_device_params* device_params, + iree_allocator_t host_allocator, iree_hal_driver_t** out_driver) { + IREE_ASSERT_ARGUMENT(options); + IREE_ASSERT_ARGUMENT(out_driver); + *out_driver = nullptr; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_driver* driver = nullptr; + iree_host_size_t total_size = sizeof(*driver) + identifier.size; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, total_size, + reinterpret_cast(&driver))); + iree_hal_resource_initialize(&iree_hal_xrt_lite_driver_vtable, + &driver->resource); + driver->host_allocator = host_allocator; + iree_string_view_append_to_buffer( + identifier, &driver->identifier, + reinterpret_cast(driver) + total_size - identifier.size); + memcpy(&driver->options, options, sizeof(*options)); + memcpy(&driver->options.device_params, device_params, sizeof(*device_params)); + *out_driver = reinterpret_cast(driver); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_driver_destroy(iree_hal_driver_t* base_driver) { + iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); + iree_allocator_t host_allocator = driver->host_allocator; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_allocator_free(host_allocator, driver); + + IREE_TRACE_ZONE_END(z0); +} + +static iree_status_t iree_hal_xrt_lite_driver_query_available_devices( + iree_hal_driver_t* base_driver, iree_allocator_t host_allocator, + iree_host_size_t* out_device_info_count, + iree_hal_device_info_t** out_device_infos) { + IREE_TRACE_ZONE_BEGIN(z0); + + static const iree_hal_device_info_t device_infos[1] = { + { + .device_id = IREE_HAL_XRT_LITE_DEVICE_ID_DEFAULT, + .name = iree_string_view_literal("default"), + }, + }; + *out_device_info_count = IREE_ARRAYSIZE(device_infos); + + IREE_TRACE_ZONE_END(z0); + return iree_allocator_clone( + host_allocator, + iree_make_const_byte_span(device_infos, sizeof(device_infos)), + reinterpret_cast(out_device_infos)); +} + +static iree_status_t iree_hal_xrt_lite_driver_create_device_by_id( + iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id, + iree_host_size_t param_count, const iree_string_pair_t* params, + iree_allocator_t host_allocator, iree_hal_device_t** out_device) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); + iree_hal_xrt_lite_device_params options = driver->options.device_params; + + IREE_TRACE_ZONE_END(z0); + return iree_hal_xrt_lite_device_create(driver->identifier, &options, + host_allocator, out_device); +} + +static iree_status_t iree_hal_xrt_lite_driver_create_device_by_path( + iree_hal_driver_t* base_driver, iree_string_view_t driver_name, + iree_string_view_t device_path, iree_host_size_t param_count, + const iree_string_pair_t* params, iree_allocator_t host_allocator, + iree_hal_device_t** out_device) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_driver* driver = IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_driver, iree_hal_xrt_lite_driver_vtable, iree_hal_xrt_lite_driver); + iree_hal_xrt_lite_device_params options = driver->options.device_params; + + IREE_TRACE_ZONE_END(z0); + return iree_hal_xrt_lite_device_create(driver->identifier, &options, + host_allocator, out_device); +} + +namespace { +const iree_hal_driver_vtable_t iree_hal_xrt_lite_driver_vtable = { + .destroy = iree_hal_xrt_lite_driver_destroy, + .query_available_devices = iree_hal_xrt_lite_driver_query_available_devices, + .dump_device_info = unimplemented_ok_status, + .create_device_by_id = iree_hal_xrt_lite_driver_create_device_by_id, + .create_device_by_path = iree_hal_xrt_lite_driver_create_device_by_path, +}; +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc new file mode 100644 index 000000000..7e846f5ab --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.cc @@ -0,0 +1,228 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/executable.h" + +#include + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" +#include "iree-amd-aie/schemas/pdi_executable_def_reader.h" +#include "iree-amd-aie/schemas/pdi_executable_def_verifier.h" +#include "iree/base/api.h" + +namespace { +extern const iree_hal_executable_vtable_t iree_hal_xrt_lite_executable_vtable; +} // namespace + +iree_hal_xrt_lite_executable* iree_hal_xrt_lite_executable_cast( + iree_hal_executable_t* base_executable) { + return IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_executable, iree_hal_xrt_lite_executable_vtable, + iree_hal_xrt_lite_executable); +} + +static iree_status_t +iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( + iree_const_byte_span_t flatbuffer_data) { + IREE_TRACE_ZONE_BEGIN(z0); + + if (!flatbuffer_data.data || flatbuffer_data.data_length < 16) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_INVALID_ARGUMENT, + "flatbuffer data is not present or less than 16 bytes (%zu total)", + flatbuffer_data.data_length); + } + + int verify_ret = iree_amd_aie_hal_xrt_lite_ExecutableDef_verify_as_root( + flatbuffer_data.data, flatbuffer_data.data_length); + if (verify_ret != flatcc_verify_ok) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "flatbuffer verification failed: %s", + flatcc_verify_error_string(verify_ret)); + } + + iree_amd_aie_hal_xrt_lite_ExecutableDef_table_t executable_def = + iree_amd_aie_hal_xrt_lite_ExecutableDef_as_root(flatbuffer_data.data); + + flatbuffers_string_vec_t entry_points_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); + size_t entry_point_count = flatbuffers_string_vec_len(entry_points_vec); + if (entry_point_count == 0) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "no entry points found in the executable"); + } + for (size_t i = 0; i < entry_point_count; ++i) { + if (!flatbuffers_string_len( + flatbuffers_string_vec_at(entry_points_vec, i))) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "executable entry point %zu has no name", i); + } + } + + iree_amd_aie_hal_xrt_lite_PdiDef_vec_t pdis = + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); + size_t number_pdi = iree_amd_aie_hal_xrt_lite_PdiDef_vec_len(pdis); + if (number_pdi == 0) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, "no pdi present"); + } + + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_t asm_instr = + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_get(executable_def); + size_t number_asm_instr = + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_len(asm_instr); + if (number_asm_instr != entry_point_count) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_INVALID_ARGUMENT, + "number of entry points (%zu) and number of asm " + "instructions (%zu) mismatched", + entry_point_count, number_asm_instr); + } + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +iree_status_t iree_hal_xrt_lite_native_executable_create( + shim_xdna::device* shim_device, + const iree_hal_executable_params_t* executable_params, + iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) { + IREE_ASSERT_ARGUMENT(executable_params); + IREE_ASSERT_ARGUMENT(out_executable); + IREE_TRACE_ZONE_BEGIN(z0); + + *out_executable = nullptr; + iree_hal_xrt_lite_executable* executable = nullptr; + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_amd_aie_hal_xrt_lite_native_executable_flatbuffer_verify( + executable_params->executable_data)); + + iree_amd_aie_hal_xrt_lite_ExecutableDef_table_t executable_def = + iree_amd_aie_hal_xrt_lite_ExecutableDef_as_root( + executable_params->executable_data.data); + flatbuffers_uint32_vec_t pdi_indices_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdi_indices_get(executable_def); + flatbuffers_uint32_vec_t asm_instr_indices_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instr_indices_get( + executable_def); + flatbuffers_string_vec_t entry_points_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_entry_points_get(executable_def); + iree_amd_aie_hal_xrt_lite_PdiDef_vec_t pdis_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_pdis_get(executable_def); + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_t asm_instrs_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_asm_instrs_get(executable_def); + iree_host_size_t entry_point_count = + flatbuffers_string_vec_len(entry_points_vec); + + iree_host_size_t total_entry_point_name_chars = 0; + IREE_TRACE({ + for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; + entry_ordinal++) { + const char* entry_name = + flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); + total_entry_point_name_chars += flatbuffers_string_len(entry_name); + } + }); + + iree_host_size_t total_size = + sizeof(*executable) + + entry_point_count * sizeof(executable->entry_points[0]) + + total_entry_point_name_chars; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, total_size, + reinterpret_cast(&executable))); + IREE_TRACE(char* string_table_buffer = reinterpret_cast( + reinterpret_cast(executable) + sizeof(*executable) + + entry_point_count * sizeof(executable->entry_points[0]))); + + iree_hal_resource_initialize(&iree_hal_xrt_lite_executable_vtable, + &executable->resource); + executable->host_allocator = host_allocator; + executable->entry_point_count = entry_point_count; + for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count; + entry_ordinal++) { + iree_hal_xrt_lite_kernel_params* params = + &executable->entry_points[entry_ordinal]; + params->kernel_name = + flatbuffers_string_vec_at(entry_points_vec, entry_ordinal); + uint32_t pdi_index = + flatbuffers_uint32_vec_at(pdi_indices_vec, entry_ordinal); + iree_amd_aie_hal_xrt_lite_PdiDef_table_t pdi_def = + iree_amd_aie_hal_xrt_lite_PdiDef_vec_at(pdis_vec, pdi_index); + flatbuffers_string_t pdi_fb = + iree_amd_aie_hal_xrt_lite_PdiDef_pdi_get(pdi_def); + + std::vector pdiVector(pdi_fb, + pdi_fb + flatbuffers_string_len(pdi_fb)); + params->pdi = pdiVector; + uint32_t asm_instr_index = + flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); + iree_amd_aie_hal_xrt_lite_AsmInstDef_table_t asminst_def = + iree_amd_aie_hal_xrt_lite_AsmInstDef_vec_at(asm_instrs_vec, + asm_instr_index); + flatbuffers_uint32_vec_t asm_inst = + iree_amd_aie_hal_xrt_lite_AsmInstDef_asm_inst_get(asminst_def); + std::vector asmVector( + asm_inst, asm_inst + flatbuffers_uint32_vec_len(asm_inst)); + params->asm_inst = asmVector; + + IREE_TRACE({ + memcpy(string_table_buffer, params->kernel_name.data(), + params->kernel_name.size()); + string_table_buffer += params->kernel_name.size(); + }); + + IREE_TRACE({ + if (iree_amd_aie_hal_xrt_lite_ExecutableDef_source_locations_is_present( + executable_def)) { + iree_amd_aie_hal_xrt_lite_FileLineLocDef_vec_t source_locs_vec = + iree_amd_aie_hal_xrt_lite_ExecutableDef_source_locations_get( + executable_def); + iree_amd_aie_hal_xrt_lite_FileLineLocDef_table_t source_loc = + iree_amd_aie_hal_xrt_lite_FileLineLocDef_vec_at(source_locs_vec, + entry_ordinal); + flatbuffers_string_t filename = + iree_amd_aie_hal_xrt_lite_FileLineLocDef_filename_get(source_loc); + uint32_t line = + iree_amd_aie_hal_xrt_lite_FileLineLocDef_line_get(source_loc); + params->source_filename = + iree_make_string_view(filename, flatbuffers_string_len(filename)); + params->source_line = line; + } + }); + } + + *out_executable = reinterpret_cast(executable); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_native_executable_destroy( + iree_hal_executable_t* base_executable) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_executable* executable = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_executable, + iree_hal_xrt_lite_executable_vtable, + iree_hal_xrt_lite_executable); + iree_allocator_t host_allocator = executable->host_allocator; + iree_allocator_free(host_allocator, executable); + + IREE_TRACE_ZONE_END(z0); +} + +namespace { +const iree_hal_executable_vtable_t iree_hal_xrt_lite_executable_vtable = { + .destroy = iree_hal_xrt_lite_native_executable_destroy, +}; +} // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h new file mode 100644 index 000000000..1b6d7a58b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/executable.h @@ -0,0 +1,47 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ + +#include + +#include "flatbuffers_common_reader.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h" +#include "iree/base/api.h" +#include "iree/base/tracing.h" +#include "iree/hal/api.h" + +struct iree_hal_xrt_lite_kernel_params { + std::vector pdi; + std::vector asm_inst; + std::string kernel_name; + IREE_TRACE(iree_string_view_t source_filename;) + IREE_TRACE(uint32_t source_line;) +}; + +struct iree_hal_xrt_lite_executable { + // Abstract resource used for injecting reference counting and vtable; must be + // at offset 0. + iree_hal_resource_t resource; + iree_allocator_t host_allocator; + iree_host_size_t entry_point_count; + iree_hal_xrt_lite_kernel_params entry_points[16]; +}; + +// `out_executable` must be released by the caller (see +// iree_hal_executable_release). +iree_status_t iree_hal_xrt_lite_native_executable_create( + shim_xdna::device* shim_device, + const iree_hal_executable_params_t* executable_params, + iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); + +iree_hal_xrt_lite_executable* iree_hal_xrt_lite_executable_cast( + iree_hal_executable_t* base_executable); + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc new file mode 100644 index 000000000..20262e955 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.cc @@ -0,0 +1,107 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/nop_executable_cache.h" + +#include "iree-amd-aie/driver/xrt-lite/executable.h" +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree-amd-aie/driver/xrt-lite/util.h" +#include "iree/base/api.h" +#include "iree/base/tracing.h" + +namespace { +extern const iree_hal_executable_cache_vtable_t + iree_hal_xrt_lite_nop_executable_cache_vtable; +} // namespace + +struct iree_hal_xrt_lite_nop_executable_cache { + // Abstract resource used for injecting reference counting and vtable; must be + // at offset 0. + iree_hal_resource_t resource; + shim_xdna::device* shim_device; + iree_allocator_t host_allocator; + + iree_hal_xrt_lite_nop_executable_cache(shim_xdna::device* shim_device, + iree_allocator_t host_allocator) + : shim_device(shim_device), host_allocator(host_allocator) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_resource_initialize(&iree_hal_xrt_lite_nop_executable_cache_vtable, + &resource); + + IREE_TRACE_ZONE_END(z0); + } +}; + +iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( + shim_xdna::device* shim_device, iree_string_view_t identifier, + iree_allocator_t host_allocator, + iree_hal_executable_cache_t** out_executable_cache) { + IREE_ASSERT_ARGUMENT(out_executable_cache); + *out_executable_cache = nullptr; + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_nop_executable_cache* executable_cache = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, sizeof(*executable_cache), + reinterpret_cast(&executable_cache))); + executable_cache = new (executable_cache) + iree_hal_xrt_lite_nop_executable_cache(shim_device, host_allocator); + *out_executable_cache = + reinterpret_cast(executable_cache); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_nop_executable_cache_destroy( + iree_hal_executable_cache_t* base_executable_cache) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_nop_executable_cache* executable_cache = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_executable_cache, iree_hal_xrt_lite_nop_executable_cache_vtable, + iree_hal_xrt_lite_nop_executable_cache); + iree_allocator_free(executable_cache->host_allocator, executable_cache); + + IREE_TRACE_ZONE_END(z0); +} + +static bool iree_hal_xrt_lite_nop_executable_cache_can_prepare_format( + iree_hal_executable_cache_t* base_executable_cache, + iree_hal_executable_caching_mode_t caching_mode, + iree_string_view_t executable_format) { + return iree_string_view_equal(executable_format, + iree_make_cstring_view("PDIR")); +} + +static iree_status_t iree_hal_xrt_lite_nop_executable_cache_prepare_executable( + iree_hal_executable_cache_t* base_executable_cache, + const iree_hal_executable_params_t* executable_params, + iree_hal_executable_t** out_executable) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_nop_executable_cache* executable_cache = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST( + base_executable_cache, iree_hal_xrt_lite_nop_executable_cache_vtable, + iree_hal_xrt_lite_nop_executable_cache); + + IREE_TRACE_ZONE_END(z0); + return iree_hal_xrt_lite_native_executable_create( + executable_cache->shim_device, executable_params, + executable_cache->host_allocator, out_executable); +} + +namespace { +const iree_hal_executable_cache_vtable_t + iree_hal_xrt_lite_nop_executable_cache_vtable = { + .destroy = iree_hal_xrt_lite_nop_executable_cache_destroy, + .can_prepare_format = + iree_hal_xrt_lite_nop_executable_cache_can_prepare_format, + .prepare_executable = + iree_hal_xrt_lite_nop_executable_cache_prepare_executable, +}; +} // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h new file mode 100644 index 000000000..ed4a998b1 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_executable_cache.h @@ -0,0 +1,21 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ + +#include "iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h" +#include "iree/base/api.h" +#include "iree/hal/api.h" + +// `out_executable_cache` must be released by the caller (see +// iree_hal_executable_cache_release). +iree_status_t iree_hal_xrt_lite_nop_executable_cache_create( + shim_xdna::device* shim_device, iree_string_view_t identifier, + iree_allocator_t host_allocator, + iree_hal_executable_cache_t** out_executable_cache); + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_EXECUTABLE_CACHE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc new file mode 100644 index 000000000..aedd01453 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.cc @@ -0,0 +1,71 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/nop_semaphore.h" + +#include "iree/base/api.h" +#include "iree/hal/utils/semaphore_base.h" +#include "util.h" + +namespace { +extern const iree_hal_semaphore_vtable_t iree_hal_xrt_lite_semaphore_vtable; +} // namespace + +struct iree_hal_xrt_lite_semaphore { + iree_hal_semaphore_t base; + iree_atomic_int64_t value; + iree_allocator_t host_allocator; + + iree_hal_xrt_lite_semaphore(uint64_t initial_value, + iree_allocator_t host_allocator) + : value(initial_value), host_allocator(host_allocator) { + iree_hal_semaphore_initialize(&iree_hal_xrt_lite_semaphore_vtable, &base); + iree_atomic_store_int64(&value, initial_value, iree_memory_order_release); + } +}; + +iree_status_t iree_hal_xrt_lite_semaphore_create( + iree_allocator_t host_allocator, uint64_t initial_value, + iree_hal_semaphore_t** out_semaphore) { + IREE_ASSERT_ARGUMENT(out_semaphore); + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_semaphore* semaphore = nullptr; + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_allocator_malloc(host_allocator, sizeof(*semaphore), + reinterpret_cast(&semaphore))); + semaphore = new (semaphore) + iree_hal_xrt_lite_semaphore(initial_value, host_allocator); + *out_semaphore = &semaphore->base; + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static void iree_hal_xrt_lite_semaphore_destroy( + iree_hal_semaphore_t* base_semaphore) { + IREE_TRACE_ZONE_BEGIN(z0); + + iree_hal_xrt_lite_semaphore* semaphore = + IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_semaphore, + iree_hal_xrt_lite_semaphore_vtable, + iree_hal_xrt_lite_semaphore); + iree_allocator_t host_allocator = semaphore->host_allocator; + iree_hal_semaphore_deinitialize(&semaphore->base); + iree_allocator_free(host_allocator, semaphore); + + IREE_TRACE_ZONE_END(z0); +} + +namespace { +const iree_hal_semaphore_vtable_t iree_hal_xrt_lite_semaphore_vtable = { + .destroy = iree_hal_xrt_lite_semaphore_destroy, + .query = unimplemented_ok_status, + .signal = unimplemented_ok_status, + .fail = unimplemented_ok_void, + .wait = unimplemented_ok_status, +}; +} // namespace diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h new file mode 100644 index 000000000..01931b9a1 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/nop_semaphore.h @@ -0,0 +1,19 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_SEMAPHORE_H_ +#define IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_SEMAPHORE_H_ + +#include + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +iree_status_t iree_hal_xrt_lite_semaphore_create( + iree_allocator_t host_allocator, uint64_t initial_value, + iree_hal_semaphore_t** out_semaphore); + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_NOP_SEMAPHORE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt new file mode 100644 index 000000000..c6e5ad98e --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/CMakeLists.txt @@ -0,0 +1,23 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_cc_library( + NAME + registration + HDRS + "driver_module.h" + SRCS + "driver_module.c" + DEPS + iree::base + iree::hal + iree-amd-aie::driver::xrt-lite + DEFINES + "IREE_HAVE_HAL_XRT_LITE_DRIVER_MODULE=1" + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c new file mode 100644 index 000000000..72f4841b1 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.c @@ -0,0 +1,155 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt-lite/registration/driver_module.h" + +#include "iree-amd-aie/driver/xrt-lite/api.h" +#include "iree/base/api.h" +#include "iree/base/internal/flags.h" + +IREE_FLAG(int32_t, xrt_lite_n_core_rows, 0, + "Number of core rows to use on NPU."); +IREE_FLAG(int32_t, xrt_lite_n_core_cols, 0, + "Number of core cols to use on NPU."); + +static const iree_string_view_t key_xrt_lite_n_core_rows = + iree_string_view_literal("xrt_lite_n_core_rows"); +static const iree_string_view_t key_xrt_lite_n_core_cols = + iree_string_view_literal("xrt_lite_n_core_cols"); + +static iree_status_t iree_hal_xrt_lite_driver_factory_enumerate( + void* self, iree_host_size_t* out_driver_info_count, + const iree_hal_driver_info_t** out_driver_infos) { + IREE_TRACE_ZONE_BEGIN(z0); + + static const iree_hal_driver_info_t default_driver_info = { + .driver_name = IREE_SVL("xrt-lite"), + .full_name = IREE_SVL("XRT-LITE driver (for AIE)"), + }; + *out_driver_info_count = 1; + *out_driver_infos = &default_driver_info; + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_driver_parse_flags( + iree_string_pair_builder_t* builder) { + IREE_TRACE_ZONE_BEGIN(z0); + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_string_pair_builder_add_int32(builder, key_xrt_lite_n_core_rows, + FLAG_xrt_lite_n_core_rows)); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_string_pair_builder_add_int32(builder, key_xrt_lite_n_core_cols, + FLAG_xrt_lite_n_core_cols)); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_driver_populate_options( + iree_allocator_t host_allocator, + struct iree_hal_xrt_lite_driver_options* driver_options, + struct iree_hal_xrt_lite_device_params* device_params, + iree_host_size_t pairs_size, iree_string_pair_t* pairs) { + IREE_TRACE_ZONE_BEGIN(z0); + + for (iree_host_size_t i = 0; i < pairs_size; ++i) { + iree_string_view_t key = pairs[i].key; + iree_string_view_t value = pairs[i].value; + int32_t ivalue; + + if (iree_string_view_equal(key, key_xrt_lite_n_core_rows)) { + if (!iree_string_view_atoi_int32(value, &ivalue)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_rows' expected to be int. Got: '%.*s'", + (int)value.size, value.data); + } + if (ivalue <= 0) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_rows' expected to be > 0. Got: '%.*s'", + (int)value.size, value.data); + } + device_params->n_core_rows = ivalue; + } else if (iree_string_view_equal(key, key_xrt_lite_n_core_cols)) { + if (!iree_string_view_atoi_int32(value, &ivalue)) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_cols' expected to be int. Got: '%.*s'", + (int)value.size, value.data); + } + if (ivalue <= 0) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status( + IREE_STATUS_FAILED_PRECONDITION, + "Option 'key_xrt_lite_n_core_cols' expected to be > 0. Got: '%.*s'", + (int)value.size, value.data); + } + device_params->n_core_cols = ivalue; + } else { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, + "Unrecognized options: %.*s", (int)key.size, + key.data); + } + } + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +static iree_status_t iree_hal_xrt_lite_driver_factory_try_create( + void* self, iree_string_view_t driver_name, iree_allocator_t host_allocator, + iree_hal_driver_t** out_driver) { + IREE_TRACE_ZONE_BEGIN(z0); + + if (!iree_string_view_equal(driver_name, IREE_SV("xrt-lite"))) { + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_UNAVAILABLE, + "no driver '%.*s' is provided by this factory", + (int)driver_name.size, driver_name.data); + } + + struct iree_hal_xrt_lite_driver_options driver_options; + iree_hal_xrt_lite_driver_options_initialize(&driver_options); + struct iree_hal_xrt_lite_device_params device_params; + iree_hal_xrt_lite_device_options_initialize(&device_params); + + iree_string_pair_builder_t flag_option_builder; + iree_string_pair_builder_initialize(host_allocator, &flag_option_builder); + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_driver_parse_flags(&flag_option_builder)); + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_driver_populate_options( + host_allocator, &driver_options, &device_params, + iree_string_pair_builder_size(&flag_option_builder), + iree_string_pair_builder_pairs(&flag_option_builder))); + + IREE_RETURN_AND_END_ZONE_IF_ERROR( + z0, iree_hal_xrt_lite_driver_create(driver_name, &driver_options, + &device_params, host_allocator, + out_driver)); + + IREE_TRACE_ZONE_END(z0); + return iree_ok_status(); +} + +IREE_API_EXPORT iree_status_t +iree_hal_xrt_lite_driver_module_register(iree_hal_driver_registry_t* registry) { + static const iree_hal_driver_factory_t factory = { + .self = NULL, + .enumerate = iree_hal_xrt_lite_driver_factory_enumerate, + .try_create = iree_hal_xrt_lite_driver_factory_try_create, + }; + return iree_hal_driver_registry_register_factory(registry, &factory); +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h new file mode 100644 index 000000000..c8e81405c --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/registration/driver_module.h @@ -0,0 +1,24 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_HAL_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ +#define IREE_HAL_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ + +#include "iree/base/api.h" +#include "iree/hal/api.h" + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + +IREE_API_EXPORT iree_status_t +iree_hal_xrt_lite_driver_module_register(iree_hal_driver_registry_t* registry); + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus + +#endif // IREE_HAL_DRIVER_XRT_LITE_REGISTRATION_DRIVER_MODULE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt new file mode 100644 index 000000000..c30c40e27 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +if(UNIX) + add_subdirectory(linux) +endif() diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt new file mode 100644 index 000000000..067d32f4a --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +add_subdirectory(kmq) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt new file mode 100644 index 000000000..e0757905f --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/CMakeLists.txt @@ -0,0 +1,34 @@ +# Copyright (c) 2024 Advanced Micro Devices, Inc. All Rights Reserved. +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +iree_cc_library( + NAME + shim-xdna + SRCS + amdxdna_accel.h + bo.cpp + bo.h + device.cpp + device.h + ert.h + fence.cpp + fence.h + hwctx.cpp + hwctx.h + hwq.cpp + hwq.h + kernel.cpp + kernel.h + shim_debug.cpp + shim_debug.h + DEPS + uuid + LLVMSupport + DEFINES + $<$:SHIM_XDNA_DEBUG> + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h new file mode 100644 index 000000000..cc8ec252f --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/amdxdna_accel.h @@ -0,0 +1,591 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright (C) 2022-2024, Advanced Micro Devices, Inc. + */ + +#ifndef AMDXDNA_ACCEL_H_ +#define AMDXDNA_ACCEL_H_ + +#include +#include +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +#define AMDXDNA_DRIVER_MAJOR 1 +#define AMDXDNA_DRIVER_MINOR 0 + +#define AMDXDNA_INVALID_ADDR (~0UL) +#define AMDXDNA_INVALID_CTX_HANDLE 0 +#define AMDXDNA_INVALID_BO_HANDLE 0 +#define AMDXDNA_INVALID_FENCE_HANDLE 0 + +/* + * The interface can grow/extend over time. + * On each struct amdxdna_drm_*, to support potential extension, we defined it + * like this. + * + * Example code: + * + * struct amdxdna_drm_example_data { + * .ext = (uintptr_t)&example_data_ext; + * ... + * }; + * + * We don't have extension now. The extension struct will define in the future. + */ + +enum amdxdna_drm_ioctl_id { + DRM_AMDXDNA_CREATE_HWCTX, + DRM_AMDXDNA_DESTROY_HWCTX, + DRM_AMDXDNA_CONFIG_HWCTX, + DRM_AMDXDNA_CREATE_BO, + DRM_AMDXDNA_GET_BO_INFO, + DRM_AMDXDNA_SYNC_BO, + DRM_AMDXDNA_EXEC_CMD, + DRM_AMDXDNA_WAIT_CMD, + DRM_AMDXDNA_GET_INFO, + DRM_AMDXDNA_SET_STATE, + DRM_AMDXDNA_NUM_IOCTLS +}; + +enum amdxdna_device_type { + AMDXDNA_DEV_TYPE_UNKNOWN = -1, + AMDXDNA_DEV_TYPE_KMQ, + AMDXDNA_DEV_TYPE_UMQ, +}; + +/** + * struct qos_info - QoS information for driver. + * @gops: Giga operations per second. + * @fps: Frames per second. + * @dma_bandwidth: DMA bandwidtha. + * @latency: Frame response latency. + * @frame_exec_time: Frame execution time. + * @priority: Request priority. + * + * User program can provide QoS hints to driver. + */ +struct amdxdna_qos_info { + __u32 gops; + __u32 fps; + __u32 dma_bandwidth; + __u32 latency; + __u32 frame_exec_time; + __u32 priority; +}; + +/** + * struct amdxdna_drm_create_hwctx - Create hardware context. + * @ext: MBZ. + * @ext_flags: MBZ. + * @qos_p: Address of QoS info. + * @umq_bo: BO handle for user mode queue(UMQ). + * @log_buf_bo: BO handle for log buffer. + * @max_opc: Maximum operations per cycle. + * @num_tiles: Number of AIE tiles. + * @mem_size: Size of AIE tile memory. + * @umq_doorbell: Returned offset of doorbell associated with UMQ. + * @handle: Returned hardware context handle. + */ +struct amdxdna_drm_create_hwctx { + __u64 ext; + __u64 ext_flags; + __u64 qos_p; + __u32 umq_bo; + __u32 log_buf_bo; + __u32 max_opc; + __u32 num_tiles; + __u32 mem_size; + __u32 umq_doorbell; + __u32 handle; +}; + +/** + * struct amdxdna_drm_destroy_hwctx - Destroy hardware context. + * @handle: Hardware context handle. + * @pad: MBZ. + */ +struct amdxdna_drm_destroy_hwctx { + __u32 handle; + __u32 pad; +}; + +/** + * struct amdxdna_cu_config - configuration for one CU + * @cu_bo: CU configuration buffer bo handle + * @cu_func: Functional of a CU + * @pad: MBZ + */ +struct amdxdna_cu_config { + __u32 cu_bo; + __u8 cu_func; + __u8 pad[3]; +}; + +/** + * struct amdxdna_hwctx_param_config_cu - configuration for CUs in hardware + * context + * @num_cus: Number of CUs to configure + * @pad: MBZ + * @cu_configs: Array of CU configurations of struct amdxdna_cu_config + */ +struct amdxdna_hwctx_param_config_cu { + __u16 num_cus; + __u16 pad[3]; + struct amdxdna_cu_config cu_configs[1]; +}; + +enum amdxdna_drm_config_hwctx_param { + DRM_AMDXDNA_HWCTX_CONFIG_CU, + DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + DRM_AMDXDNA_HWCTX_CONFIG_NUM +}; + +/** + * struct amdxdna_drm_config_hwctx - Configure hardware context. + * @handle: hardware context handle. + * @param_type: Value in enum amdxdna_drm_config_hwctx_param. Specifies the + * structure passed in via param_val. + * @param_val: A structure specified by the param_type struct member. + * @param_val_size: Size of the parameter buffer pointed to by the param_val. + * If param_val is not a pointer, driver can ignore this. + * + * Note: if the param_val is a pointer pointing to a buffer, the maximum size + * of the buffer is 4KiB(PAGE_SIZE). + */ +struct amdxdna_drm_config_hwctx { + __u32 handle; + __u32 param_type; + __u64 param_val; + __u32 param_val_size; + __u32 pad; +}; + +/* + * AMDXDNA_BO_SHMEM: DRM GEM SHMEM bo + * AMDXDNA_BO_DEV_HEAP: Shared host memory to device as heap memory + * AMDXDNA_BO_DEV_BO: Allocated from BO_DEV_HEAP + * AMDXDNA_BO_CMD: User and driver accessible bo + * AMDXDNA_BO_DMA: DRM GEM DMA bo + */ +enum amdxdna_bo_type { + AMDXDNA_BO_INVALID = 0, + AMDXDNA_BO_SHMEM, + AMDXDNA_BO_DEV_HEAP, + AMDXDNA_BO_DEV, + AMDXDNA_BO_CMD, + AMDXDNA_BO_DMA, +}; + +/** + * struct amdxdna_drm_create_bo - Create a buffer object. + * @flags: Buffer flags. MBZ. + * @type: Buffer type. + * @vaddr: User VA of buffer if applied. MBZ. + * @size: Size in bytes. + * @handle: Returned DRM buffer object handle. + */ +struct amdxdna_drm_create_bo { + __u64 flags; + __u32 type; + __u32 _pad; + __u64 vaddr; + __u64 size; + __u32 handle; +}; + +/** + * struct amdxdna_drm_get_bo_info - Get buffer object information. + * @ext: MBZ. + * @ext_flags: MBZ. + * @handle: DRM buffer object handle. + * @map_offset: Returned DRM fake offset for mmap(). + * @vaddr: Returned user VA of buffer. 0 in case user needs mmap(). + * @xdna_addr: Returned XDNA device virtual address. + */ +struct amdxdna_drm_get_bo_info { + __u64 ext; + __u64 ext_flags; + __u32 handle; + __u32 _pad; + __u64 map_offset; + __u64 vaddr; + __u64 xdna_addr; +}; + +/** + * struct amdxdna_drm_sync_bo - Sync buffer object. + * @handle: Buffer object handle. + * @direction: Direction of sync, can be from device or to device. + * @offset: Offset in the buffer to sync. + * @size: Size in bytes. + */ +struct amdxdna_drm_sync_bo { + __u32 handle; +#define SYNC_DIRECT_TO_DEVICE 0U +#define SYNC_DIRECT_FROM_DEVICE 1U + __u32 direction; + __u64 offset; + __u64 size; +}; + +enum amdxdna_cmd_type { + AMDXDNA_CMD_SUBMIT_EXEC_BUF = 0, + AMDXDNA_CMD_SUBMIT_DEPENDENCY, + AMDXDNA_CMD_SUBMIT_SIGNAL, +}; + +/** + * struct amdxdna_drm_exec_cmd - Execute command. + * @ext: MBZ. + * @ext_flags: MBZ. + * @hwctx: Hardware context handle. + * @type: One of command type in enum amdxdna_cmd_type. + * @cmd_handles: Array of command handles or the command handle itself in case + * of just one. + * @args: Array of arguments for all command handles. + * @cmd_count: Number of command handles in the cmd_handles array. + * @arg_count: Number of arguments in the args array. + * @seq: Returned sequence number for this command. + */ +struct amdxdna_drm_exec_cmd { + __u64 ext; + __u64 ext_flags; + __u32 hwctx; + __u32 type; + __u64 cmd_handles; + __u64 args; + __u32 cmd_count; + __u32 arg_count; + __u64 seq; +}; + +/** + * struct amdxdna_drm_wait_cmd - Wait exectuion command. + * + * @hwctx: hardware context handle. + * @timeout: timeout in ms, 0 implies infinite wait. + * @seq: sequence number of the command returned by execute command. + * + * Wait a command specified by seq to be completed. + */ +struct amdxdna_drm_wait_cmd { + __u32 hwctx; + __u32 timeout; + __u64 seq; +}; + +/** + * struct amdxdna_drm_query_aie_status - Query the status of the AIE hardware + * @buffer: The user space buffer that will return the AIE status + * @buffer_size: The size of the user space buffer + * @cols_filled: A bitmap of AIE columns whose data has been returned in the + * buffer. + */ +struct amdxdna_drm_query_aie_status { + __u64 buffer; /* out */ + __u32 buffer_size; /* in */ + __u32 cols_filled; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_version - Query the version of the AIE hardware + * @major: The major version number + * @minor: The minor version number + */ +struct amdxdna_drm_query_aie_version { + __u32 major; /* out */ + __u32 minor; /* out */ +}; + +/** + * struct amdxdna_drm_query_aie_tile_metadata - Query the metadata of AIE tile + * (core, mem, shim) + * @row_count: The number of rows. + * @row_start: The starting row number. + * @dma_channel_count: The number of dma channels. + * @lock_count: The number of locks. + * @event_reg_count: The number of events. + * @pad: MBZ. + */ +struct amdxdna_drm_query_aie_tile_metadata { + __u16 row_count; + __u16 row_start; + __u16 dma_channel_count; + __u16 lock_count; + __u16 event_reg_count; + __u16 pad[3]; +}; + +/** + * struct amdxdna_drm_query_aie_metadata - Query the metadata of the AIE + * hardware + * @col_size: The size of a column in bytes. + * @cols: The total number of columns. + * @rows: The total number of rows. + * @version: The version of the AIE hardware. + * @core: The metadata for all core tiles. + * @mem: The metadata for all mem tiles. + * @shim: The metadata for all shim tiles. + */ +struct amdxdna_drm_query_aie_metadata { + __u32 col_size; + __u16 cols; + __u16 rows; + struct amdxdna_drm_query_aie_version version; + struct amdxdna_drm_query_aie_tile_metadata core; + struct amdxdna_drm_query_aie_tile_metadata mem; + struct amdxdna_drm_query_aie_tile_metadata shim; +}; + +/** + * struct amdxdna_drm_query_clock - Metadata for a clock + * @name: The clock name. + * @freq_mhz: The clock frequency. + * @pad: MBZ. + */ +struct amdxdna_drm_query_clock { + __u8 name[16]; + __u32 freq_mhz; + __u32 pad; +}; + +/** + * struct amdxdna_drm_query_clock_metadata - Query metadata for clocks + * @mp_npu_clock: The metadata for MP-NPU clock. + * @h_clock: The metadata for H clock. + */ +struct amdxdna_drm_query_clock_metadata { + struct amdxdna_drm_query_clock mp_npu_clock; + struct amdxdna_drm_query_clock h_clock; +}; + +enum amdxdna_sensor_type { AMDXDNA_SENSOR_TYPE_POWER }; + +/** + * struct amdxdna_drm_query_sensor - The data for single sensor. + * @label: The name for a sensor. + * @input: The current value of the sensor. + * @max: The maximum value possible for the sensor. + * @average: The average value of the sensor. + * @highest: The highest recorded sensor value for this driver load for the + * sensor. + * @status: The sensor status. + * @units: The sensor units. + * @unitm: Translates value member variables into the correct unit via (pow(10, + * unitm) * value) + * @type: The sensor type from enum amdxdna_sensor_type + * @pad: MBZ. + */ +struct amdxdna_drm_query_sensor { + __u8 label[64]; + __u32 input; + __u32 max; + __u32 average; + __u32 highest; + __u8 status[64]; + __u8 units[16]; + __s8 unitm; + __u8 type; + __u8 pad[6]; +}; + +/** + * struct amdxdna_drm_query_hwctx - The data for single context. + * @context_id: The ID for this context. + * @start_col: The starting column for the partition assigned to this context. + * @num_col: The number of columns in the partition assigned to this context. + * @pid: The Process ID of the process that created this context. + * @command_submissions: The number of commands submitted to this context. + * @command_completions: The number of commands completed by this context. + * @migrations: The number of times this context has been moved to a different + * partition. + * @preemptions: The number of times this context has been preempted by another + * context in the same partition. + * @pad: MBZ. + */ +struct amdxdna_drm_query_hwctx { + __u32 context_id; + __u32 start_col; + __u32 num_col; + __u32 pad; + __s64 pid; + __u64 command_submissions; + __u64 command_completions; + __u64 migrations; + __u64 preemptions; + __u64 errors; +}; + +/** + * struct amdxdna_drm_aie_mem - The data for AIE memory read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE memory address to read/write + * @size: The size of bytes to read/write + * @buf_p: The buffer to store read/write data + * + * This is used for DRM_AMDXDNA_READ_AIE_MEM and DRM_AMDXDNA_WRITE_AIE_MEM + * parameters. + */ +struct amdxdna_drm_aie_mem { + __u32 col; + __u32 row; + __u32 addr; + __u32 size; + __u64 buf_p; +}; + +/** + * struct amdxdna_drm_aie_reg - The data for AIE register read/write + * @col: The AIE column index + * @row: The AIE row index + * @addr: The AIE register address to read/write + * @val: The value to write or returned value from AIE + * + * This is used for DRM_AMDXDNA_READ_AIE_REG and DRM_AMDXDNA_WRITE_AIE_REG + * parameters. + */ +struct amdxdna_drm_aie_reg { + __u32 col; + __u32 row; + __u32 addr; + __u32 val; +}; + +enum amdxdna_power_mode_type { + POWER_MODE_DEFAULT, /**< Fallback to calculated DPM */ + POWER_MODE_LOW, /**< Set frequency to lowest DPM */ + POWER_MODE_MEDIUM, /**< Set frequency to medium DPM */ + POWER_MODE_HIGH, /**< Set frequency to highest DPM */ + POWER_MODE_TURBO, /**< More power, more performance */ +}; + +/** + * struct amdxdna_drm_get_power_mode - Get the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_get_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +/** + * struct amdxdna_drm_query_firmware_version - Query the version of the firmware + * @major: The major version number + * @minor: The minor version number + * @patch: The patch level version number + * @build: The build ID + */ +struct amdxdna_drm_query_firmware_version { + __u32 major; /* out */ + __u32 minor; /* out */ + __u32 patch; /* out */ + __u32 build; /* out */ +}; + +enum amdxdna_drm_get_param { + DRM_AMDXDNA_QUERY_AIE_STATUS, + DRM_AMDXDNA_QUERY_AIE_METADATA, + DRM_AMDXDNA_QUERY_AIE_VERSION, + DRM_AMDXDNA_QUERY_CLOCK_METADATA, + DRM_AMDXDNA_QUERY_SENSORS, + DRM_AMDXDNA_QUERY_HW_CONTEXTS, + DRM_AMDXDNA_READ_AIE_MEM, + DRM_AMDXDNA_READ_AIE_REG, + DRM_AMDXDNA_QUERY_FIRMWARE_VERSION, + DRM_AMDXDNA_GET_POWER_MODE, + DRM_AMDXDNA_QUERY_TELEMETRY, + DRM_AMDXDNA_NUM_GET_PARAM, +}; + +/** + * struct amdxdna_drm_get_info - Get some information from the AIE hardware. + * @param: Value in enum amdxdna_drm_get_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. Size needed/written by the kernel. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_get_info { + __u32 param; /* in */ + __u32 buffer_size; /* in/out */ + __u64 buffer; /* in/out */ +}; + +/** + * struct amdxdna_drm_set_power_mode - Set the power mode of the AIE hardware + * @power_mode: The sensor type from enum amdxdna_power_mode_type + * @pad: MBZ. + */ +struct amdxdna_drm_set_power_mode { + __u8 power_mode; + __u8 pad[7]; +}; + +enum amdxdna_drm_set_param { + DRM_AMDXDNA_SET_POWER_MODE, + DRM_AMDXDNA_WRITE_AIE_MEM, + DRM_AMDXDNA_WRITE_AIE_REG, + DRM_AMDXDNA_NUM_SET_PARAM, +}; + +/** + * struct amdxdna_drm_set_state - Set the state of some component within the AIE + * hardware. + * @param: Value in enum amdxdna_drm_set_param. Specifies the structure passed + * in the buffer. + * @buffer_size: Size of the input buffer. + * @buffer: A structure specified by the param struct member. + */ +struct amdxdna_drm_set_state { + __u32 param; /* in */ + __u32 buffer_size; /* in */ + __u64 buffer; /* in */ +}; + +#define DRM_IOCTL_AMDXDNA_CREATE_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_HWCTX, \ + struct amdxdna_drm_create_hwctx) + +#define DRM_IOCTL_AMDXDNA_DESTROY_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_DESTROY_HWCTX, \ + struct amdxdna_drm_destroy_hwctx) + +#define DRM_IOCTL_AMDXDNA_CONFIG_HWCTX \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CONFIG_HWCTX, \ + struct amdxdna_drm_config_hwctx) + +#define DRM_IOCTL_AMDXDNA_CREATE_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_CREATE_BO, \ + struct amdxdna_drm_create_bo) + +#define DRM_IOCTL_AMDXDNA_GET_BO_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_BO_INFO, \ + struct amdxdna_drm_get_bo_info) + +#define DRM_IOCTL_AMDXDNA_SYNC_BO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SYNC_BO, struct amdxdna_drm_sync_bo) + +#define DRM_IOCTL_AMDXDNA_EXEC_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_EXEC_CMD, struct amdxdna_drm_exec_cmd) + +#define DRM_IOCTL_AMDXDNA_WAIT_CMD \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_WAIT_CMD, struct amdxdna_drm_wait_cmd) + +#define DRM_IOCTL_AMDXDNA_GET_INFO \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_GET_INFO, struct amdxdna_drm_get_info) + +#define DRM_IOCTL_AMDXDNA_SET_STATE \ + DRM_IOWR(DRM_COMMAND_BASE + DRM_AMDXDNA_SET_STATE, \ + struct amdxdna_drm_set_state) + +#if defined(__cplusplus) +} /* extern c end */ +#endif + +#endif /* AMDXDNA_ACCEL_H_ */ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp new file mode 100644 index 000000000..2d266f729 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.cpp @@ -0,0 +1,474 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "bo.h" + +#include +#include +#include +#include + +#include +#include + +#include "shim_debug.h" +#include "xrt_mem.h" + +namespace { + +uint32_t alloc_drm_bo(const shim_xdna::pdev &dev, amdxdna_bo_type type, + size_t size) { + amdxdna_drm_create_bo cbo = { + .type = static_cast(type), + .vaddr = reinterpret_cast(nullptr), + .size = size, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CREATE_BO, &cbo); + return cbo.handle; +} + +void free_drm_bo(const shim_xdna::pdev &dev, uint32_t boh) { + drm_gem_close close_bo = {boh, 0}; + dev.ioctl(DRM_IOCTL_GEM_CLOSE, &close_bo); +} + +void get_drm_bo_info(const shim_xdna::pdev &dev, uint32_t boh, + amdxdna_drm_get_bo_info *bo_info) { + bo_info->handle = boh; + dev.ioctl(DRM_IOCTL_AMDXDNA_GET_BO_INFO, bo_info); +} + +void *map_parent_range(size_t size) { + auto p = ::mmap(nullptr, size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (!p) shim_xdna::shim_err(errno, "mmap(len=%ld) failed", size); + + return p; +} + +void *map_drm_bo(const shim_xdna::pdev &dev, size_t size, int prot, + uint64_t offset) { + return dev.mmap(nullptr, size, prot, MAP_SHARED | MAP_LOCKED, offset); +} + +void *map_drm_bo(const shim_xdna::pdev &dev, void *addr, size_t size, int prot, + int flags, uint64_t offset) { + return dev.mmap(addr, size, prot, flags, offset); +} + +void unmap_drm_bo(const shim_xdna::pdev &dev, void *addr, size_t size) { + munmap(addr, size); +} + +void attach_dbg_drm_bo(const shim_xdna::pdev &dev, uint32_t boh, + uint32_t ctx_id) { + amdxdna_drm_config_hwctx adbo = { + .handle = ctx_id, + .param_type = DRM_AMDXDNA_HWCTX_ASSIGN_DBG_BUF, + .param_val = boh, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &adbo); +} + +void detach_dbg_drm_bo(const shim_xdna::pdev &dev, uint32_t boh, + uint32_t ctx_id) { + amdxdna_drm_config_hwctx adbo = { + .handle = ctx_id, + .param_type = DRM_AMDXDNA_HWCTX_REMOVE_DBG_BUF, + .param_val = boh, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &adbo); +} + +int export_drm_bo(const shim_xdna::pdev &dev, uint32_t boh) { + drm_prime_handle exp_bo = {boh, DRM_RDWR | DRM_CLOEXEC, -1}; + dev.ioctl(DRM_IOCTL_PRIME_HANDLE_TO_FD, &exp_bo); + return exp_bo.fd; +} + +uint32_t import_drm_bo(const shim_xdna::pdev &dev, + const shim_xdna::shared_handle &share, + amdxdna_bo_type *type, size_t *size) { + int fd = share.get_export_handle(); + drm_prime_handle imp_bo = {AMDXDNA_INVALID_BO_HANDLE, 0, fd}; + dev.ioctl(DRM_IOCTL_PRIME_FD_TO_HANDLE, &imp_bo); + + *type = AMDXDNA_BO_SHMEM; + *size = lseek(fd, 0, SEEK_END); + lseek(fd, 0, SEEK_SET); + + return imp_bo.handle; +} + +bool is_power_of_two(size_t x) { return x > 0 && (x & x - 1) == 0; } + +void *addr_align(void *p, size_t align) { + if (!is_power_of_two(align)) + shim_xdna::shim_err(EINVAL, "Alignment 0x%lx is not power of two", align); + + return reinterpret_cast((uintptr_t)p + align & ~(align - 1)); +} + +amdxdna_bo_type flag_to_type(shim_xcl_bo_flags flags) { + uint32_t boflags = (static_cast(flags.boflags) << 24); + switch (boflags) { + case XCL_BO_FLAGS_NONE: + case XCL_BO_FLAGS_HOST_ONLY: + return AMDXDNA_BO_SHMEM; + case XCL_BO_FLAGS_CACHEABLE: + return AMDXDNA_BO_DEV; + case XCL_BO_FLAGS_EXECBUF: + return AMDXDNA_BO_CMD; + default: + break; + } + return AMDXDNA_BO_INVALID; +} + +// flash cache line for non coherence memory +inline void clflush_data(const void *base, size_t offset, size_t len) { + static long cacheline_size = 0; + + if (!cacheline_size) { + long sz = sysconf(_SC_LEVEL1_DCACHE_LINESIZE); + if (sz <= 0) + shim_xdna::shim_err(EINVAL, "Invalid cache line size: %ld", sz); + cacheline_size = sz; + } + + const char *cur = (const char *)base; + cur += offset; + uintptr_t lastline = (uintptr_t)(cur + len - 1) | (cacheline_size - 1); + do { + _mm_clflush(cur); + cur += cacheline_size; + } while (cur <= (const char *)lastline); +} + +void sync_drm_bo(const shim_xdna::pdev &dev, uint32_t boh, + shim_xdna::direction dir, size_t offset, size_t len) { + amdxdna_drm_sync_bo sbo = { + .handle = boh, + .direction = + (dir == shim_xdna::direction::host2device ? SYNC_DIRECT_TO_DEVICE + : SYNC_DIRECT_FROM_DEVICE), + .offset = offset, + .size = len, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_SYNC_BO, &sbo); +} + +bool is_driver_sync() { + static int drv_sync = -1; + + if (drv_sync == -1) { + bool ds = std::getenv("Debug.force_driver_sync"); + drv_sync = ds ? 1 : 0; + } + return drv_sync == 1; +} + +} // namespace + +namespace shim_xdna { + +drm_bo::drm_bo(bo &parent, const amdxdna_drm_get_bo_info &bo_info) + : m_parent(parent), + m_handle(bo_info.handle), + m_map_offset(bo_info.map_offset), + m_xdna_addr(bo_info.xdna_addr), + m_vaddr(bo_info.vaddr) {} + +drm_bo::~drm_bo() { + if (m_handle == AMDXDNA_INVALID_BO_HANDLE) return; + free_drm_bo(m_parent.m_pdev, m_handle); +} + +std::string bo::type_to_name() const { + switch (m_type) { + case AMDXDNA_BO_SHMEM: + return {"AMDXDNA_BO_SHMEM"}; + case AMDXDNA_BO_DEV_HEAP: + return {"AMDXDNA_BO_DEV_HEAP"}; + case AMDXDNA_BO_DEV: + if (shim_xcl_bo_flags{m_flags}.use == XRT_BO_USE_DEBUG) + return {"AMDXDNA_BO_DEV_DEBUG"}; + return {"AMDXDNA_BO_DEV"}; + case AMDXDNA_BO_CMD: + return {"AMDXDNA_BO_CMD"}; + default:; + return {"BO_UNKNOWN"}; + } + return {"BO_UNKNOWN"}; +} + +std::string bo::describe() const { + std::string desc = "type="; + desc += type_to_name(); + desc += ", "; + desc += "drm_bo="; + desc += std::to_string(m_drm_bo->m_handle); + desc += ", "; + desc += "size="; + desc += std::to_string(m_aligned_size); + return desc; +} + +void bo::mmap_bo(size_t align) { + size_t a = align; + + if (m_drm_bo->m_map_offset == AMDXDNA_INVALID_ADDR) { + m_aligned = reinterpret_cast(m_drm_bo->m_vaddr); + return; + } + + if (a == 0) { + m_aligned = map_drm_bo(m_pdev, m_aligned_size, PROT_READ | PROT_WRITE, + m_drm_bo->m_map_offset); + return; + } + + /* + * Handle special alignment + * The first mmap() is just for reserved a range in user vritual address + * space. The second mmap() uses an aligned addr as the first argument in mmap + * syscall. + */ + m_parent_size = align * 2 - 1; + m_parent = map_parent_range(m_parent_size); + auto aligned = addr_align(m_parent, align); + m_aligned = + map_drm_bo(m_pdev, aligned, m_aligned_size, PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_FIXED, m_drm_bo->m_map_offset); +} + +void bo::munmap_bo() { + SHIM_DEBUG("Unmap BO, aligned %p parent %p", m_aligned, m_parent); + if (m_drm_bo->m_map_offset == AMDXDNA_INVALID_ADDR) return; + + unmap_drm_bo(m_pdev, m_aligned, m_aligned_size); + if (m_parent) unmap_drm_bo(m_pdev, m_parent, m_parent_size); +} + +void bo::import_bo() { + uint32_t boh = import_drm_bo(m_pdev, m_import, &m_type, &m_aligned_size); + + amdxdna_drm_get_bo_info bo_info = {}; + get_drm_bo_info(m_pdev, boh, &bo_info); + m_drm_bo = std::make_unique(*this, bo_info); +} + +void bo::free_bo() { m_drm_bo.reset(); } + +bo::bo(const pdev &pdev, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, + amdxdna_bo_type type) + : m_pdev(pdev), + m_aligned_size(size), + m_flags(flags), + m_type(type), + m_import(-1), + m_owner_ctx_id(ctx_id) { + size_t align = 0; + + if (m_type == AMDXDNA_BO_DEV_HEAP) + align = 64 * 1024 * 1024; // Device mem heap must align at 64MB boundary. + + uint32_t boh = alloc_drm_bo(m_pdev, m_type, m_aligned_size); + // TODO(max): this is dumb? performs an ioctl right after we just made one? + amdxdna_drm_get_bo_info bo_info = {}; + get_drm_bo_info(m_pdev, boh, &bo_info); + m_drm_bo = std::make_unique(*this, bo_info); + + mmap_bo(align); + + // Newly allocated buffer may contain dirty pages. If used as output buffer, + // the data in cacheline will be flushed onto memory and pollute the output + // from device. We perform a cache flush right after the BO is allocated to + // avoid this issue. + if (m_type == AMDXDNA_BO_SHMEM) { + sync(direction::host2device, size, 0); + } + + attach_to_ctx(); +#ifndef NDEBUG + switch (m_flags.all) { + case 0x0: + SHIM_DEBUG("allocating dev heap"); + break; + case 0x1000000: + // pdi bo + SHIM_DEBUG("allocating pdi bo"); + break; + case 0x20000000: + // XCL_BO_FLAGS_P2P in create_free_bo test + SHIM_DEBUG("allocating XCL_BO_FLAGS_P2P"); + break; + case 0x80000000: + // XCL_BO_FLAGS_EXECBUF in create_free_bo test + SHIM_DEBUG("allocating XCL_BO_FLAGS_EXECBUF"); + break; + case 0x1001000000: + // debug bo + SHIM_DEBUG("allocating debug bo"); + break; + default: + shim_err(-1, "unknown flags %d", flags); + } +#endif + + SHIM_DEBUG( + "Allocated KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, " + "type=%d, drm_bo=%d)", + m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); +} + +bo::bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags) + : bo(p, ctx_id, size, flags, flag_to_type(flags)) { + if (m_type == AMDXDNA_BO_INVALID) + shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); +} + +bo::bo(const pdev &p, uint32_t ctx_id, size_t size, uint32_t flags) + : bo(p, ctx_id, size, shim_xcl_bo_flags{.flags = flags}) { + if (m_type == AMDXDNA_BO_INVALID) + shim_err(EINVAL, "Invalid BO flags: 0x%lx", flags); +} + +bo::bo(const pdev &p, int ehdl) : m_pdev(p), m_import(ehdl) { + import_bo(); + mmap_bo(); + SHIM_DEBUG( + "Imported KMQ BO (userptr=0x%lx, size=%ld, flags=0x%llx, type=%d, " + "drm_bo=%d)", + m_aligned, m_aligned_size, m_flags, m_type, get_drm_bo_handle()); +} + +bo::~bo() { + SHIM_DEBUG("Freeing KMQ BO, %s", describe().c_str()); + + munmap_bo(); + detach_from_ctx(); + // If BO is in use, we should block and wait in driver + free_bo(); +} + +bo::bo(const pdev &p, size_t size, amdxdna_bo_type type) + : bo(p, AMDXDNA_INVALID_CTX_HANDLE, size, shim_xcl_bo_flags{}, type) {} + +properties bo::get_properties() const { + return {m_flags, m_aligned_size, get_paddr(), get_drm_bo_handle()}; +} + +size_t bo::size() { return get_properties().size; } + +void *bo::map() const { return m_aligned; } + +void bo::unmap(void *addr) {} + +uint64_t bo::get_paddr() const { + if (m_drm_bo->m_xdna_addr != AMDXDNA_INVALID_ADDR) + return m_drm_bo->m_xdna_addr; + return reinterpret_cast(m_aligned); +} + +void bo::set_cmd_id(uint64_t id) { m_cmd_id = id; } + +uint64_t bo::get_cmd_id() const { return m_cmd_id; } + +uint32_t bo::get_drm_bo_handle() const { return m_drm_bo->m_handle; } + +void bo::attach_to_ctx() { + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; + + auto boh = get_drm_bo_handle(); + SHIM_DEBUG("Attaching drm_bo %d to ctx: %d", boh, m_owner_ctx_id); + attach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); +} + +void bo::detach_from_ctx() { + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) return; + + auto boh = get_drm_bo_handle(); + SHIM_DEBUG("Detaching drm_bo %d from ctx: %d", boh, m_owner_ctx_id); + detach_dbg_drm_bo(m_pdev, boh, m_owner_ctx_id); +} + +std::unique_ptr bo::share() const { + auto boh = get_drm_bo_handle(); + auto fd = export_drm_bo(m_pdev, boh); + SHIM_DEBUG("Exported bo %d to fd %d", boh, fd); + return std::make_unique(fd); +} + +amdxdna_bo_type bo::get_type() const { return m_type; } + +void bo::sync(direction dir, size_t size, size_t offset) { + if (is_driver_sync()) { + sync_drm_bo(m_pdev, get_drm_bo_handle(), dir, offset, size); + return; + } + + if (offset + size > m_aligned_size) + shim_err(EINVAL, "Invalid BO offset and size for sync'ing: %ld, %ld", + offset, size); + + switch (m_type) { + case AMDXDNA_BO_SHMEM: + case AMDXDNA_BO_CMD: + clflush_data(m_aligned, offset, size); + break; + case AMDXDNA_BO_DEV: + if (m_owner_ctx_id == AMDXDNA_INVALID_CTX_HANDLE) + clflush_data(m_aligned, offset, size); + else + sync_drm_bo(m_pdev, get_drm_bo_handle(), dir, offset, size); + break; + default: + shim_err(ENOTSUP, "Can't sync bo type %d", m_type); + } +} + +void bo::sync(direction dir) { sync(dir, size(), 0); } + +void bo::bind_at(size_t pos, const bo &boh, size_t offset, size_t size) { + std::lock_guard lg(m_args_map_lock); + + if (m_type != AMDXDNA_BO_CMD) + shim_err(EINVAL, "Can't call bind_at() on non-cmd BO"); + + if (!pos) m_args_map.clear(); + + if (boh.get_type() != AMDXDNA_BO_CMD) { + auto h = boh.get_drm_bo_handle(); + m_args_map[pos] = h; + SHIM_DEBUG("Added arg BO %d to cmd BO %d", h, get_drm_bo_handle()); + } else { + const size_t max_args_order = 6; + const size_t max_args = 1 << max_args_order; + size_t key = pos << max_args_order; + uint32_t hs[max_args]; + auto arg_cnt = boh.get_arg_bo_handles(hs, max_args); + std::string bohs; + for (int i = 0; i < arg_cnt; i++) { + m_args_map[key + i] = hs[i]; + bohs += std::to_string(hs[i]) + " "; + } + SHIM_DEBUG("Added arg BO %s to cmd BO %d", bohs.c_str(), + get_drm_bo_handle()); + } +} + +uint32_t bo::get_arg_bo_handles(uint32_t *handles, size_t num) const { + std::lock_guard lg(m_args_map_lock); + + auto sz = m_args_map.size(); + if (sz > num) + shim_err(E2BIG, "There are %ld BO args, provided buffer can hold only %ld", + sz, num); + + for (auto m : m_args_map) *(handles++) = m.second; + + return sz; +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h new file mode 100644 index 000000000..8742c8e28 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/bo.h @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _BO_XDNA_H_ +#define _BO_XDNA_H_ + +#include + +#include "amdxdna_accel.h" +#include "device.h" +#include "ert.h" +#include "hwctx.h" + +namespace shim_xdna { + +enum xclBOSyncDirection { + XCL_BO_SYNC_BO_TO_DEVICE = 0, + XCL_BO_SYNC_BO_FROM_DEVICE, +}; + +// direction - direction of sync operation +enum class direction { + host2device = XCL_BO_SYNC_BO_TO_DEVICE, + device2host = XCL_BO_SYNC_BO_FROM_DEVICE, +}; + +// properties - buffer details +struct properties { + shim_xcl_bo_flags flags; // flags of bo + uint64_t size; // size of bo + uint64_t paddr; // physical address + uint64_t kmhdl; // kernel mode handle +}; + +struct drm_bo { + bo &m_parent; + uint32_t m_handle = AMDXDNA_INVALID_BO_HANDLE; + off_t m_map_offset = AMDXDNA_INVALID_ADDR; + uint64_t m_xdna_addr = AMDXDNA_INVALID_ADDR; + uint64_t m_vaddr = AMDXDNA_INVALID_ADDR; + + drm_bo(bo &parent, const amdxdna_drm_get_bo_info &bo_info); + // no copying + drm_bo(const drm_bo &) = delete; + drm_bo &operator=(const drm_bo &) = delete; + ~drm_bo(); +}; + +struct bo { + const pdev &m_pdev; + void *m_parent = nullptr; + void *m_aligned = nullptr; + size_t m_parent_size = 0; + size_t m_aligned_size = 0; + shim_xcl_bo_flags m_flags{}; + amdxdna_bo_type m_type = AMDXDNA_BO_INVALID; + std::unique_ptr m_drm_bo; + const shared_handle m_import; + // Only for AMDXDNA_BO_CMD type + std::map m_args_map; + mutable std::mutex m_args_map_lock; + + // Command ID in the queue after command submission. + // Only valid for cmd BO. + uint64_t m_cmd_id = -1; + + // Used when exclusively assigned to a HW context. By default, BO is shared + // among all HW contexts. + uint32_t m_owner_ctx_id = AMDXDNA_INVALID_CTX_HANDLE; + + bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags, + amdxdna_bo_type type); + bo(const pdev &p, uint32_t ctx_id, size_t size, shim_xcl_bo_flags flags); + bo(const pdev &p, uint32_t ctx_id, size_t size, uint32_t flags); + bo(const pdev &p, int ehdl); + // Support BO creation from internal + bo(const pdev &p, size_t size, amdxdna_bo_type type); + ~bo(); + // no copying + bo(const bo &) = delete; + bo &operator=(const bo &) = delete; + + void *map() const; + void unmap(void *addr); + void sync(direction, size_t size, size_t offset); + void sync(direction); + properties get_properties() const; + size_t size(); + + std::unique_ptr share() const; + // For cmd BO only + void set_cmd_id(uint64_t id); + // For cmd BO only + uint64_t get_cmd_id() const; + uint32_t get_drm_bo_handle() const; + amdxdna_bo_type get_type() const; + // DRM BO managed by driver. + void bind_at(size_t pos, const bo &bh, size_t offset, size_t size); + std::string describe() const; + // Import DRM BO from m_import shared object + void import_bo(); + // Free DRM BO in driver + void free_bo(); + void mmap_bo(size_t align = 0); + void munmap_bo(); + uint64_t get_paddr() const; + std::string type_to_name() const; + void attach_to_ctx(); + void detach_from_ctx(); + // Obtain array of arg BO handles, returns real number of handles + uint32_t get_arg_bo_handles(uint32_t *handles, size_t num) const; +}; + +} // namespace shim_xdna + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp new file mode 100644 index 000000000..8b71d5f38 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.cpp @@ -0,0 +1,277 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. - All rights reserved + +#include "device.h" + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "bo.h" +#include "fence.h" +#include "hwctx.h" +#include "llvm/Support/ErrorHandling.h" +#include "shim_debug.h" +#include "xrt_mem.h" + +namespace { + +int64_t import_fd(pid_t pid, int ehdl) { + if (pid == 0 || getpid() == pid) return ehdl; + +#if defined(SYS_pidfd_open) && defined(SYS_pidfd_getfd) + auto pidfd = syscall(SYS_pidfd_open, pid, 0); + if (pidfd < 0) shim_xdna::shim_err(errno, "pidfd_open failed"); + + int64_t fd = syscall(SYS_pidfd_getfd, pidfd, ehdl, 0); + if (fd < 0) { + if (errno == EPERM) { + shim_xdna::shim_err( + errno, + "pidfd_getfd failed, check that ptrace access mode " + "allows PTRACE_MODE_ATTACH_REALCREDS. For more details please " + "check /etc/sysctl.d/10-ptrace.conf"); + } + + shim_xdna::shim_err(errno, "pidfd_getfd failed"); + } + return fd; +#else + shim_xdna::shim_err( + int(std::errc::not_supported), + "Importing buffer object from different process requires XRT " + " built and installed on a system with 'pidfd' kernel support"); +#endif +} + +std::string ioctl_cmd2name(unsigned long cmd) { + switch (cmd) { + case DRM_IOCTL_AMDXDNA_CREATE_HWCTX: + return "DRM_IOCTL_AMDXDNA_CREATE_HWCTX"; + case DRM_IOCTL_AMDXDNA_DESTROY_HWCTX: + return "DRM_IOCTL_AMDXDNA_DESTROY_HWCTX"; + case DRM_IOCTL_AMDXDNA_CONFIG_HWCTX: + return "DRM_IOCTL_AMDXDNA_CONFIG_HWCTX"; + case DRM_IOCTL_AMDXDNA_CREATE_BO: + return "DRM_IOCTL_AMDXDNA_CREATE_BO"; + case DRM_IOCTL_AMDXDNA_GET_BO_INFO: + return "DRM_IOCTL_AMDXDNA_GET_BO_INFO"; + case DRM_IOCTL_AMDXDNA_SYNC_BO: + return "DRM_IOCTL_AMDXDNA_SYNC_BO"; + case DRM_IOCTL_AMDXDNA_EXEC_CMD: + return "DRM_IOCTL_AMDXDNA_EXEC_CMD"; + case DRM_IOCTL_AMDXDNA_WAIT_CMD: + return "DRM_IOCTL_AMDXDNA_WAIT_CMD"; + case DRM_IOCTL_AMDXDNA_GET_INFO: + return "DRM_IOCTL_AMDXDNA_GET_INFO"; + case DRM_IOCTL_AMDXDNA_SET_STATE: + return "DRM_IOCTL_AMDXDNA_SET_STATE"; + case DRM_IOCTL_GEM_CLOSE: + return "DRM_IOCTL_GEM_CLOSE"; + case DRM_IOCTL_PRIME_HANDLE_TO_FD: + return "DRM_IOCTL_PRIME_HANDLE_TO_FD"; + case DRM_IOCTL_PRIME_FD_TO_HANDLE: + return "DRM_IOCTL_PRIME_FD_TO_HANDLE"; + case DRM_IOCTL_SYNCOBJ_CREATE: + return "DRM_IOCTL_SYNCOBJ_CREATE"; + case DRM_IOCTL_SYNCOBJ_QUERY: + return "DRM_IOCTL_SYNCOBJ_QUERY"; + case DRM_IOCTL_SYNCOBJ_DESTROY: + return "DRM_IOCTL_SYNCOBJ_DESTROY"; + case DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD: + return "DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD"; + case DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE: + return "DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE"; + case DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL: + return "DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL"; + case DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT: + return "DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT"; + default: + return "UNKNOWN(" + std::to_string(cmd) + ")"; + } + return "UNKNOWN(" + std::to_string(cmd) + ")"; +} + +// Device memory heap needs to be within one 64MB page. The maximum size is +// 64MB. +const size_t dev_mem_size = (64 << 20); +} // namespace + +namespace shim_xdna { + +pdev::pdev() { + const std::lock_guard lock(m_lock); + // TODO(max): hardcoded + m_dev_fd = ::open("/dev/accel/accel0", O_RDWR); + if (m_dev_fd < 0) shim_err(EINVAL, "Failed to open KMQ device"); + SHIM_DEBUG("Device opened, fd=%d", m_dev_fd); + m_dev_heap_bo = + std::make_unique(*this, dev_mem_size, AMDXDNA_BO_DEV_HEAP); + SHIM_DEBUG("Created KMQ pcidev"); +} + +pdev::~pdev() { + SHIM_DEBUG("Destroying KMQ pcidev"); + const std::lock_guard lock(m_lock); + m_dev_heap_bo.reset(); + ::close(m_dev_fd); + SHIM_DEBUG("Device closed, fd=%d", m_dev_fd); + SHIM_DEBUG("Destroyed KMQ pcidev"); +} + +void pdev::ioctl(unsigned long cmd, void *arg) const { + if (::ioctl(m_dev_fd, cmd, arg) == -1) { + shim_err(errno, "%s IOCTL failed", ioctl_cmd2name(cmd).c_str()); + } +} + +void *pdev::mmap(void *addr, size_t len, int prot, int flags, + off_t offset) const { + void *ret = ::mmap(addr, len, prot, flags, m_dev_fd, offset); + if (ret == reinterpret_cast(-1)) + shim_err(errno, + "mmap(addr=%p, len=%ld, prot=%d, flags=%d, offset=%ld) failed", + addr, len, prot, flags, offset); + return ret; +} + +device::device(uint32_t n_rows, uint32_t n_cols) + : n_rows(n_rows), n_cols(n_cols) { + SHIM_DEBUG("Created KMQ device n_rows %d n_cols %d", n_rows, n_cols); +} + +device::~device() { SHIM_DEBUG("Destroying KMQ device"); } + +const pdev &device::get_pdev() const { return m_pdev; } + +hw_ctx device::create_hw_context(const std::vector &pdi, + const std::string &cu_name, + const std::map &qos) { + return {*this, pdi, cu_name, n_rows, n_cols, qos}; +} + +hw_ctx device::create_hw_context(const std::vector &pdi, + const std::string &cu_name) { + return {*this, pdi, cu_name, n_rows, n_cols}; +} + +std::unique_ptr device::alloc_bo(uint32_t ctx_id, size_t size, + shim_xcl_bo_flags flags) { + return std::make_unique(this->m_pdev, ctx_id, size, flags); +} + +std::unique_ptr device::alloc_bo(size_t size, shim_xcl_bo_flags flags) { + return alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, flags); +} + +std::unique_ptr device::alloc_bo(size_t size, uint32_t flags) { + return alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, + shim_xcl_bo_flags{.flags = flags}); +} + +std::unique_ptr device::import_bo(pid_t pid, int ehdl) { + return import_bo(import_fd(pid, ehdl)); +} + +std::unique_ptr device::create_fence(fence_handle::access_mode) { + return std::make_unique(*this); +} + +std::unique_ptr device::import_fence(pid_t pid, int ehdl) { + return std::make_unique(*this, import_fd(pid, ehdl)); +} + +std::unique_ptr device::import_bo(int ehdl) const { + return std::make_unique(this->m_pdev, ehdl); +} + +std::vector device::read_aie_mem(uint16_t col, uint16_t row, + uint32_t offset, uint32_t size) { + amdxdna_drm_aie_mem mem{}; + std::vector store_buf(size); + mem.col = col; + mem.row = row; + mem.addr = offset; + mem.size = size; + mem.buf_p = reinterpret_cast(store_buf.data()); + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_READ_AIE_MEM, + .buffer_size = sizeof(mem), + .buffer = reinterpret_cast(&mem)}; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_GET_INFO, &arg); + return store_buf; +} + +uint32_t device::read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr) { + amdxdna_drm_aie_reg reg{}; + reg.col = col; + reg.row = row; + reg.addr = reg_addr; + reg.val = 0; + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_READ_AIE_REG, + .buffer_size = sizeof(reg), + .buffer = reinterpret_cast(®)}; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_GET_INFO, &arg); + return reg.val; +} + +size_t device::write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + const std::vector &buf) { + amdxdna_drm_aie_mem mem{}; + uint32_t size = static_cast(buf.size()); + mem.col = col; + mem.row = row; + mem.addr = offset; + mem.size = size; + mem.buf_p = reinterpret_cast(buf.data()); + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_WRITE_AIE_MEM, + .buffer_size = sizeof(mem), + .buffer = reinterpret_cast(&mem)}; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_SET_STATE, &arg); + return size; +} + +void device::write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, + uint32_t reg_val) { + amdxdna_drm_aie_reg reg{}; + reg.col = col; + reg.row = row; + reg.addr = reg_addr; + reg.val = reg_val; + amdxdna_drm_get_info arg = {.param = DRM_AMDXDNA_WRITE_AIE_REG, + .buffer_size = sizeof(reg), + .buffer = reinterpret_cast(®)}; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_SET_STATE, &arg); +} + +std::string read_sysfs(const std::string &filename) { + std::ifstream file(filename); + std::string line; + if (file.is_open()) { + std::getline(file, line); + file.close(); + } else { + std::cerr << "Error opening file: " << filename << std::endl; + line = ""; + } + return line; +} + +std::filesystem::path find_npu_device() { + const std::filesystem::path drvpath = "/sys/bus/pci/drivers/amdxdna"; + for (auto const &dir_entry : std::filesystem::directory_iterator{drvpath}) + if (dir_entry.is_symlink()) { + std::cout << dir_entry.path() << '\n'; + auto actual_path = drvpath / std::filesystem::read_symlink(dir_entry); + auto rel = std::filesystem::relative(actual_path, "/sys/devices"); + if (!rel.empty() && rel.native()[0] != '.') return absolute(actual_path); + } + shim_err(errno, "No npu device found"); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h new file mode 100644 index 000000000..8ace4e79d --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/device.h @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef PCIE_DEVICE_LINUX_XDNA_H +#define PCIE_DEVICE_LINUX_XDNA_H + +#include +#include + +#include "fence.h" +#include "xrt_mem.h" + +namespace shim_xdna { +struct pdev; +struct bo; + +struct pdev { + mutable std::mutex m_lock; + mutable int m_dev_fd = -1; + mutable std::unique_ptr m_dev_heap_bo; + + pdev(); + ~pdev(); + + void ioctl(unsigned long cmd, void *arg) const; + void *mmap(void *addr, size_t len, int prot, int flags, off_t offset) const; +}; + +struct device { + enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; + pdev m_pdev; + uint32_t n_rows; + uint32_t n_cols; + + device(uint32_t n_rows, uint32_t n_cols); + ~device(); + + std::unique_ptr import_bo(int ehdl) const; + const pdev &get_pdev() const; + + std::unique_ptr alloc_bo(uint32_t ctx_id, size_t size, + shim_xcl_bo_flags flags); + std::unique_ptr alloc_bo(size_t size, uint32_t flags); + std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); + std::unique_ptr import_bo(pid_t, int); + + hw_ctx create_hw_context(const std::vector &pdi, + const std::string &cu_name, + const std::map &qos); + hw_ctx create_hw_context(const std::vector &pdi, + const std::string &cu_name); + + std::vector read_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + uint32_t size); + size_t write_aie_mem(uint16_t col, uint16_t row, uint32_t offset, + const std::vector &buf); + uint32_t read_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr); + void write_aie_reg(uint16_t col, uint16_t row, uint32_t reg_addr, + uint32_t reg_val); + + std::unique_ptr create_fence(fence_handle::access_mode); + std::unique_ptr import_fence(pid_t, int); +}; + +std::string read_sysfs(const std::string &filename); +std::filesystem::path find_npu_device(); + +} // namespace shim_xdna + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h new file mode 100644 index 000000000..058b68530 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/ert.h @@ -0,0 +1,1163 @@ +/* + * Copyright (C) 2019-2022, Xilinx Inc + * + * This file is dual licensed. It may be redistributed and/or modified + * under the terms of the Apache 2.0 License OR version 2 of the GNU + * General Public License. + * + * Apache License Verbiage + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * GPL license Verbiage: + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. This program is + * distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public + * License for more details. You should have received a copy of the + * GNU General Public License along with this program; if not, write + * to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, + * Boston, MA 02111-1307 USA + * + */ + +/** + * DOC: XRT Embedded Runtime definition + * + * Header file *ert.h* defines data structures used by Emebdded Runtime (ERT) + * and XRT xclExecBuf() API. + */ + +#ifndef _ERT_H_ +#define _ERT_H_ + +#if defined(__linux__) && defined(__KERNEL__) +#include +#elif defined(__windows__) && defined(_KERNEL_MODE) +#include +#elif defined(__cplusplus) && !defined(_KERNEL_MODE) +#include +#include +#else +#include +#include +#include +#endif + +#ifdef _WIN32 +#pragma warning(push) +#pragma warning(disable : 4200 4201) +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif + +#define to_cfg_pkg(pkg) ((struct ert_configure_cmd *)(pkg)) +#define to_start_krnl_pkg(pkg) ((struct ert_start_kernel_cmd *)(pkg)) +#define to_copybo_pkg(pkg) ((struct ert_start_copybo_cmd *)(pkg)) +#define to_cfg_sk_pkg(pkg) ((struct ert_configure_sk_cmd *)(pkg)) +#define to_init_krnl_pkg(pkg) ((struct ert_init_kernel_cmd *)(pkg)) +#define to_validate_pkg(pkg) ((struct ert_validate_cmd *)(pkg)) +#define to_abort_pkg(pkg) ((struct ert_abort_cmd *)(pkg)) + +#define HOST_RW_PATTERN 0xF0F0F0F0 +#define DEVICE_RW_PATTERN 0x0F0F0F0F + +/** + * struct ert_packet: ERT generic packet format + * + * @state: [3-0] current state of a command + * @custom: [11-4] custom per specific commands + * @count: [22-12] number of words in payload (data) + * @opcode: [27-23] opcode identifying specific command + * @type: [31-28] type of command (currently 0) + * @data: count number of words representing packet payload + */ +struct ert_packet { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-28] */ + }; + uint32_t header; + }; + uint32_t data[1]; /* count number of words */ +}; + +/** + * struct ert_start_kernel_cmd: ERT start kernel command format + * + * @state: [3-0] current state of a command + * @stat_enabled: [4] enabled driver to record timestamp for various + * states cmd has gone through. The stat data + * is appended after cmd data. + * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask + * @count: [22-12] number of words following header for cmd data. Not + * include stat data. + * @opcode: [27-23] 0, opcode for start_kernel + * @type: [31-27] 0, type of start_kernel + * + * @cu_mask: first mandatory CU mask + * @data: count-1 number of words representing interpreted payload + * + * The packet payload is comprised of reserved id field, a mandatory CU mask, + * and extra_cu_masks per header field, followed by a CU register map of size + * (count - (1 + extra_cu_masks)) uint32_t words. + */ +struct ert_start_kernel_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t stat_enabled : 1; /* [4] */ + uint32_t unused : 5; /* [9-5] */ + uint32_t extra_cu_masks : 2; /* [11-10] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t cu_mask; /* mandatory cu mask */ + uint32_t data[1]; /* count-1 number of words */ +}; + +/** + * struct ert_dpu_data - interpretation of data payload for ERT_START_DPU + * + * @instruction_buffer: address of instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @chained: number of following ert_dpu_data elements + * + * The ert_dpu_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_dpu_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_DPU is interpreted as fixed instruction + * buffer address along with instruction count, followed by regular kernel + * arguments. + */ +struct ert_dpu_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t chained; /* number of following ert_dpu_data elements */ +}; + +/** + * struct ert_npu_data - interpretation of data payload for ERT_START_NPU + * + * @instruction_buffer: address of instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @instruction_prop_count: WORD length of property name value pairs + * + * The ert_npu_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_npu_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_NPU is interpreted as instruction + * buffer address, instruction count along with instruction property, + * followed by regular kernel arguments. + * + * When instruction_prop_count is non-zero, it indicates the length + * (in 32 bits WORD) of the instruction buffer properties after this + * fields. This count is reserved for future extension. One example + * propertiy is the number of actual columns this instruction used. + */ +struct ert_npu_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t + instruction_prop_count; /* WORD length of following properties nv pairs */ +}; + +/** + * struct ert_npu_preempt_data - interpretation of data payload for + * ERT_START_NPU_PREEMPT + * + * @instruction_buffer: address of instruction buffer + * @save_buffer: address of save instruction buffer + * @restore_buffer: address of restrore instruction buffer + * @instruction_buffer_size: size of instruction buffer in bytes + * @save_buffer_size: size of save instruction buffer in bytes + * @restore_buffer_size: size of restore instruction buffer in bytes + * @instruction_prop_count: number of property name value pairs + * + * The ert_npu_preempt_data is prepended to data payload of ert_start_kernel_cmd + * after any extra cu masks. The payload count of the ert packet is + * incremented with the size (words) of ert_npu_preempt_data elements + * preprended to the data payload. + * + * The data payload for ERT_START_NPU_PREEMPT is interpreted as instruction + * buffer, save instruction buffer, restore instruction buffer and their + * size, along with instruction property, followed by regular kernel arguments. + * + * When instruction_prop_count is non-zero, it indicates the length + * (in 32 bits WORD) of the instruction buffer properties after this + * fields. This count is reserved for future extension. One example + * propertiy is the number of actual columns this instruction used. + */ +struct ert_npu_preempt_data { + uint64_t instruction_buffer; /* buffer address 2 words */ + uint64_t save_buffer; /* buffer address 2 words */ + uint64_t restore_buffer; /* buffer address 2 words */ + uint32_t instruction_buffer_size; /* size of buffer in bytes */ + uint32_t save_buffer_size; /* size of buffer in bytes */ + uint32_t restore_buffer_size; /* size of buffer in bytes */ + uint32_t instruction_prop_count; /* DWORD length of following properties nv + pairs */ +}; + +/** + * struct ert_cmd_chain_data - interpretation of data payload for ERT_CMD_CHAIN + * + * @command_count: number of commands in chain + * @submit_index: index of last successfully submitted command in chain + * @error_index: index of failing command if cmd status is not completed + * @data[]: address of each command in chain + * + * This is the payload of an *ert_packet* when the opcode is ERT_CMD_CHAIN + */ +struct ert_cmd_chain_data { + uint32_t command_count; + uint32_t submit_index; + uint32_t error_index; + uint32_t reserved[3]; + uint64_t data[]; +}; + +#ifndef U30_DEBUG +#define ert_write_return_code(cmd, value) \ + do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + int end_idx = skcmd->count - 1 - skcmd->extra_cu_masks; \ + skcmd->data[end_idx] = value; \ + } while (0) + +#define ert_read_return_code(cmd, ret) \ + do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + int end_idx = skcmd->count - 1 - skcmd->extra_cu_masks; \ + ret = skcmd->data[end_idx]; \ + } while (0) +#else +/* These are for debug legacy U30 firmware */ +#define ert_write_return_code(cmd, value) \ + do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + skcmd->cu_mask = value; \ + } while (0) + +#define ert_read_return_code(cmd, ret) \ + do { \ + struct ert_start_kernel_cmd *skcmd = (struct ert_start_kernel_cmd *)cmd; \ + ret = skcmd->cu_mask; \ + } while (0) +#endif + +/** + * struct ert_init_kernel_cmd: ERT initialize kernel command format + * this command initializes CUs by writing CU registers. CUs are + * represented by cu_mask and extra_cu_masks. + * + * @state: [3-0] current state of a command + * @update_rtp: [4] command is for runtime update of cu argument + * @extra_cu_masks: [11-10] extra CU masks in addition to mandatory mask + * @count: [22-12] number of words following header + * @opcode: [27-23] 0, opcode for init_kernel + * @type: [31-27] 0, type of init_kernel + * + * @cu_run_timeout the configured CU timeout value in Microseconds + * setting to 0 means CU should not timeout + * @cu_reset_timeout the configured CU reset timeout value in Microseconds + * when CU timeout, CU will be reset. this indicates + * CU reset should be completed within the timeout value. + * if cu_run_timeout is set to 0, this field is undefined. + * + * @cu_mask: first mandatory CU mask + * @data: count-9 number of words representing interpreted payload + * + * The packet payload is comprised of reserved id field, 8 reserved fields, + * a mandatory CU mask, and extra_cu_masks per header field, followed by a + * CU register map of size (count - (9 + extra_cu_masks)) uint32_t words. + */ +struct ert_init_kernel_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t update_rtp : 1; /* [4] */ + uint32_t unused : 5; /* [9-5] */ + uint32_t extra_cu_masks : 2; /* [11-10] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + uint32_t cu_run_timeout; /* CU timeout value in Microseconds */ + uint32_t cu_reset_timeout; /* CU reset timeout value in Microseconds */ + uint32_t reserved[6]; /* reserved for future use */ + + /* payload */ + uint32_t cu_mask; /* mandatory cu mask */ + uint32_t data[1]; /* count-9 number of words */ +}; + +#define KDMA_BLOCK_SIZE 64 /* Limited by KDMA CU */ +struct ert_start_copybo_cmd { + uint32_t state : 4; /* [3-0], must be ERT_CMD_STATE_NEW */ + uint32_t unused : 6; /* [9-4] */ + uint32_t extra_cu_masks : 2; /* [11-10], = 3 */ + uint32_t count : 11; /* [22-12], = 16, exclude 'arg' */ + uint32_t opcode : 5; /* [27-23], = ERT_START_COPYBO */ + uint32_t type : 4; /* [31-27], = ERT_DEFAULT */ + uint32_t cu_mask[4]; /* mandatory cu masks */ + uint32_t reserved[4]; /* for scheduler use */ + uint32_t src_addr_lo; /* low 32 bit of src addr */ + uint32_t src_addr_hi; /* high 32 bit of src addr */ + uint32_t src_bo_hdl; /* src bo handle, cleared by driver */ + uint32_t dst_addr_lo; /* low 32 bit of dst addr */ + uint32_t dst_addr_hi; /* high 32 bit of dst addr */ + uint32_t dst_bo_hdl; /* dst bo handle, cleared by driver */ + uint32_t size; /* size in bytes low 32 bit*/ + uint32_t size_hi; /* size in bytes high 32 bit*/ + void *arg; /* pointer to aux data for KDS */ +}; + +/** + * struct ert_configure_cmd: ERT configure command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload (5 + num_cus) + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @slot_size: command queue slot size + * @num_cus: number of compute units in program + * @cu_shift: shift value to convert CU idx to CU addr + * @cu_base_addr: base address to add to CU addr for actual physical address + * + * @ert:1 enable embedded HW scheduler + * @polling:1 poll for command completion + * @cu_dma:1 enable CUDMA custom module for HW scheduler + * @cu_isr:1 enable CUISR custom module for HW scheduler + * @cq_int:1 enable interrupt from host to HW scheduler + * @cdma:1 enable CDMA kernel + * @unused:25 + * @dsa52:1 reserved for internal use + * + * @data: addresses of @num_cus CUs + */ +struct ert_configure_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t slot_size; + uint32_t num_cus; + uint32_t cu_shift; + uint32_t cu_base_addr; + + /* features */ + uint32_t ert : 1; + uint32_t polling : 1; + uint32_t cu_dma : 1; + uint32_t cu_isr : 1; + uint32_t cq_int : 1; + uint32_t cdma : 1; + uint32_t dataflow : 1; + /* WORKAROUND: allow xclRegWrite/xclRegRead access shared CU */ + uint32_t rw_shared : 1; + uint32_t kds_30 : 1; + uint32_t dmsg : 1; + uint32_t echo : 1; + uint32_t intr : 1; + uint32_t unusedf : 19; + uint32_t dsa52 : 1; + + /* cu address map size is num_cus */ + uint32_t data[1]; +}; + +/* + * Note: We need to put maximum 128 soft kernel image + * in one config command (1024 DWs including header). + * So each one needs to be smaller than 8 DWs. + * + * This data struct is obsoleted. Only used in legacy ERT firmware. + * Use 'struct config_sk_image_uuid' instead on XGQ based ERT. + * + * @start_cuidx: start index of compute units of each image + * @num_cus: number of compute units of each image + * @sk_name: symbol name of soft kernel of each image + */ +struct config_sk_image { + uint32_t start_cuidx; + uint32_t num_cus; + uint32_t sk_name[5]; +}; + +/* + * Note: We need to put maximum 128 soft kernel image + * in one config command (1024 DWs including header). + * So each one needs to be smaller than 8 DWs. + * + * @start_cuidx: start index of compute units of each image + * @num_cus: number of compute units of each image + * @sk_name: symbol name of soft kernel of each image + * @sk_uuid: xclbin uuid that this soft kernel image belones to + */ +struct config_sk_image_uuid { + uint32_t start_cuidx; + uint32_t num_cus; + uint32_t sk_name[5]; + unsigned char sk_uuid[16]; + uint32_t slot_id; +}; + +/** + * struct ert_configure_sk_cmd: ERT configure soft kernel command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @num_image: number of images + */ +struct ert_configure_sk_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t num_image; + struct config_sk_image image[1]; +}; + +/** + * struct ert_unconfigure_sk_cmd: ERT unconfigure soft kernel command format + * + * @state: [3-0] current state of a command + * @count: [22-12] number of words in payload + * @opcode: [27-23] 1, opcode for configure + * @type: [31-27] 0, type of configure + * + * @start_cuidx: start index of compute units + * @num_cus: number of compute units in program + */ +struct ert_unconfigure_sk_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t unused : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint32_t start_cuidx; + uint32_t num_cus; +}; + +/** + * struct ert_abort_cmd: ERT abort command format. + * + * @exec_bo_handle: The bo handle of execbuf command to abort + */ +struct ert_abort_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + + /* payload */ + uint64_t exec_bo_handle; +}; + +/** + * struct ert_validate_cmd: ERT BIST command format. + * + */ +struct ert_validate_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + uint32_t timestamp; + uint32_t cq_read_single; + uint32_t cq_write_single; + uint32_t cu_read_single; + uint32_t cu_write_single; +}; + +/** + * struct ert_validate_cmd: ERT BIST command format. + * + */ +struct ert_access_valid_cmd { + union { + struct { + uint32_t state : 4; /* [3-0] */ + uint32_t custom : 8; /* [11-4] */ + uint32_t count : 11; /* [22-12] */ + uint32_t opcode : 5; /* [27-23] */ + uint32_t type : 4; /* [31-27] */ + }; + uint32_t header; + }; + uint32_t h2h_access; + uint32_t h2d_access; + uint32_t d2h_access; + uint32_t d2d_access; + uint32_t d2cu_access; + uint32_t wr_count; + uint32_t wr_test; +}; + +/** + * ERT command state + * + * @ERT_CMD_STATE_NEW: Set by host before submitting a command to + * scheduler + * @ERT_CMD_STATE_QUEUED: Internal scheduler state + * @ERT_CMD_STATE_SUBMITTED: Internal scheduler state + * @ERT_CMD_STATE_RUNNING: Internal scheduler state + * @ERT_CMD_STATE_COMPLETED: Set by scheduler when command completes + * @ERT_CMD_STATE_ERROR: Set by scheduler if command failed + * @ERT_CMD_STATE_ABORT: Set by scheduler if command abort + * @ERT_CMD_STATE_TIMEOUT: Set by scheduler if command timeout and reset + * @ERT_CMD_STATE_NORESPONSE: Set by scheduler if command timeout and fail to + * reset + */ +enum ert_cmd_state { + ERT_CMD_STATE_NEW = 1, + ERT_CMD_STATE_QUEUED = 2, + ERT_CMD_STATE_RUNNING = 3, + ERT_CMD_STATE_COMPLETED = 4, + ERT_CMD_STATE_ERROR = 5, + ERT_CMD_STATE_ABORT = 6, + ERT_CMD_STATE_SUBMITTED = 7, + ERT_CMD_STATE_TIMEOUT = 8, + ERT_CMD_STATE_NORESPONSE = 9, + ERT_CMD_STATE_SKERROR = 10, // Check for error return code from Soft Kernel + ERT_CMD_STATE_SKCRASHED = 11, // Soft kernel has crashed + ERT_CMD_STATE_MAX, // Always the last one +}; + +struct cu_cmd_state_timestamps { + uint64_t skc_timestamps[ERT_CMD_STATE_MAX]; // In nano-second +}; + +/** + * Opcode types for commands + * + * @ERT_START_CU: start a workgroup on a CU + * @ERT_START_KERNEL: currently aliased to ERT_START_CU + * @ERT_CONFIGURE: configure command scheduler + * @ERT_EXEC_WRITE: execute a specified CU after writing + * @ERT_CU_STAT: get stats about CU execution + * @ERT_START_COPYBO: start KDMA CU or P2P, may be converted to + * ERT_START_CU before cmd reach to scheduler, short-term hack + * @ERT_SK_CONFIG: configure soft kernel + * @ERT_SK_START: start a soft kernel + * @ERT_SK_UNCONFIG: unconfigure a soft kernel + * @ERT_START_KEY_VAL: same as ERT_START_CU but with key-value pair flavor + * @ERT_START_DPU: instruction buffer command format + * @ERT_CMD_CHAIN: command chain + * @ERT_START_NPU: instruction buffer command format on NPU format + * @ERT_START_NPU_PREEMPT: instruction buffer command with preemption format on + * NPU + */ +enum ert_cmd_opcode { + ERT_START_CU = 0, + ERT_START_KERNEL = 0, + ERT_CONFIGURE = 2, + ERT_EXIT = 3, + ERT_ABORT = 4, + ERT_EXEC_WRITE = 5, + ERT_CU_STAT = 6, + ERT_START_COPYBO = 7, + ERT_SK_CONFIG = 8, + ERT_SK_START = 9, + ERT_SK_UNCONFIG = 10, + ERT_INIT_CU = 11, + ERT_START_FA = 12, + ERT_CLK_CALIB = 13, + ERT_MB_VALIDATE = 14, + ERT_START_KEY_VAL = 15, + ERT_ACCESS_TEST_C = 16, + ERT_ACCESS_TEST = 17, + ERT_START_DPU = 18, + ERT_CMD_CHAIN = 19, + ERT_START_NPU = 20, + ERT_START_NPU_PREEMPT = 21, +}; + +/** + * Command types + * + * @ERT_DEFAULT: default command type + * @ERT_KDS_LOCAL: command processed by KDS locally + * @ERT_CTRL: control command uses reserved command queue slot + * @ERT_CU: compute unit command + */ +enum ert_cmd_type { + ERT_DEFAULT = 0, + ERT_KDS_LOCAL = 1, + ERT_CTRL = 2, + ERT_CU = 3, + ERT_SCU = 4, +}; + +/** + * Soft kernel types + * + * @SOFTKERNEL_TYPE_EXEC: executable + */ +enum softkernel_type { + SOFTKERNEL_TYPE_EXEC = 0, +}; + +/* + * Base address GPIO per spec + * | Offset | Description + * ----------------------- + * | 0x00 | ERT_MGMT_PF_base_addr (Not sure where this should be use) + * | 0x08 | ERT_USER_PF_base_addr. The base address of ERT peripherals + */ +#if defined(ERT_BUILD_V20) +uint32_t ert_base_addr = 0; +#define ERT_BASE_ADDR 0x01F30008 +#endif + +#if defined(ERT_BUILD_V30) +uint32_t ert_base_addr = 0; +#define ERT_BASE_ADDR 0x01F30008 +#endif + +/** + * Address constants per spec + */ +#define ERT_WORD_SIZE 4 /* 4 bytes */ +#define ERT_CQ_SIZE 0x10000 /* 64K */ +#if defined(ERT_BUILD_U50) +#define ERT_CQ_BASE_ADDR 0x340000 +#define ERT_CSR_ADDR 0x360000 +#elif defined(ERT_BUILD_V20) +#define ERT_CQ_BASE_ADDR (0x000000 + ert_base_addr) +#define ERT_CSR_ADDR (0x010000 + ert_base_addr) +#elif defined(ERT_BUILD_V30) +#define ERT_CQ_BASE_ADDR 0x1F60000 +#define ERT_CSR_ADDR (0x010000 + ert_base_addr) +#else +#define ERT_CQ_BASE_ADDR 0x190000 +#define ERT_CSR_ADDR 0x180000 +#endif + +/** + * The STATUS REGISTER is for communicating completed CQ slot indices + * MicroBlaze write, host reads. MB(W) / HOST(COR) + */ +#define ERT_STATUS_REGISTER_ADDR (ERT_CSR_ADDR) +#define ERT_STATUS_REGISTER_ADDR0 (ERT_CSR_ADDR) +#define ERT_STATUS_REGISTER_ADDR1 (ERT_CSR_ADDR + 0x4) +#define ERT_STATUS_REGISTER_ADDR2 (ERT_CSR_ADDR + 0x8) +#define ERT_STATUS_REGISTER_ADDR3 (ERT_CSR_ADDR + 0xC) + +/** + * The CU DMA REGISTER is for communicating which CQ slot is to be started + * on a specific CU. MB selects a free CU on which the command can + * run, then writes the 1<state = ERT_CMD_STATE_NEW; + pkt->extra_cu_masks = 3; + pkt->count = 16; + pkt->opcode = ERT_START_COPYBO; + pkt->type = ERT_DEFAULT; + pkt->cu_mask[0] = 0; + pkt->cu_mask[1] = 0; + pkt->cu_mask[2] = 0; + pkt->cu_mask[3] = 0; + pkt->src_addr_lo = (uint32_t)src_offset; + pkt->src_addr_hi = (src_offset >> 32) & 0xFFFFFFFF; + pkt->src_bo_hdl = src_bo; + pkt->dst_addr_lo = (uint32_t)dst_offset; + pkt->dst_addr_hi = (dst_offset >> 32) & 0xFFFFFFFF; + pkt->dst_bo_hdl = dst_bo; + pkt->size = size; + pkt->size_hi = 0; /* set to 0 explicitly */ + pkt->arg = 0; +} +static inline uint64_t ert_copybo_src_offset(struct ert_start_copybo_cmd *pkt) { + return (uint64_t)pkt->src_addr_hi << 32 | pkt->src_addr_lo; +} +static inline uint64_t ert_copybo_dst_offset(struct ert_start_copybo_cmd *pkt) { + return (uint64_t)pkt->dst_addr_hi << 32 | pkt->dst_addr_lo; +} +static inline uint64_t ert_copybo_size(struct ert_start_copybo_cmd *pkt) { + return pkt->size; +} + +static inline bool ert_valid_opcode(struct ert_packet *pkt) { + struct ert_start_kernel_cmd *skcmd; + struct ert_init_kernel_cmd *ikcmd; + struct ert_start_copybo_cmd *sccmd; + struct ert_configure_cmd *ccmd; + struct ert_configure_sk_cmd *cscmd; + struct ert_cmd_chain_data *ccdata; + bool valid; + + switch (pkt->opcode) { + case ERT_START_CU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 4 registers */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 4); + break; + case ERT_START_DPU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + size (in words) of ert_dpu_data + */ + valid = + (skcmd->count >= 1 + skcmd->extra_cu_masks + + sizeof(struct ert_dpu_data) / sizeof(uint32_t)); + break; + case ERT_CMD_CHAIN: + ccdata = (struct ert_cmd_chain_data *)pkt->data; + /* header count must match number of commands in payload */ + valid = (pkt->count == (ccdata->command_count * sizeof(uint64_t) + + sizeof(struct ert_cmd_chain_data)) / + sizeof(uint32_t)); + break; + case ERT_START_NPU: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + ert_npu_data */ + valid = + (skcmd->count >= 1 + skcmd->extra_cu_masks + + sizeof(struct ert_npu_data) / sizeof(uint32_t)); + break; + case ERT_START_NPU_PREEMPT: + skcmd = to_start_krnl_pkg(pkt); + /* 1 mandatory cumask + extra_cu_masks + ert_npu_preempt_data */ + valid = (skcmd->count >= + 1 + skcmd->extra_cu_masks + + sizeof(struct ert_npu_preempt_data) / sizeof(uint32_t)); + break; + case ERT_START_KEY_VAL: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1); + break; + case ERT_EXEC_WRITE: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 6 registers */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 6); + break; + case ERT_START_FA: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1); + break; + case ERT_SK_START: + skcmd = to_start_krnl_pkg(pkt); + /* 1 cu mask + 1 control word */ + valid = (skcmd->count >= skcmd->extra_cu_masks + 1 + 1); + break; + case ERT_CONFIGURE: + ccmd = to_cfg_pkg(pkt); + /* 5 mandatory fields in struct */ + valid = (ccmd->count >= 5 + ccmd->num_cus); + break; + case ERT_START_COPYBO: + sccmd = to_copybo_pkg(pkt); + valid = (sccmd->count == 16); + break; + case ERT_INIT_CU: + ikcmd = to_init_krnl_pkg(pkt); + /* 9 mandatory words in struct + 4 control registers */ + valid = (ikcmd->count >= ikcmd->extra_cu_masks + 9 + 4); + break; + case ERT_SK_CONFIG: + cscmd = to_cfg_sk_pkg(pkt); + valid = (cscmd->count == + sizeof(struct config_sk_image) * cscmd->num_image / 4 + 1); + break; + case ERT_CLK_CALIB: + case ERT_MB_VALIDATE: + case ERT_ACCESS_TEST_C: + case ERT_CU_STAT: /* TODO: Rules to validate? */ + case ERT_EXIT: + case ERT_ABORT: + valid = true; + break; + case ERT_SK_UNCONFIG: /* NOTE: obsolete */ + default: + valid = false; + } + + return valid; +} + +static inline uint64_t get_ert_packet_size_bytes(struct ert_packet *pkt) { + // header plus payload + return sizeof(pkt->header) + pkt->count * sizeof(uint32_t); +} + +static inline struct ert_dpu_data *get_ert_dpu_data( + struct ert_start_kernel_cmd *pkt) { + if (pkt->opcode != ERT_START_DPU) return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_dpu_data *)(pkt->data + pkt->extra_cu_masks); +} + +static inline struct ert_dpu_data *get_ert_dpu_data_next( + struct ert_dpu_data *dpu_data) { + if (dpu_data->chained == 0) return NULL; + + return dpu_data + 1; +} + +static inline struct ert_cmd_chain_data *get_ert_cmd_chain_data( + struct ert_packet *pkt) { + if (pkt->opcode != ERT_CMD_CHAIN) return NULL; + + return (struct ert_cmd_chain_data *)pkt->data; +} + +static inline struct ert_npu_data *get_ert_npu_data( + struct ert_start_kernel_cmd *pkt) { + if (pkt->opcode != ERT_START_NPU) return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_npu_data *)(pkt->data + pkt->extra_cu_masks); +} + +static inline struct ert_npu_preempt_data *get_ert_npu_preempt_data( + struct ert_start_kernel_cmd *pkt) { + if (pkt->opcode != ERT_START_NPU_PREEMPT) return NULL; + + // past extra cu_masks embedded in the packet data + return (struct ert_npu_preempt_data *)(pkt->data + pkt->extra_cu_masks); +} + +static inline uint32_t *get_ert_regmap_begin(struct ert_start_kernel_cmd *pkt) { + switch (pkt->opcode) { + case ERT_START_DPU: + return pkt->data + pkt->extra_cu_masks + + (get_ert_dpu_data(pkt)->chained + 1) * + sizeof(struct ert_dpu_data) / sizeof(uint32_t); + + case ERT_START_NPU: + return pkt->data + pkt->extra_cu_masks + + sizeof(struct ert_npu_data) / sizeof(uint32_t) + + get_ert_npu_data(pkt)->instruction_prop_count; + + case ERT_START_NPU_PREEMPT: + return pkt->data + pkt->extra_cu_masks + + sizeof(struct ert_npu_preempt_data) / sizeof(uint32_t) + + get_ert_npu_preempt_data(pkt)->instruction_prop_count; + + default: + // skip past embedded extra cu_masks + return pkt->data + pkt->extra_cu_masks; + } +} + +static inline uint32_t *get_ert_regmap_end(struct ert_start_kernel_cmd *pkt) { + // pkt->count includes the mandatory cumask which precededs data array + return &pkt->cu_mask + pkt->count; +} + +static inline uint64_t get_ert_regmap_size_bytes( + struct ert_start_kernel_cmd *pkt) { + return (get_ert_regmap_end(pkt) - get_ert_regmap_begin(pkt)) * + sizeof(uint32_t); +} + +#ifdef __linux__ +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) +static inline struct cu_cmd_state_timestamps *ert_start_kernel_timestamps( + struct ert_start_kernel_cmd *pkt) { + uint64_t offset = pkt->count * sizeof(uint32_t) + sizeof(pkt->header); + /* Make sure the offset of timestamps are properly aligned. */ + return ( + struct cu_cmd_state_timestamps *)((char *)pkt + + P2ROUNDUP(offset, sizeof(uint64_t))); +} + +/* Return 0 if this pkt doesn't support timestamp or disabled */ +static inline int get_size_with_timestamps_or_zero(struct ert_packet *pkt) { + struct ert_start_kernel_cmd *skcmd; + int size = 0; + + switch (pkt->opcode) { + case ERT_START_CU: + case ERT_EXEC_WRITE: + case ERT_START_FA: + case ERT_SK_START: + skcmd = to_start_krnl_pkg(pkt); + if (skcmd->stat_enabled) { + size = (char *)ert_start_kernel_timestamps(skcmd) - (char *)pkt; + size += sizeof(struct cu_cmd_state_timestamps); + } + } + + return size; +} +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#ifdef _WIN32 +#pragma warning(pop) +#endif + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp new file mode 100644 index 000000000..06fd948fc --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.cpp @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "fence.h" + +#include + +#include + +#include "hwctx.h" +#include "shim_debug.h" + +namespace { + +uint32_t create_syncobj(const shim_xdna::pdev &dev) { + drm_syncobj_create csobj = {.handle = AMDXDNA_INVALID_FENCE_HANDLE, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_CREATE, &csobj); + return csobj.handle; +} + +void destroy_syncobj(const shim_xdna::pdev &dev, uint32_t hdl) { + drm_syncobj_destroy dsobj = {.handle = hdl}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_DESTROY, &dsobj); +} + +uint64_t query_syncobj_timeline(const shim_xdna::pdev &dev, uint32_t sobj_hdl) { + uint64_t point = 0; + drm_syncobj_timeline_array sobjs = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&point), + .count_handles = 1, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_QUERY, &sobjs); + return point; +} + +int export_syncobj(const shim_xdna::pdev &dev, uint32_t sobj_hdl) { + drm_syncobj_handle esobj = { + .handle = sobj_hdl, + .flags = 0, + .fd = -1, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_HANDLE_TO_FD, &esobj); + return esobj.fd; +} + +uint32_t import_syncobj(const shim_xdna::pdev &dev, int fd) { + drm_syncobj_handle isobj = { + .handle = AMDXDNA_INVALID_FENCE_HANDLE, + .flags = 0, + .fd = fd, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_FD_TO_HANDLE, &isobj); + return isobj.handle; +} + +void signal_syncobj(const shim_xdna::pdev &dev, uint32_t sobj_hdl, + uint64_t timepoint) { + drm_syncobj_timeline_array sobjs = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&timepoint), + .count_handles = 1, + .flags = 0}; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL, &sobjs); +} + +void wait_syncobj_done(const shim_xdna::pdev &dev, uint32_t sobj_hdl, + uint64_t timepoint) { + drm_syncobj_timeline_wait wsobj = { + .handles = reinterpret_cast(&sobj_hdl), + .points = reinterpret_cast(&timepoint), + .timeout_nsec = std::numeric_limits::max(), /* wait forever */ + .count_handles = 1, + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wsobj); +} + +void wait_syncobj_available(const shim_xdna::pdev &dev, + const uint32_t *sobj_hdls, + const uint64_t *timepoints, uint32_t num) { + drm_syncobj_timeline_wait wsobj = { + .handles = reinterpret_cast(sobj_hdls), + .points = reinterpret_cast(timepoints), + .timeout_nsec = std::numeric_limits::max(), /* wait forever */ + .count_handles = num, + .flags = DRM_SYNCOBJ_WAIT_FLAGS_WAIT_ALL | + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_FOR_SUBMIT | + DRM_SYNCOBJ_WAIT_FLAGS_WAIT_AVAILABLE, + }; + dev.ioctl(DRM_IOCTL_SYNCOBJ_TIMELINE_WAIT, &wsobj); +} + +void submit_wait_syncobjs(const shim_xdna::pdev &dev, + const shim_xdna::hw_ctx *ctx, + const uint32_t *sobj_hdls, const uint64_t *points, + uint32_t num) { + wait_syncobj_available(dev, sobj_hdls, points, num); + + amdxdna_drm_exec_cmd ecmd = { + .hwctx = ctx->m_handle, + .type = AMDXDNA_CMD_SUBMIT_DEPENDENCY, + .cmd_handles = reinterpret_cast(sobj_hdls), + .args = reinterpret_cast(points), + .cmd_count = num, + .arg_count = num, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); +} + +void submit_signal_syncobj(const shim_xdna::pdev &dev, + const shim_xdna::hw_ctx *ctx, uint32_t sobj_hdl, + uint64_t point) { + amdxdna_drm_exec_cmd ecmd = { + .hwctx = ctx->m_handle, + .type = AMDXDNA_CMD_SUBMIT_SIGNAL, + .cmd_handles = sobj_hdl, + .args = point, + .cmd_count = 1, + .arg_count = 1, + }; + dev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); +} + +} // namespace + +namespace shim_xdna { + +shared_handle::~shared_handle() { + if (m_fd != -1) close(m_fd); +} + +int shared_handle::get_export_handle() const { return m_fd; } + +fence_handle::fence_handle(const device &device) + : m_pdev(device.get_pdev()), + m_import(std::make_unique(-1)), + m_syncobj_hdl(create_syncobj(m_pdev)) { + SHIM_DEBUG("Fence allocated: %d@%d", m_syncobj_hdl, m_state); +} + +fence_handle::fence_handle(const device &device, int ehdl) + : m_pdev(device.get_pdev()), + m_import(std::make_unique(ehdl)), + m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())) { + SHIM_DEBUG("Fence imported: %d@%ld", m_syncobj_hdl, m_state); +} + +fence_handle::fence_handle(const fence_handle &f) + : m_pdev(f.m_pdev), + m_import(f.share_handle()), + m_syncobj_hdl(import_syncobj(m_pdev, m_import->get_export_handle())), + m_signaled{f.m_signaled}, + m_state{f.m_state} { + SHIM_DEBUG("Fence cloned: %d@%ld", m_syncobj_hdl, m_state); +} + +fence_handle::~fence_handle() { + SHIM_DEBUG("Fence going away: %d@%ld", m_syncobj_hdl, m_state); + destroy_syncobj(m_pdev, m_syncobj_hdl); +} + +std::unique_ptr fence_handle::share_handle() const { + if (m_state != initial_state) + shim_err(-EINVAL, "Can't share fence_handle not at initial state."); + + return std::make_unique(export_syncobj(m_pdev, m_syncobj_hdl)); +} + +uint64_t fence_handle::get_next_state() const { return m_state + 1; } + +std::unique_ptr fence_handle::clone() const { + return std::make_unique(*this); +} + +uint64_t fence_handle::wait_next_state() const { + std::lock_guard guard(m_lock); + + if (m_state != initial_state && m_signaled) + shim_err(-EINVAL, + "Can't wait on fence_handle that has been signaled before."); + return ++m_state; +} + +// Timeout value is ignored for now. +void fence_handle::wait(uint32_t timeout_ms) const { + auto st = signal_next_state(); + SHIM_DEBUG("Waiting for command fence_handle %d@%ld", m_syncobj_hdl, st); + wait_syncobj_done(m_pdev, m_syncobj_hdl, st); +} + +void fence_handle::submit_wait(const hw_ctx *ctx) const { + auto st = signal_next_state(); + SHIM_DEBUG("Submitting wait for command fence_handle %d@%ld", m_syncobj_hdl, + st); + submit_wait_syncobjs(m_pdev, ctx, &m_syncobj_hdl, &st, 1); +} + +uint64_t fence_handle::signal_next_state() const { + std::lock_guard guard(m_lock); + + if (m_state != initial_state && !m_signaled) + shim_err(-EINVAL, "Can't signal fence_handle that has been waited before."); + if (m_state == initial_state) m_signaled = true; + return ++m_state; +} + +void fence_handle::signal() const { + auto st = signal_next_state(); + SHIM_DEBUG("Signaling command fence_handle %d@%ld", m_syncobj_hdl, st); + signal_syncobj(m_pdev, m_syncobj_hdl, st); +} + +void fence_handle::submit_signal(const hw_ctx *ctx) const { + auto st = signal_next_state(); + SHIM_DEBUG("Submitting signal command fence_handle %d@%ld", m_syncobj_hdl, + st); + submit_signal_syncobj(m_pdev, ctx, m_syncobj_hdl, st); +} + +void fence_handle::submit_wait( + const pdev &dev, const hw_ctx *ctx, + const std::vector &fences) { + constexpr int max_fences = 1024; + uint32_t hdls[max_fences]; + uint64_t pts[max_fences]; + int i = 0; + + if (fences.size() > max_fences) + shim_err(-EINVAL, "Too many fences in one submit: %d", fences.size()); + + for (auto f : fences) { + auto fh = static_cast(f); + auto st = fh->wait_next_state(); + SHIM_DEBUG("Waiting for command fence_handle %d@%ld", fh->m_syncobj_hdl, + st); + hdls[i] = fh->m_syncobj_hdl; + pts[i] = st; + i++; + } + submit_wait_syncobjs(dev, ctx, hdls, pts, i); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h new file mode 100644 index 000000000..842b85c2d --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/fence.h @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _FENCE_XDNA_H_ +#define _FENCE_XDNA_H_ + +#include +#include +#include + +namespace shim_xdna { +struct pdev; +struct device; +struct hw_ctx; + +struct shared_handle { + const int m_fd; + shared_handle(int fd) : m_fd(fd) {} + ~shared_handle(); + int get_export_handle() const; +}; + +struct fence_handle { + using export_handle = int; + const pdev &m_pdev; + const std::unique_ptr m_import; + uint32_t m_syncobj_hdl; + // Protecting below mutables + mutable std::mutex m_lock; + // Set once at first signal + mutable bool m_signaled = false; + // Ever incrementing at each wait/signal + static constexpr uint64_t initial_state = 0; + mutable uint64_t m_state = initial_state; + enum class access_mode : uint8_t { local, shared, process, hybrid }; + + fence_handle(const device &device); + fence_handle(const device &device, int ehdl); + fence_handle(const fence_handle &); + ~fence_handle(); + + std::unique_ptr clone() const; + std::unique_ptr share_handle() const; + void wait(uint32_t timeout_ms) const; + uint64_t get_next_state() const; + void signal() const; + void submit_wait(const hw_ctx *) const; + static void submit_wait(const pdev &dev, const hw_ctx *, + const std::vector &fences); + void submit_signal(const hw_ctx *) const; + uint64_t wait_next_state() const; + uint64_t signal_next_state() const; +}; + +} // namespace shim_xdna + +#endif // _FENCE_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp new file mode 100644 index 000000000..20a94efd7 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.cpp @@ -0,0 +1,164 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "hwctx.h" + +#include +#include + +#include "bo.h" +#include "hwq.h" +#include "shim_debug.h" + +namespace shim_xdna { + +hw_ctx::hw_ctx(device &dev, const std::map &qos, + std::unique_ptr q, const std::vector &pdi, + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols) + : m_device(dev), + m_q(std::move(q)), + m_num_rows(n_rows), + m_num_cols(n_cols), + m_doorbell(0), + m_log_buf(nullptr) { + SHIM_DEBUG("Creating HW context..."); + + for (auto &[key, value] : qos) { + if (key == "gops") + m_qos.gops = value; + else if (key == "fps") + m_qos.fps = value; + else if (key == "dma_bandwidth") + m_qos.dma_bandwidth = value; + else if (key == "latency") + m_qos.latency = value; + else if (key == "frame_execution_time") + m_qos.frame_exec_time = value; + else if (key == "priority") + m_qos.priority = value; + } + + // TODO(max): multiple pdis? + m_cu_info.push_back( + {.m_name = cu_name, .m_func = /*functional*/ 0, .m_pdi = pdi}); + + if (m_cu_info.empty()) + shim_err(EINVAL, "No valid DPU kernel found in xclbin"); + // TODO(max): configure this + m_ops_per_cycle = 2048; +} + +hw_ctx::hw_ctx(device &device, const std::vector &pdi, + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols, + const std::map &qos) + : hw_ctx(device, qos, std::make_unique(device), pdi, cu_name, n_rows, + n_cols) { + create_ctx_on_device(); + std::vector cu_conf_param_buf(sizeof(amdxdna_hwctx_param_config_cu) + + m_cu_info.size() * + sizeof(amdxdna_cu_config)); + auto cu_conf_param = reinterpret_cast( + cu_conf_param_buf.data()); + + cu_conf_param->num_cus = m_cu_info.size(); + shim_xcl_bo_flags f = {}; + f.flags = XRT_BO_FLAGS_CACHEABLE; + for (int i = 0; i < m_cu_info.size(); i++) { + cu_info &ci = m_cu_info[i]; + + m_pdi_bos.push_back(alloc_bo(ci.m_pdi.size(), f)); + std::unique_ptr &pdi_bo = m_pdi_bos[i]; + char *pdi_vaddr = reinterpret_cast(pdi_bo->map()); + + // see cu_configs[1] in amdxdna_hwctx_param_config_cu + assert(i < 1 && "only 1 CU supported"); + amdxdna_cu_config &cf = cu_conf_param->cu_configs[i]; + std::memcpy(pdi_vaddr, ci.m_pdi.data(), ci.m_pdi.size()); + pdi_bo->sync(direction::host2device, pdi_bo->get_properties().size, 0); + cf.cu_bo = pdi_bo->get_drm_bo_handle(); + cf.cu_func = ci.m_func; + } + + amdxdna_drm_config_hwctx arg = {}; + arg.handle = m_handle; + arg.param_type = DRM_AMDXDNA_HWCTX_CONFIG_CU; + arg.param_val = reinterpret_cast(cu_conf_param); + arg.param_val_size = cu_conf_param_buf.size(); + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CONFIG_HWCTX, &arg); + + SHIM_DEBUG("Created KMQ HW context (%d)", m_handle); +} + +hw_ctx::~hw_ctx() { + delete_ctx_on_device(); + SHIM_DEBUG("Destroyed HW context (%d)...", m_handle); + SHIM_DEBUG("Destroying KMQ HW context (%d)...", m_handle); +} + +cuidx_t hw_ctx::open_cu_context(const std::string &cu_name) { + for (uint32_t i = 0; i < m_cu_info.size(); i++) { + auto &ci = m_cu_info[i]; + SHIM_DEBUG("ci.m_name %s", ci.m_name.c_str()); + if (ci.m_name == cu_name) return cuidx_t{.index = i}; + } + + shim_err(ENOENT, "CU name (%s) not found", cu_name.c_str()); +} + +std::unique_ptr hw_ctx::alloc_bo(size_t size, shim_xcl_bo_flags flags) { + // const_cast: alloc_bo() is not const yet in device class + // Debug buffer is specific to one context. + if (flags.use == XRT_BO_USE_DEBUG) + return m_device.alloc_bo(m_handle, size, flags); + // Other BOs are shared across all contexts. + return m_device.alloc_bo(AMDXDNA_INVALID_CTX_HANDLE, size, flags); +} + +std::unique_ptr hw_ctx::import_bo(pid_t pid, int ehdl) { + // const_cast: import_bo() is not const yet in device class + return m_device.import_bo(pid, ehdl); +} + +hw_q *hw_ctx::get_hw_queue() const { return m_q.get(); } + +void hw_ctx::create_ctx_on_device() { + amdxdna_drm_create_hwctx arg = {}; + arg.qos_p = reinterpret_cast(&m_qos); + arg.umq_bo = m_q->m_queue_boh; + arg.max_opc = m_ops_per_cycle; + arg.num_tiles = m_num_rows * m_num_cols; + arg.log_buf_bo = + m_log_bo ? m_log_bo->get_drm_bo_handle() : AMDXDNA_INVALID_BO_HANDLE; + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_CREATE_HWCTX, &arg); + + m_handle = arg.handle; + m_doorbell = arg.umq_doorbell; + + m_q->bind_hwctx(this); +} + +void hw_ctx::delete_ctx_on_device() const { + if (m_handle == AMDXDNA_INVALID_CTX_HANDLE) return; + + m_q->unbind_hwctx(); + amdxdna_drm_destroy_hwctx arg = {}; + arg.handle = m_handle; + m_device.get_pdev().ioctl(DRM_IOCTL_AMDXDNA_DESTROY_HWCTX, &arg); + + fini_log_buf(); +} + +void hw_ctx::init_log_buf() { + auto log_buf_size = m_num_cols * 1024; + shim_xcl_bo_flags f; + f.flags = XCL_BO_FLAGS_EXECBUF; + m_log_bo = alloc_bo(log_buf_size, f); + m_log_buf = m_log_bo->map(); + std::memset(m_log_buf, 0, log_buf_size); +} + +void hw_ctx::fini_log_buf() const { + if (m_log_bo) m_log_bo->unmap(m_log_buf); +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h new file mode 100644 index 000000000..7a169e270 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwctx.h @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _HWCTX_XDNA_H_ +#define _HWCTX_XDNA_H_ + +#include + +#include "amdxdna_accel.h" +#include "device.h" + +namespace shim_xdna { + +struct hw_q; +struct bo; +struct device; + +struct cu_info { + std::string m_name; + size_t m_func; + std::vector m_pdi; +}; + +struct cuidx_t { + union { + std::uint32_t index; + struct { + std::uint16_t domain_index; // [15-0] + std::uint16_t domain; // [31-16] + }; + }; + + // Ensure consistent use of domain and index types + using domain_type = uint16_t; + using domain_index_type = uint16_t; +}; + +struct hw_ctx { + enum class access_mode : uint8_t { exclusive = 0, shared = 1 }; + device &m_device; + uint32_t m_handle = AMDXDNA_INVALID_CTX_HANDLE; + amdxdna_qos_info m_qos = {}; + std::vector m_cu_info; + std::unique_ptr m_q; + uint32_t m_ops_per_cycle; + uint32_t m_num_rows; + uint32_t m_num_cols; + uint32_t m_doorbell; + std::unique_ptr m_log_bo; + void *m_log_buf; + std::vector> m_pdi_bos; + + hw_ctx(device &dev, const std::map &qos, + std::unique_ptr q, const std::vector &pdi, + const std::string &cu_name, uint32_t n_rows, uint32_t n_cols); + hw_ctx(device &dev, const std::vector &pdi, + const std::string &cu_name, + uint32_t n_rows, uint32_t n_cols, + const std::map &qos = {}); + ~hw_ctx(); + // no copying + hw_ctx(const hw_ctx &) = delete; + hw_ctx &operator=(const hw_ctx &) = delete; + + std::unique_ptr alloc_bo(size_t size, shim_xcl_bo_flags flags); + std::unique_ptr import_bo(pid_t, int); + + cuidx_t open_cu_context(const std::string &cuname); + void create_ctx_on_device(); + void init_log_buf(); + void fini_log_buf() const; + void delete_ctx_on_device() const; + + hw_q *get_hw_queue() const; +}; + +} // namespace shim_xdna + +#endif // _HWCTX_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp new file mode 100644 index 000000000..b3bcc6b2b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.cpp @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "hwq.h" + +#include + +#include "bo.h" +#include "ert.h" +#include "fence.h" +#include "shim_debug.h" + +namespace { + +ert_packet *get_chained_command_pkt(shim_xdna::bo *boh) { + ert_packet *cmdpkt = reinterpret_cast(boh->map()); + return cmdpkt->opcode == ERT_CMD_CHAIN ? cmdpkt : nullptr; +} + +int wait_cmd(const shim_xdna::pdev &pdev, const shim_xdna::hw_ctx *ctx, + shim_xdna::bo *cmd, uint32_t timeout_ms) { + int ret = 1; + auto id = cmd->get_cmd_id(); + + SHIM_DEBUG("Waiting for cmd (%ld)...", id); + + amdxdna_drm_wait_cmd wcmd = { + .hwctx = ctx->m_handle, + .timeout = timeout_ms, + .seq = id, + }; + + if (::ioctl(pdev.m_dev_fd, DRM_IOCTL_AMDXDNA_WAIT_CMD, &wcmd) == -1) { + if (errno == ETIME) { + ret = 0; + } else { + shim_xdna::shim_err(errno, "DRM_IOCTL_AMDXDNA_WAIT_CMD IOCTL failed"); + } + } + return ret; +} + +} // namespace + +namespace shim_xdna { + +hw_q::hw_q(const device &device) + : m_hwctx(nullptr), + m_pdev(device.get_pdev()), + m_queue_boh(AMDXDNA_INVALID_BO_HANDLE) { + SHIM_DEBUG("Created KMQ HW queue"); +} + +void hw_q::bind_hwctx(const hw_ctx *ctx) { + m_hwctx = ctx; + SHIM_DEBUG("Bond HW queue to HW context %d", m_hwctx->m_handle); +} + +void hw_q::unbind_hwctx() { + SHIM_DEBUG("Unbond HW queue from HW context %d", m_hwctx->m_handle); + m_hwctx = nullptr; +} + +int hw_q::wait_command(bo *cmd, uint32_t timeout_ms) const { + if (poll_command(cmd)) return 1; + return wait_cmd(m_pdev, m_hwctx, cmd, timeout_ms); +} + +void hw_q::submit_wait(const fence_handle *f) { f->submit_wait(m_hwctx); } + +void hw_q::submit_wait(const std::vector &fences) { + fence_handle::submit_wait(m_pdev, m_hwctx, fences); +} + +void hw_q::submit_signal(const fence_handle *f) { f->submit_signal(m_hwctx); } + +hw_q::~hw_q() { SHIM_DEBUG("Destroying KMQ HW queue"); } + +void hw_q::issue_command(bo *cmd_bo) { + // Assuming 1024 max args per cmd bo + const size_t max_arg_bos = 1024; + + uint32_t arg_bo_hdls[max_arg_bos]; + uint32_t cmd_bo_hdl = cmd_bo->get_drm_bo_handle(); + + amdxdna_drm_exec_cmd ecmd = { + .hwctx = m_hwctx->m_handle, + .type = AMDXDNA_CMD_SUBMIT_EXEC_BUF, + .cmd_handles = cmd_bo_hdl, + .args = reinterpret_cast(arg_bo_hdls), + .cmd_count = 1, + .arg_count = cmd_bo->get_arg_bo_handles(arg_bo_hdls, max_arg_bos), + }; + m_pdev.ioctl(DRM_IOCTL_AMDXDNA_EXEC_CMD, &ecmd); + + auto id = ecmd.seq; + cmd_bo->set_cmd_id(id); + SHIM_DEBUG("Submitted command (%ld)", id); +} + +int poll_command(bo *cmd) { + ert_packet *cmdpkt = reinterpret_cast(cmd->map()); + if (cmdpkt->state >= ERT_CMD_STATE_COMPLETED) { + return 1; + } + return 0; +} + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h new file mode 100644 index 000000000..5c85f46ab --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/hwq.h @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef _HWQ_XDNA_H_ +#define _HWQ_XDNA_H_ + +#include "fence.h" +#include "hwctx.h" + +namespace shim_xdna { +struct bo; +struct hw_q { + const hw_ctx *m_hwctx; + const pdev &m_pdev; + uint32_t m_queue_boh; + + hw_q(const device &device); + ~hw_q(); + + int wait_command(bo *, uint32_t timeout_ms) const; + void submit_wait(const fence_handle *); + void submit_wait(const std::vector &); + void submit_signal(const fence_handle *); + void bind_hwctx(const hw_ctx *ctx); + void unbind_hwctx(); + void issue_command(bo *); +}; + +int poll_command(bo *); + +} // namespace shim_xdna + +#endif // _HWQ_XDNA_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp new file mode 100644 index 000000000..a142e281b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.cpp @@ -0,0 +1,126 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "kernel.h" + +#include +#include + +#include "amdxdna_accel.h" +#include "bo.h" +#include "device.h" +#include "shim_debug.h" + +#define MAX_EXEC_BO_SIZE 4096 + +namespace shim_xdna { +kernel::kernel(const pdev &p, uint32_t op) + : m_exec_buf_bo(std::make_unique(p, AMDXDNA_INVALID_CTX_HANDLE, + MAX_EXEC_BO_SIZE, + XCL_BO_FLAGS_EXECBUF)), + m_cmd_pkt(reinterpret_cast(m_exec_buf_bo->map())), + m_cmd_size(m_exec_buf_bo->size()), + m_op(op), + m_arg_cnt(0), + m_reg_idx(0) { + std::memset(m_cmd_pkt, 0, m_cmd_size); + m_cmd_pkt->state = ERT_CMD_STATE_NEW; + m_cmd_pkt->opcode = m_op; + m_cmd_pkt->type = ERT_CU; + // One word for cu mask + inc_pkt_count(sizeof(int32_t)); +} + +void kernel::set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx) { + ert_start_kernel_cmd *cmd_pkt = + reinterpret_cast(bo_execbuf.map()); + cmd_pkt->cu_mask = 0x1 << cu_idx.index; +} + +void kernel::set_cu_idx(cuidx_t cu_idx) { + m_cmd_pkt->cu_mask = 0x1 << cu_idx.index; +} + +void kernel::add_ctrl_bo(bo &bo_ctrl) { + ert_start_kernel_cmd *cmd_packet = + reinterpret_cast(m_exec_buf_bo->map()); + switch (m_op) { + case ERT_START_CU: + break; + case ERT_START_NPU: { + ert_npu_data *npu_data = get_ert_npu_data(cmd_packet); + npu_data->instruction_buffer = bo_ctrl.get_paddr(); + npu_data->instruction_buffer_size = bo_ctrl.size(); + npu_data->instruction_prop_count = 0; + inc_pkt_count(sizeof(*npu_data)); + break; + } + case ERT_START_DPU: { + ert_dpu_data *dpu_data = get_ert_dpu_data(cmd_packet); + dpu_data->instruction_buffer = bo_ctrl.get_paddr(); + dpu_data->instruction_buffer_size = bo_ctrl.size(); + dpu_data->chained = 0; + inc_pkt_count(sizeof(*dpu_data)); + break; + } + default: + shim_err(-1, "Unknown exec buf op code: %d", m_op); + } +} + +void kernel::add_arg_32(uint32_t val) { + inc_pkt_count(sizeof(val)); + auto args = get_ert_regmap_begin(m_cmd_pkt); + args[m_reg_idx++] = val; + m_arg_cnt++; +} + +void kernel::add_arg_64(uint64_t val) { + inc_pkt_count(sizeof(val)); + auto args = get_ert_regmap_begin(m_cmd_pkt); + args[m_reg_idx++] = val; + args[m_reg_idx++] = val >> 32; + m_arg_cnt++; +} + +void kernel::add_arg_bo(bo &bo_arg, const std::string &arg_name) { + // Add to argument list for driver + m_exec_buf_bo->bind_at(m_arg_cnt, bo_arg, 0, bo_arg.size()); + // Add to argument list for control code patching + if (arg_name.empty()) + m_patching_args.emplace_back(std::to_string(m_arg_cnt), bo_arg.get_paddr()); + else + m_patching_args.emplace_back(arg_name, bo_arg.get_paddr()); + // Only increase m_arg_cnt now after it's used by code above. + add_arg_64(bo_arg.get_paddr()); +} + +void kernel::dump() { + std::cout << "Dumping exec buf:"; + int *data = static_cast(m_exec_buf_bo->map()); + std::cout << std::hex; + for (int i = 0; i < m_cmd_pkt->count + 1; i++) { + if (i % 4 == 0) std::cout << "\n"; + std::cout << std::setfill('0') << std::setw(8) << data[i] << " "; + } + std::cout << std::setfill(' ') << std::setw(0) << std::dec << std::endl; + + std::cout << "Dumping patching arguement list:\n"; + for (auto &[arg_name, arg_addr] : m_patching_args) + std::cout << "{ " << arg_name << ", 0x" << std::hex << arg_addr << std::dec + << " }\n"; +} + +void kernel::inc_pkt_count(uint32_t n) const { + m_cmd_pkt->count += n / sizeof(int32_t); + if (m_cmd_size < + sizeof(m_cmd_pkt->header) + m_cmd_pkt->count * sizeof(int32_t)) + shim_err(-1, "Size of exec buf too small: %d", m_cmd_size); +} + +bo *kernel::get_exec_buf_bo() const { return m_exec_buf_bo.get(); } + +} // namespace shim_xdna diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h new file mode 100644 index 000000000..ddc7a9283 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/kernel.h @@ -0,0 +1,37 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef KERNEL_H +#define KERNEL_H + +#include "bo.h" + +namespace shim_xdna { +struct kernel { + std::unique_ptr m_exec_buf_bo; + ert_start_kernel_cmd *m_cmd_pkt; + size_t m_cmd_size; + uint32_t m_op; + uint32_t m_arg_cnt; + uint32_t m_reg_idx; + std::vector > m_patching_args; + + kernel(const pdev &p, uint32_t op); + + static void set_cu_idx(bo &bo_execbuf, cuidx_t cu_idx); + void set_cu_idx(cuidx_t cu_idx); + bo *get_exec_buf_bo() const; + + void add_ctrl_bo(bo &bo_ctrl); + void add_arg_32(uint32_t val); + void add_arg_64(uint64_t val); + void add_arg_bo(bo &bo_arg, const std::string &arg_name = ""); + void dump(); + void inc_pkt_count(uint32_t n) const; +}; +} // namespace shim_xdna + +#endif // KERNEL_H diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp new file mode 100644 index 000000000..75b14fdfc --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.cpp @@ -0,0 +1,37 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "shim_debug.h" + +#include +#include +#include + +static std::recursive_mutex s_debug_mutex; + +struct debug_lock { + std::lock_guard m_lk; + debug_lock(); +}; + +debug_lock::debug_lock() : m_lk(s_debug_mutex) {} + +unsigned long time_ns() { + static auto zero = std::chrono::high_resolution_clock::now(); + auto now = std::chrono::high_resolution_clock::now(); + auto integral_duration = + std::chrono::duration_cast(now - zero).count(); + return static_cast(integral_duration); +} + +void debugf(const char *format, ...) { + debug_lock lk; + va_list args; + va_start(args, format); + vprintf(format, args); + va_end(args); + fflush(stdout); +} diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h new file mode 100644 index 000000000..f9e5e1785 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/shim_debug.h @@ -0,0 +1,49 @@ +// SPDX-License-Identifier: Apache-2.0 +// Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. +// +#ifndef SHIM_DEBUG_H +#define SHIM_DEBUG_H + +#include + +#include +#include +#include + +#include "llvm/Support/ErrorHandling.h" + +void debugf(const char *format, ...); + +namespace shim_xdna { + +template +[[noreturn]] void shim_err(int err, const char *fmt, Args &&...args) { + std::string format = std::string(fmt); + format += " (err=%d)"; + int sz = std::snprintf(nullptr, 0, format.c_str(), args..., err) + 1; + if (sz <= 0) llvm::report_fatal_error("could not format error string"); + + auto size = static_cast(sz); + std::unique_ptr buf(new char[size]); + std::snprintf(buf.get(), size, format.c_str(), args..., err); + std::string err_str(buf.get()); + llvm::report_fatal_error(err_str.c_str()); +} + +template +void shim_debug(const char *fmt, Args &&...args) { + std::string format{"shim_xdna: "}; + format += std::string(fmt); + format += "\n"; + debugf(format.c_str(), std::forward(args)...); +} + +} // namespace shim_xdna + +#ifdef SHIM_XDNA_DEBUG +#define SHIM_DEBUG(...) shim_xdna::shim_debug(__VA_ARGS__) +#else +#define SHIM_DEBUG(...) +#endif + +#endif // SHIM_DEBUG_H diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/xrt_mem.h b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/xrt_mem.h new file mode 100755 index 000000000..d7286bcd0 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/shim/linux/kmq/xrt_mem.h @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2019-2022, Xilinx Inc - All rights reserved. + * Xilinx Runtime (XRT) APIs + * + * Licensed under the Apache License, Version 2.0 (the "License"). You may + * not use this file except in compliance with the License. A copy of the + * License is located at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + * GPL license Verbiage: + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) any + * later version. This program is distributed in the hope that it will be + * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General + * Public License for more details. You should have received a copy of the GNU + * General Public License along with this program; if not, write to the Free + * Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + */ + +#ifndef _SHIM_MEM_H_ +#define _SHIM_MEM_H_ + +#ifdef _WIN32 +#pragma warning(push) +#pragma warning(disable : 4201) +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpedantic" +#endif + +#ifdef __cplusplus +#include +extern "C" { +#else +#if defined(__KERNEL__) +#include +#else +#include +#endif +#endif + +/** + * Encoding of flags passed to xcl buffer allocation APIs + */ +struct shim_xcl_bo_flags { + union { + uint64_t all; // [63-0] + + struct { + uint32_t flags; // [31-0] + uint32_t extension; // [63-32] + }; + + struct { + uint16_t bank; // [15-0] + uint8_t slot; // [23-16] + uint8_t boflags; // [31-24] + + // extension + uint32_t access : 2; // [33-32] + uint32_t dir : 2; // [35-34] + uint32_t use : 1; // [36] + uint32_t unused : 27; // [63-35] + }; + }; +}; + +/** + * XCL BO Flags bits layout + * + * bits 0 ~ 15: DDR BANK index + * bits 24 ~ 31: BO flags + */ +#define XRT_BO_FLAGS_MEMIDX_MASK (0xFFFFFFUL) +#define XCL_BO_FLAGS_NONE (0) +#define XCL_BO_FLAGS_CACHEABLE (1U << 24) +#define XCL_BO_FLAGS_KERNBUF (1U << 25) +#define XCL_BO_FLAGS_SGL (1U << 26) +#define XCL_BO_FLAGS_SVM (1U << 27) +#define XCL_BO_FLAGS_DEV_ONLY (1U << 28) +#define XCL_BO_FLAGS_HOST_ONLY (1U << 29) +#define XCL_BO_FLAGS_P2P (1U << 30) +#define XCL_BO_FLAGS_EXECBUF (1U << 31) + +/** + * Shim level BO Flags for extension + */ +#define XRT_BO_ACCESS_LOCAL 0 +#define XRT_BO_ACCESS_SHARED 1 +#define XRT_BO_ACCESS_PROCESS 2 +#define XRT_BO_ACCESS_HYBRID 3 + +/** + * Shim level BO Flags for direction of data transfer + * as seen from device. + */ +#define XRT_BO_ACCESS_READ (1U << 0) +#define XRT_BO_ACCESS_WRITE (1U << 1) +#define XRT_BO_ACCESS_READ_WRITE (XRT_BO_ACCESS_READ | XRT_BO_ACCESS_WRITE) + +/** + * Shim level BO Flags to distinguish use of BO + * + * The use flag is for internal use only. A debug BO + * is supported only on some platforms to communicate + * data from driver / firmware back to user space. + */ +#define XRT_BO_USE_NORMAL 0 +#define XRT_BO_USE_DEBUG 1 + +/** + * XRT Native BO flags + * + * These flags are simple aliases for use with XRT native BO APIs. + */ +#define XRT_BO_FLAGS_NONE XCL_BO_FLAGS_NONE +#define XRT_BO_FLAGS_CACHEABLE XCL_BO_FLAGS_CACHEABLE +#define XRT_BO_FLAGS_DEV_ONLY XCL_BO_FLAGS_DEV_ONLY +#define XRT_BO_FLAGS_HOST_ONLY XCL_BO_FLAGS_HOST_ONLY +#define XRT_BO_FLAGS_P2P XCL_BO_FLAGS_P2P +#define XRT_BO_FLAGS_SVM XCL_BO_FLAGS_SVM + +#ifdef __cplusplus +} +#endif + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + +#ifdef _WIN32 +#pragma warning(pop) +#endif + +#endif diff --git a/runtime/src/iree-amd-aie/driver/xrt-lite/util.h b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h new file mode 100644 index 000000000..c50e4a235 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt-lite/util.h @@ -0,0 +1,33 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#ifndef IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H +#define IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H + +#include "iree/base/status.h" + +template +iree_status_t unimplemented(Params...) { + return iree_make_status(IREE_STATUS_UNIMPLEMENTED, "unimplemented"); +} + +template +iree_status_t unimplemented_ok_status(Params...) { + return iree_ok_status(); +} + +template +void unimplemented_ok_void(Params...){} +#ifndef NDEBUG +#define IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_value, vtable, subvalue_t) \ + (IREE_HAL_ASSERT_TYPE(base_value, &vtable), \ + reinterpret_cast(base_value)) +#else +#define IREE_HAL_XRT_LITE_CHECKED_VTABLE_CAST(base_value, vtable, subvalue_t) \ + (reinterpret_cast(base_value)) +#endif + +#endif // IREE_AMD_AIE_DRIVER_XRT_LITE_UTIL_H diff --git a/runtime/src/iree-amd-aie/driver/xrt/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt/CMakeLists.txt index 81f90689b..9d9cabd44 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/driver/xrt/CMakeLists.txt @@ -38,6 +38,7 @@ iree_cc_library( "native_executable.h" "native_executable.cc" "nop_semaphore.cc" + "nop_semaphore.h" "nop_executable_cache.h" "nop_executable_cache.cc" DEPS @@ -48,6 +49,7 @@ iree_cc_library( iree::base::internal::flatcc::parsing iree::hal::utils::deferred_command_buffer iree::hal::utils::file_transfer + iree::hal::utils::semaphore_base iree::hal iree-amd-aie::schemas::xrt_executable_def_c_fbs # hide the target from all exports so it doesn't need to be installed diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt b/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt new file mode 100644 index 000000000..e068c08e3 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/CMakeLists.txt @@ -0,0 +1,114 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +include(CMakeDependentOption) + +iree_hal_cts_test_suite( + DRIVER_NAME + xrt + DRIVER_REGISTRATION_HDR + "iree-amd-aie/driver/xrt/registration/driver_module.h" + DRIVER_REGISTRATION_FN + "iree_hal_xrt_driver_module_register" + COMPILER_TARGET_BACKEND + "amd-aie" + EXECUTABLE_FORMAT + "\"amdaie-xclbin-fb\"" + DEPS + iree-amd-aie::driver::xrt::registration + INCLUDED_TESTS + "allocator" + "buffer_mapping" + "driver" +) + +set(PEANO_INSTALL_DIR "" CACHE PATH "") +set(VITIS_DIR "" CACHE PATH "") +if((NOT PEANO_INSTALL_DIR) AND (NOT VITIS_DIR)) + message(FATAL_ERROR "either PEANO_INSTALL_DIR or VITIS_DIR must be set") +endif() +cmake_dependent_option(USE_CHESS "" "1" "VITIS_DIR" "0") +set(TARGET_DEVICE "npu1_4col" CACHE STRING "") + +iree_bytecode_module( + NAME + xrt_executable_cache_test_module + MODULE_FILE_NAME + xrt_executable_cache_test.bin + SRC + "${CMAKE_CURRENT_LIST_DIR}/executable_cache_test.mlir" + FLAGS + --compile-mode=hal-executable + --iree-hal-dump-executable-files-to=${CMAKE_CURRENT_BINARY_DIR} + # on windows iree-aie-xclbinutil for some reason isn't found by iree's findTool + # so set this instead to the bin dir + --iree-amd-aie-install-dir=${CMAKE_BINARY_DIR} + --iree-hal-target-backends=amd-aie + --iree-amdaie-lower-to-aie-pipeline=air + --iree-amdaie-target-device=${TARGET_DEVICE} + --iree-amd-aie-peano-install-dir=${PEANO_INSTALL_DIR} + --iree-amd-aie-vitis-install-dir=${VITIS_DIR} + --iree-amd-aie-enable-chess=$ + --iree-amd-aie-show-invoked-commands + --iree-hal-memoization=false + --iree-hal-indirect-command-buffers=false + DEPS + iree-aie-xclbinutil + PUBLIC + TESTONLY +) + +iree_c_embed_data( + NAME + xrt_executables_c + SRCS + xrt_executable_cache_test.bin + C_FILE_OUTPUT + xrt_executables_c.c + H_FILE_OUTPUT + xrt_executables_c.h + IDENTIFIER + iree_cts_testdata_executables_aie_xrt + STRIP_PREFIX + xrt_ + DEPENDS + ::xrt_executable_cache_test_module + FLATTEN + PUBLIC + TESTONLY +) + +iree_cc_test( + NAME + xrt_executable_cache_test + SRCS + executable_cache_test.cc + DEPS + ::xrt_executables_c + iree-amd-aie::driver::xrt::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main +) + +iree_cc_test( + NAME + xrt_dispatch_test + SRCS + matmul_dispatch_test.cc + DEPS + ::xrt_executables_c + iree-amd-aie::driver::xrt::registration + iree::base + iree::hal + iree::hal::cts::cts_test_base + iree::testing::gtest_main + iree::tools::testing::e2e::e2e_test_util +) + +target_include_directories(iree-amd-aie_driver_xrt_cts_xrt_executable_cache_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") +target_include_directories(iree-amd-aie_driver_xrt_cts_xrt_dispatch_test PRIVATE "${CMAKE_CURRENT_BINARY_DIR}") diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.cc b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.cc new file mode 100644 index 000000000..3e9411cf2 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.cc @@ -0,0 +1,85 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt/registration/driver_module.h" +#include "iree/base/api.h" +#include "iree/base/string_view.h" +#include "iree/hal/api.h" +#include "iree/hal/cts/cts_test_base.h" +#include "iree/testing/gtest.h" +#include "iree/testing/status_matchers.h" +#include "xrt_executables_c.h" + +namespace iree::hal::cts { + +const char* get_test_driver_name() { return "xrt"; } + +iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { + return iree_hal_xrt_driver_module_register(registry); +} + +const char* get_test_executable_format() { return "amdaie-xclbin-fb"; } + +iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { + const struct iree_file_toc_t* toc = + iree_cts_testdata_executables_aie_xrt_create(); + const auto& file = toc[0]; + return iree_make_const_byte_span(file.data, file.size); +} + +class ExecutableCacheTest : public CTSTestBase<> {}; + +TEST_F(ExecutableCacheTest, Create) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +TEST_F(ExecutableCacheTest, CantPrepareUnknownFormat) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + EXPECT_FALSE(iree_hal_executable_cache_can_prepare_format( + executable_cache, /*caching_mode=*/0, iree_make_cstring_view("FOO?"))); + + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +TEST_F(ExecutableCacheTest, PrepareExecutable) { + iree_status_t loop_status = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status), &executable_cache)); + + iree_hal_executable_params_t executable_params; + iree_hal_executable_params_initialize(&executable_params); + executable_params.caching_mode = + IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; + executable_params.executable_format = + iree_make_cstring_view(get_test_executable_format()); + executable_params.executable_data = get_test_executable_data( + iree_make_cstring_view("executable_cache_test.bin")); + + iree_hal_executable_t* executable = nullptr; + IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable( + executable_cache, &executable_params, &executable)); + + iree_hal_executable_release(executable); + iree_hal_executable_cache_release(executable_cache); + IREE_ASSERT_OK(loop_status); +} + +} // namespace iree::hal::cts diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir new file mode 100644 index 000000000..dedbcab6b --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/executable_cache_test.mlir @@ -0,0 +1,33 @@ +// bootstrapped from https://github.com/nod-ai/iree-amd-aie/blob/9c4c167baf89a279888fba8db75907845946077c/tests/samples/matmul_pack_peel_objectfifo_e2e.mlir + +#pipeline_layout = #hal.pipeline.layout< + bindings = [ + #hal.pipeline.binding, + #hal.pipeline.binding, + #hal.pipeline.binding + ], + flags = Indirect +> +hal.executable.source public @amdaie_fb { + hal.executable.export public @matmul_f32_dispatch_0_matmul_32x32x32_f32 ordinal(0) layout(#pipeline_layout) { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + func.func @matmul_f32_dispatch_0_matmul_32x32x32_f32() { + %c0_f32 = arith.constant 0.0 : f32 + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : !flow.dispatch.tensor> -> tensor<32x32xf32> + %5 = tensor.empty() : tensor<32x32xf32> + %6 = linalg.fill ins(%c0_f32 : f32) outs(%5 : tensor<32x32xf32>) -> tensor<32x32xf32> + %7 = linalg.matmul ins(%3, %4 : tensor<32x32xf32>, tensor<32x32xf32>) outs(%6 : tensor<32x32xf32>) -> tensor<32x32xf32> + flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [32, 32], strides = [1, 1] : tensor<32x32xf32> -> !flow.dispatch.tensor> + return + } + } +} diff --git a/runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc b/runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc new file mode 100644 index 000000000..c48ea13f7 --- /dev/null +++ b/runtime/src/iree-amd-aie/driver/xrt/cts/matmul_dispatch_test.cc @@ -0,0 +1,224 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/driver/xrt/registration/driver_module.h" +#include "iree/base/api.h" +#include "iree/base/string_view.h" +#include "iree/hal/api.h" +#include "iree/hal/buffer_view_util.h" +#include "iree/hal/cts/cts_test_base.h" +#include "iree/testing/gtest.h" +#include "iree/testing/status_matchers.h" +#include "tools/testing/e2e/test_utils.h" +#include "xrt_executables_c.h" + +namespace iree::hal::cts { + +const char* get_test_driver_name() { return "xrt"; } + +iree_status_t register_test_driver(iree_hal_driver_registry_t* registry) { + return iree_hal_xrt_driver_module_register(registry); +} + +const char* get_test_executable_format() { return "amdaie-xclbin-fb"; } + +iree_const_byte_span_t get_test_executable_data(iree_string_view_t file_name) { + const struct iree_file_toc_t* toc = + iree_cts_testdata_executables_aie_xrt_create(); + const auto& file = toc[0]; + return iree_make_const_byte_span(file.data, file.size); +} + +class MatMulDispatchTest + : public CTSTestBase<::testing::TestWithParam> { + protected: + void PrepareMatmulExecutable() { + IREE_ASSERT_OK(iree_hal_executable_cache_create( + device_, iree_make_cstring_view("default"), + iree_loop_inline(&loop_status_), &executable_cache_)); + + iree_hal_executable_params_t executable_params; + iree_hal_executable_params_initialize(&executable_params); + executable_params.caching_mode = + IREE_HAL_EXECUTABLE_CACHING_MODE_ALIAS_PROVIDED_DATA; + executable_params.executable_format = + iree_make_cstring_view(get_test_executable_format()); + executable_params.executable_data = get_test_executable_data( + iree_make_cstring_view("xrt_executable_cache_test.bin")); + + IREE_ASSERT_OK(iree_hal_executable_cache_prepare_executable( + executable_cache_, &executable_params, &executable_)); + } + + void CleanupExecutable() { + iree_hal_executable_release(executable_); + iree_hal_executable_cache_release(executable_cache_); + IREE_ASSERT_OK(loop_status_); + } + + iree_status_t loop_status_ = iree_ok_status(); + iree_hal_executable_cache_t* executable_cache_ = nullptr; + iree_hal_executable_t* executable_ = nullptr; +}; + +int32_t generate_random_number(iree_hal_element_type_t element_type, + int32_t seed) { + int32_t min = 0; + int32_t max = 0; + iree_test_utils_get_min_max_for_element_type(element_type, &min, &max); + uint32_t range = (max - min + 1); + return (int32_t)iree_test_utils_pseudorandom_range( + reinterpret_cast(&seed), range) + + min; +} + +TEST_F(MatMulDispatchTest, Create) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + EXPECT_TRUE((iree_hal_command_buffer_allowed_categories(command_buffer) & + IREE_HAL_COMMAND_CATEGORY_DISPATCH) == + IREE_HAL_COMMAND_CATEGORY_DISPATCH); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_F(MatMulDispatchTest, BeginEnd) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_F(MatMulDispatchTest, SubmitEmpty) { + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + /*binding_capacity=*/0, &command_buffer)); + + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer)); + + iree_hal_command_buffer_release(command_buffer); +} + +TEST_P(MatMulDispatchTest, DispatchMatmul) { + PrepareMatmulExecutable(); + + // Create input buffer. + constexpr iree_device_size_t WIDTH = 32; + constexpr iree_device_size_t M = WIDTH, K = WIDTH, N = WIDTH; + iree_hal_buffer_t *input_A = nullptr, *input_B = nullptr, *output_C = nullptr; + int32_t seed = + std::chrono::high_resolution_clock::now().time_since_epoch().count() >> + 32; + int32_t a = generate_random_number( + iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed); + int32_t b = generate_random_number( + iree_hal_element_types_t::IREE_HAL_ELEMENT_TYPE_FLOAT_32, seed + 1); + CreateFilledDeviceBuffer(M * K * sizeof(float), a, &input_A); + CreateFilledDeviceBuffer(K * N * sizeof(float), b, &input_B); + CreateFilledDeviceBuffer(M * N * sizeof(float), -1, &output_C); + + iree_hal_buffer_ref_t binding_refs[3]; + iree_hal_buffer_binding_table_t binding_table = + iree_hal_buffer_binding_table_empty(); + binding_refs[0] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/input_A, + /*offset=*/0, + /*length=*/M * K * sizeof(float), + }; + binding_refs[1] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/input_B, + /*offset=*/0, + /*length=*/K * N * sizeof(float), + }; + binding_refs[2] = { + /*binding=*/0, + /*buffer_slot=*/0, + /*buffer=*/output_C, + /*offset=*/0, + /*length=*/M * N * sizeof(float), + }; + iree_hal_buffer_ref_list_t bindings = { + /*.count=*/IREE_ARRAYSIZE(binding_refs), + /*.values=*/binding_refs, + }; + + iree_hal_command_buffer_t* command_buffer = nullptr; + IREE_ASSERT_OK(iree_hal_command_buffer_create( + device_, IREE_HAL_COMMAND_BUFFER_MODE_ONE_SHOT, + IREE_HAL_COMMAND_CATEGORY_DISPATCH, IREE_HAL_QUEUE_AFFINITY_ANY, + binding_table.count, &command_buffer)); + IREE_ASSERT_OK(iree_hal_command_buffer_begin(command_buffer)); + + uint32_t workgroup_count[3] = {1, 1, 1}; + IREE_ASSERT_OK(iree_hal_command_buffer_dispatch( + command_buffer, executable_, /*entry_point=*/0, workgroup_count, + iree_const_byte_span_empty(), bindings, IREE_HAL_DISPATCH_FLAG_NONE)); + + IREE_ASSERT_OK(iree_hal_command_buffer_execution_barrier( + command_buffer, + /*source_stage_mask=*/IREE_HAL_EXECUTION_STAGE_DISPATCH | + IREE_HAL_EXECUTION_STAGE_TRANSFER | + IREE_HAL_EXECUTION_STAGE_COMMAND_RETIRE, + /*target_stage_mask=*/IREE_HAL_EXECUTION_STAGE_COMMAND_ISSUE | + IREE_HAL_EXECUTION_STAGE_DISPATCH | IREE_HAL_EXECUTION_STAGE_TRANSFER, + IREE_HAL_EXECUTION_BARRIER_FLAG_NONE, /*memory_barrier_count=*/0, + /*memory_barriers=*/nullptr, + /*buffer_barrier_count=*/0, /*buffer_barriers=*/nullptr)); + + IREE_ASSERT_OK(iree_hal_command_buffer_end(command_buffer)); + + IREE_ASSERT_OK(SubmitCommandBufferAndWait(command_buffer, binding_table)); + + std::vector output_values; + output_values.reserve(M * N); + IREE_ASSERT_OK(iree_hal_device_transfer_d2h( + device_, output_C, + /*source_offset=*/0, output_values.data(), M * N * sizeof(float), + IREE_HAL_TRANSFER_BUFFER_FLAG_DEFAULT, iree_infinite_timeout())); + std::vector correct_output_values; + correct_output_values.reserve(M * N); + std::fill_n(correct_output_values.data(), M * N, (float)WIDTH * (a * b)); + int n_wrong = 0; + for (int i = 0; i < M * N; ++i) { + if (output_values[i] != correct_output_values[i]) { + std::cout << "wrong @ i:" << i << ", " << output_values[i] + << " != " << correct_output_values[i] << "\n"; + n_wrong += 1; + } + } + EXPECT_EQ(n_wrong, 0); + + iree_hal_command_buffer_release(command_buffer); + iree_hal_buffer_release(output_C); + iree_hal_buffer_release(input_B); + iree_hal_buffer_release(input_A); + CleanupExecutable(); +} + +INSTANTIATE_TEST_SUITE_P(MatMulDispatchTest, MatMulDispatchTest, + ::testing::Values(RecordingType::kDirect), + GenerateTestName()); + +} // namespace iree::hal::cts diff --git a/runtime/src/iree-amd-aie/schemas/CMakeLists.txt b/runtime/src/iree-amd-aie/schemas/CMakeLists.txt index 48c2885fc..15c818aff 100644 --- a/runtime/src/iree-amd-aie/schemas/CMakeLists.txt +++ b/runtime/src/iree-amd-aie/schemas/CMakeLists.txt @@ -12,3 +12,16 @@ flatbuffer_c_library( "--json" PUBLIC ) + +flatbuffer_c_library( + NAME + pdi_executable_def_c_fbs + SRCS + "pdi_executable_def.fbs" + FLATCC_ARGS + "--reader" + "--builder" + "--verifier" + "--json" + PUBLIC +) diff --git a/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs b/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs new file mode 100644 index 000000000..8d4e49c13 --- /dev/null +++ b/runtime/src/iree-amd-aie/schemas/pdi_executable_def.fbs @@ -0,0 +1,57 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +namespace iree.amd.aie.hal.xrt_lite; + +file_identifier "PDIR"; +file_extension "pdir"; + +// Source code location denoted by a file name and line within that file. +table FileLineLocDef { + filename:string; + line:int32; +} + +// Assembly instructions. +table AsmInstDef { + asm_inst:[uint32]; +} + +// PDIs. +table PdiDef { + pdi:string; +} + +table ExecutableDef { + // A map of entry point ordinals to string names as used in PDI(s) + entry_points:[string]; + + // A map of entry point ordinals to the indices of the containing XCLBINs (the following field). + // This list has the same size as the entry_points list. + // This list currently is just a range (0, number of entry points] but will change when we start doing + // kernel merging in the backend. + pdi_indices:[uint32]; + + + // PDI strings of the entry points. + pdis: [PdiDef]; + + // A map of entry point ordinals to the indices of the containing asm_instrs (the following field). + // This list has the same size as the entry_points list. + // This list currently is just a range (0, number of entry points] but can chnage if kernels decide to + // share the instruction streams. + asm_instr_indices:[uint32]; + + // Assembly instructions stream for LX6 processor to run for each kernel + // The number of kernels and by extention the number of asm instruction streams + // are equal to the number of entry points. We access each kernel + // by giving the entry point name to the pdi and getting a kernel object from it. + asm_instrs:[AsmInstDef]; + + source_locations:[FileLineLocDef]; +} + +root_type ExecutableDef; diff --git a/tests/conftest.py b/tests/conftest.py index 3bc6d4daa..69d06af45 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,8 @@ import numpy as np import pytest + +from iree._runtime_libs._runtime import parse_flags from ml_dtypes import bfloat16 from iree.compiler import ir @@ -47,14 +49,35 @@ def pytest_addoption(parser): parser.addoption("--output-dir", type=abs_path) parser.addoption("--vitis-dir", type=abs_path) parser.addoption("--iree-aie-debug", action="store_true") + parser.addoption( + "--device-hal", + default="xrt-lite", + const="xrt-lite", + nargs="?", + choices=["xrt", "xrt-lite"], + ) + parser.addoption("--xrt_lite_n_core_rows", type=int) + parser.addoption("--xrt_lite_n_core_cols", type=int) @pytest.fixture(scope="session") -def global_cl_args(request): - _initializeGlobalCL( +def global_cl_args(request, pytestconfig): + compiler_flags = [ "--iree-hal-memoization=false", "--iree-hal-indirect-command-buffers=false", - ) + ] + _initializeGlobalCL(*compiler_flags) + + runtime_flags = [] + if pytestconfig.option.xrt_lite_n_core_rows is not None: + runtime_flags += [ + f"--xrt_lite_n_core_rows={pytestconfig.option.xrt_lite_n_core_rows}" + ] + if pytestconfig.option.xrt_lite_n_core_cols is not None: + runtime_flags += [ + f"--xrt_lite_n_core_cols={pytestconfig.option.xrt_lite_n_core_cols}" + ] + parse_flags(*runtime_flags) @pytest.fixture @@ -80,6 +103,7 @@ def iree_session(request, pytestconfig, global_cl_args) -> Session: f"--iree-amd-aie-install-dir={pytestconfig.option.iree_install_dir}", f"--iree-amd-aie-enable-chess={use_chess}", f"--iree-amdaie-enable-packet-flow={enable_packet_flow}", + f"--iree-amdaie-device-hal={pytestconfig.option.device_hal}", ] if pytestconfig.option.vitis_dir: flags += [f"--iree-amd-aie-vitis-install-dir={pytestconfig.option.vitis_dir}"] @@ -98,7 +122,7 @@ def iree_session(request, pytestconfig, global_cl_args) -> Session: @pytest.fixture -def session_module(iree_session, tmp_path) -> ir.Module: +def session_module(iree_session) -> ir.Module: with ir.Location.unknown(iree_session.context): module_op = ir.Module.create() with ir.InsertionPoint(module_op.body): @@ -106,8 +130,8 @@ def session_module(iree_session, tmp_path) -> ir.Module: @pytest.fixture(scope="session") -def device(device="xrt") -> ir.Module: - yield get_driver(device).create_default_device() +def device(pytestconfig, global_cl_args) -> ir.Module: + yield get_driver(pytestconfig.option.device_hal).create_default_device() @contextmanager