diff --git a/aie_kernels/aie2/scale.cc b/aie_kernels/aie2/scale.cc index c212ed2514..f3d1133dc1 100755 --- a/aie_kernels/aie2/scale.cc +++ b/aie_kernels/aie2/scale.cc @@ -19,6 +19,7 @@ #include +// Scalar scale template template void scale_scalar(T *a, T *c, T factor, const int32_t N) { event0(); @@ -28,20 +29,43 @@ void scale_scalar(T *a, T *c, T factor, const int32_t N) { event1(); } +// Vectorized scale template // Assume N is multiple of 16 template -void scale_vectorized(T *a, T *c, T factor, const int32_t N) { - constexpr int vec_factor = 16; +void scale_vectorized(T *a, T *c, int32_t factor, const int32_t N) { event0(); + constexpr int vec_factor = 32; T *__restrict pA1 = a; T *__restrict pC1 = c; const int F = N / vec_factor; + T fac = factor; for (int i = 0; i < F; i++) chess_prepare_for_pipelining chess_loop_range(16, ) { aie::vector A0 = aie::load_v(pA1); pA1 += vec_factor; + aie::accum cout = aie::mul(A0, fac); + aie::store_v(pC1, cout.template to_vector(0)); + pC1 += vec_factor; + } + event1(); +} + +// Vectorized scale tempalte for int32_t (acc64 used) +// Assume N is multiple of 16 +template <> +void scale_vectorized(int32_t *a, int32_t *c, int32_t factor, + const int32_t N) { + event0(); + constexpr int vec_factor = 32; + int32_t *__restrict pA1 = a; + int32_t *__restrict pC1 = c; + const int F = N / vec_factor; + for (int i = 0; i < F; i++) + chess_prepare_for_pipelining chess_loop_range(16, ) { + aie::vector A0 = aie::load_v(pA1); + pA1 += vec_factor; aie::accum cout = aie::mul(A0, factor); - aie::store_v(pC1, cout.to_vector(0)); + aie::store_v(pC1, cout.template to_vector(0)); pC1 += vec_factor; } event1(); @@ -49,14 +73,26 @@ void scale_vectorized(T *a, T *c, T factor, const int32_t N) { extern "C" { -void vector_scalar_mul_aie(int32_t *a_in, int32_t *c_out, int32_t *factor, - int32_t N) { +// 16-bit datatype +void vector_scalar_mul_int32_scalar(int32_t *a_in, int32_t *c_out, + int32_t *factor, int32_t N) { + scale_scalar(a_in, c_out, *factor, N); +} + +void vector_scalar_mul_int32_vector(int32_t *a_in, int32_t *c_out, + int32_t *factor, int32_t N) { scale_vectorized(a_in, c_out, *factor, N); } -void vector_scalar_mul_aie_scalar(int32_t *a_in, int32_t *c_out, - int32_t *factor, int32_t N) { - scale_scalar(a_in, c_out, *factor, N); +// 32-bit datatype +void vector_scalar_mul_int16_scalar(int16_t *a_in, int16_t *c_out, + int32_t *factor, int32_t N) { + scale_scalar(a_in, c_out, *factor, N); +} + +void vector_scalar_mul_int16_vector(int16_t *a_in, int16_t *c_out, + int32_t *factor, int32_t N) { + scale_vectorized(a_in, c_out, *factor, N); } } // extern "C" diff --git a/docs/conferenceDescriptions/asplos24TutorialDescription.md b/docs/conferenceDescriptions/asplos24TutorialDescription.md index f7f26d5b8a..b7e1d85a79 100644 --- a/docs/conferenceDescriptions/asplos24TutorialDescription.md +++ b/docs/conferenceDescriptions/asplos24TutorialDescription.md @@ -16,7 +16,7 @@ This tutorial will cover the following key topics: Date: Saturday April 27th 2024 (morning) Location: Hilton La Jolla Torrey Pines, San Diego, California (with ASPLOS’24) -Prerequisite: please bring your laptop, so that you can ssh into our Ryzen AI enabled miniPCs for the hands-on excersizes. +Prerequisite: please bring your laptop, so that you can ssh into our Ryzen AI enabled miniPCs for the hands-on exercises. ### Contents and Timeline (tentative) diff --git a/programming_examples/basic/dma_transpose/CMakeLists.txt b/programming_examples/basic/dma_transpose/CMakeLists.txt new file mode 100644 index 0000000000..3986c4a075 --- /dev/null +++ b/programming_examples/basic/dma_transpose/CMakeLists.txt @@ -0,0 +1,75 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 Advanced Micro Devices, Inc. + +# parameters +# -DBOOST_ROOT: Path to Boost install +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.1) + +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + +find_program(WSL NAMES powershell.exe) + +if (NOT WSL) + set(CMAKE_C_COMPILER gcc-13) + set(CMAKE_CXX_COMPILER g++-13) + set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") + set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") +else() + set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") + set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") + set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") +endif() + +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName proj_${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) +endif () + +project(${ProjectName}) + +# Find packages +find_package(Boost REQUIRED) + +add_executable(${currentTarget} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp + test.cpp +) + +target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) + +target_include_directories (${currentTarget} PUBLIC + ${XRT_INC_DIR} + ${Boost_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} + ${Boost_LIBRARY_DIRS} +) + +if (NOT WSL) + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + boost_program_options + boost_filesystem + ) +else() + target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil + ) +endif() diff --git a/programming_examples/basic/dma_transpose/Makefile b/programming_examples/basic/dma_transpose/Makefile new file mode 100644 index 0000000000..58f3e707c0 --- /dev/null +++ b/programming_examples/basic/dma_transpose/Makefile @@ -0,0 +1,51 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## + +include ../../makefile-common + +SHELL := /bin/bash + +all: build/final.xclbin build/insts.txt + +targetname = dmaTranspose +M ?= 64 +K ?= 32 + +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< ${M} ${K} > $@ + +.PHONY: inst/insts.txt +inst/insts.txt: aie2.py + rm -rf inst + mkdir -p inst + python3 $< ${LENGTH} > inst/aie.mlir + pushd inst && aiecc.py --aie-only-generate-ipu --ipu-insts-name=insts.txt aie.mlir && popd + ${powershell} ./build/${targetname}.exe -x build/final.xclbin -i inst/insts.txt -k MLIR_AIE -l ${LENGTH} + +build/final.xclbin: build/aie.mlir + mkdir -p ${@D} + cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ + --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + +${targetname}.exe: test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --M ${M} --K ${K} + +clean: + rm -rf build _build inst ${targetname}.exe diff --git a/programming_examples/basic/dma_transpose/README.md b/programming_examples/basic/dma_transpose/README.md new file mode 100644 index 0000000000..32dd7ac3d3 --- /dev/null +++ b/programming_examples/basic/dma_transpose/README.md @@ -0,0 +1,25 @@ + + +# 2-D Array Transpose using AIE DMAs + +This reference design can be run on a Ryzen™ AI NPU. + +In the [design](./aie2.py) a 2-D array in row-major layout is read from external memory to `ComputeTile2` with a transposed layout, +by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). The data is read from and written to external memory through Shim tile (`col`, 0). + +The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/README.md/#object-fifo-link) of the programming guide. + + +To compile and run the design for NPU: +``` +make +make run +``` \ No newline at end of file diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py new file mode 100755 index 0000000000..ee16ce6f62 --- /dev/null +++ b/programming_examples/basic/dma_transpose/aie2.py @@ -0,0 +1,66 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023 AMD Inc. + +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.dialects.ext import memref, arith +from aie.extras.context import mlir_mod_ctx + +N = 4096 +M = 64 +K = 64 + +if len(sys.argv) == 3: + M = int(sys.argv[1]) + K = int(sys.argv[2]) + N = M * K + + +def my_passthrough(): + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.ipu) + def device_body(): + memRef_ty = T.memref(M, K, T.i32()) + + # Tile declarations + ShimTile = tile(0, 0) + ComputeTile2 = tile(0, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty) + object_fifo_link(of_in, of_out) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2) + def core_body(): + for _ in for_(sys.maxsize): + yield_([]) + + # To/from AIE-array data movement + tensor_ty = T.memref(N, T.i32()) + + @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) + def sequence(A, B, C): + ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + # The strides below are configured to read across all rows in the same column + # Stride of K in dim/wrap 2 skips an entire row to read a full column + ipu_dma_memcpy_nd( + metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K] + ) + ipu_sync(column=0, row=0, direction=0, channel=0) + + print(ctx.module) + + +my_passthrough() diff --git a/programming_examples/basic/dma_transpose/run.lit b/programming_examples/basic/dma_transpose/run.lit new file mode 100644 index 0000000000..e86aece967 --- /dev/null +++ b/programming_examples/basic/dma_transpose/run.lit @@ -0,0 +1,10 @@ +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, chess +// +// RUN: %python %S/aie2.py 64 32 > ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem +// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt --M 64 --K 32 | FileCheck %s +// CHECK: PASS! diff --git a/programming_examples/basic/dma_transpose/test.cpp b/programming_examples/basic/dma_transpose/test.cpp new file mode 100644 index 0000000000..fa9a918669 --- /dev/null +++ b/programming_examples/basic/dma_transpose/test.cpp @@ -0,0 +1,214 @@ +//===- test.cpp -------------------------------------------000---*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +namespace po = boost::program_options; + +void check_arg_file_exists(po::variables_map &vm_in, std::string name) { + if (!vm_in.count(name)) { + throw std::runtime_error("Error: no " + name + " file was provided\n"); + } else { + std::ifstream test(vm_in[name].as()); + if (!test) { + throw std::runtime_error("The " + name + " file " + + vm_in[name].as() + + " does not exist.\n"); + } + } +} + +std::vector load_instr_sequence(std::string instr_path) { + std::ifstream instr_file(instr_path); + std::string line; + std::vector instr_v; + while (std::getline(instr_file, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + throw std::runtime_error("Unable to parse instruction file\n"); + } + instr_v.push_back(a); + } + return instr_v; +} + +int main(int argc, const char *argv[]) { + // Program arguments parsing + po::options_description desc("Allowed options"); + + desc.add_options()("help,h", "produce help message")( + "xclbin,x", po::value()->required(), + "the input xclbin path")( + "kernel,k", po::value()->required(), + "the kernel name in the XCLBIN (for instance PP_PRE_FD)")( + "verbosity,v", po::value()->default_value(0), + "the verbosity of the output")( + "instr,i", po::value()->required(), + "path of file containing userspace instructions to be sent to the LX6")( + "M", po::value()->default_value(64), + "M, number of rows in the input matrix")( + "K", po::value()->default_value(64), + "K, number of columns in the input matrix"); + + po::variables_map vm; + + try { + po::store(po::parse_command_line(argc, argv, desc), vm); + po::notify(vm); + + if (vm.count("help")) { + std::cout << desc << std::endl; + return 1; + } + } catch (const std::exception &ex) { + std::cerr << ex.what() << "\n\n"; + std::cerr << "Usage:\n" << desc << std::endl; + return 1; + } + + check_arg_file_exists(vm, "xclbin"); + check_arg_file_exists(vm, "instr"); + + std::vector instr_v = + load_instr_sequence(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << std::endl; + + uint32_t M = vm["M"].as(); + uint32_t K = vm["K"].as(); + uint32_t N = M * K; + + if ((N % 1024)) { + std::cerr + << "Length (M * K) must be a multiple of 1024. Change M and K inputs" + << std::endl; + return 1; + } + + // Start the XRT test code + // Get a device handle + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + // Load the xclbin + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() + << std::endl; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() + << std::endl; + std::string Node = vm["kernel"].as(); + + // Get the kernel from the xclbin + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + // get a hardware context + if (verbosity >= 1) + std::cout << "Getting hardware context." << std::endl; + xrt::hw_context context(device, xclbin.get_uuid()); + + // get a kernel handle + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << std::endl; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); + auto bo_inA = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(2)); + auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects." << std::endl; + + int32_t *bufInA = bo_inA.map(); + std::vector srcVecA; + for (int i = 0; i < N; i++) + srcVecA.push_back(i + 1); + memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel." << std::endl; + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out); + run.wait(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + int errors = 0; + + std::vector refVecA(N); + + // Doing a transpose on the source vector to produce a ref vector + for (uint32_t i = 0; i < M; i++) { + for (uint32_t j = 0; j < K; j++) { + uint32_t src_index = i * K + j; + uint32_t dst_index = j * M + i; + refVecA[dst_index] = srcVecA[src_index]; + } + } + + for (uint32_t i = 0; i < N; i++) { + uint32_t ref = refVecA[i]; + if (*(bufOut + i) != ref) { + std::cout << "ref = " << ref << " result = " << *(bufOut + i) << "\n"; + errors++; + } + } + + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index fd6a438ea0..5e41886d79 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -38,6 +38,8 @@ M?=512 K?=512 N?=512 +trace_size=16384 + mlir_target?=build/aie_${M}x${K}x${N}.mlir xclbin_target?=build/final_${M}x${K}x${N}.xclbin insts_target?=build/insts_${M}x${K}x${N}.txt @@ -83,14 +85,19 @@ run: ${targetname}.exe ${xclbin_target} ${insts_target} #sign export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \ ${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs} -.PHONY: clean -clean: - rm -rf build _build ${targetname}.exe +trace: ${targetname}.exe ${xclbin_target} ${insts_target} # sign + export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \ + ${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size} + ../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json .PHONY: parse_trace parse_trace: - ../../../utils/parse_eventIR.py --filename trace.txt --mlir ./build/aie.mlir --colshift 1 > trace_eventIR.json + ../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json + +.PHONY: clean +clean: clean_trace + rm -rf build _build ${targetname}.exe .PHONY: clean_trace clean_trace: - rm -rf tmpTrace trace_eventIR.json + rm -rf tmpTrace parse*.json diff --git a/programming_examples/basic/vector_exp/README.md b/programming_examples/basic/vector_exp/README.md index 8ab0602545..7b6fe0eb23 100644 --- a/programming_examples/basic/vector_exp/README.md +++ b/programming_examples/basic/vector_exp/README.md @@ -30,12 +30,18 @@ The design also uses a single file from the AIE runtime, in order to initialize ### C++ Testbench -To compile the design and C++ testbench: +To compile the design: ``` make ``` +To compile the C++ testbench: + +``` +make testExp.exe +``` + To run the design: ``` diff --git a/programming_examples/basic/vector_reduce_add/Makefile b/programming_examples/basic/vector_reduce_add/Makefile index ad4724dc45..ea201d5753 100644 --- a/programming_examples/basic/vector_reduce_add/Makefile +++ b/programming_examples/basic/vector_reduce_add/Makefile @@ -19,7 +19,7 @@ all: build/final.xclbin build/insts.txt VPATH := ../../../aie_kernels/aie2 -build/%.o: %.cc +build/%.cc.o: %.cc mkdir -p ${@D} cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F} @@ -27,7 +27,7 @@ build/aie.mlir: aie2.py mkdir -p ${@D} python3 $< ${devicename} ${col} > $@ -build/final.xclbin: build/aie.mlir build/reduce_add.o +build/final.xclbin: build/aie.mlir build/reduce_add.cc.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) diff --git a/programming_examples/basic/vector_reduce_add/README.md b/programming_examples/basic/vector_reduce_add/README.md index 4e53dce2cf..7548165a1a 100644 --- a/programming_examples/basic/vector_reduce_add/README.md +++ b/programming_examples/basic/vector_reduce_add/README.md @@ -32,7 +32,7 @@ To compile the design and C++ testbench: ``` make -make build/reduce_add.exe +make reduce_add.exe ``` To run the design: diff --git a/programming_examples/basic/vector_reduce_max/Makefile b/programming_examples/basic/vector_reduce_max/Makefile index 3ca11ea293..3ef597e472 100755 --- a/programming_examples/basic/vector_reduce_max/Makefile +++ b/programming_examples/basic/vector_reduce_max/Makefile @@ -19,7 +19,7 @@ all: build/final.xclbin build/insts.txt VPATH := ../../../aie_kernels/aie2 -build/%.o: %.cc +build/%.cc.o: %.cc mkdir -p ${@D} cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F} @@ -27,7 +27,7 @@ build/aie.mlir: aie2.py mkdir -p ${@D} python3 $< ${devicename} ${col} > $@ -build/final.xclbin: build/aie.mlir build/reduce_max.o +build/final.xclbin: build/aie.mlir build/reduce_max.cc.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) diff --git a/programming_examples/basic/vector_reduce_max/README.md b/programming_examples/basic/vector_reduce_max/README.md index 6bfa382fb4..ac2756f2dc 100644 --- a/programming_examples/basic/vector_reduce_max/README.md +++ b/programming_examples/basic/vector_reduce_max/README.md @@ -32,7 +32,7 @@ To compile the design and C++ testbench: ``` make -make build/reduce_max.exe +make reduce_max.exe ``` To run the design: diff --git a/programming_examples/basic/vector_reduce_min/Makefile b/programming_examples/basic/vector_reduce_min/Makefile index 0ade6ed0fd..b0b724e4a3 100755 --- a/programming_examples/basic/vector_reduce_min/Makefile +++ b/programming_examples/basic/vector_reduce_min/Makefile @@ -19,7 +19,7 @@ all: build/final.xclbin build/insts.txt VPATH := ../../../aie_kernels/aie2 -build/%.o: %.cc +build/%.cc.o: %.cc mkdir -p ${@D} cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F} @@ -27,7 +27,7 @@ build/aie.mlir: aie2.py mkdir -p ${@D} python3 $< ${devicename} ${col} > $@ -build/final.xclbin: build/aie.mlir build/reduce_min.o +build/final.xclbin: build/aie.mlir build/reduce_min.cc.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) diff --git a/programming_examples/basic/vector_reduce_min/README.md b/programming_examples/basic/vector_reduce_min/README.md index 1d5e9677d1..feb6328142 100644 --- a/programming_examples/basic/vector_reduce_min/README.md +++ b/programming_examples/basic/vector_reduce_min/README.md @@ -32,7 +32,7 @@ To compile the design and C++ testbench: ``` make -make build/reduce_min.exe +make reduce_min.exe ``` To run the design: diff --git a/programming_examples/basic/vector_scalar_add/README.md b/programming_examples/basic/vector_scalar_add/README.md index 3cf2a7cfcd..b1cb33333f 100644 --- a/programming_examples/basic/vector_scalar_add/README.md +++ b/programming_examples/basic/vector_scalar_add/README.md @@ -30,7 +30,7 @@ To compile the design and C++ testbench: ``` make -make build/vectorScalarAdd.exe +make vectorScalarAdd.exe ``` To run the design: diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py index 7c99acd401..88cda90226 100644 --- a/programming_examples/basic/vector_scalar_add/aie2.py +++ b/programming_examples/basic/vector_scalar_add/aie2.py @@ -13,63 +13,67 @@ def my_vector_bias_add(): - with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) - def device_body(): - memRef_16_ty = T.memref(16, T.i32()) - memRef_8_ty = T.memref(8, T.i32()) + @device(AIEDevice.ipu) + def device_body(): + memRef_16_ty = T.memref(16, T.i32()) + memRef_8_ty = T.memref(8, T.i32()) - # Tile declarations - ShimTile = tile(0, 0) - MemTile = tile(0, 1) - ComputeTile2 = tile(0, 2) + # Tile declarations + ShimTile = tile(0, 0) + MemTile = tile(0, 1) + ComputeTile2 = tile(0, 2) - # AIE-array data movement with object fifos - # Input - of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty) - of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty) - object_fifo_link(of_in0, of_in1) + # AIE-array data movement with object fifos + # Input + of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty) + of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty) + object_fifo_link(of_in0, of_in1) - # Output - of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty) - of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty) - object_fifo_link(of_out1, of_out0) + # Output + of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty) + of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty) + object_fifo_link(of_out1, of_out0) - # Set up compute tiles + # Set up compute tiles - # Compute tile 2 - @core(ComputeTile2) - def core_body(): - # Effective while(1) - for _ in for_(8): - elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) - elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) - for i in for_(8): - v0 = memref.load(elem_in, [i]) - v1 = arith.addi(v0, arith.constant(1, T.i32())) - memref.store(v1, elem_out, [i]) - yield_([]) - of_in1.release(ObjectFifoPort.Consume, 1) - of_out1.release(ObjectFifoPort.Produce, 1) + # Compute tile 2 + @core(ComputeTile2) + def core_body(): + # Effective while(1) + for _ in for_(8): + elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1) + elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1) + for i in for_(8): + v0 = memref.load(elem_in, [i]) + v1 = arith.addi(v0, arith.constant(1, T.i32())) + memref.store(v1, elem_out, [i]) yield_([]) + of_in1.release(ObjectFifoPort.Consume, 1) + of_out1.release(ObjectFifoPort.Produce, 1) + yield_([]) - # To/from AIE-array data movement + # To/from AIE-array data movement - memRef_64_ty = T.memref(64, T.i32()) - memRef_32_ty = T.memref(32, T.i32()) + memRef_64_ty = T.memref(64, T.i32()) + memRef_32_ty = T.memref(32, T.i32()) - @FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty) - def sequence(inTensor, notUsed, outTensor): - ipu_dma_memcpy_nd( - metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64] - ) - ipu_dma_memcpy_nd( - metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64] - ) - ipu_sync(column=0, row=0, direction=0, channel=0) + @FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty) + def sequence(inTensor, notUsed, outTensor): + ipu_dma_memcpy_nd( + metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64] + ) + ipu_dma_memcpy_nd( + metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64] + ) + ipu_sync(column=0, row=0, direction=0, channel=0) - print(ctx.module) - -my_vector_bias_add() +# Declares that subsequent code is in mlir-aie context +with mlir_mod_ctx() as ctx: + my_vector_bias_add() + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) diff --git a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt index 4d1000b813..48ff56aaaf 100644 --- a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt +++ b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt @@ -33,8 +33,6 @@ endif() set(VECTORSCALARMUL_SIZE 4096 CACHE STRING "vector size") set(TARGET_NAME test CACHE STRING "Target to be built") -message(STATUS "NOLF NOLF VECTORSCALARMUL_SIZE: ${VECTORSCALARMUL_SIZE}") - SET (ProjectName ${TARGET_NAME}) SET (currentTarget ${TARGET_NAME}) diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile old mode 100755 new mode 100644 index 8099ac5b83..7b28e874d5 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -11,9 +11,7 @@ include ../../makefile-common VPATH := ../../../aie_kernels/aie2 targetname = vectorScalar -#data_size = 4096 -data_size = 512 -#data_size = 1024 +data_size = 4096 trace_size = 8192 all: build/final_${data_size}.xclbin build/insts_${data_size}.txt @@ -38,7 +36,7 @@ build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%) -build/final_trace_${data_size}.xclbin: build/aie_trace.mlir build/scale.o +build/final_trace_${data_size}.xclbin: build/aie_trace_${data_size}.mlir build/scale.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%) @@ -62,16 +60,16 @@ run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt ${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} - ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace__${data_size}.mlir --colshift 1 > parse_eventIR_vs.json + ../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt ${powershell} python3 test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size} - ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > parse_eventIR_vs.json + ../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json clean_trace: - rm -rf tmpTrace trace.txt + rm -rf tmpTrace trace.txt parse*json trace*json clean: clean_trace - rm -rf build _build ${targetname}_*.exe + rm -rf build _build ${targetname}*.exe diff --git a/programming_examples/basic/vector_scalar_mul/README.md b/programming_examples/basic/vector_scalar_mul/README.md index b1e78561d4..2ee29e2e19 100644 --- a/programming_examples/basic/vector_scalar_mul/README.md +++ b/programming_examples/basic/vector_scalar_mul/README.md @@ -78,7 +78,7 @@ To compile the design and C++ testbench: ``` make -make build/vectorScalar.exe +make vectorScalar.exe ``` To run the design: diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py old mode 100755 new mode 100644 index d6ca3d0813..7dea893dba --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -16,8 +16,10 @@ def my_vector_scalar(vector_size, trace_size): + word_size_in = 2 N = vector_size - N_in_bytes = N * 4 + N_in_i32s = N * word_size_in // 4 + N_in_bytes = N_in_i32s * 4 N_div_n = 4 # chop input vector into 4 sub-vectors n = N // N_div_n @@ -27,17 +29,18 @@ def my_vector_scalar(vector_size, trace_size): @device(AIEDevice.ipu) def device_body(): - memRef_ty = T.memref(n, T.i32()) + memRef_ty = T.memref(n, T.i16()) memRef_ty2 = T.memref(1, T.i32()) # AIE Core Function declarations scale_scalar = external_func( - "vector_scalar_mul_aie_scalar", + "vector_scalar_mul_int16_scalar", inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()], ) scale = external_func( - "vector_scalar_mul_aie", inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()] + "vector_scalar_mul_int16_vector", + inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()], ) # Tile declarations @@ -78,7 +81,7 @@ def core_body(): yield_([]) # To/from AIE-array data movement - tensor_ty = T.memref(N, T.i32()) + tensor_ty = T.memref(N_in_i32s, T.i32()) scalar_ty = T.memref(1, T.i32()) @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty) @@ -92,15 +95,17 @@ def sequence(A, F, C): size=trace_size, offset=N_in_bytes, ) - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + ipu_dma_memcpy_nd( + metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N_in_i32s] + ) + ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s]) ipu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) ipu_sync(column=0, row=0, direction=0, channel=0) try: vector_size = int(sys.argv[1]) - if vector_size % 64 != 0 or vector_size <= 512: + if vector_size % 64 != 0 or vector_size < 512: print("Vector size must be a multiple of 64 and greater than or equal to 512") raise ValueError trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2]) diff --git a/programming_examples/basic/vector_scalar_mul/test.cpp b/programming_examples/basic/vector_scalar_mul/test.cpp index e147d91fa4..fe81d3ba9e 100644 --- a/programming_examples/basic/vector_scalar_mul/test.cpp +++ b/programming_examples/basic/vector_scalar_mul/test.cpp @@ -22,7 +22,8 @@ // Configure this to match your buffer data type // ------------------------------------------------------ // using DATATYPE = std::uint8_t; -using DATATYPE = std::uint32_t; +// using DATATYPE = std::uint32_t; +using DATATYPE = std::uint16_t; #endif const int scaleFactor = 3; @@ -67,7 +68,7 @@ int main(int argc, const char *argv[]) { XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); auto bo_inA = xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE), + auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_outC = xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); @@ -85,8 +86,8 @@ int main(int argc, const char *argv[]) { bufInA[i] = i + 1; // Initialize buffer bo_inFactor - DATATYPE *bufInFactor = bo_inFactor.map(); - *bufInFactor = scaleFactor; + int32_t *bufInFactor = bo_inFactor.map(); + *bufInFactor = (DATATYPE)scaleFactor; // Zero out buffer bo_outC DATATYPE *bufOut = bo_outC.map(); diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py index e0ada9be1e..996bb90c78 100644 --- a/programming_examples/basic/vector_scalar_mul/test.py +++ b/programming_examples/basic/vector_scalar_mul/test.py @@ -33,9 +33,9 @@ def main(opts): INOUT1_VOLUME = int(1) # Input only, 1 uint32_t scale factor INOUT2_VOLUME = int(opts.size) # Output only, 64x uint32_t in this example - INOUT0_DATATYPE = np.int32 + INOUT0_DATATYPE = np.int16 INOUT1_DATATYPE = np.int32 - INOUT2_DATATYPE = np.int32 + INOUT2_DATATYPE = np.int16 INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize @@ -90,7 +90,7 @@ def main(opts): bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) # Copy output results and verify they are correct - entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32) + entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint16) output_buffer = entire_buffer[:INOUT2_VOLUME] if opts.verify: if opts.verbosity >= 1: diff --git a/programming_examples/utils/README.md b/programming_examples/utils/README.md index 8a73c46207..2cb70cb7c5 100644 --- a/programming_examples/utils/README.md +++ b/programming_examples/utils/README.md @@ -15,8 +15,8 @@ These utilities are helpful in the current programming examples context and incl - [Open CV Utilities](#open-cv-utilities-opencvutilsh) ([OpenCVUtils.h](./OpenCVUtils.h)) - [Clean microcode shell script](#clean-microcode-shell-script-clean_microcodesh) ([clean_microcode.sh](./clean_microcode.sh)) +- [Trace parser](#trace-parser-parse_tracepy) ([parse_trace.py](./parse_trace.py)) - [Trace parser - eventIR based](#trace-parser---eventir-based-parse_eventirpy) ([parse_eventIR.py](./parse_eventIR.py)) -- [Trace parser, custom](#trace-parser-custom-parse_tracepy) ([parse_trace.py](./parse_trace.py)) ## Open CV Utilities ([OpenCVUtils.h](./OpenCVUtils.h)) OpenCV utilities used in vision processing pipelines to help read and/or initialize images and video. Currently supported functions include the following. Please view header for more specific function information. @@ -32,8 +32,24 @@ OpenCV utilities used in vision processing pipelines to help read and/or initial ## Clean microcode shell script ([clean_microcode.sh](./clean_microcode.sh)) Shell script to do in-place cleanup of microcode files (e.g. core_*.lst). When viewing microcode, it's helpful for some of the extra information like hardware and software breakpoints to be removed so it's easier to see back-to-back lines of microcode. +## Trace parser ([parse_trace.py](./parse_trace.py)) +The text file generated by the host code (`test.cpp` or `test.py`) are formatted as 32-bit hex values, one per line. This python script parses the raw trace packet data and creates a waveform json file for view on Perfetto http://ui.perfetto.dev. The script syntax is: + +```bash +parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > parse_eventIR_vs.json +``` + +* **--filename** : Input trace packet text file. This is generated during the running of our python host code +* **--mlir** : MLIR source. This is needed to parse what events and tiles we are monitoring to generate labels for our waveform visualizer. +* **--colshift** : runtime column shift. This specifies how much the actual design was shifted from the default position when it was scheduled and called. The reason we need this is becuase even if our design is configured for column 0, the actual loading and execution of the design may place it in column 1, 2, 3 etc. We account for this shift since the parser needs to match the actual column location of the generated trace data. Usually 1 is the right value. **NOTE** - the underlying tools currently default to column 1 to avoid using column 0 on Ryzen AI since that column does not have a shimDMA and is therefore avoided at the moment. + + ## Trace parser - eventIR based ([parse_eventIR.py](./parse_eventIR.py)) -The text file generated by the host code (`test.cpp` or `test.py`) are formatted as 32-bit hex values, one per line. This python script executes a number of steps in order to transform it from trace packet text file into a waveform json file. The script syntax is: +The text file generated by the host code (`test.cpp` or `test.py`) are formatted as 32-bit hex values, one per line. This python script executes a number of steps in order to transform it from trace packet text file into a waveform json file. + +**NOTE** - There seems to be some inconsistencies in the results generated by this parser. As of now, it is used to compare to existing the `hwfrontend` tool only. + +The script syntax is: ```bash parse_eventIR.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > parse_eventIR_vs.json @@ -90,9 +106,3 @@ to ``` which reduces the timer from 11,091,042 cycles to 381,175 seems to fix it. -## Trace parser, custom ([parse_trace.py](./parse_trace.py)) -This is our custom trace packet parser based on the trace packet spec, but it's currently a work in progress as some inconsisteancies with the generated waveform have cropped up. It is run in the same way as `parse_eventIR.py` but does not generate the intermediate directory `tmpTrace` or the other intermediate files use by `parse_eventIR.py`. - -```bash -parse_eventIR.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > parse_eventIR_vs.json -``` diff --git a/programming_examples/utils/parse_eventIR.py b/programming_examples/utils/parse_eventIR.py index b7c989ca3c..9b11975e17 100755 --- a/programming_examples/utils/parse_eventIR.py +++ b/programming_examples/utils/parse_eventIR.py @@ -18,7 +18,7 @@ rowoffset = 1 # TODO tmeporary workaround to figure out row offset for AIE2 for tiles DEBUG = False -verbose = False +verbose = True eventIRFile = "eventIR.txt" tmpTraceDirName = "tmpTrace" @@ -733,6 +733,8 @@ def lookup_event_name_by_type(trace_type, code): if trace_type == 0: if code == 0x1: event = "True" + elif code == 23: # 0x17: + event = "MemoryStall" elif code == 24: # 0x18: event = "StreamStall" elif code == 26: # 0x1A: diff --git a/programming_examples/vision/color_detect/Makefile b/programming_examples/vision/color_detect/Makefile index c8feea4cb6..9376fcd770 100755 --- a/programming_examples/vision/color_detect/Makefile +++ b/programming_examples/vision/color_detect/Makefile @@ -42,7 +42,7 @@ build/final_${COLORDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${COLORDETECT_W cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) -build/${targetname}.exe: test.cpp +${targetname}.exe: test.cpp mkdir -p ${@D} rm -rf _build mkdir -p _build @@ -55,7 +55,7 @@ else cp _build/${targetname} $@ endif -run: build/${targetname}.exe build/final_${COLORDETECT_WIDTH}.xclbin build/insts.txt +run: ${targetname}.exe build/final_${COLORDETECT_WIDTH}.xclbin build/insts.txt ${powershell} ./$< -x build/final_${COLORDETECT_WIDTH}.xclbin -i build/insts.txt -k MLIR_AIE clean: diff --git a/programming_examples/vision/vision_passthrough/README.md b/programming_examples/vision/vision_passthrough/README.md index 31d4add65f..ebb86bc0f2 100644 --- a/programming_examples/vision/vision_passthrough/README.md +++ b/programming_examples/vision/vision_passthrough/README.md @@ -15,7 +15,7 @@ Single tile applies a pass through kernel on data from local memory. There are t To compile desing in Windows: ``` make -make build/passThrough.exe +make passThrough.exe ``` To run the design: diff --git a/programming_guide/README.md b/programming_guide/README.md index 87f6414e5a..df0471ebc0 100644 --- a/programming_guide/README.md +++ b/programming_guide/README.md @@ -16,9 +16,9 @@ The AI Engine (AIE) array is a spatial compute architecture: a modular and scala Programming the AIE-array configures all its spatial building blocks: the compute cores' program memory, the data movers' buffer descriptors, interconnect with switches, etc. This guide introduces our Interface Representation for hands-ON (IRON) close-to-metal programming of the AIE-array. IRON is an open access toolkit enabling performance engineers to build fast and efficient, often specialized designs through a set of Python language bindings around mlir-aie, our MLIR-based representation of the AIE-array. mlir-aie provides the foundation from which complex and performant AI Engine designs can be defined and is supported by simulation and hardware implementation infrastructure. -> **NOTE:** For those interested in better understanding how AI Engine designs are defined at the MLIR level, take a look through the [MLIR tutorial](../tutorials/) material. mlir-aie also serves as a lower layer for other higher-level abstraction MLIR layers such as [mlir-air](https://github.com/Xilinx/mlir-air). +> **NOTE:** For those interested in better understanding how AI Engine designs are defined at the MLIR level, take a look through the [MLIR tutorial](../mlir_tutorials/) material. mlir-aie also serves as a lower layer for other higher-level abstraction MLIR layers such as [mlir-air](https://github.com/Xilinx/mlir-air). -This IRON AIE programming guide first introduces the language bindings for AIE-array's structural elements ([section 1](./section-1/README.md)). After explaining how to set up explicit data movement ([section 2](./section-2/README.md)) to transport the necessary data, you can run your first program on the AIE compute core ([section 3](./section-3/README.md)). [Section 4](./section-4/README.md) adds tracing for performance analysis and explains how to exploit the compute dense vector operations. More vector design examples, basic and larger (ML or computer vision) are given in sections [5](./section-5/README.md) and [6](./section-6/README.md). Finally, the [quick reference](./quick_reference.md) summarizes the most important API elements. +This IRON AIE programming guide first introduces the language bindings for AIE-array's structural elements ([section 1](./section-1/README.md)). After explaining how to set up explicit data movement ([section 2](./section-2/README.md)) to transport the necessary data, you can run your first program on the AIE compute core ([section 3](./section-3/README.md)). [Section 4](./section-4/README.md) adds tracing for performance analysis and explains how to exploit the compute dense vector operations. More vector design examples, basic and larger (ML or computer vision), are given in sections [5](./section-5/README.md) and [6](./section-6/README.md). Finally, the [quick reference](./quick_reference.md) summarizes the most important API elements. ## Outline
Section 0 - Getting Set Up for IRON diff --git a/programming_guide/assets/ComputeTile.png b/programming_guide/assets/ComputeTile.png new file mode 100644 index 0000000000..065fed189f Binary files /dev/null and b/programming_guide/assets/ComputeTile.png differ diff --git a/programming_guide/assets/ComputeTile_2.png b/programming_guide/assets/ComputeTile_2.png new file mode 100644 index 0000000000..6141e4edd7 Binary files /dev/null and b/programming_guide/assets/ComputeTile_2.png differ diff --git a/programming_guide/assets/aie-ml_shift_adder_path.png b/programming_guide/assets/aie-ml_shift_adder_path.png new file mode 100644 index 0000000000..5723cd6b6f Binary files /dev/null and b/programming_guide/assets/aie-ml_shift_adder_path.png differ diff --git a/programming_guide/assets/aie-ml_srs_ups.png b/programming_guide/assets/aie-ml_srs_ups.png new file mode 100644 index 0000000000..1d45408bfe Binary files /dev/null and b/programming_guide/assets/aie-ml_srs_ups.png differ diff --git a/programming_guide/assets/aie-ml_vector_unit.png b/programming_guide/assets/aie-ml_vector_unit.png new file mode 100644 index 0000000000..2151f8fecc Binary files /dev/null and b/programming_guide/assets/aie-ml_vector_unit.png differ diff --git a/programming_guide/assets/aie_compute_details1.png b/programming_guide/assets/aie_compute_details1.png new file mode 100755 index 0000000000..3bcc968d94 Binary files /dev/null and b/programming_guide/assets/aie_compute_details1.png differ diff --git a/programming_guide/assets/aie_vector_scalar_ml_opt1.png b/programming_guide/assets/aie_vector_scalar_ml_opt1.png new file mode 100755 index 0000000000..712fa28814 Binary files /dev/null and b/programming_guide/assets/aie_vector_scalar_ml_opt1.png differ diff --git a/programming_guide/assets/trace_vector_scalar_add1.png b/programming_guide/assets/trace_vector_scalar_add1.png new file mode 100644 index 0000000000..fd6e9e8e42 Binary files /dev/null and b/programming_guide/assets/trace_vector_scalar_add1.png differ diff --git a/programming_guide/quick_reference.md b/programming_guide/quick_reference.md index b3c1c9e926..4e0c5d11b7 100644 --- a/programming_guide/quick_reference.md +++ b/programming_guide/quick_reference.md @@ -65,6 +65,7 @@ | Lock stall |0x1A| 26 | | Core Port Running 1 |0x4F| 79 | | Core Port Running 0 |0x4B| 75 | + * A more exhaustive list of events for core tile, core memory, memtile and shim tile can be found in [this header file](https://github.com/Xilinx/aie-rt/blob/main-aie/driver/src/events/xaie_events_aie.h) ## AI Engine documentation * [Summary Documentation Links in UG1076](https://docs.amd.com/r/en-US/ug1076-ai-engine-environment/Documentation) @@ -73,5 +74,8 @@ * [AIE2 Architecture Manual - AM020](https://docs.amd.com/r/en-US/am020-versal-aie-ml/Overview) * [AIE2 Register Reference - AM025](https://docs.amd.com/r/en-US/am025-versal-aie-ml-register-reference/Overview) * [AIE API User Guide - v2023.2](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_intrinsics/intrinsics/index.html) +* [AIE1 Intrinsics User Guide - v2023.2](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_intrinsics/intrinsics/index.html) +* [AIE2 Intrinsics User Guide - v2023.2](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/index.html) - +## AIE Detailed References +* [AIE2 - Table of supported data types and vector sizes (AIE API)](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__basic__types.html) diff --git a/programming_guide/section-0/README.md b/programming_guide/section-0/README.md index f71d3d368c..c4b5558927 100644 --- a/programming_guide/section-0/README.md +++ b/programming_guide/section-0/README.md @@ -14,11 +14,11 @@ This programming guide focuses on application programming for the NPU found in R ## Recommended Hardware -* [Phoenix Point Mini PC : Minisforum UM790 Pro : AMD Ryzen™ 9 7940HS](https://store.minisforum.com/products/minisforum-um790-pro?variant=43865372786933) +* [Phoenix Point Mini PC: Minisforum UM790 Pro : AMD Ryzen™ 9 7940HS](https://store.minisforum.com/products/minisforum-um790-pro?variant=43865372786933) -* [Hawk Point Mini PC : GMKtec NucBox K8 : AMD Ryzen™ 7 8845HS](https://www.gmktec.com/products/amd-ryzen-7-8845hs-mini-pc-nucbox-k8?spm=..product_fe40bedf-d378-40fc-a60a-54d18f1dbc53.header_1.1&variant=71118fa4-6acb-4d6e-abcb-1d3cf00e6438) +* [Hawk Point Mini PC: GMKtec NucBox K8 : AMD Ryzen™ 7 8845HS](https://www.gmktec.com/products/amd-ryzen-7-8845hs-mini-pc-nucbox-k8?spm=..product_fe40bedf-d378-40fc-a60a-54d18f1dbc53.header_1.1&variant=71118fa4-6acb-4d6e-abcb-1d3cf00e6438) -* [Phoenix Point Laptop : ASUS Vivobook Pro 15 M6500XV-EB96 : AMD Ryzen™ 9 7940HS](https://www.asus.com/us/laptops/for-creators/vivobook/asus-vivobook-pro-15-oled-m6500/) +* [Phoenix Point Laptop: ASUS Vivobook Pro 15 M6500XV-EB96 : AMD Ryzen™ 9 7940HS](https://www.asus.com/us/laptops/for-creators/vivobook/asus-vivobook-pro-15-oled-m6500/) ## AMD Ryzen™ AI Initial Setup diff --git a/programming_guide/section-1/README.md b/programming_guide/section-1/README.md index a70c2e4dd5..3444d9e392 100644 --- a/programming_guide/section-1/README.md +++ b/programming_guide/section-1/README.md @@ -10,24 +10,24 @@ # Section 1 - Basic AI Engine building blocks -When we program the AIE-array, we need to declare and configure its structural building blocks: compute tiles for vector processing, memory tiles as larger level-2 shared scratchpads, and shim tiles supporting data movement to external memory. In this programming guide, we will be utilizing the IRON python bindings for MLIR-AIE components to describe our design at the tile level of granularity. Later on, when we focus on kernel programming, we will explore vector programming in C/C++. But let's first look at a basic python source file (named [aie2.py](./aie2.py)) for an IRON design. +When we program the AIE-array, we need to declare and configure its structural building blocks: compute tiles for vector processing, memory tiles as larger level-2 shared scratchpads, and shim tiles supporting data movement to external memory. In this programming guide, we will be utilizing the IRON Python bindings for MLIR-AIE components to describe our design at the tile level of granularity. Later on, when we focus on kernel programming, we will explore vector programming in C/C++. But let's first look at a basic Python source file (named [aie2.py](./aie2.py)) for an IRON design. -## Walkthrough of python source file (aie2.py) -At the top of this python source, we include modules that define the IRON AIE language bindings `aie.dialects.aie` and the mlir-aie context `aie.extras.context` which binds to MLIR definitions for AI Engines. +## Walkthrough of Python source file (aie2.py) +At the top of this Python source, we include modules that define the IRON AIE language bindings `aie.dialects.aie` and the mlir-aie context `aie.extras.context` which binds to MLIR definitions for AI Engines. ``` from aie.dialects.aie import * # primary mlir-aie dialect definitions from aie.extras.context import mlir_mod_ctx # mlir-aie context ``` -Then we declare a structural design function that will expand into mlir code when it will get called from within an mlir-aie context (see last part of this subsection). +Then we declare a structural design function that will expand into MLIR code when it will get called from within an mlir-aie context (see last part of this subsection). ``` # AI Engine structural design function def mlir_aie_design(): <... AI Engine device, blocks and connections ...> ``` -Let's look at how we declare the AI Engine device, blocks and connections. We start off by declaring our AIE device via `@device(AIEDevice.npu)` or `@device(AIEDevice.xcvc1902)`. The blocks and connections themselves will then be declared inside the `def device_body():`. Here, we instantiate our AI Engine blocks, which in this first example are simply AIE compute tiles. +Let's look at how we declare the AI Engine device, blocks and connections. We start off by declaring our AIE device via `@device(AIEDevice.npu)` or `@device(AIEDevice.xcvc1902)`. The blocks and connections themselves will then be declared inside the `def device_body():`. Here, we instantiate our AI Engine blocks, which in this first example are AIE compute tiles. -The arguments for the tile declaration are the tile coordinates (column, row) and we assign it a variable tile name in our python program. +The arguments for the tile declaration are the tile coordinates (column, row). We assign each declared tile to a variable in our Python program. > **NOTE:** The actual tile coordinates used on the device when the program is run may deviate from the ones declared here. For example, on the NPU on Ryzen™ AI (`@device(AIEDevice.npu)`), these coordinates tend to be relative coordinates as the runtime scheduler may assign it to a different available column during runtime. @@ -37,20 +37,20 @@ The arguments for the tile declaration are the tile coordinates (column, row) an def device_body(): # Tile declarations - ComputeTile = tile(1, 3) - ComputeTile = tile(2, 3) - ComputeTile = tile(2, 4) + ComputeTile1 = tile(1, 3) + ComputeTile2 = tile(2, 3) + ComputeTile3 = tile(2, 4) ``` -Once we are done declaring our blocks (and connections) within our design function, we move onto the main body of our program where we call the function and output our design in MLIR. This is done by first declaring the MLIR context via the `with mlir_mod_ctx() as ctx:` line. This indicates that subsequent indented python code is in the MLIR context and we follow this by calling our previosly defined design function `mlir_aie_design()`. This means all the code within the design function is understood to be in the MLIR context and contains the IRON custom python binding definitions of the more detailed mlir block definitions. The final line is `print(ctx.module)` which takes the code defined in our MLIR context and prints it stdout. This will then convert our python binded code to its MLIR equivalent and print it to stdout. +Once we are done declaring our blocks (and connections) within our design function, we move onto the main body of our program where we call the function and output our design in MLIR. This is done by first declaring the MLIR context via the `with mlir_mod_ctx() as ctx:` line. This indicates that subsequent indented Python code is in the MLIR context and we follow this by calling our previosly defined design function `mlir_aie_design()`. This means all the code within the design function is understood to be in the MLIR context and contains the IRON custom Python binding definitions of the more detailed MLIR block definitions. The final line is `print(ctx.module)` which takes the code defined in our MLIR context and prints it stdout. This will then convert our Python-bound code to its MLIR equivalent and print it to stdout. ``` # Declares that subsequent code is in mlir-aie context with mlir_mod_ctx() as ctx: mlir_aie_design() # Call design function within the mlir-aie context - print(ctx.module) # Print the python-to-mlir conversion to stdout + print(ctx.module) # Print the Python-to-MLIR conversion to stdout ``` ## Other Tile Types -Next to the compute tiles, an AIE-array also contains data movers for accessing L3 memory (also called shim DMAs) and larger L2 scratchpads (called mem tiles) which are available since the AIE-ML generation - see [the introduction of this programming guide](../README.md). Declaring these other types of structural blocks follows the same syntax but requires physical layout details for the specific target device. Shim DMAs typically occupy row 0, while mem tiles (when available) often reside on the following row(s). The following code segment declares all the different tile types found in a single NPU column. +Next to the compute tiles, an AIE-array also contains data movers for accessing L3 memory (also called shim DMAs) and larger L2 scratchpads (called mem tiles) which are available since the AIE-ML generation - see [the introduction of this programming guide](../README.md). Declaring these other types of structural blocks follows the same syntax but requires physical layout details for the specific target device. Shim DMAs typically occupy row 0, while mem tiles (when available) often reside on row 1. The following code segment declares all the different tile types found in a single NPU column. ``` # Device declaration - here using aie2 device NPU @@ -67,13 +67,13 @@ Next to the compute tiles, an AIE-array also contains data movers for accessing ``` ## Exercises -1. To run our python program from the command line, we type `python3 aie2.py` which converts our python structural design into MLIR source code. This works from the command line if our design environment already contains the mlir-aie python binded dialect module. We included this in the [Makefile](./Makefile) so go ahead and run `make` now. Then take a look at the generated MLIR source under `build/aie.mlir`. +1. To run our Python program from the command line, we type `python3 aie2.py` which converts our Python structural design into MLIR source code. This works from the command line if our design environment already contains the mlir-aie Python-bound dialect module. We included this in the [Makefile](./Makefile), so go ahead and run `make` now. Then take a look at the generated MLIR source under `build/aie.mlir`. -2. Run `make clean` to remove the generated files. Then introduce an error to the python source such as misspelling `tile` to `tilex` and then run `make` again. What messages do you see? +2. Run `make clean` to remove the generated files. Then introduce an error to the Python source such as misspelling `tile` to `tilex` and then run `make` again. What messages do you see? -3. Run `make clean` again. Now change the error by renaming `tilex` back to `tile` but change the coordinates to (-1,3) which is an inavlid location. Run `make` again. What messages do you see now? +3. Run `make clean` again. Now change the error by renaming `tilex` back to `tile`, but change the coordinates to (-1,3) which is an invalid location. Run `make` again. What messages do you see now? -4. No error is generated but our code is invalid. Take a look at the generated MLIR code under `build/aie.mlir`. This generated output is invalid MLIR syntax and running our mlir-aie tools on this MLIR source will generate an error. We do, however, have some additional python structural syntax checks that can be enabled if we use the function `ctx.module.operation.verify()`. This verifies that our python binded code has valid operation within the mlir-aie context. +4. No error is generated but our code is invalid. Take a look at the generated MLIR code under `build/aie.mlir`. This generated output is invalid MLIR syntax and running our mlir-aie tools on this MLIR source will generate an error. We do, however, have some additional Python structural syntax checks that can be enabled if we use the function `ctx.module.operation.verify()`. This verifies that our Python-bound code has valid operation within the mlir-aie context. Qualify the `print(ctx.module)` call with a check on `ctx.module.operation.verify()` using a code block like the following: ``` @@ -83,7 +83,7 @@ Next to the compute tiles, an AIE-array also contains data movers for accessing else: print(res) ``` - Make this change and run `make` again. What message do you see now? + Make this change and run `make` again. What message do you see now? ----- [[Prev - Section 0](../section-0/)] [[Top](..)] [[Next - Section 2](../section-2/)] diff --git a/programming_guide/section-2/README.md b/programming_guide/section-2/README.md index f22c164946..0679fb2cb1 100644 --- a/programming_guide/section-2/README.md +++ b/programming_guide/section-2/README.md @@ -16,7 +16,7 @@ In this section of the programming guide, we introduce the Object FIFO high-leve 3. understand the design decisions which led to current limitations and/or restrictions in the Object FIFO design, 4. know where to find more in-depth material of the Object FIFO implementation and lower-level lowering. -To understand the need for a data movement abstraction we must first understand the hardware architecture with which we are working. The AIE array is a [spatial compute architecture](../README.md) with explicit data movement requirements. Each compute unit of the array works on data that is stored within its L1 memory module and that data needs to be explicitly moved there as part of the AIE's array global data movement configuration. This configuration involves several specialized hardware resources which handle the data movement over the entire array in such a way that data arrives at its destination without loss. The Object FIFO provides users with a way to specify the data movement in a more human comprehensible and accessible manner without sacrificing some of the more advanced control possibilities which the hardware provides. +To understand the need for a data movement abstraction we must first understand the hardware architecture with which we are working. The AIE array is a [spatial compute architecture](../README.md) with explicit data movement requirements. Each compute unit of the array works on data that is stored within its L1 memory module and that data needs to be explicitly moved there as part of the AIE's array global data movement configuration. This configuration involves several specialized hardware resources which handle the data movement over the entire array in such a way that data arrives at its destination without loss. The Object FIFO provides users with a way to specify the data movement in a more human-comprehensible and accessible manner, without sacrificing some of the more advanced control possibilities which the hardware provides. > **NOTE:** For more in-depth, low-level material on Object FIFO programming in MLIR please see the MLIR-AIE [tutorials](../mlir_tutorials). diff --git a/programming_guide/section-2/section-2a/README.md b/programming_guide/section-2/section-2a/README.md index 61b367145d..2eb790472e 100644 --- a/programming_guide/section-2/section-2a/README.md +++ b/programming_guide/section-2/section-2a/README.md @@ -28,11 +28,11 @@ class object_fifo: dimensionsFromStreamPerConsumer=None, ) ``` -We will now go over each of the inputs, what they represents and why they are required by the abstraction. We will first focus on the mandatory inputs and in a later section of the guide on the default valued ones (see Data Layout Transformations in [section-2c](../section-2c/README.md#data-layout-transformations)). +We will now go over each of the inputs, what they represent and why they are required by the abstraction. We will first focus on the mandatory inputs and in a later section of the guide on the default-valued ones (see Data Layout Transformations in [section-2c](../section-2c/README.md#data-layout-transformations)). -First of all, an Object FIFO has a unique `name` which is required for the lowering steps. It functions as an ordered buffer that has `depth`-many objects of specified `datatype`. Currently, all objects in an Object FIFO have to be of the same datatype. The `datatype` is a tensor-like attribute where the size of the tensor and the type of the individual elements are specified at the same time (i.e. `<16xi32>`). The `depth` can be either an integer or an array of integers. The latter is used to support a specific dependency that can arise when working with multiple Object FIFOs and it is further explained in the Key Object FIFO Patterns [section](../section-2b/02_Broadcast/README.md#object-fifo-broadcast-pattern). +First of all, an Object FIFO has a unique `name` which is required for the lowering steps. The Object FIFO functions as an ordered buffer that has a count of `depth` objects of specified `datatype`. Currently, all objects in an Object FIFO have to be of the same datatype. The `datatype` is a tensor-like attribute where the size of the tensor and the type of the individual elements are specified at the same time (i.e. `<16xi32>`). The `depth` can be either an integer or an array of integers. The latter is explained further down in this section. -An Object FIFO is created between a producer, or source tile, and a consumer, or destination tile. The tiles are where producer and consumer processes accessing the Object FIFO will be executed. Below, you can see an example of an Object FIFO created between producer tile A and consumer tile B: +An Object FIFO is created between a producer, or source tile, and a consumer, or destination tile. The tiles are where producer and consumer processes accessing the Object FIFO will be executed. These processes are also refered to as the "actors" of the Object FIFO, based on dataflow theory terminology. Below, you can see an example of an Object FIFO created between producer tile A and consumer tile B: ```python A = tile(1, 3) B = tile(2, 4) @@ -42,17 +42,17 @@ The created Object FIFO is stored in the `of0` variable and is named `objfifo0`. -As you will see in the Key Object FIFO Patterns [section](../section-2b/README.md#key-object-fifo-patterns), an Object FIFO can have multiple consumer tiles, which describes a broadcast connection from the source tile to all of the consumer tiles. As such, the `consumerTiles` input can be either a single tile or an array of tiles. This is not the case for the `producerTile` input as currently the Object FIFO does not support multiple producers. +As you will see in the ["Key Object FIFO Patterns" section](../section-2b/README.md#key-object-fifo-patterns), an Object FIFO can have multiple consumer tiles, which describes a broadcast connection from the source tile to all of the consumer tiles. As such, the `consumerTiles` input can be either a single tile or an array of tiles. This is not the case for the `producerTile` input, as currently the Object FIFO does not support multiple producers. ### Accessing the objects of an Object FIFO -An Object FIFO can be accessed by the processes running on the producer and consumer tiles registered to it. Before a process can have access to the objects it has to acquire them from the Object FIFO. This is because the Object FIFO is a synchronized communication primitive that leverages the synchronization mechanism available in the target hardware architecture to ensure that two processes can't access the same object at the same time. Once a process has finished working with an object and has no further use for it, it should release it so that another process will be able to acquire and access it. The patterns in which a producer or a consumer process acquires and releases objects from an Object FIFO are called `access patterns`. We can specifically refer to the acquire and release patterns as well. +An Object FIFO can be accessed by the processes running on the producer and consumer tiles registered to it. Before a process can have access to the objects, it has to acquire them from the Object FIFO. This is because the Object FIFO is a synchronized communication primitive that leverages the synchronization mechanism available in the target hardware architecture to ensure that two processes cannot access the same object at the same time. Once a process has finished working with an object and has no further use for it, it must release it so that another process will be able to acquire and access it. The patterns in which a producer or a consumer process acquires and releases objects from an Object FIFO are called "access patterns". We can specifically refer to the acquire and release patterns as well. To acquire one or multiple objects users should use the acquire function of the `object_fifo` class: ```python def acquire(self, port, num_elem) ``` -Based on the `num_elem` input representing the number of acquired elements, the acquire function will either directly return an object, or an array of objects that can be accessed in an array-like fashion. +Based on the `num_elem` input representing the number of acquired elements, the acquire function will either directly return an object, or an array of objects. The Object FIFO is an ordered primitive and the API keeps track for each process which object is the next one that they will have access to when acquiring, based on how many they have already acquired and released. Specifically, the first time a process acquires an object it will have access to the first object of the Object FIFO, and after releasing it and acquiring a new one, it'll have access to the second object, and so on until the last object, after which the order starts from the first one again. When acquiring multiple objects and accessing them in the returned array, the object at index 0 will always be the oldest object that that process has access to, which may not be the first object in the pool of that Object FIFO. @@ -60,11 +60,11 @@ To release one or multiple objects users should use the release function of the ```python def release(self, port, num_elem) ``` -A process may release one, some or all of the objects it has acquired. The release function will release objects from oldest to youngest in acquired order. If a process does not release all of the objects it has acquired, then the next time it acquires objects the oldest objects will be those that were not released. This functionality is intended to achieve the behaviour of a sliding window through the Object FIFO primitive. This is described further in the Key Object FIFO Patterns [section](../section-2b/01_Reuse/README.md#object-fifo-reuse-pattern). +A process may release one, some or all of the objects it has acquired. The release function will release objects from oldest to youngest in acquired order. If a process does not release all of the objects it has acquired, then the next time it acquires objects the oldest objects will be those that were not released. This functionality is intended to achieve the behaviour of a sliding window through the Object FIFO primitive. This is described further in the ["Key Object FIFO Patterns" section](../section-2b/01_Reuse/README.md#object-fifo-reuse-pattern). -When acquiring the objects of an Object FIFO using the acquire function it is important to note that any unreleased objects from a previous acquire will also be returned by the most recent acquire call. Unreleased objects will not be reacquired in the sense that the synchronization mechanism used under the hood has already been set in place such that the process already has the sole access rights to the unreleased objects from the previous acquire. As such, two acquire calls back-to-back without a release call in-between will result in the same objects being returned by both acquire calls. This decision was made to facilitate the understanding of releasing objects between calls to the acquire function as well as to ensure a proper lowering through the Object FIFO primitive. A code example of this behaviour is available in the Key Object FIFO Patterns [section](../section-2b/01_Reuse/README.md#object-fifo-reuse-pattern). +When acquiring the objects of an Object FIFO using the acquire function it is important to note that any unreleased objects from a previous acquire will also be returned by the most recent acquire call. Unreleased objects will not be reacquired in the sense that the synchronization mechanism used under the hood has already been set in place such that the process already has the sole access rights to the unreleased objects from the previous acquire. As such, two acquire calls back-to-back without a release call in-between will result in the same objects being returned by both acquire calls. This decision was made to facilitate the understanding of releasing objects between calls to the acquire function as well as to ensure a proper lowering through the Object FIFO primitive. A code example of this behaviour is available in the ["Key Object FIFO Patterns" section](../section-2b/01_Reuse/README.md#object-fifo-reuse-pattern). -The `port` input of both the acquire and the release functions represents whether that process is a producer or a consumer process and it is an important hint for the Object FIFO lowering to properly leverage the underlying synchronization mechanism. Its value may be either `ObjectFifoPort.Produce` or `ObjectFifoPort.Consume`. However, an important thing to note is that the terms producer and consumers are used mainly as a means to provide a logical reference for a human user to keep track of what process is at what end of the data movement, but it does not restrict the behaviour of that process, i.e., a producer process may simply access an object to read it and does not require to modify it. +The `port` input of both the acquire and the release functions represents whether that process is a producer or a consumer process and it is an important indication for the Object FIFO lowering to properly leverage the underlying synchronization mechanism. Its value may be either `ObjectFifoPort.Produce` or `ObjectFifoPort.Consume`. However, an important thing to note is that the terms producer and consumers are used mainly as a means to provide a logical reference for a human user to keep track of what process is at what end of the data movement, but it does not restrict the behaviour of that process, i.e., a producer process may simply access an object to read it and is not required to modify it. Below you can see an example of two processes that are iterating over the objects of the Object FIFO `of0` that we initialized in the previous section, one running on the producer tile and the other on the consumer tile. To do this, the producer process runs a loop of three iterations, equal to the depth of `of0`, and during each iteration it acquires one object from `of0`, calls a `test_func` function on the acquired object, and releases the object. The consumer process only runs once and acquires all three objects from `of0` at once and stores them in the `elems` array, from which it can access each object individually in any order. It then calls a `test_func2` function three times and in each call it gives as input one of the objects it acquired, before releasing all three objects at the end. ```python @@ -89,15 +89,15 @@ def core_body(): of0.release(ObjectFifoPort.Consume, 3) ``` -The code above can be viewed as in the figure below where each of the 4 drawings can be seen as the state of the system during an iteration of execution. In the first three iterations, the producer process, drawn in blue, on tile A progressively acquires the elements of `of0` one by one. Once the third element has been released in the forth iteration the consumer process, drawn in green, on tile B is able to acquire all three objects at once. +The figure below illustrates this code: Each of the 4 drawings represents the state of the system during one iteration of execution. In the first three iterations, the producer process on tile A, drawn in blue, progressively acquires the elements of `of0` one by one. Once the third element has been released in the forth iteration, the consumer process on tile B, drawn in green, is able to acquire all three objects at once. Examples of designs that use these features are available in Section 2e: [01_single_double_buffer](../section-2e/01_single_double_buffer/) and [02_external_mem_to_core](../section-2e/02_external_mem_to_core/). -### Object FIFOs with same producer / consumer +### Object FIFOs with the same producer / consumer -An Object FIFO can be created with the same tile as both its producer and consumer tile. This is mostly done in order to ensure proper synchronization within the process itself, as opposed to synchronization across multiple processes running on different tiles, as we've seen in examples up until this point. Composing two kernels with access to a shared buffer is an application that leverages this property of the Object FIFO, as showcased in the code snippet below, where `test_func` and `test_func2` are composed using `of0`: +An Object FIFO can be created with the same tile as both its producer and consumer tile. This is mostly done to ensure proper synchronization within the process itself, as opposed to synchronization across multiple processes running on different tiles, as we have seen in examples up until this point. Composing two kernels with access to a shared buffer is an application that leverages this property of the Object FIFO, as showcased in the code snippet below, where `test_func` and `test_func2` are composed using `of0`: ```python A = tile(1, 3) of0 = object_fifo("objfifo0", A, A, 3, T.memref(256, T.i32())) @@ -115,5 +115,76 @@ def core_body(): yield_([]) ``` +### Specifying the Object FIFO Depth as an Array + +As was mentioned in the beginning of this section, the AIE architecture is a spatial architecture that requires explicit data movement. As such, while the Object FIFO's conceptual design is that of an ordered buffer between two or more AIE tiles, in reality its conceptual depth is spread out over multiple resource pools that may be located at different levels of the memory hierarchy and on different tiles. + +A more in-depth, yet still abstract, view of the Object FIFO's depth is that the producer and each consumer have their own working resource pool available in their local memory modules which they can use to send and receive data in relation to the data movement described by the Object FIFO. The Object FIFO primitive and its lowering typically allocate the depth of each of these pools such that the resulting behaviour matches that of the conceptual depth. + +The user does however have the possibility to manually choose the depth of these pools. This feature is available because, while the Object FIFO primitive tries to offer a unified representation of the data movement across the AIE array, it also aims to provide performance programmers with the tools to more finely control it. + +For example, in the code snippet below `of0` describes the data movement between producer A and consumer B: +```python +A = tile(1, 3) +B = tile(2, 4) +of0 = object_fifo("objfifo0", A, B, 3, T.memref(256, T.i32())) +``` +The conceptual depth of the Object FIFO is `3`. The reasoning behind this choice of depth can be understood by looking at the acquire and release patterns of the two actors: +```python +@core(A) +def core_body(): + for _ in range_(9): + elem0 = of0.acquire(ObjectFifoPort.Produce, 1) + call(produce_func, [elem0]) + of0.release(ObjectFifoPort.Produce, 1) + yield_([]) + +@core(B) +def core_body(): + for _ in range_(9): + elems = of0.acquire(ObjectFifoPort.Consume, 2) + call(consume_func, [elems[0], elems[1]]) + of0.release(ObjectFifoPort.Consume, 2) + yield_([]) +``` +Each iteration: +* producer A acquires one object to produce into, calls the kernel function `produce_func` to store new data in it for B to consume, and releases the object, +* consumer B acquires two objects to consume, reads the data and applies kernel function `consume_func`, then releases both objects. + +A conceptual depth of `2` would have sufficed for this system to function without deadlocking. However, with a depth of `3`, A and B can execute concurrently, i.e., while B consumes two objects and applies the kernel function, A has one object available into which it can produce at the same time. + +The equivalent of this conceptual depth of `3` using an array of depths would be: +```python +of0 = object_fifo("objfifo0", A, B, [1, 2], T.memref(256, T.i32())) +``` +where `1` is the number of resources available locally to producer A and `2` is the number available to consumer B. + +> **NOTE:** For a correct lowering, this feature should be used in situations where the producers and consumers of the Object FIFO are running on different tiles. + +The feature of specifying the depths of the resource pools for different actors of the Object FIFO is used to support a specific dependency that can arise when working with multiple Object FIFOs and it is further explained in the ["Key Object FIFO Patterns" section](../section-2b/02_Broadcast/README.md#object-fifo-broadcast-pattern). + +### Advanced Topic: Data Movement Accelerators + +**The following topic is not required to understand the rest of this guide.** + +This part of the guide introduces a few lower level concepts in the AIE hardware and takes a closer look at the individual resource pools on each tile and the reasoning behind their depths. + +Every tile in the AIE array has its own dedicated Data Movement Accelerator (or "DMA"). The DMAs are responsible for moving data from the tile's memory module to the AXI stream interconnect, or from the stream to the memory module. In the case of compute tiles, both the compute core and the tile's DMA are able to access the tile's memory module. Because of this, there is a need for a **synchronization mechanism** that will allow the compute core and the DMA to signal to each other when data is available for the other party to read or write in order to avoid data corruption. This is very similar to the concept of the Object FIFO where producers and consumers must first acquire objects before they can access them, and release them when they are done so they may be acquired by the other party. + +The figure below showcases a high-level view of a compute tile, where the compute core and the DMA are both reading and writing data to a location `buff` in the local memory module: + + + +The intent of this high-level view showcases that the DMA is able to interact with memory buffers while the compute core is simultaneously accessing them. The DMA can send data from a buffer onto the AXI stream, and receive data from the stream to write into a buffer which the core is processing. Because this concurrency can lead to data races, a ping-pong buffer (also called double buffer) is often used instead of a single buffer. This is showcased in the figure below where the `buff` has been extended to a `buff_ping` and `buff_pong`: + + + +> **NOTE:** It is possible to directly configure the DMAs without the use of the Object FIFO primitive to setup data movement between tiles. This is described in [Section 2f](../section-2f/README.md). + +## Exercises +1. In the previous [subsection](./README.md/#specifying-the-object-fifo-depth-as-an-array) it was explained that the conceptual depth of `3` for `of0` could be represented as an array of depths `[1, 2]`. With the advanced knowledge on the topic of DMAs, do you think those depths suffice for the compute cores on tiles A and B to run concurrently with their local DMAs? + +1. How would you update the depths? + ----- [[Up](..)] [[Next - Section 2b](../section-2b/)] diff --git a/programming_guide/section-2/section-2b/01_Reuse/README.md b/programming_guide/section-2/section-2b/01_Reuse/README.md index 883f7567a9..5ffb69d2d4 100644 --- a/programming_guide/section-2/section-2b/01_Reuse/README.md +++ b/programming_guide/section-2/section-2b/01_Reuse/README.md @@ -10,9 +10,9 @@ # Object FIFO Reuse Pattern -During the previous [section](../../section-2a/README.md#accessing-the-objects-of-an-object-fifo) it was mentioned that the Object FIFO acquire and release functions can be paired together to achieve the behaviour of a sliding window with data reuse. Specifically, this communication pattern occurs when a producer or a consumer of an Object FIFO releases less objects than it had previously acquired. As acquiring from an Object FIFO does not destroy the data, unreleased objects can be reused without requiring new copies of the data. +In the previous [section](../../section-2a/README.md#accessing-the-objects-of-an-object-fifo) it was mentioned that the Object FIFO acquire and release functions can be paired together to achieve the behaviour of a sliding window with data reuse. Specifically, this communication pattern occurs when a producer or a consumer of an Object FIFO releases less objects than it had previously acquired. As acquiring from an Object FIFO does not destroy the data, unreleased objects can continue to be used without requiring new copies of the data. -It is important to note that each new acquire function will return a new object or array of objects that a process can access, which includes unreleased objects from previous acquires. The process should always use the result of the most recent acquire call to access unreleased objects to ensure a proper lowering through the Object FIFO primitive. +It is important to note that each new acquire function will return a new object or array of objects that a process can access, which **includes unreleased objects from previous acquire calls**. The process should always use the result of the **most recent** acquire call to access unreleased objects to ensure a proper lowering through the Object FIFO primitive. In the example below `of0` is created between producer A and consumer B with a depth of 3 objects: object0, object1, and object2. The process running on the core of tile B is showcased in the next figure and explained in-depth below. ```python @@ -43,13 +43,13 @@ def core_body(): The figure below represents the status of the system in each of the marked situations 1 through 4: 1. Consumer B first acquires 2 elements from `of0` in the variable `elems`. As this is the first time that B acquires, it will have access to object0 and object1. B then applies `test_func2` on the two acquired elements. Finally, B releases a single object, the oldest acquired one, and keeps object1. -2. B acquires 2 elements in variable `elems_2`. It already has access to object1 which remains unreleased from 1, but also to the newly acquired object2. B again applies the function after which it only releases a single object and keeps object2. +2. B acquires 2 elements in variable `elems_2`. It now has access to object1 (which remains acquired from the first acquire call at step 1), and also to the newly acquired object2. B again applies the function, after which it only releases a single object and keeps object2. 3. B acquires 2 objects in `elems_3` and has access to object2 and object0. B releases a single object and keeps object0. -4. B acquires 2 objects in `elems_4` and has access to object0 and object1 thus returning to the situation at the beginning of 1. +4. B acquires 2 objects in `elems_4` and has access to object0 and object1 thus returning to the situation at the beginning of step 1. -The situations above can be fused into a for loop with 4 iterations. By continuously releasing one less element than it acquired every iteration, the consumer process running on tile B is implementing the behaviour of a sliding window with 2 objects that slides down by 1 each new iteration. +The situations above can be fused into a `for`-loop with 4 iterations. By continuously releasing one less element than it acquired every iteration, the consumer process running on tile B is implementing the behaviour of a sliding window with 2 objects that slides down by 1 in each iteration. ```python A = tile(1, 3) B = tile(2, 4) diff --git a/programming_guide/section-2/section-2b/02_Broadcast/README.md b/programming_guide/section-2/section-2b/02_Broadcast/README.md index 3410764df7..1381619588 100644 --- a/programming_guide/section-2/section-2b/02_Broadcast/README.md +++ b/programming_guide/section-2/section-2b/02_Broadcast/README.md @@ -27,9 +27,9 @@ of0 = object_fifo("objfifo0", A, [B, C, D], 3, T.memref(256, T.i32())) The `depth` input of an Object FIFO can also be specified as an array of integers, which describe the number of objects that are available to each tile (the producer tile plus each consumer tile) when accessing the Object FIFO. For the previous example, each of the four tiles has a resource pool of 3 objects available to perform the data movement of `of_0`. -> **NOTE:** This functionality of the Object FIFO primitive exposes what is actually going on at the hardware level when the data movement is established for a broadcast. The object pool of the Object FIFO is not a single structure but rather composed of several pools of objects that are allocated in the memory module of each tile involved in the data movement. Specifying the `depth` as an array of integers allows the user full control to set the sizes of the pools on each individual tile. +> **NOTE:** This functionality of the Object FIFO primitive exposes what is actually going on at the hardware level when the data movement is established for a broadcast. The object pool of the Object FIFO is not a single structure but rather composed of several pools of objects that are allocated in the memory module of each tile involved in the data movement. Specifying the `depth` as an array of integers allows the user full control to set the sizes of the pools on each individual tile. Please see [Section 2a](../../section-2a/README.md/#specifying-the-object-fifo-depth-as-an-array) for more details. -The main advantage of this feature comes to light during a situation like the one showcased in the example below, which we refer to as a broadcast with a skip-connection. In the example below two Object FIFOs are created: `of0` is a broadcast from producer tile A to consumer tiles B and C, while `of1` is a 1-to-1 data movement from producer tile B to consumer tile C. We refer to `of1` as a skip-connection because it is a dependency between the two consumer tiles of the same broadcast connection. +The main advantage of this feature comes to light during a situation like the one showcased in the example below, which we refer to as a broadcast with a skip-connection. In the example below two Object FIFOs are created: `of0` is a broadcast from producer tile A to consumer tiles B and C, while `of1` is a 1-to-1 data movement from producer tile B to consumer tile C. We refer to `of0` as a skip-connection because it skips over B in the A → B → C chain when connecting A → C. ```python A = tile(1, 3) B = tile(2, 3) @@ -62,7 +62,7 @@ def core_body(): ``` We can see that C requires one object from both `of0` and `of1` before it can proceed with its execution. However, B also requires an object from `of0` before it can produce the data for `of1`. Because C is waiting on B, the two tiles do not have the same rate of consumption from the broadcast connection and this results in the production rate of A being impacted. -To further represent this we can take the slightly lower lever view that the consumer tiles each have a pool of objects allocated for their Object FIFOs. To simplify things, only the pools used by the consumers are shown (for example, for `of1` only the pool on the side of consumer tile C is visible). Currently, all the pools have a depth of `1`. +To further represent this we can take the slightly lower-level view that the consumer tiles each have a pool of objects allocated for their Object FIFOs. To simplify things, only the pools used by the consumers are shown (for example, for `of1` only the pool on the side of consumer tile C is visible). Currently, all the pools have a depth of `1`. To avoid having the production of A impacted by the skip-connection, an additional object is required by C for `of0`. It can be used as buffering space for data coming from `of0` while waiting for the data from B via `of1`. To achieve this `of0` is created with an array of integers for its `depth`: diff --git a/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md b/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md index a937f4c17b..d550f214f1 100644 --- a/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md +++ b/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md @@ -12,7 +12,7 @@ ### Object FIFO Link -By design an Object FIFO handles both the configuration of the data movement between the producer and consumer tiles as well as the allocation of objects over the memory modules of the tiles. In order to put data consumed from one Object FIFO into another Object FIFO the user could explicitly do this in the core code of a shared tile between the two FIFOs. However, if the goal is to simply copy data from one Object FIFO to the other without modifying it, doing it in the manner described above results in allocating more objects than necessary, i.e., the data being copied to the second Object FIFO is already available in the first one. Additionally, Shim tiles and Mem tiles do not have a core on which the copy can be done explicitly. +By design, an Object FIFO handles both the configuration of the data movement between the producer and consumer tiles, as well as the allocation of objects over the memory modules of the tiles. In order to put data consumed from one Object FIFO into another Object FIFO, the user could explicitly do this in the core code of a shared tile between the two FIFOs. However, if the goal is to simply copy data from one Object FIFO to the other without modifying it, doing it in the manner described above results in allocating more objects than necessary, i.e., the data being copied to the second Object FIFO is already available in the first one. Additionally, Shim tiles and Mem tiles do not have a core on which the copy can be done explicitly. Instead of an explicit copy, the Object FIFO API provides an implicit copy via an `object_fifo_link`, which can be initialized using its class constructor (defined in [aie.py](../../../../python/dialects/aie.py)): ```python @@ -23,7 +23,7 @@ class object_fifo_link(ObjectFifoLinkOp): fifoOuts, ) ``` -A link allows the user to specify a set of input Object FIFOs via the `fifoIns` input and a set of output ones via the `fifoOuts` input. Each Object FIFO may be specified either using its `name` or its variable. Both inputs can be either a single Object FIFO or an array of them. It is required that there exists at least one shared tile between the consumer tiles of `fifoIns` and the producer tiles of `fifoOuts` for a link to be valid. This is because the implicit copy of data will be done using the Data Movement Accelerators (DMAs) of that tile. +A link allows the user to specify a set of input Object FIFOs via the `fifoIns` input and a set of output ones via the `fifoOuts` input. Each Object FIFO may be specified either using its `name` or its Python object. Both inputs can be either a single Object FIFO or an array of them. It is required that there exists at least one shared tile between the consumer tiles of `fifoIns` and the producer tiles of `fifoOuts` for a link to be valid. This is because the implicit copy of data will be done using the Data Movement Accelerators (DMAs) of that tile. Below is an example of a link created between two FIFOs `of0` and `of1`, where tile B is the shared tile between them: ```python @@ -41,7 +41,7 @@ A full design example that uses this features is available in Section 2e: [03_ex ### Link & Distribute -By using the link with one input Object FIFO and multiple output Object FIFOs a user can describe a distribute pattern where parts of data in every object from the producer tile are distributed to each output FIFO. The `datatype` of the output FIFOs should be of a smaller size than the input one, and the sum of the sizes of the output FIFOs should equal to the size of the `datatype` of the input FIFO. +By using the link with one input Object FIFO and multiple output Object FIFOs, a user can describe a distribute pattern where parts of data in every object from the producer tile are distributed to each output FIFO. The `datatype` of the output FIFOs should be of a smaller size than the input one, and the sum of the sizes of the output FIFOs should equal the size of the `datatype` of the input FIFO. Currently, the Object FIFO lowering uses the order in which the output FIFOs are specified in the `fifoOuts` to know which part of the input object should go to each output FIFO. To achieve the distribute, the lowering will use one output port of the shared tile to establish a connection per output FIFO, as in the figure below: @@ -59,11 +59,11 @@ of2 = object_fifo("objfifo2", B, D, 2, T.memref(128, T.i32())) object_fifo_link(of0, [of1, of2]) ``` -A full design example that uses this features is available in Section 2e: [04_distribute_L2](../../section-2e/04_distribute_L2/). +A full design example that uses this feature is available in Section 2e: [04_distribute_L2](../../section-2e/04_distribute_L2/). ### Link & Join -The join pattern is the opposite of the distribute pattern in that the link will have multiple input Object FIFOs and a single output Object FIFO. With this pattern the user can combine the smaller inputs from multiple sources into a single bigger output data movement. The `datatype` of the input FIFOs should be of a smaller size than the output one, and the sum of the sizes of the input FIFOs should equal to the size of the `datatype` of the output FIFO. +The join pattern is the opposite of the distribute pattern in that the link will have multiple input Object FIFOs and a single output Object FIFO. With this pattern the user can combine the smaller inputs from multiple sources into a single bigger output data movement. The `datatype` of the input FIFOs should be of a smaller size than the output one, and the sum of the sizes of the input FIFOs should be equal to the size of the `datatype` of the output FIFO. Similarly, the order in `fifoIns` specifies which input object will make up which part of the larger objects of the output Object FIFO. To achieve the join, the lowering will use one input port of the shared tile to establish a connection per input FIFO, as in the figure below: @@ -81,7 +81,7 @@ of2 = object_fifo("objfifo2", D, B, 2, T.memref(128, T.i32())) object_fifo_link([of1, of2], of0) ``` -A full design example that uses this features is available in Section 2e: [05_join_L2](../../section-2e/05_join_L2/). +A full design example that uses these features is available in Section 2e: [05_join_L2](../../section-2e/05_join_L2/). ----- [[Prev](../02_Broadcast/)] [[Up](..)] [[Next - Section 2c](../../section-2c/)] diff --git a/programming_guide/section-2/section-2b/README.md b/programming_guide/section-2/section-2b/README.md index eab39a310c..ac883aef17 100644 --- a/programming_guide/section-2/section-2b/README.md +++ b/programming_guide/section-2/section-2b/README.md @@ -10,7 +10,7 @@ # Section 2b - Key Object FIFO Patterns -The Object FIFO primitive supports several data movement patterns through its inputs and its member functions. We will now describe each of the currently supported patterns in three subsections and provide links to more in-depth practical code examples that showcase each of them. +The Object FIFO primitive supports several data movement patterns. We will now describe each of the currently supported patterns in three subsections and provide links to more in-depth practical code examples that showcase each of them.
Object FIFO Reuse Pattern @@ -20,7 +20,7 @@ The Object FIFO primitive supports several data movement patterns through its in * Broadcast data from one producer to multiple consumers
-
Object FIFO Distribute & Join Patterns with Object FIFO Link +
Object FIFO Distribute & Join Patterns with Object FIFO Link * Implicit copy of data from one Object FIFO to another via an Object FIFO Link * Distribute different pieces of the input data to multiple consumers diff --git a/programming_guide/section-2/section-2c/README.md b/programming_guide/section-2/section-2c/README.md index 802cc94000..d1a7ff98ad 100644 --- a/programming_guide/section-2/section-2c/README.md +++ b/programming_guide/section-2/section-2c/README.md @@ -10,7 +10,7 @@ # Section 2c - Data Layout Transformations -While the Object FIFO primitive aims to reduce the complexity tied to data movement configuration on the AI Engine array, it also gives the user control over some advanced features of the underlying architecture. One such feature is the ability to do data layout transformations on the fly using the tile's dedicated hardware: the Data Movement Accelerators (DMAs). This is available on AIE-ML devices. +While the Object FIFO primitive aims to reduce the complexity tied to data movement configuration on the AI Engine array, it also gives the user control over some advanced features of the underlying architecture. One such feature is the ability to do data layout transformations on the fly using the tile's dedicated hardware: the Data Movement Accelerators (DMAs). **This is available on AIE-ML devices.** Tile DMAs interact directly with the memory modules of their tiles and are responsible for pushing and retrieving data to and from the AXI stream interconnect. When data is pushed onto the stream, the user can program the DMA's n-dimensional address generation scheme such that the data's layout when pushed may be different than how it is stored in the tile's local memory. In the same way, a user can also specify in what layout a DMA should store the data retrieved from the AXI stream. @@ -35,11 +35,11 @@ A data layout transformation is presented as a tuple of pairs, where each pair r ```c [, , ] ``` -Transformations can be expressed in up to three dimensions on each compute and Shim tile, and in up to four dimensions on Mem tiles. The first pair of this array gives the outer-most dimension's stride and size ``, while the last pair of the array gives the inner-most dimension's stride and size ``. All strides are expressed in multiples of the element width. +Transformations can be expressed in up to three dimensions on each compute and Shim tile, and in up to four dimensions on Mem tiles. The first pair of this array gives the outer-most dimension's stride and size ``, while the last pair of the array gives the inner-most dimension's stride and size ``. All strides are expressed in **multiples of the element width**. > **NOTE:** Only for 4B data types the inner-most dimension's stride must be 1 by design. -Data layout transformations can be viewed as a way to specify to the hardware which location in the data to access next and as such it is possible to model the access pattern using a series of nested loops. For example, the transformation above can be expressed as: +Data layout transformations can be viewed as a way to specify to the hardware which location in the data to access next and as such it is possible to model the access pattern using a series of nested loops. For example, the transformation using the strides and sizes from above can be expressed as: ```c int *buffer; for(int i = 0; i < size_2; i++) @@ -69,7 +69,7 @@ for(int i = 0; i < 8; i++) # size_2 ### Data Layout Transformations with the Object FIFO -Reminder that the Object FIFO class constructor has two default valued inputs: `dimensionsToStream` and `dimensionsFromStreamPerConsumer`. +Remember that the Object FIFO class constructor has two default-valued inputs: `dimensionsToStream` and `dimensionsFromStreamPerConsumer`. ```python class object_fifo: def __init__( @@ -84,9 +84,9 @@ class object_fifo: ) ``` -The Object FIFO directly lowers to `AIE_DMABDOp` operations described above that can leverage data layout transformations expressed as pairs of strides and sizes. It uses the `dimensionsToStream` input in relation to the `producerTile` to describe in what layout that tile's DMA should push the objects onto the stream. Similarly, the `dimensionsFromStreamPerConsumer` input describes to the DMA's of each individual tile in the `consumerTiles` in what layout to retrieve the objects from the stream. +Our compiler directly lowers Object FIFOs that make use of the aforementioned data layout transformations to `AIE_DMABDOp`. You can use the `dimensionsToStream` input to describe in which order the `producerTile`'s DMA should push the objects onto the stream. Similarly, the `dimensionsFromStreamPerConsumer` input describes to the DMAs of each individual tile in the `consumerTiles` in what layout to retrieve the objects from the stream. -As an example, the Object FIFO in the code below contains objects with datatype `<4x8xi8>`. Using the `dimensionsToStream` input it performs a data layout transformation on the producer tile side that, for every row out of two, selects one element out of two up to three elements. +As an example, the Object FIFO in the code below contains objects with datatype `<4x8xi8>`. Using the `dimensionsToStream` input it performs a data layout transformation on the producer tile side that pushes elements from memory onto the stream as follows: For every even length-8 row, select the first three even-indexed elements. ```python A = tile(1, 1) B = tile(1, 3) @@ -113,7 +113,7 @@ for(int i = 0; i < 2; i++) # size_1 + j * 2 # stride_0 ) ``` -and further represented as in the image below, +and further represented as in the image below: diff --git a/programming_guide/section-2/section-2d/README.md b/programming_guide/section-2/section-2d/README.md index e9234177f5..d9008649c2 100644 --- a/programming_guide/section-2/section-2d/README.md +++ b/programming_guide/section-2/section-2d/README.md @@ -18,7 +18,7 @@ ShimTile = tile(0, 0) MemTile = tile(0, 1) ComputeTile = tile(0, 2) ``` -For our scale out design we will keep a using a single Shim tile and a single Mem tile but we will increase the number of compute tiles to three. We can do so cleanly and efficiently in the following way: +For our scale out design we will keep using a single Shim tile and a single Mem tile, but we will increase the number of compute tiles to three. We can do so cleanly and efficiently in the following way: ```python n_cores = 3 @@ -28,7 +28,7 @@ ComputeTiles = [tile(0, 2 + i) for i in range(n_cores)] ``` Each compute tile can now be accessed by indexing into the `ComputeTiles` array. -Once the tiles have been declared the next step is to setup the data movement using Object FIFOs. The simple design has a total of four double-buffered Object FIFOs and two object_fifo_links. The Object FIFOs move objects of datatype `<48xi32>`. `of_in` brings data from the Shim tile to the Mem tile and is linked to `of_in0` which brings data from the Mem tile to the compute tile. For the output side, `of_out0` brings data from the compute tile to the Mem tile where it is linked to `of_out` to bring the data out through the Shim tile. The corresponding code is shown below: +Once the tiles have been declared, the next step is to set up the data movement using Object FIFOs. The simple design has a total of four double-buffered Object FIFOs and two `object_fifo_links`. The Object FIFOs move objects of datatype `<48xi32>`. `of_in` brings data from the Shim tile to the Mem tile and is linked to `of_in0` which brings data from the Mem tile to the compute tile. For the output side, `of_out0` brings data from the compute tile to the Mem tile where it is linked to `of_out` to bring the data out through the Shim tile. The corresponding code is shown below: ```python data_size = 48 buffer_depth = 2 @@ -106,7 +106,7 @@ def core_body(): of_out0.release(ObjectFifoPort.Produce, 1) yield_([]) ``` -Once again we apply the same logic and use a for loop over our three cores to write the code which will be executed on the three compute tiles. Each tile will index the `inX_fifos` and `outX_fifos` maps to retrieve the Object FIFOs it will acquire and release from. This process results in the following code: +Once again we apply the same logic and use a `for`-loop over our three cores to write the code which will be executed on the three compute tiles. Each tile will index the `inX_fifos` and `outX_fifos` maps to retrieve the Object FIFOs it will acquire and release from. This process results in the following code: ```python for i in range(n_cores): # Compute tile i diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/README.md b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/README.md index 335681a30a..31560688cb 100644 --- a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/README.md +++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/README.md @@ -10,7 +10,7 @@ # External Memory to Core through L2 -The design in [ext_to_coreL2.py](./ext_to_core.py) is very similar to the one in the previous [example](../02_external_mem_to_core/) with the difference being that in this design we first bring the `24xi32` data from external memory to the `MemTile` with `of_in0`. We then use `of_in1` to bring smaller `8xi32` slices of the data from the `MemTile` to `ComputeTile2`. Two fifos then bring the data first to the `MemTile` via `of_out1` as `8xi32` tensors, then to the `ShimTile` via `of_out0` as `24xi32` ones. All fifos use double buffers. +The design in [ext_to_coreL2.py](./ext_to_core.py) is very similar to the one in the previous [example](../02_external_mem_to_core/) with the difference being that in this design we first bring the `24xi32` data from external memory to the `MemTile` with `of_in0`. We then use `of_in1` to bring smaller `8xi32` slices of the data from the `MemTile` to `ComputeTile2`. Two FIFOs then bring the data first to the `MemTile` via `of_out1` as `8xi32` tensors, then to the `ShimTile` via `of_out0` as `24xi32` ones. All FIFOs use double buffers. diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/README.md b/programming_guide/section-2/section-2e/04_distribute_L2/README.md index 4f20703842..5c2e93c276 100644 --- a/programming_guide/section-2/section-2e/04_distribute_L2/README.md +++ b/programming_guide/section-2/section-2e/04_distribute_L2/README.md @@ -10,7 +10,7 @@ # Distribute from L2 -The design in [distribute_L2.py](./distribute_L2.py) uses an Object FIFO `of_in` to bring data from external memory via the `ShimTile` to the `MemTile` as `24xi32` tensors. From there three Object FIFOs distribute smaller `8xi32` parts of the data to each of the three compute tiles. Each tile receives a different part of the larger data based on the order of the Object FIFOs in the `object_fifo_link`. +The design in [distribute_L2.py](./distribute_L2.py) uses an Object FIFO `of_in` to bring data from external memory via the `ShimTile` to the `MemTile` as `24xi32` tensors. From there, three Object FIFOs distribute smaller `8xi32` parts of the data to each of the three compute tiles. Each tile receives a different part of the larger data based on the order of the Object FIFOs in the `object_fifo_link`. @@ -24,7 +24,7 @@ The design in [distribute_L2.py](./distribute_L2.py) uses an Object FIFO `of_in` object_fifo_link(of_in, [of_in0, of_in1, of_in2]) ``` -All compute tiles are running the same process of acquring one object from their respective input Object FIFOs to consume, add `1` to all of its entries, and release the object. The [join design](../05_join_L2/) shows how the data is sent back out to external memory and tested. +All compute tiles are running the same process of acquring one object from their respective input Object FIFOs to consume, adding `1` to all of its entries, and releasing the object. The [join design](../05_join_L2/) shows how the data is sent back out to external memory and tested. Other examples containing this data movement pattern are available in the [programming_examples/matrix_multiplication/](../../../../programming_examples/basic/matrix_multiplication/). diff --git a/programming_guide/section-2/section-2e/05_join_L2/README.md b/programming_guide/section-2/section-2e/05_join_L2/README.md index cee2f75708..64f1ef902a 100644 --- a/programming_guide/section-2/section-2e/05_join_L2/README.md +++ b/programming_guide/section-2/section-2e/05_join_L2/README.md @@ -24,7 +24,7 @@ The design in [join_L2.py](./join_L2.py) uses three Object FIFOs from each of th object_fifo_link([of_out0, of_out1, of_out2], of_out) ``` -All compute tiles are running the same process of acquring one object from their respective input Object FIFOs to produce, write `1` to all of its entries, and release the object. +All compute tiles are running the same process of acquring one object from their respective input Object FIFOs to produce, writing `1` to all of its entries, and releasing the object. This design is combined with the previous [distribute](../04_distribute_L2/distribute_L2.py) design to achieve a full data movement from external memory to the AIE array and back. The resulting code is available in [distribute_and_join_L2.py](./distribute_and_join_L2.py). It is possible to build, run and test it with the following commands: ``` diff --git a/programming_guide/section-2/section-2f/README.md b/programming_guide/section-2/section-2f/README.md index 88f9f28d13..9f06315a1a 100644 --- a/programming_guide/section-2/section-2f/README.md +++ b/programming_guide/section-2/section-2f/README.md @@ -10,16 +10,16 @@ # Section 2f - Data Movement Without Object FIFOs -Not all data movement patterns can be described with Object FIFOs. This section goes into detail about how a user can express data movement using the Data Movement Accelerators (or `DMA`) on AIE tiles. +Not all data movement patterns can be described with Object FIFOs. This **advanced** section goes into detail about how a user can express data movement using the Data Movement Accelerators (or `DMA`) on AIE tiles. To better understand the code and concepts introduced in this section it is recommended to first read the [Advanced Topic of Section - 2a on DMAs](../section-2a/README.md/#advanced-topic--data-movement-accelerators). -The AIE architecture currently has three different types of tiles: compute tiles referred to as `tile`, memory tiles reffered to as `Mem tile`, and external memory interface tiles referred to as `Shim tile`. Each of these tiles has its own attributes regarding compute capabilities and memory capacity, but the base design of their DMAs is the same. The different types of DMAs can be intialized using the constructors in [aie.py](../../../python/dialects/aie.py): +The AIE architecture currently has three different types of tiles: compute tiles, referred to as "tile", memory tiles referred to as "Mem tiles", and external memory interface tiles referred to as "Shim tiles". Each of these tiles have their own attributes regarding compute capabilities and memory capacity, but the base design of their DMAs is the same. The different types of DMAs can be intialized using the constructors in [aie.py](../../../python/dialects/aie.py): ```python @mem(tile) # compute tile DMA @shim_dma(tile) # Shim tile DMA @memtile_dma(tile) # Mem tile DMA ``` -The DMA hardware component has a certain number of input and output `channels`, and each one has a direction and a port index. Input channels are denoted with the keyword `SS2M` and output ones with `M2SS`. Port indices vary per tile, for example compute tiles have two input and two output ports, same as Shim tiles, whereas Mem tiles have six input and six output ports. +The DMA hardware component has a certain number of input and output `channels`, and each one has a direction and a port index. Input channels are denoted with the keyword `S2MM` and output ones with `MM2S`. Port indices vary per tile. For example, compute and Shim tiles have two input and two output ports, whereas Mem tiles have six input and six output ports. A channel in any tile's DMA can be initialized using the unified `dma` constructor: ```python @@ -36,7 +36,7 @@ def dma( ) ``` -The data movement on each channel is described by a chain of Buffer Descriptors (or `BD`), where each BD describes what data is being moved and configures its synchornization mechanism. The `dma` constructor already creates space for one such BD as can be seen by its `num_blocks=1` default valued input. +The data movement on each channel is described by a chain of Buffer Descriptors (or "BDs"), where each BD describes what data is being moved and configures its synchornization mechanism. The `dma` constructor already creates space for one such BD as can be seen by its `num_blocks=1` default valued input. The code snippet below shows how to configure the DMA on `tile_a` such that data coming in on input channel 0 is written into `buff_in`: ```python @@ -54,9 +54,9 @@ def mem_body(): dma_bd(buff_in) use_lock(cons_lock, Release) ``` -The locks `prod_lock` and `cons_lock` follow AIE2 architecture semantics. Their task is to mark synchronization points in the tile's and its DMA's execution: for example, if the tile is currently using `buff_in` it will only release the `prod_lock` when it is done and that is when the DMA will be allowed to overwrite the data in `buff_in` with new input. Similarly, the tile's core can query the `cons_lock` to know when the new data is ready to be read (i.e., when the DMA releases the lock so the core can acquire it). +The locks `prod_lock` and `cons_lock` follow AIE-ML architecture semantics. Their task is to mark synchronization points in the tile's and its DMA's execution: for example, if the tile is currently using `buff_in`, it will only release the `prod_lock` when it is done, and that is when the DMA will be allowed to overwrite the data in `buff_in` with new input. Similarly, the tile's core can query the `cons_lock` to know when the new data is ready to be read (i.e., when the DMA releases the lock so the core can acquire it). -In the previous code the channel only had one BD in its chain. To add additional BDs to the chain, users can use the following constructor, which takes as input what would be the previous BD in the chain it should be added to: +In the previous code, the channel only had one BD in its chain. To add additional BDs to the chain, users can use the following constructor, which takes as input what would be the previous BD in the chain it should be added to: ```python @another_bd(dma_bd) ``` @@ -102,11 +102,11 @@ def flow( ) ``` The `flow` is established between channels of two DMAs (other endpoints are available, but they are beyond the scope of this section) and as such it requires: -* their `source` and `dest` tiles, -* their `source_bundle` and `dest_bundle`, which represent the type of endpoints (for our scope, these will be `WireBundle.DMA`), -* and their `source_channel` and `dest_channel`, which represent the index of the channel. +* its `source` and `dest` tiles, +* its `source_bundle` and `dest_bundle`, which represent the type of endpoints (for our scope, these will be `WireBundle.DMA`), +* and its `source_channel` and `dest_channel`, which represent the index of the channel. -For example, to create a flow between tile `tile_a` and tile `tile_b` where `tile_a` is sending data on its output channel 0 to `tile_b`'s input channel 1, the user can write: +For example, to create a flow between tile `tile_a` and tile `tile_b`, where `tile_a` is sending data on its output channel 0 to `tile_b`'s input channel 1, the user can write: ```python aie.flow(tile_a, WireBundle.DMA, 0, tile_b, WireBundle.DMA, 1) ``` diff --git a/programming_guide/section-2/section-2g/README.md b/programming_guide/section-2/section-2g/README.md index 8b9338e22a..db52d0b827 100644 --- a/programming_guide/section-2/section-2g/README.md +++ b/programming_guide/section-2/section-2g/README.md @@ -10,6 +10,10 @@ # Section 2g - Runtime Data Movement +In the preceding sections, we looked at how we can describe data movement between tiles *within* the AIE-array. However, to do anything useful, we need to get data from outside the array, i.e. from the "host", into the AIE-array and back. On NPU devices, we can achieve this with the operations described in this section. + +The operations that will be described in this section must be placed in a separate `sequence` function. The arguments to this function describe buffers that will be available on the host side; the body of the function describes how those buffers are moved into the AIE-array. [Section 3](../../../programming_examples/) contains an example. + ### Guide to Managing Runtime Data Movement to/from Host Memory In high-performance computing applications, efficiently managing data movement and synchronization is crucial. This guide provides a comprehensive overview of how to utilize the `npu_dma_memcpy_nd` and `npu_sync` functions to manage data movement at runtime from/to host memory to/from the AIE array (for example in the Ryzen™ AI NPU). @@ -24,12 +28,12 @@ npu_dma_memcpy_nd(metadata, bd_id, mem, offsets=None, sizes=None, strides=None) ``` - **`metadata`**: This string is a reference to the metadata generated by the object FIFO that records a Shim Tile and one of its DMA channels allocated for the host-side memory transfer. In order to associate the memcpy operation with an object FIFO, this metadata string needs to match the object FIFO name string. - **`bd_id`**: Identifier integer for the particular Buffer Descriptor control registers used for this memcpy. A buffer descriptor contains all information needed for a DMA transfer described in the parameters below. -- **`mem`**: Reference to the memory object involved in the transfer found in the argument list of the sequence function call containing the operation. +- **`mem`**: Reference to a host buffer, given as an argument to the sequence function, that this transfer will read from or write to. - **`offsets`** (optional): Start points for data transfer in each dimension. There is a maximum of four offset dimensions. - **`sizes`**: The extent of data to be transferred across each dimension. There is a maximum of four size dimensions. - **`strides`** (optional): Interval steps between data points in each dimension, useful for striding-across and reshaping data. There is a maximum of three stride dimensions that can be expressed because dimension 0 is an implicit stride of 1 4B element. -It is important to note that dimension 0 of the **`sizes`** and all **`strides`** are expressed in a 4B granularity. Higher dimensions of the **`sizes`** are integers to repeat the lower dimensions. The **`offsets`** are expressed in multiples of the **`sizes`**, however the dimension 0 offset is in a 4B granularity. +It is important to note that dimension 0 of the **`sizes`** and all **`strides`** are expressed in a 4B granularity. Higher dimensions of the **`sizes`** are integers to repeat the lower dimensions. The **`offsets`** are expressed in multiples of the **`sizes`**, however the dimension 0 offset is in a 4B granularity. The strides and wraps express data transformations analogously to those described in [Section 2C](../section-2c). **Example Usage**: ```python diff --git a/programming_guide/section-3/README.md b/programming_guide/section-3/README.md index 120c8d42af..76860cb78d 100644 --- a/programming_guide/section-3/README.md +++ b/programming_guide/section-3/README.md @@ -14,7 +14,7 @@ This section creates a first program that will run on the AIE-array. As shown in the figure on the right, we will have to create both binaries for the AIE-array (device) and CPU (host) parts. For the AIE-array, a structural description and kernel code is compiled into the AIE-array binaries: an XCLBIN file ("final.xclbin") and an instruction sequence ("inst.txt"). The host code ("test.exe") loads these AIE-array binaries and contains the test functionality. -For the AIE-array structural description we will combine what you learned in [section-1](../section-1) for defining a basic structural design in python with the data movement part from [section-2](../section-2). +For the AIE-array structural description we will combine what you learned in [section-1](../section-1) for defining a basic structural design in Python with the data movement part from [section-2](../section-2). For the AIE kernel code, we will start with non-vectorized code that will run on the scalar processor part of an AIE. [section-4](../section-4) will introduce how to vectorize a compute kernel to harvest the compute density of the AIE. @@ -29,7 +29,7 @@ This design is also available in the [programming_examples](../../programming_ex -The [aie2.py](../../programming_examples/basic/vector_scalar_mul/aie2.py) AIE-array structural description (see [section-1](../section-1) deploys both a compute core (green) for the multiplication in the operations and a shimDMA (purple) for data movement both input vector a and output vector c residing in external memory. +The [aie2.py](../../programming_examples/basic/vector_scalar_mul/aie2.py) AIE-array structural description (see [section-1](../section-1)) deploys both a compute core (green) for the multiplication and a shimDMA (purple) for data movement of both input vector a and output vector c residing in external memory. ```python # Device declaration - here using aie2 device NPU @@ -41,7 +41,7 @@ def device_body(): ComputeTile2 = tile(0, 2) ``` -We also need to declare that the compute core will run an external function: a kernel written in C++ that will be linked into the design as pre-compiled kernel (more details in the next subsection). With as goal to get our initial design running on the AIE-array, we will run a generic version of the vector scalar multiply run on the scalar processor of the AIE. +We also need to declare that the compute core will run an external function: a kernel written in C++ that will be linked into the design as pre-compiled kernel (more details in the next subsection). To get our initial design running on the AIE-array, we will run a generic version of the vector scalar multiply run on the scalar processor of the AIE. ```python # Type declarations @@ -105,7 +105,7 @@ This access and execute pattern runs on the AIE compute core `ComputeTile2` and ## Kernel Code -We can program the AIE compute core using C++ code and compile it with xchesscc into an kernel object file. In this section, a generic implementation of the vector scalar multiplication that can run on the scalar processor part of the AIE will provide our initial implementation. The `vector_scalar_mul_aie_scalar` function processes one data element at a time, taking advantage of AIE scalar datapath to load, multiply and store data elements. +We can program the AIE compute core using C++ code and compile it with `xchesscc` into an kernel object file. In this section, we will use a generic implementation of the vector scalar multiplication that can run on the scalar processor part of the AIE. The `vector_scalar_mul_aie_scalar` function processes one data element at a time, taking advantage of AIE scalar datapath to load, multiply and store data elements. ```c void vector_scalar_mul_aie_scalar(int32_t *a_in, int32_t *c_out, @@ -122,9 +122,9 @@ Note that since the scalar factor is communicated through an object, it is provi ## Host Code -The host code is acts as environment setup and testbench for the Vector Scalar Multiplication design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and kick off the execution the AIE design on the NPU. After running, it verifies the memcpy results and optionally outputs trace data. Both a C++ [test.cpp](./test.cpp) and Python [test.py](./test.py) variant of this code are available. +The host code is acts as environment setup and testbench for the Vector Scalar Multiplication design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and kick off the execution the AIE design on the NPU. After running, it verifies the results and optionally outputs trace data. Both a C++ [test.cpp](./test.cpp) and Python [test.py](./test.py) variant of this code are available. -For convenience a set of test utilities support common elements of command line parsing, the XRT-based environment setup and with testbench functionality: [test_utils.h](../../runtime_lib/test_lib/test_utils.h) or [test.py](../../python/utils/test.py). +For convenience, a set of test utilities support common elements of command line parsing, the XRT-based environment setup and with testbench functionality: [test_utils.h](../../runtime_lib/test_lib/test_utils.h) or [test.py](../../python/utils/test.py). The host code contains following elements: diff --git a/programming_guide/section-4/README.md b/programming_guide/section-4/README.md index 58e1720a28..e4a787afae 100644 --- a/programming_guide/section-4/README.md +++ b/programming_guide/section-4/README.md @@ -16,7 +16,7 @@ It's helpful to first examine perfomance measurement before we delve into vector * [Section 4a - Timers](./section-4a) * [Section 4b - Trace](./section-4b) -* [Section 4c - Kernel vectorization](./section-4c) +* [Section 4c - Kernel vectorization and optimization](./section-4c) ----- [[Prev - Section 3](../section-3/)] [[Top](..)] [[Next - Section 5](../section-5/)] \ No newline at end of file diff --git a/programming_guide/section-4/section-4a/README.md b/programming_guide/section-4/section-4a/README.md index 981acbf7f4..e373fcfc48 100644 --- a/programming_guide/section-4/section-4a/README.md +++ b/programming_guide/section-4/section-4a/README.md @@ -13,7 +13,7 @@ * [Section 4 - Vector Programming & Peformance Measurement](../../section-4) * Section 4a - Timers * [Section 4b - Trace](../section-4b) - * [Section 4c - Kernel Vectorization](../section-4c) + * [Section 4c - Kernel Vectorization and Optimization](../section-4c) ----- diff --git a/programming_guide/section-4/section-4a/aie2.py b/programming_guide/section-4/section-4a/aie2.py index a952c6e7d7..1aa8d9606e 100644 --- a/programming_guide/section-4/section-4a/aie2.py +++ b/programming_guide/section-4/section-4a/aie2.py @@ -72,4 +72,8 @@ def sequence(inTensor, unused, outTensor): # Declares that subsequent code is in mlir-aie context with mlir_mod_ctx() as ctx: my_first_aie_program() # Call design function within the mlir-aie context - print(ctx.module) # Print the python-to-mlir conversion + res = ctx.module.operation.verify() # Verify mlir context + if res == True: + print(ctx.module) # Print the python-to-mlir conversion + else: + print(res) diff --git a/programming_guide/section-4/section-4a/answers/aie2.py b/programming_guide/section-4/section-4a/answers/aie2.py index 7a588d86c1..2aa1b79e01 100644 --- a/programming_guide/section-4/section-4a/answers/aie2.py +++ b/programming_guide/section-4/section-4a/answers/aie2.py @@ -72,4 +72,8 @@ def sequence(inTensor, unused, outTensor): # Declares that subsequent code is in mlir-aie context with mlir_mod_ctx() as ctx: my_first_aie_program() # Call design function within the mlir-aie context - print(ctx.module) # Print the python-to-mlir conversion + res = ctx.module.operation.verify() # Verify mlir context + if res == True: + print(ctx.module) # Print the python-to-mlir conversion + else: + print(res) diff --git a/programming_guide/section-4/section-4b/Makefile b/programming_guide/section-4/section-4b/Makefile index e8120a379c..df9442283e 100644 --- a/programming_guide/section-4/section-4b/Makefile +++ b/programming_guide/section-4/section-4b/Makefile @@ -12,6 +12,8 @@ all: build/final.xclbin targetname = myFirstProgram +trace_size = 8192 + build/aie.mlir: aie2.py mkdir -p ${@D} python3 $< > $@ @@ -32,18 +34,23 @@ else cp _build/${targetname} $@ endif -run-g: ${targetname}.exe build/final.xclbin build/insts.txt - ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -t 8192 +run: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE run_py: build/final.xclbin build/insts.txt ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -trace: - ../../../programming_examples/utils/parse_eventIR.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > parse_eventIR_vs.json +trace: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size} + ../../../programming_examples/utils/parse_trace.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > trace_4b.json + +trace_py: build/final.xclbin build/insts.txt + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size} + ../../../programming_examples/utils/parse_trace.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > trace_4b.json clean_trace: - rm -rf tmpTrace trace.txt + rm -rf tmpTrace trace.txt trace*json clean: clean_trace rm -rf build _build ${targetname}.exe diff --git a/programming_guide/section-4/section-4b/README.md b/programming_guide/section-4/section-4b/README.md index 71a3b8fdac..e57029ae0c 100644 --- a/programming_guide/section-4/section-4b/README.md +++ b/programming_guide/section-4/section-4b/README.md @@ -13,7 +13,7 @@ * [Section 4 - Vector Programming & Peformance Measurement](../../section-4) * [Section 4a - Timers](../section-4a) * Section 4b - Trace - * [Section 4c - Kernel Vectorization](../section-4c) + * [Section 4c - Kernel Vectorization and Optimization](../section-4c) ----- @@ -102,7 +102,7 @@ packetflow(1) { ## 2. Configure host code to read trace data and write it to a text file -Once the trace units are configured and enabled, we want the host code to read the trace data from DDR and write it out to a text file for post-run processing. +Once the trace units are configured and enabled, we want the host code to read the trace data from DDR and write it out to a text file for post-run processing. To give a better sense of how this comes together, this section provides an example design sourc files and Makefile whose kernel is based off the [Vector Scalar Add example](../../../programming_examples/basic/vector_scalar_add/). ### AIE structural design code ([aie2.py](./aie2.py)) In order to write the DDR data to a text file, we need to decide where we want the DDR data to first be stored and then read from that location, before writing to a text file. This starts inside the [aie2.py](./aie2.py) file where we use the `configure_simple_tracing_aie2` function call to configure the trace units and program the shimDMA to write to one of the 3 inout buffers. There are many ways to configure our structural design to write this data out but one pattern is the following: `inout0` is for input data, `inout1` is for output data, and `inout2` is for output trace data as illustrated below: @@ -124,37 +124,38 @@ As described in [python/utils](../../../python/utils) for `trace.py`, we configu | 1 | inout1 | | 2 | inout2 | -An example of this is in the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul/aie2.py), where it uses the 2nd pattern above (input A, input B, output C + trace). In the vector scalar multiply case, A is used for the input vector and B for the scalar factor. Since we're sharing the trace data with the output buffer on `inout2`, we set `ddr_id=2`. In addition, we set the offset to be the output data buffer size since the trace data is appended after the data (`offset=N_in_bytes`). +An example of this is in the Vector Scalar Multiply example ([aie2.py](../../../programming_examples/basic/vector_scalar_mul/aie2.py)), where it uses the 2nd pattern above (input A, input B, output C + trace). In the vector scalar multiply case, A is used for the input vector and B for the scalar factor. Since we're sharing the trace data with the output buffer on `inout2`, we set `ddr_id=2`. In addition, we set the offset to be the output data buffer size since the trace data is appended after the data (`offset=N_in_bytes`). For our local design ([aie2.py](./aie.py)), we have variation of the 2nd pattern but the second inout buffer is unused (input A, unused, output C + trace). `ddr_id=2` is still used since our output buffer is mapped to `inout2` and our trace data offset is specified as `C_sz_in_bytes`. Once [aie2.py](./aie2.py) is configured to output trace data through one of the 3 inout buffers with matching `ddr_id` config and `offset`, we turn our attention to the host code to read the DDR data and write it to a file. -**NOTE**: In the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul/aie2.py) and associated [Makefile](../../../programming_examples/basic/vector_scalar_mul/Makefile), we provide a Makefile target `run` for standard build and `trace` for trace-enabld build. The trace-enabled build passes the trace buffer size as an argument to [aie2.py](./aie2.py) which conditionally enables the trace `flow` and calls `configure_simple_tracing_aie2` as long as `trace_size` is > 0. +**NOTE**: In our example design, the [aie2.py](./aie2.py) and associated [Makefile](./Makefile), we provide a Makefile target `run` for standard build and `trace` for trace-enabld build. The trace-enabled build passes the trace buffer size as an argument to [aie2.py](./aie2.py) which conditionally enables the trace `flow` and calls `configure_simple_tracing_aie2` as long as `trace_size` is > 0. This is also true for the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul). ### (2a) C/C++ Host code ([test.cpp](./test.cpp)) The main changes needed for [test.cpp](./test.cpp) is the increase in the output buffer size to account for the trace buffer size, being careful to read only the output buffer portion when verifying correctness of the results. We also need to be sure to pass the correct buffer offset which points to the trace buffer data when calling `write_out_trace`. -You can see in the Vector Scalar Multiply example [test.cpp](../../../programming_examples/basic/vector_scalar_mul/test.cpp) that trace_size is set based on an input argument of `-t $(trace_size)` which is defined and passed in the [Makefile](../../../programming_examples/basic/vector_scalar_mul/Makefile). The `trace` target from the [Makefile](../../../programming_examples/basic/vector_scalar_mul/Makefile) is shown below. +You can see in [test.cpp](.test.cpp) that trace_size is set based on an input argument of `-t $(trace_size)` which is defined and passed in the [Makefile](.Makefile). The `trace` target from the [Makefile](./Makefile) is shown below. ```Makefile -trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt - ${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} - ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace__${data_size}.mlir --colshift 1 > parse_eventIR_vs.json +trace: ${targetname}.exe build/final.xclbin build/insts.txt + ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -t 8192 + ../../../programming_examples/utils/parse_trace.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > trace_4b.json ``` -Following the invocation of the executable, we call the `parse_eventIR.py` python script which we will cover in more detail in step 3. -Within the Vector Scalar Multiply example [test.cpp](../../../programming_examples/basic/vector_scalar_mul/test.cpp), we redefine OUT_SIZE to be the sum of output buffer size (in bytes) and the trace buffer size. +Following the invocation of the executable, we call the `parse_trace.py` python script which we will cover in more detail in step 3. +Within the [test.cpp](./test.cpp), we redefine OUT_SIZE to be the sum of output buffer size (in bytes) and the trace buffer size. ```c++ - int OUT_SIZE = OUT_VOLUME * sizeof(DATATYPE) + trace_size; + int OUT_SIZE = INOUT2_SIZE + trace_size; ``` -All subsuquent references to the output buffer size should use `OUT_SIZE`. The exception is when we want to verify the output results which should be bounded by the original output buffer size, in this case `IN_VOLUME`. +All subsuquent references to the output buffer size should use `OUT_SIZE`. The exception is when we want to verify the output results which should be bounded by the original output buffer size, in this case `INOUT2_VOLUME`. Finally, the function to write the trace output to a file as defined in `aie.utils.trace` is `write_out_trace` and we need to pass it the pointer in the output buffer where the trace data begins, the trace buffer size and the trace file name (default is `trace.txt`). ```c++ - test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size, - vm["trace_file"].as()); + test_utils::write_out_trace( + ((char *)bufInOut2) + INOUT2_SIZE, + trace_size, vm["trace_file"].as()); ``` ### (2b) Python Host code ([test.py](./test.py)) -In the [Makefile](../../../programming_examples/basic/vector_scalar_mul/Makefile), we also have a `trace_py` target which calls the python host code `test.py`. Here in addition to the `-t ${trace_size}`, we also define the `-s ${data_size}` which is the data size (in uint32) for our Vector Scalar Multiply kernel. +In the [Makefile](./Makefile), we also have a `trace_py` target which calls the python host code `test.py`. Here in addition to the `-t ${trace_size}`, we also define the `-s ${data_size}` which is the data size (in uint32) for our Vector Scalar Multiply kernel. ```Makefile trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt ${powershell} python3 test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size} @@ -177,10 +178,9 @@ Finally, we read `trace buffer` from the entire_buffer starting a the offset of ``` ## 3. Parse text file to generate a waveform json file -Once the packet trace text file is generated (`trace.txt`), we use a python-based trace parser ([parse_eventIR.py](../../../programming_examples/utils/parse_eventIR.py)) to interpret the trace values and generate a waveform json file for visualization (with Perfetto). +Once the packet trace text file is generated (`trace.txt`), we use a python-based trace parser ([parse_trace.py](../../../programming_examples/utils/parse_trace.py)) to interpret the trace values and generate a waveform json file for visualization (with Perfetto). ```Makefile - ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace__${data_size}.mlir --colshift 1 > parse_eventIR_vs.json -json + ../../../programming_examples/utils/parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > trace_vs.json ``` This leverages the python parse scripts under [programming_examples/utils](../../../programming_examples/utils/). Follow [this link](../../../programming_examples/utils/) to get more details about how to use the python parse scripts and how they are coded. @@ -194,7 +194,14 @@ Open https://ui.perfetto.dev in your browser and then open up the waveform json * Check matching packet IDs for packet-routed flows. The packet flow ID must match the configured ID value in Trace Control 1 register or else the packets don't get routed. ## Exercises -1. Ask questions about routing congestion for circuit switch and packet switch routes? +1. Let's give tracing a try. In this directory, we're been examining a design based off the `Vector Scalar Add` example. Run `make trace` to compile the design and generate a trace file and run the `prase_trace.py` script on it to generate the `trace_4b.json` waveform file. Open this in http://ui.perfetto.dev. if you zoom into the region of interest with the W and S to zoom in and out respectively and A adn D to pan left and right. You should seem a wave like the following: + + + + Based on this wave, You can mouse over each chunk of continguous data for `PortRunning0` (input dma port) and `PortRunning1` (output dma port). What is the chunk size? How many input and output chunks are there? This shoudl match iteration loop bounds in our exmple design. + +1. **TODO** Additional questions about routing congestion for circuit switch and packet switch routes for trace packets? + ----- [[Prev]](../section-4a) [[Up]](../../section-4) [[Next]](../section-4c) diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py index 15e58e0343..16d9ea203e 100644 --- a/programming_guide/section-4/section-4b/aie2.py +++ b/programming_guide/section-4/section-4b/aie2.py @@ -75,14 +75,9 @@ def sequence(inTensor, notUsed, outTensor): trace_utils.configure_simple_tracing_aie2( ComputeTile, ShimTile, - channel=1, - bd_id=13, ddr_id=2, size=trace_size, offset=C_sz_in_bytes, - start=0x1, - stop=0x0, - events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F], ) ipu_dma_memcpy_nd( @@ -97,4 +92,8 @@ def sequence(inTensor, notUsed, outTensor): # Declares that subsequent code is in mlir-aie context with mlir_mod_ctx() as ctx: my_first_aie_program() # Call design function within the mlir-aie context - print(ctx.module) # Print the python-to-mlir conversion + res = ctx.module.operation.verify() # Verify mlir context + if res == True: + print(ctx.module) # Print the python-to-mlir conversion + else: + print(res) diff --git a/programming_guide/section-4/section-4b/test.cpp b/programming_guide/section-4/section-4b/test.cpp index 004b243134..6f775e5b54 100644 --- a/programming_guide/section-4/section-4b/test.cpp +++ b/programming_guide/section-4/section-4b/test.cpp @@ -215,10 +215,8 @@ int main(int argc, const char *argv[]) { // Write trace values if trace_size > 0 if (trace_size > 0) { - test_utils::write_out_trace( - ((char *)bufInOut2) + INOUT2_SIZE, - // test_utils::write_out_trace(((char *)bufTrace), - trace_size, vm["trace_file"].as()); + test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, trace_size, + vm["trace_file"].as()); } // Accumulate run times diff --git a/programming_guide/section-4/section-4b/test.py b/programming_guide/section-4/section-4b/test.py index b6c0d99c02..a36dc5d5a7 100644 --- a/programming_guide/section-4/section-4b/test.py +++ b/programming_guide/section-4/section-4b/test.py @@ -10,6 +10,7 @@ import time import aie.utils.test as test_utils +import aie.utils.trace as trace_utils # ------------------------------------------------------ # Configure this to match your design's buffer size @@ -35,12 +36,12 @@ def main(opts): instr_text = [l for l in instr_text if l != ""] instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32) - OUT_SIZE = INOUT2_SIZE + opts.trace_size + OUT_SIZE = INOUT2_SIZE + int(opts.trace_size) # ------------------------------------------------------ # Get device, load the xclbin & kernel and register them # ------------------------------------------------------ - (device, kernel) = init_xrt_load_kernel(opts) + (device, kernel) = test_utils.init_xrt_load_kernel(opts) # ------------------------------------------------------ # Initialize input/ output buffer sizes and sync them @@ -95,23 +96,20 @@ def main(opts): continue # Copy output results and verify they are correct - out_size = INOUT2_SIZE + opts.trace_size - print("out_size:", out_size) - output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE) - dout_buffer = output_buffer[0 : INOUT2_VOLUME - 1] - trace_buffer = output_buffer[INOUT2_VOLUME - 1 :] + entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32) + output_buffer = entire_buffer[:INOUT2_VOLUME] if opts.verify: if opts.verbosity >= 1: print("Verifying results ...") ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE) - # e = np.equal(output_buffer, ref) - e = np.equal(dput_buffer, ref) + e = np.equal(output_buffer, ref) + # e = np.equal(dput_buffer, ref) errors = errors + np.size(e) - np.count_nonzero(e) # Write trace values if trace_size > 0 - # if opts.trace_size > 0: - # print("Do something with trace!") - # test_utils.write_out_trace(trace_buffer, opts.trace_size, opts.trace_file) + if opts.trace_size > 0: + trace_buffer = entire_buffer[INOUT2_VOLUME:] + trace_utils.write_out_trace(trace_buffer, str(opts.trace_file)) npu_time = stop - start npu_time_total = npu_time_total + npu_time diff --git a/programming_guide/section-4/section-4c/README.md b/programming_guide/section-4/section-4c/README.md index 79a96a564b..6d863fe7bb 100644 --- a/programming_guide/section-4/section-4c/README.md +++ b/programming_guide/section-4/section-4c/README.md @@ -8,16 +8,18 @@ // //===----------------------------------------------------------------------===//--> -# Section 4c - Kernel Vectorization +# Section 4c - Kernel Vectorization and Optimization * [Section 4 - Vector Programming & Peformance Measurement](../../section-4) * [Section 4a - Timers](../section-4a) * [Section 4b - Trace](../section-4b) - * Section 4c - Kernel Vectorization + * Section 4c - Kernel Vectorization and Optimization ----- -Now that we are able to measure the total application time ([section-4a](../section-4a/)) and have examined the kernel performance via tracing ([section-4b](../section-4b)), we will take a closer look at kernel vectorization. We will be using the [vector-scalar multiply example](../../../programming_examples/basic/vector_scalar_mul/) again to illustrate kernel vectorization concepts. Go ahead and read the design example summary for [vector-scalar multiply](../../../programming_examples/basic/vector_scalar_mul/) first to get an idea of the different components of this example design. Then, let's take a closer look at the kernel source file ([scale.cc](../../../aie_kernels/aie2/scale.cc)). +Now that we are able to measure the total application time ([section-4a](../section-4a/)) and have examined the kernel performance via tracing ([section-4b](../section-4b)), we will take a closer look at kernel vectorization. We will be using the [vector-scalar multiply example](../../../programming_examples/basic/vector_scalar_mul/) to illustrate kernel vectorization concepts. + +Go ahead and read the design example summary for [vector-scalar multiply](../../../programming_examples/basic/vector_scalar_mul/) first to get an idea of the different components of this example design. Then, let's take a closer look at the kernel source file ([scale.cc](../../../aie_kernels/aie2/scale.cc)). In [scale.cc](../../../aie_kernels/aie2/scale.cc), we see that the scalar code is relatively straight forward: ```C++ @@ -33,6 +35,7 @@ void scale_scalar(T *a, T *c, T factor, const int32_t N) { Here, the code iterates over the input vector (`a`) and multiplies each element from the vector with a scalar value (`factor`) before storing the results in output vector (`c`). The simple C/C++ code for this consists of a for-loop, with a simple read and scalar multiply operation inside the loop. +### AIE API To vectorize this, we first need to familiarize ourselves with the AIE API which abstracts the underlying AIE processor and associated low-level intrinsics with an higher level C++ API. Documentation for AIE API (2023.2 Vitis tools) can be found [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/modules.html). To view details on the vector x scalar mutlipler, on the left pane, navigate to *AI Engine API User Guide -> API Reference -> Arithmetic* and select the first `aie::mul` which shows a `Vec * E` where `E` is an elementary data type like a scalar int. To be able to use this AIE API function in our kernel code, we first need to include the AIE API headers. @@ -40,6 +43,7 @@ To be able to use this AIE API function in our kernel code, we first need to inc #include ``` +#### Vector Registers Then, we declare a vector as follows: ```C++ aie::vector my_vector @@ -47,7 +51,7 @@ aie::vector my_vector * T - data type, such as `int32_t` * vec_factor - vector size, such as 16. -The size of the vector depends on the type. For example, the standard vector register in AIE2 is 512 bits. For `int32_t`, that means we can store 16 of them. Extending this to the other supported data types, we have the following abbreviated table: +The size of the vector depends on the type. For example, the standard vector register in AIE2 is **512 bits**. For `int32_t`, that means we can store 16 of them in 1x 512b vector register. Extending this to the other supported data types, we have the following abbreviated table: | Data type | Vector size | |-----------|-------------| @@ -58,77 +62,193 @@ The size of the vector depends on the type. For example, the standard vector reg A more complete table of supported vectors can be found in the AIE API User Guide [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__basic__types.html). Note that if the listed data types * vector size ends up being larger than 512-bits, that just means it's stored in 2+ vector registers instead of just one. +#### Vector Load + We can load the vector register from local L1 memory with the `aie::load_v` function, defined as follows: ```C++ T *__restrict pA1 = a; aie::vector A0 = aie::load_v(pA1); ``` -Here, `__restict` is used to qualify the pointer to indicate that it's a restrict pointer and therefore memory access to that pointer can be more optimally arranged by the scheduler. This is because restrict says that sequential access to the pointer will not access the same memory location and can therefore be treated as independent. +Here, we use `__restict` to qualify the pointer to indicate that it's a restrict pointer which says that the pointer is the only thing that accesses the underlying object. It eliminates the potential for pointer aliasing, enabling better optimization by the compiler. +#### Vector Multiply The vector load has a template argument `vec_factor` to match the one used in the `aie::vector` declaration. +At this point, it would be good to take a closer look the AIE Archtecture as being able to optim + Finally, we get to the `aie::mul` call which takes a vector and a scalar as arguments and stores the result in an accumulator register desginated by: ```C++ - aie::accum cout + aie::accum cout ``` -The accumulator data type in this case is 16x 64-bit accumulator. We store the computed results back to local memory using the vector store function `aie::store_v` as shown: +The accumulator data type in this case is 32x 32-bit accumulator. We store the computed results back to local memory using the vector store function `aie::store_v` as shown: ```C++ T *__restrict pC1 = c; - aie::store_v(pC1, cout.to_vector(0)): + aie::store_v(pC1, cout.template to_vector(0)): ``` -Here, the accumulator type can be shift-round-saturated back to a vector register with the `.to_vector(0)` call where `T` is the vector register type and the single integer argument `(0)` is the shift amount. +Here, the accumulator type can be shift-round-saturated back to a vector register with the `.template to_vector(0)` call where `T` is the vector register type and the single integer argument `(0)` is the shift amount. The entire vector block is then: ```C++ template -void scale_vectorized(T *a, T *c, T factor, const int32_t N) { - constexpr int vec_factor = 16; +void scale_vectorized(T *a, T *c, int32_t factor, const int32_t N) { event0(); + constexpr int vec_factor = 32; T *__restrict pA1 = a; T *__restrict pC1 = c; const int F = N / vec_factor; + T fac = factor; for (int i = 0; i < F; i++) - chess_prepare_for_pipelining chess_loop_range(16, ) { + chess_prepare_for_pipelining chess_loop_range(16, ) + { aie::vector A0 = aie::load_v(pA1); pA1 += vec_factor; - aie::accum cout = aie::mul(A0, factor); - aie::store_v(pC1, cout.to_vector(0)); + aie::accum cout = aie::mul(A0, fac); + aie::store_v(pC1, cout.template to_vector(0)); pC1 += vec_factor; - } + } event1(); } ``` In this first example, the vectorization strategy was relatively straight forward. Instead of iterating over a vector of values and doing a single scalar multiply, we load a vector of input values, iterate over a smaller loop to perfrom a vector*scalar operation using the AIE API functions, and then store the vector of results back to local memory. -## Exercises -1. Let's take a look at the trace for our vector scalar design. First, let's edit our [vector_scalar_mul design](../../../programming_examples/basic/vector_scalar_mul/) so that the [aie2.py](../../../programming_examples/basic/vector_scalar_mul/aie2.py) source file has `vectorized=False`. In the soruce code, we simply select the scalar version of the kernel function. Then run `make trace`. After the trace compilation is complete, open `parse_eventIR_vs.json` in https://ui.perfetto.dev and measure the delta between `event 0` and `event 1`. Note that in the Perfetto waveform, 1 ms is equal to 1 clock cycle. How many cycles did you measure? +## Vectorization Exercises +1. Let's take a look at the trace for our vector scalar design. First, let's edit our [vector_scalar_mul design](../../../programming_examples/basic/vector_scalar_mul/) so that the [aie2.py](../../../programming_examples/basic/vector_scalar_mul/aie2.py) source file has `vectorized=False`. In the [aie2.py](../../../programming_examples/basic/vector_scalar_mul/aie2.py) sourcee code, we simply select the scalar version of the kernel function. Then run `make trace`. After the trace compilation is complete, open `trace_vs.json` in https://ui.perfetto.dev and measure the delta between `event 0` and `event 1`. Note that in the Perfetto waveform, 1 us is equal to 1 clock cycle. How many cycles did you measure? + +1. Now let's turn vectorization back on by changing `vectorized=True`. But we're also going to disable an pragma guided optimization first to see its effect. In the [scale.cc](../../../aie_kernels/aie2/scale.cc), comment out the line after the `for loop` that says `chess_prepare_for_pipelining chess_loop_range(16, )`. Be sure you're editing the general template and not the `int32_t` template specialization. We'll examine that shortly. The rerun the compilation (`make clean; make trace`). Measure the delta between `event 0` and `event 1` again. What value do you see now? + + That's quite an improvemnt, ~20X reduction in compute latency. However, there's more optimization that can be had with vetor code and that involves compilation pragmas. + +1. Go back to [scale.cc](../../../aie_kernels/aie2/scale.cc) and uncomment the line with `chess_prepare_for_pipelining chess_loop_range(16, )`. The rerun the compilation (`make clean; make trace`). Measure the delta between `event 0` and `event 1` again. What value do you see now? + + Now, we're really seeing some savings (another factor ~6X savings or ~140X compare to the scalar version) The line we added help guide the compiler to find optimal schedules. In particular for kernel loops, `chess_prepare_for_pipelining` and `chess_loop_range(16, )` are particularly useful. + * `chess_prepare_for_pipelining` - Used in the innermost loop to tell the compiler to enable software pipelining. This is necessary for subsequent loop optimization pragmas to be useful + * `chess_loop_range(MIN, MAX)` - An extremely helpful pragma. This tells the compiler how many minimum or maximum iterations we expect this loop to have. We often paramterize loop bounds based on size and even if the upper bound is declared as a const, it's still a runtime computed value. Giving the MIN value is particular helpful because it guides the scheduler to know how many iterations we have and can therefore properly schedule the loop instructions. + +## Optimization - Coding for the Architecture + +At this point, We've vectorized our code to better leverage the AIE hardware and saw signficant performance gains, but is our design fully optimized? How do we know if we've used the powerful AIE hardware to its full potential? This requires a deeper understanding of the underlying AIE architecture and coding for performance with the hardware in mind. For this next section, we will focus on **AIE2** (aka AIE-ML) that's at the heart of the Ryzen AI NPU. AIE2 is optimized for ML workloads which means matrix mulitplication style compute would leverage the hardware the best. We will also start our exploration by continuing with the vector-scalar multiply example. While it is true that vector-scalar multiply isn't matrix multiply, it does provides a good starting point in understanding what design considerations are needed to code optimal designs. + +### The Vector Unit - Loads + +The first step in optimizing our code even further is to have a picture of the AIE Vector Unit which can be found in the [AIE-ML architecture manual (am020)](https://docs.amd.com/r/en-US/am020-versal-aie-ml/Fixed-Point-Vector-Unit). Below is a diagram of the vector unit from the manual. + + + +As we can see, vector registers are loaded from 2 parallel Load Units, each capable of loading 256 bits per clock cycle from local L1 memory. We have 12 512-bit vector registers which feed into each Permute block and eventually, the Multiplier block. It is important then to always think in terms of 2 256-bit parallel loads per clock cycle. If, for example, you try to load 2048-bits of data per clock in order to do your compute, you will be less efficient as that would require more than 1 cycle. Another important note is that the loads must come from different L1 memory banks or else a bank conflict will occur. The bank conflict penalty is small but would reduce opitimal performance. + +### The Vector Unit - Multiply and Add (MAC) + +Once data is loaded and permuted, it passes to the Mutliplier block which supports a wide list of AIE data types. The multiply results then pass through an optional post-add step (very common for matrix multiply) before eventualy being stored in the accumulator registers. There are 9x 512-bit accumulator registers. Accumulator registers are larger so data precision can be maintained. A well optimized piece of code woudl schedule 1 vector MAC (VMAC) every cycle. + +### The Vector Unit - SRS and Stores + +Once data has been computed (either in 1 cycle or accumulated over a number of cycles), the results can be then be written back out to local L1 memory via the Store Unit. This mirrors the 2 Load Units except there is a just 1 Store Unit. Bridging between the accumulator registers and vector registers or local L1 memory utilizes the SRS Unit (shift-round-saturate) which shifts, rounds and saturates with a number of configurable rounding and saturation modes. + + + +The SRS path is on the right of the diagram above with the corollary path, the Upshift (UPS) path on the left. + +### The Vector Unit - Shift/ Shuffle/ Adder Path + +Finally, we have an additional parallel processing path which performs shift, shuffle, simple addition, comparison and a host of other functions. This path runs in parallel with the main integer vector datapath and may be tasked to do the aforementioned functions without the need of the VMAC datapath if a VMAC is not needed in our code. + + + +It is very helpful to have in mind this processing datapath and the way in which data is loaded and stored to/from local memory. The next step then is to see how close we are to the ideal performance in our application and then examine the results in more detail to better understand where we might be able to improve. + + +### Multiplier Utilization Efficiency + +Now that we have a better understanding of the architecture, let's take a closer look at hardware efficiency.The following diagram shows the various AIE architecture blocks we talked about along with a table of generalized compte. + + + +**NOTE** - Matrix multiplication mode table is in the AIE API User Guide [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__mmul.html). Another way to see the total number of MACs for different bit precisions is the `Table: Supported Precision Width of the Vector Data Path` in the [AM020 spec](https://docs.amd.com/r/en-US/am020-versal-aie-ml/Functional-Overview). + +This table tells us that for 16-bit x 16-bit compute, we have 64 MACs avaialable per cycle. However, these MACs are targetign Matrix Multiplication (with its accompanying post-addition steps). In practice, we have 32 accumulator lanes available. That means for eltwise operations, we can only use 32 MACs per cycle. + +#### MAC efficiency +Using this information and our Vector Scalar Multiply example, we know that each call to the kernel passses in an array of 1024 16-bit data. With 32 MACs available, our `vector_factor` is 32 and therefore, we would ideally need 1024 / 32 = 32 cycles to process this amount of data given our 32 MACs per clock eltwise vector MAC cofiguratoin. Our final optimized cycle count for the kernel was 72 cycles or roughly 2x the ideal number of cycles. + +Total MAC efficiency is a product of the (MAC schedule efficiency) x (per clock MAC utilization efficiency). +* MAC schedule efficiency - Ideal MAC cycles / Actual MAC cycles (e.g. 32/ 72 = 44%) +* per clock MAC utilization efficiency - # of MACs used/ total # of MACs avaialable (e.g. 32/ 64 = 50%) +Therefore, the total MAC efficiency is 44% x 50% = 22%.. + +Let's file that result away but look at our algorithm from load/ store bandwidth perspective. + +#### Load/ Store Bandwidth efficiency + +To process a vector of 32 int16 values times a scalar, let's ignor the scalar load and focus only on the vector one. 32 int16 = 512-bits which would take 2x 256-bit loads or 2 cycles per MAC. It might be possible to do it in a single cycle if the data is striped across banks perfectly. We also need to store 2x 256-bits which must take 2 cycles since we only have 1 Store Unit. This means that even if we could do a VMAC every cycle, we need 2 cycles to load the inputs an store the outputs. This explains why our optimized vector results was 72, since based on this 2 cycle requirement, our minimum cycles for our data size is 64 cycles. The remaining 6 cycles is loop preamble, loop postamble and function initailzation and cleanup overhead. + +#### Data routing efficiency +So we saw why load/sore bandwidth is the bottleneck in our 16-bit Vector Scalar Multiply example for the compute. But what about data movement via streams and DMAs. We need to process 1024 chunks of 16-bit data or 512 32-bit quantities. Because our stream switch move data in 32-bit granuliarity, we need 512 cycles in order to load in the data to L1 and to move the data out of L1 to L2/L3. + +#### Hardware efficiency summary + +| Component | # of Cycles | Efficiency | +|-----------|-------------|------------| +| MAC | 72 | 22% | +| Load/Store| 64 | 50% / 100% | +| DMA | 512 | 100% | + +Looking at this table, we quickly see that the data movement is the bottleneck. + +## Optimization Exercises - Part 1 +1. Rerun the final optimized code and take a look at the resulting waveform. + + + + Mouse over the blocks of PortRuning0 and PortRunning1, what is the measured number of cycles per chunk? This matches what we expected to see. But note how it's obvious from the waveform how dominant data movement is as compared to compute. + + +**TODO** - Looking at int32 version? matmul? + + +## Diving Deep - Examining the Microcode +Let's take a look again at the results of our [vector_scalar_mul design](../../../programming_examples/basic/vector_scalar_mul/). Let's also take go back one step comment out `chess_prepare_for_pipelining chess_loop_range(16, )` and rerun the compilation (`make clean; make trace`). + +At this point, we can actually take a look at the `microcode`. The `microcode` is the precise schedule of instructions that our AIE executes in order to run the kernel program. This microcode can usually be found under `build/core_0_2.elf.lst` where the two numbers for the core indicates its column and row position respectively. So if your design has multiple cores, then each core will have its own .lst file. If you were to open the file, you will see a lot of information. Comment lines will have a . in front of it. The other lines are the instructions and are structured as follows: -1. Now let's turn vectorization back on by changing `vectorized=True` and rerun our build (`make clean; make trace`). Measure the delta between `event 0` and `event 1` again. What value do you see now? +Instruction Line Number ---- Encoded Instruction ---- 1 or more slots of ISA commands -## Multiplier Efficiency +| Example ISA commands | Description | +|----------------------|-------------| +| NOP .. | No op | +| JL #XXX | Jump and link to instruction line # | +| MOV r1, r2 | Move register values from r2 to r1 | +| LD .. | Scalra load | +| ST .. | Scalar store | +| VLDA | Vector load unit A, | +| VLDB | Vector load unit B, | +| VMUL .. | Vector mulitply | +| VMAC .. | Vector mutliple and accumulate | +| VST .. | Vector store | +| VSRS .. | Vector SRS | +| VSHUFFLE .. | Vector shuffle | -Let's take a closer look at hardware efficiency. In particular, we examine how often we are maximally utilizing all the multipliers in our fixed-point vector datapath. The AI Engine fixed-point vector datapath operates at 100% efficiency when we can schedule a vector MAC every clock cycle. The MAC itself operates at 100% efficiency if all the MAC units are being used. The overall MAC utilization efficiency then is a product of these two percentages. For example, if we have 1x vector MAC every 2 cycles, and we use 50% of our vector MACs each cycle, then 50% * 50% is a total mac efficiency of 25%. +Fully analzying and understanding this microcode is beyond the scope of this programming guide but we We will focus on key parts of this microcode, labled by 3 types of comments in particular, -The AIE fixed-point vector datapath is optimized for matrix multiplication, and as a result, has a post-addition block with a smaller number of output lanes. What this means is that element-wise MACs generally run less efficiently since the hardware does not have enough output lanes for all MACs. So for 16-bit x 16-bit, we can do 64x MACs but with 32 output lanes, element-wise multiply for 16-bit x 16-bit has a MAC utilization efficiency of 50%. +`.label vector_scalar_mul_aie` followed by `.function_start` - The start of the function we're interested in. The name after label is the function name but this might have additional characters if the function is generated from a template. -If we examine the matrix multiplication mode table in the AIE API User Guide [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__mmul.html), we see that for 16-bit x 16-bit matmul in AIE2 (aka AIE-ML), we support 4x4x4 mode which does 64 MACs each cycle. Another way to see the total number of MACs for different bit precisions is the `Table: Supported Precision Width of the Vector Data Path` in the [AM020 spec](https://docs.amd.com/r/en-US/am020-versal-aie-ml/Functional-Overview). There, we again see 64 MACs for 16-bit x 16-bit. For 32-bit x 32-bit, we have 16 MACs per cycle. So element-wise multiply for 32-bit x 32-bit does in fact have a per cycle MAC utilization efficiency of 100% which is not the case for element-wie multiplies of smaller bit precisions. +`.label ZLS_...` - The start of a zero-overhead loop -Going back to our vector-scalar design, we are processing 4096 samples totak, but only 1024 samples every iteration of our kernel. From the previous exercises, you saw that the scalar implementation takes about ~10,000 cycles to process while the vector implementaion only takes ~600 cycles. This is an speedup factor of over **16X**! That's a signficant gain but we'd like to look more closely to see if we can squeeze out even more performance. +`.label ZLE_...` - The end of a zero-overhead loop. **NOTE** The line after this label is the last line within the loop, not just the lines strictly between `ZLS` and `ZLE`. In general, labels pertain the line after the label. -### Crunching the numbers -Looking at the optimal MAC utilziation for 32-bit x 32-bit, we expect 1024 cycles to actually only take 64 cycles (1024/ 16) if it were possible to do a vector MAC every cycle. This seemingly gives our vector implementation a total MAC utilization efficiency of ~11%. In a non-ideal kernel, we have cycle overheads in the way of loop preamble and postamble as well as general function overhead. It is also true that these overheads become a smaller percentage of the total compute time if we are process a larger set of data. But this static overhead does not fully explain the unexpectedly small MAC utilization efficiency we see at first glance. +Let's examine this more closely in our example. -### Data movement +## Optimization Exercises - Part 2 +1. Open `build/core_0_2.elf.lst` and take a look through the file. You'll see a lot of helpful comments but it may be a bit too much comments to be able to see patterns in the microcode clearly. Run a simple cleanup script from the vector_scalar_mul example directory -The other consideration when looking at vectorization and performance is data movement. The time to compute must be balanced with the time to move data so that neither becomes the bottleneck. Looking at our example once again, we are moving data via objectFifos through the stream switch. Each stream switch channel moves 32-bits of data every clock cycle. This means to move 1024 32-bit data, it would require 1024 cycles. This makes it seem like our kernel throughput should be 1024 cycles. + `../../utils/clean_microcode.sh build/core_0_2.elf.lst` -But in our example, we are actually ping-ponging our data movement so that we are moving the next set of 1024 words while the first set is being computed on. So we would expect the kernel to be able to compute the data in a fraction of the data movement time (though we are still limited to 1024 cycles throughput). The real reason our compute is larger than 64 cycles is because both sets of data in our objectFifo are in the same local memory bank and thus access conflicts are occuring, which increases our total compute time. Also, the actual inner loop microcode schedule from the compiler is closer to 80% efficient instead of 100%. All these factors then add up to a total compute cycle count that's larger than our back-of-the-envelope ideal number. + This will remove some of the extra comments. Open up the `core_0_2.elf.lst` file again and search for `.label vector_scalar_mul_aie`. Then scroll down until you see the first `.label ZLS ..` line. Count the number of lines until you reach the first `.label ZLE ..` line and add 1 to that total (since the line after ZLE is within the loop). How many lines are in this inner loop? -## Conclusions +1. Now look at each line (including the one after ZLE) and count how many lines contain a `VMUL` or `VMAC` in it? What number do you get? -Having walked through this example, we can see the importance of matching the algorithm to the hardware in order to achieve maximum utilization efficiency. This generally means matmul style ops gives us the best MAC efficiency, not only because it matches the built-in vector matmul in the fixed-point vector datapath, but also because matmul has higher data re-use which lowers the data movement component so that data movement time and compute time are more closely aligned. +1. The number you got gives us a rough idea of how optimized the innermost loop of our algorithm is. In this case, we have 1 VMAC out of 15 cycles or ~6% MAC utilization. If the inner loop take 15 cycles and we iterate 32 times, how many cycles should this version take and how close are we to the measured cycle count? ----- diff --git a/python/utils/README.md b/python/utils/README.md index 7ca2e20088..7771cc2c84 100644 --- a/python/utils/README.md +++ b/python/utils/README.md @@ -16,10 +16,10 @@ import aie.utils.trace as trace_utils ``` Thereafter, functions defined in the file can be called via `trace_utils.configure_simple_tracing_aie2(...)`. -- [Test utilities](#Test-utilities) ([test.py](./test.py)) -- [Trace utilities](#Trace-utilities-(trace.py)) ([trace.py](./trace.py)) -- [XRT utilities](#XRT-utilities) ([xrt.py](./xrt.py)) -- [Machine Learning (ML) utilities](#Machine-Langauge-(ML)-utilities-(ml.py)) ([ml.py](./ml.py)) +- [Test utilities](#test-utilites-testpy) ([test.py](./test.py)) +- [Trace utilities](#trace-utilites-tracepy) ([trace.py](./trace.py)) +- [XRT utilities](#xrt-utilites-xrtpy) ([xrt.py](./xrt.py)) +- [Machine Learning (ML) utilities](#machine-language-ml-utilites-mlpyss) ([ml.py](./ml.py)) ## Test utilites ([test.py](./test.py)) Test/ Host code utilities. @@ -147,6 +147,7 @@ There is an extensive lists of trace events but here, we will only describe a fe | Lock stall |0x1A| 26 | | Core Port Running 1 |0x4F| 79 | | Core Port Running 0 |0x4B| 75 | +* A more exhaustive list of events for core tile, core memory, memtile and shim tile can be found in [this header file](https://github.com/Xilinx/aie-rt/blob/main-aie/driver/src/events/xaie_events_aie.h). However, not all events are yet supported in `parse_eventIR.py` at this time. **NOTE**: The "Core Instruction - Event 0/1" are special intrinsics you can add to your kernel code to trigger an event during the running of your core program. Within the kernel code, they look like: ```c++