diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index a57dff389c..c8486817a0 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -100,4 +100,4 @@ clean: clean_trace .PHONY: clean_trace clean_trace: - rm -rf tmpTrace parse*.json + rm -rf tmpTrace parse*.json trace.txt diff --git a/programming_guide/assets/trace_vector_scalar_mul1.png b/programming_guide/assets/trace_vector_scalar_mul1.png new file mode 100755 index 0000000000..0e63467715 Binary files /dev/null and b/programming_guide/assets/trace_vector_scalar_mul1.png differ diff --git a/programming_guide/quick_reference.md b/programming_guide/quick_reference.md index 4e0c5d11b7..fff7e14cbd 100644 --- a/programming_guide/quick_reference.md +++ b/programming_guide/quick_reference.md @@ -49,6 +49,12 @@ | `print(ctx.module)` | Converts our ctx wrapped structural code to mlir and prints to stdout| | `ctx.module.operation.verify()` | Runs additional structural verficiation on the python binded source code and return result to stdout | +## Common AIE API functions for Kernel Programming +| Function Signature | Definition | Parameters | Return Type | Example | +|---------------------|------------|------------|-------------|---------| +| `aie::vector my_vector` | Declare vector type | `T`: data type
`vec_factor`: vector width | n/a | aie::vector my_vector; | +| `aie::load_v(pA1);` | Vector load | `vec_factor`: vector width | `aie::vector` | aie::vector my_vector; | + ## Helpful AI Engine Architecture References and Tables * [AIE2 - Table of supported data types and vector sizes (AIE API)](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__basic__types.html) diff --git a/programming_guide/section-1/Makefile b/programming_guide/section-1/Makefile index 9a89112879..1a3d65de9a 100644 --- a/programming_guide/section-1/Makefile +++ b/programming_guide/section-1/Makefile @@ -6,7 +6,7 @@ # ##===----------------------------------------------------------------------===## -include ../../tutorials/makefile-common +include ../../programming_examples/makefile-common build/aie.mlir: aie2.py mkdir -p ${@D} diff --git a/programming_guide/section-3/Makefile b/programming_guide/section-3/Makefile index eb57eeb40b..77688005e3 100644 --- a/programming_guide/section-3/Makefile +++ b/programming_guide/section-3/Makefile @@ -12,11 +12,15 @@ all: build/final.xclbin build/insts.txt targetname = vectorScalar +build/aie.mlir: aie2.py + mkdir -p ${@D} + python3 $< > $@ + build/scale.o: vector_scalar_mul.cc mkdir -p ${@D} cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F} -build/final.xclbin: aie.mlir build/kernel1.o build/kernel2.o build/kernel3.o +build/final.xclbin: build/aie.mlir build/scale.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) diff --git a/programming_guide/section-3/README.md b/programming_guide/section-3/README.md index c91095d68f..166cab665e 100644 --- a/programming_guide/section-3/README.md +++ b/programming_guide/section-3/README.md @@ -149,7 +149,6 @@ To compile the design and C++ testbench: ```sh make -make build/vectorScalar.exe ``` To run the design: diff --git a/programming_guide/section-3/test.cpp b/programming_guide/section-3/test.cpp index c5690e127d..0698905f19 100644 --- a/programming_guide/section-3/test.cpp +++ b/programming_guide/section-3/test.cpp @@ -34,13 +34,10 @@ int main(int argc, const char *argv[]) { test_utils::parse_options(argc, argv, desc, vm); int verbosity = vm["verbosity"].as(); - int trace_size = vm["trace_sz"].as(); constexpr bool VERIFY = true; - constexpr bool ENABLE_TRACING = false; - // constexpr int TRACE_SIZE = 8192; constexpr int IN_SIZE = 4096; - constexpr int OUT_SIZE = ENABLE_TRACING ? IN_SIZE + trace_size / 4 : IN_SIZE; + constexpr int OUT_SIZE = IN_SIZE; // Load instruction sequence std::vector instr_v = @@ -64,7 +61,7 @@ int main(int argc, const char *argv[]) { XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE) + trace_size, + auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) @@ -85,7 +82,7 @@ int main(int argc, const char *argv[]) { // Zero out buffer bo_outC DATATYPE *bufOut = bo_outC.map(); - memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE) + trace_size); + memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE)); // sync host to device memories bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); @@ -120,11 +117,6 @@ int main(int argc, const char *argv[]) { } } - if (trace_size > 0) { - test_utils::write_out_trace(((char *)bufOut) + (IN_SIZE * sizeof(DATATYPE)), - trace_size, vm["trace_file"].as()); - } - // Print Pass/Fail result of our test if (!errors) { std::cout << std::endl << "PASS!" << std::endl << std::endl; diff --git a/programming_guide/section-3/test.py b/programming_guide/section-3/test.py index bfdc33cbea..4028e889b6 100644 --- a/programming_guide/section-3/test.py +++ b/programming_guide/section-3/test.py @@ -15,7 +15,6 @@ from aie.extras.dialects.ext import memref, arith import aie.utils.test as test_utils -import aie.utils.trace as trace_utils def main(opts): @@ -41,7 +40,7 @@ def main(opts): INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize - OUT_SIZE = INOUT2_SIZE + int(opts.trace_size) + OUT_SIZE = INOUT2_SIZE # ------------------------------------------------------ # Get device, load the xclbin & kernel and register them @@ -99,11 +98,6 @@ def main(opts): e = np.equal(output_buffer, ref) errors = errors + np.size(e) - np.count_nonzero(e) - # Write trace values if trace_size > 0 - if opts.trace_size > 0: - trace_buffer = entire_buffer[INOUT2_VOLUME:] - trace_utils.write_out_trace(trace_buffer, str(opts.trace_file)) - # ------------------------------------------------------ # Print verification and timing results # ------------------------------------------------------ diff --git a/programming_guide/section-4/CMakeLists.txt b/programming_guide/section-4/CMakeLists.txt deleted file mode 100644 index 6b330f21c1..0000000000 --- a/programming_guide/section-4/CMakeLists.txt +++ /dev/null @@ -1,70 +0,0 @@ -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2024 Advanced Micro Devices, Inc. - -# parameters -# -DBOOST_ROOT: Path to Boost install -# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo -# -DXRT_LIB_DIR: Path to xrt_coreutil.lib -# -DTARGET_NAME: Target name to be built - -# cmake needs this line -cmake_minimum_required(VERSION 3.1) - -find_program(WSL NAMES powershell.exe) - -if (NOT WSL) - set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install") - set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib") -else() - set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") - set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") -endif() - -set(TARGET_NAME test CACHE STRING "Target to be built") - -SET (ProjectName ${TARGET_NAME}) -SET (currentTarget ${TARGET_NAME}) - -if ( WSL ) - set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) -endif () - -project(${ProjectName}) - -# Find packages -find_package(Boost REQUIRED) - -add_executable(${currentTarget} - ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib/test_utils.cpp - test.cpp -) - -target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1) - -target_include_directories (${currentTarget} PUBLIC - ${XRT_INC_DIR} - ${Boost_INCLUDE_DIRS} - ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib -) - -target_link_directories(${currentTarget} PUBLIC - ${XRT_LIB_DIR} - ${Boost_LIBRARY_DIRS} -) - -if (NOT WSL) - target_link_libraries(${currentTarget} PUBLIC - xrt_coreutil - boost_program_options - boost_filesystem - ) -else() - target_link_libraries(${currentTarget} PUBLIC - xrt_coreutil - ) -endif() diff --git a/programming_guide/section-4/aie2.py b/programming_guide/section-4/aie2.py deleted file mode 100644 index 4231179c36..0000000000 --- a/programming_guide/section-4/aie2.py +++ /dev/null @@ -1,74 +0,0 @@ -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2023 AMD Inc. - -from aie.dialects.aie import * # primary mlir-aie dialect definitions -from aie.extras.context import mlir_mod_ctx # mlir-aie context - -from aie.dialects.aiex import * # extended mlir-aie dialect definitions -from aie.dialects.scf import * # scf (strcutred control flow) dialect -from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects - - -# AI Engine structural design function -def my_first_aie_program(): - - # Dvice declaration - aie2 device NPU - @device(AIEDevice.npu) - def device_body(): - # Memref types - memRef_8_ty = T.memref(8, T.i32()) - memRef_16_ty = T.memref(16, T.i32()) - memRef_32_ty = T.memref(32, T.i32()) - memRef_64_ty = T.memref(64, T.i32()) - - # Tile declarations - ComputeTile = tile(0, 2) - ShimTile = tile(0, 0) - - # Data movement with object FIFOs - # Input (from shim tile to compute tile) - of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty) - - # Output (from compute tile to shim tile) - of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty) - - # Compute tile body - @core(ComputeTile) - def core_body(): - for _ in for_(8): - # Acquire input and output object FIFO objects - elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1) - elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1) - - # Core functionality - load, add 1, store - for i in for_(8): - v0 = memref.load(elem_in, [i]) - v1 = arith.addi(v0, arith.constant(1, T.i32())) - memref.store(v1, elem_out, [i]) - yield_([]) - - # Release input and output object FIFO objects - of_in0.release(ObjectFifoPort.Consume, 1) - of_out0.release(ObjectFifoPort.Produce, 1) - yield_([]) - - # To/from AIE-array data movement - @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty) - def sequence(inTensor, unused, outTensor): - npu_dma_memcpy_nd( - metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64] - ) - npu_dma_memcpy_nd( - metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64] - ) - npu_sync(column=0, row=0, direction=0, channel=0) - - -# Declares that subsequent code is in mlir-aie context -with mlir_mod_ctx() as ctx: - my_first_aie_program() # Call design function within the mlir-aie context - print(ctx.module) # Print the python-to-mlir conversion diff --git a/programming_guide/section-4/section-4a/Makefile b/programming_guide/section-4/section-4a/Makefile index ee28c567c4..3b0140656f 100644 --- a/programming_guide/section-4/section-4a/Makefile +++ b/programming_guide/section-4/section-4a/Makefile @@ -16,10 +16,14 @@ build/aie.mlir: aie2.py mkdir -p ${@D} python3 $< > $@ -build/final.xclbin: build/aie.mlir +build/scale.o: vector_scalar_mul.cc mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ - --xclbin-name=${@F} --npu-insts-name=insts.txt ${ -1. Let's change our design and increase the loop size of our kernel by a factor of 10. This involves changing the outer loop from 8 to 80. What reported times do you see now? - - ----- [[Up]](../../section-4) [[Next]](../section-4b) diff --git a/programming_guide/section-4/section-4a/aie2.py b/programming_guide/section-4/section-4a/aie2.py index 3e1f7e59ab..b09f9d0637 100644 --- a/programming_guide/section-4/section-4a/aie2.py +++ b/programming_guide/section-4/section-4a/aie2.py @@ -5,75 +5,73 @@ # # (c) Copyright 2023 AMD Inc. -from aie.dialects.aie import * # primary mlir-aie dialect definitions -from aie.extras.context import mlir_mod_ctx # mlir-aie context +import sys -from aie.dialects.aiex import * # extended mlir-aie dialect definitions -from aie.dialects.scf import * # scf (strcutred control flow) dialect -from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx +import aie.utils.trace as trace_utils -# AI Engine structural design function -def my_first_aie_program(): - # Dvice declaration - aie2 device NPU +def my_vector_scalar(): + @device(AIEDevice.npu) def device_body(): - # Memref types - memRef_8_ty = T.memref(8, T.i32()) - memRef_16_ty = T.memref(16, T.i32()) - memRef_32_ty = T.memref(32, T.i32()) - memRef_64_ty = T.memref(64, T.i32()) - memRef_640_ty = T.memref(640, T.i32()) + memRef_ty = T.memref(1024, T.i32()) + + # AIE Core Function declarations + scale_scalar = external_func( + "vector_scalar_mul_aie_scalar", + inputs=[memRef_ty, memRef_ty, T.memref(1, T.i32()), T.i32()], + ) # Tile declarations - ComputeTile = tile(0, 2) ShimTile = tile(0, 0) + ComputeTile2 = tile(0, 2) - # Data movement with object FIFOs - # Input (from shim tile to compute tile) - of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty) - - # Output (from compute tile to shim tile) - of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty) + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty) + of_factor = object_fifo( + "infactor", ShimTile, ComputeTile2, 2, T.memref(1, T.i32()) + ) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty) - # Compute tile body - @core(ComputeTile) + # Set up compute tiles + # Compute tile 2 + @core(ComputeTile2, "scale.o") def core_body(): - for _ in for_(8): - # Acquire input and output object FIFO objects - elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1) - elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1) - - # Core functionality - load, add 1, store - for i in for_(8): - v0 = memref.load(elem_in, [i]) - v1 = arith.addi(v0, arith.constant(1, T.i32())) - memref.store(v1, elem_out, [i]) + # Effective while(1) + for _ in for_(sys.maxsize): + elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1) + # Number of sub-vector "tile" iterations + for _ in for_(4): + elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) + elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + call(scale_scalar, [elem_in, elem_out, elem_factor, 1024]) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) yield_([]) - - # Release input and output object FIFO objects - of_in0.release(ObjectFifoPort.Consume, 1) - of_out0.release(ObjectFifoPort.Produce, 1) + of_factor.release(ObjectFifoPort.Consume, 1) yield_([]) # To/from AIE-array data movement - @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty) - def sequence(inTensor, unused, outTensor): - npu_dma_memcpy_nd( - metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64] - ) - npu_dma_memcpy_nd( - metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64] - ) + tensor_ty = T.memref(4096, T.i32()) + scalar_ty = T.memref(1, T.i32()) + + @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty) + def sequence(A, F, C): + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096]) + npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) npu_sync(column=0, row=0, direction=0, channel=0) -# Declares that subsequent code is in mlir-aie context with mlir_mod_ctx() as ctx: - my_first_aie_program() # Call design function within the mlir-aie context - res = ctx.module.operation.verify() # Verify mlir context + my_vector_scalar() + res = ctx.module.operation.verify() if res == True: - print(ctx.module) # Print the python-to-mlir conversion + print(ctx.module) else: print(res) diff --git a/programming_guide/section-4/section-4a/answers/aie2.py b/programming_guide/section-4/section-4a/answers/aie2.py deleted file mode 100644 index 595e0c11d2..0000000000 --- a/programming_guide/section-4/section-4a/answers/aie2.py +++ /dev/null @@ -1,79 +0,0 @@ -# -# This file is licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# (c) Copyright 2023 AMD Inc. - -from aie.dialects.aie import * # primary mlir-aie dialect definitions -from aie.extras.context import mlir_mod_ctx # mlir-aie context - -from aie.dialects.aiex import * # extended mlir-aie dialect definitions -from aie.dialects.scf import * # scf (strcutred control flow) dialect -from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects - - -# AI Engine structural design function -def my_first_aie_program(): - - # Dvice declaration - aie2 device NPU - @device(AIEDevice.npu) - def device_body(): - # Memref types - memRef_8_ty = T.memref(8, T.i32()) - memRef_16_ty = T.memref(16, T.i32()) - memRef_32_ty = T.memref(32, T.i32()) - memRef_64_ty = T.memref(64, T.i32()) - memRef_640_ty = T.memref(640, T.i32()) - - # Tile declarations - ComputeTile = tile(0, 2) - ShimTile = tile(0, 0) - - # Data movement with object FIFOs - # Input (from shim tile to compute tile) - of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty) - - # Output (from compute tile to shim tile) - of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty) - - # Compute tile body - @core(ComputeTile) - def core_body(): - for _ in for_(80): - # Acquire input and output object FIFO objects - elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1) - elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1) - - # Core functionality - load, add 1, store - for i in for_(8): - v0 = memref.load(elem_in, [i]) - v1 = arith.addi(v0, arith.constant(1, T.i32())) - memref.store(v1, elem_out, [i]) - yield_([]) - - # Release input and output object FIFO objects - of_in0.release(ObjectFifoPort.Consume, 1) - of_out0.release(ObjectFifoPort.Produce, 1) - yield_([]) - - # To/from AIE-array data movement - @FuncOp.from_py_func(memRef_640_ty, memRef_64_ty, memRef_640_ty) - def sequence(inTensor, unused, outTensor): - npu_dma_memcpy_nd( - metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 640] - ) - npu_dma_memcpy_nd( - metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 640] - ) - npu_sync(column=0, row=0, direction=0, channel=0) - - -# Declares that subsequent code is in mlir-aie context -with mlir_mod_ctx() as ctx: - my_first_aie_program() # Call design function within the mlir-aie context - res = ctx.module.operation.verify() # Verify mlir context - if res == True: - print(ctx.module) # Print the python-to-mlir conversion - else: - print(res) diff --git a/programming_guide/section-4/section-4a/answers/test.cpp b/programming_guide/section-4/section-4a/answers/test.cpp deleted file mode 100644 index d154a97425..0000000000 --- a/programming_guide/section-4/section-4a/answers/test.cpp +++ /dev/null @@ -1,256 +0,0 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" - -#include "test_utils.h" - -#ifndef DATATYPES_USING_DEFINED -#define DATATYPES_USING_DEFINED -// ------------------------------------------------------ -// Configure this to match your buffer data type -// ------------------------------------------------------ -using INOUT0_DATATYPE = std::uint32_t; -using INOUT1_DATATYPE = std::uint32_t; -using INOUT2_DATATYPE = std::uint32_t; -#endif - -namespace po = boost::program_options; - -// ---------------------------------------------------------------------------- -// Verify results (specific to our design example) -// ---------------------------------------------------------------------------- -template -int verify(int CSize, std::vector C, int verbosity) { - int errors = 0; - for (uint32_t i = 0; i < CSize; i++) { - uint32_t ref = i + 2; - if (C[i] != ref) { - std::cout << "Error in output " << C[i] << " != " << ref << std::endl; - errors++; - } else { - if (verbosity > 1) - std::cout << "Correct output " << C[i] << " == " << ref << std::endl; - } - } - return errors; -} - -// ---------------------------------------------------------------------------- -// Main -// ---------------------------------------------------------------------------- -int main(int argc, const char *argv[]) { - - // ------------------------------------------------------ - // Parse program arguments - // ------------------------------------------------------ - po::options_description desc("Allowed options"); - po::variables_map vm; - test_utils::add_default_options(desc); - - test_utils::parse_options(argc, argv, desc, vm); - int verbosity = vm["verbosity"].as(); - int do_verify = vm["verify"].as(); - int n_iterations = vm["iters"].as(); - int n_warmup_iterations = vm["warmup"].as(); - int trace_size = vm["trace_sz"].as(); - - // ------------------------------------------------------ - // Configure this to match your design's buffer size - // ------------------------------------------------------ - int INOUT0_VOLUME = 640; // Input only, 64x uint32_t in this example - int INOUT1_VOLUME = 640; // Not used in this example - int INOUT2_VOLUME = 640; // Output only, 64x uint32_t in this example - - size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); - size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); - size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE); - - // TODO Remove trace for now? - size_t OUT_SIZE = INOUT2_SIZE + trace_size; - - srand(time(NULL)); - - // Load instruction sequence - std::vector instr_v = - test_utils::load_instr_sequence(vm["instr"].as()); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // ------------------------------------------------------ - // Get device, load the xclbin & kernel and register them - // ------------------------------------------------------ - xrt::device device; - xrt::kernel kernel; - - test_utils::init_xrt_load_kernel(device, kernel, verbosity, - vm["xclbin"].as(), - vm["kernel"].as()); - - // ------------------------------------------------------ - // Initialize input/ output buffer sizes and sync them - // ------------------------------------------------------ - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - // Assumes trace will only be added to inout2 - auto bo_inout2 = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - // Initialize instruction buffer - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - // Initialize Inout buffer 0 - INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); - std::vector AVec(INOUT0_VOLUME); - for (int i = 0; i < INOUT0_VOLUME; i++) - AVec[i] = i + 1; - // AVec.push_back(i + 1); - memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); - - // Initialize Inout buffer 1 - // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); - // std::vector BVec(INOUT1_VOLUME); - // for (int i = 0; i < INOUT1_VOLUME; i++) - // BVec[i] = i + 1 - // //BVec.push_back(i + 1); - // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); - - // Initialize Inout buffer 2 - char *bufInOut2 = bo_inout2.map(); - std::vector CVec(INOUT2_VOLUME); - memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size - - // Sync buffers to update input buffer values - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); - // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - // ------------------------------------------------------ - // Initialize run configs - // ------------------------------------------------------ - unsigned num_iter = n_iterations + n_warmup_iterations; - float npu_time_total = 0; - float npu_time_min = 9999999; - float npu_time_max = 0; - - int errors = 0; - - // ------------------------------------------------------ - // Main run loop - // ------------------------------------------------------ - for (unsigned iter = 0; iter < num_iter; iter++) { - - // Run kernel - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto start = std::chrono::high_resolution_clock::now(); - auto run = - kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); - run.wait(); - auto stop = std::chrono::high_resolution_clock::now(); - bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - if (iter < n_warmup_iterations) { - /* Warmup iterations do not count towards average runtime. */ - continue; - } - - // Copy output results and verify they are correct - memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE))); - if (do_verify) { - if (verbosity >= 1) { - std::cout << "Verifying results ..." << std::endl; - } - auto vstart = std::chrono::system_clock::now(); - errors = verify(INOUT2_VOLUME, CVec, verbosity); - auto vstop = std::chrono::system_clock::now(); - float vtime = - std::chrono::duration_cast(vstop - vstart) - .count(); - if (verbosity >= 1) { - std::cout << "Verify time: " << vtime << "secs." << std::endl; - } - } else { - if (verbosity >= 1) - std::cout << "WARNING: results not verified." << std::endl; - } - - // Write trace values if trace_size > 0 - if (trace_size > 0) { - // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, - // trace_size, - test_utils::write_out_trace(((char *)bufInOut2), trace_size, - vm["trace_file"].as()); - } - - // Accumulate run times - float npu_time = - std::chrono::duration_cast(stop - start) - .count(); - - npu_time_total += npu_time; - npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; - npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; - } - - // ------------------------------------------------------ - // Print verification and timing results - // ------------------------------------------------------ - - // TODO - Mac count to guide gflops - float macs = 0; - - std::cout << std::endl - << "Avg NPU time: " << npu_time_total / n_iterations << "us." - << std::endl; - if (macs > 0) - std::cout << "Avg NPU gflops: " - << macs / (1000 * npu_time_total / n_iterations) << std::endl; - - std::cout << std::endl - << "Min NPU time: " << npu_time_min << "us." << std::endl; - if (macs > 0) - std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) - << std::endl; - - std::cout << std::endl - << "Max NPU time: " << npu_time_max << "us." << std::endl; - if (macs > 0) - std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) - << std::endl; - - if (!errors) { - std::cout << "\nPASS!\n\n"; - return 0; - } else { - std::cout << "\nError count: " << errors << "\n\n"; - std::cout << "\nFailed.\n\n"; - return 1; - } -} diff --git a/programming_guide/section-4/section-4a/test.cpp b/programming_guide/section-4/section-4a/test.cpp index 2ec8a0d1c3..a5af1576bf 100644 --- a/programming_guide/section-4/section-4a/test.cpp +++ b/programming_guide/section-4/section-4a/test.cpp @@ -1,4 +1,4 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// +//===- test.cpp -------------------------------------------------*- C++ -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,59 +8,26 @@ // //===----------------------------------------------------------------------===// -#include #include #include #include #include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" #include "test_utils.h" +#include "xrt/xrt_bo.h" #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -// ------------------------------------------------------ -// Configure this to match your buffer data type -// ------------------------------------------------------ -using INOUT0_DATATYPE = std::uint32_t; -using INOUT1_DATATYPE = std::uint32_t; -using INOUT2_DATATYPE = std::uint32_t; +using DATATYPE = std::uint32_t; // Configure this to match your buffer data type #endif -namespace po = boost::program_options; +const int scaleFactor = 3; -// ---------------------------------------------------------------------------- -// Verify results (specific to our design example) -// ---------------------------------------------------------------------------- -template -int verify(int CSize, std::vector C, int verbosity) { - int errors = 0; - for (uint32_t i = 0; i < CSize; i++) { - uint32_t ref = i + 2; - if (C[i] != ref) { - std::cout << "Error in output " << C[i] << " != " << ref << std::endl; - errors++; - } else { - if (verbosity > 1) - std::cout << "Correct output " << C[i] << " == " << ref << std::endl; - } - } - return errors; -} +namespace po = boost::program_options; -// ---------------------------------------------------------------------------- -// Main -// ---------------------------------------------------------------------------- int main(int argc, const char *argv[]) { - // ------------------------------------------------------ - // Parse program arguments - // ------------------------------------------------------ + // Program arguments parsing po::options_description desc("Allowed options"); po::variables_map vm; test_utils::add_default_options(desc); @@ -70,33 +37,19 @@ int main(int argc, const char *argv[]) { int do_verify = vm["verify"].as(); int n_iterations = vm["iters"].as(); int n_warmup_iterations = vm["warmup"].as(); - int trace_size = vm["trace_sz"].as(); - - // ------------------------------------------------------ - // Configure this to match your design's buffer size - // ------------------------------------------------------ - int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example - int INOUT1_VOLUME = 64; // Not used in this example - int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example - - size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); - size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); - size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE); - // TODO Remove trace for now? - size_t OUT_SIZE = INOUT2_SIZE + trace_size; - - srand(time(NULL)); + constexpr bool VERIFY = true; + constexpr int IN_SIZE = 4096; + constexpr int OUT_SIZE = IN_SIZE; // Load instruction sequence std::vector instr_v = test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - // ------------------------------------------------------ - // Get device, load the xclbin & kernel and register them - // ------------------------------------------------------ + // Start the XRT context and load the kernel xrt::device device; xrt::kernel kernel; @@ -104,52 +57,41 @@ int main(int argc, const char *argv[]) { vm["xclbin"].as(), vm["kernel"].as()); - // ------------------------------------------------------ - // Initialize input/ output buffer sizes and sync them - // ------------------------------------------------------ + // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - // Assumes trace will only be added to inout2 - auto bo_inout2 = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); + auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; - // Initialize instruction buffer + // Copy instruction stream to xrt buffer object void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - // Initialize Inout buffer 0 - INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); - std::vector AVec(INOUT0_VOLUME); - for (int i = 0; i < INOUT0_VOLUME; i++) - AVec[i] = i + 1; - // AVec.push_back(i + 1); - memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); - - // Initialize Inout buffer 1 - // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); - // std::vector BVec(INOUT1_VOLUME); - // for (int i = 0; i < INOUT1_VOLUME; i++) - // BVec[i] = i + 1 - // //BVec.push_back(i + 1); - // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); - - // Initialize Inout buffer 2 - char *bufInOut2 = bo_inout2.map(); - std::vector CVec(INOUT2_VOLUME); - memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size - - // Sync buffers to update input buffer values + // Initialize buffer bo_inA + DATATYPE *bufInA = bo_inA.map(); + for (int i = 0; i < IN_SIZE; i++) + bufInA[i] = i + 1; + + // Initialize buffer bo_inFactor + DATATYPE *bufInFactor = bo_inFactor.map(); + *bufInFactor = scaleFactor; + + // Zero out buffer bo_outC + DATATYPE *bufOut = bo_outC.map(); + memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE)); + + // sync host to device memories bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); - // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_outC.sync(XCL_BO_SYNC_BO_TO_DEVICE); // ------------------------------------------------------ // Initialize run configs @@ -170,11 +112,12 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = - kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); run.wait(); auto stop = std::chrono::high_resolution_clock::now(); - bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + // Sync device to host memories + bo_outC.sync(XCL_BO_SYNC_BO_FROM_DEVICE); if (iter < n_warmup_iterations) { /* Warmup iterations do not count towards average runtime. */ @@ -182,13 +125,26 @@ int main(int argc, const char *argv[]) { } // Copy output results and verify they are correct - memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE))); + // Copy output results and verify they are correct if (do_verify) { if (verbosity >= 1) { std::cout << "Verifying results ..." << std::endl; } auto vstart = std::chrono::system_clock::now(); - errors = verify(INOUT2_VOLUME, CVec, verbosity); + for (uint32_t i = 0; i < IN_SIZE; i++) { + int32_t ref = bufInA[i] * scaleFactor; + int32_t test = bufOut[i]; + if (test != ref) { + if (verbosity >= 1) + std::cout << "Error in output " << test << " != " << ref + << std::endl; + errors++; + } else { + if (verbosity >= 1) + std::cout << "Correct output " << test << " == " << ref + << std::endl; + } + } auto vstop = std::chrono::system_clock::now(); float vtime = std::chrono::duration_cast(vstop - vstart) @@ -201,14 +157,6 @@ int main(int argc, const char *argv[]) { std::cout << "WARNING: results not verified." << std::endl; } - // Write trace values if trace_size > 0 - if (trace_size > 0) { - // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, - // trace_size, - test_utils::write_out_trace(((char *)bufInOut2), trace_size, - vm["trace_file"].as()); - } - // Accumulate run times float npu_time = std::chrono::duration_cast(stop - start) @@ -245,12 +193,15 @@ int main(int argc, const char *argv[]) { std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) << std::endl; + // Print Pass/Fail result of our test if (!errors) { - std::cout << "\nPASS!\n\n"; + std::cout << std::endl << "PASS!" << std::endl << std::endl; return 0; } else { - std::cout << "\nError count: " << errors << "\n\n"; - std::cout << "\nFailed.\n\n"; + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; return 1; } } diff --git a/programming_guide/section-4/section-4a/test.py b/programming_guide/section-4/section-4a/test.py index 0e82d741cb..887586472d 100644 --- a/programming_guide/section-4/section-4a/test.py +++ b/programming_guide/section-4/section-4a/test.py @@ -8,22 +8,13 @@ import sys import time -import aie.utils.test as test_utils - -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -INOUT0_VOLUME = 64 # Input only, 64x uint32_t in this example -INOUT1_VOLUME = 64 # Not used in this example -INOUT2_VOLUME = 64 # Output only, 64x uint32_t in this example +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext import memref, arith -INOUT0_DATATYPE = np.uint32 -INOUT1_DATATYPE = np.uint32 -INOUT2_DATATYPE = np.uint32 - -INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize -INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize -INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize +import aie.utils.test as test_utils def main(opts): @@ -34,6 +25,21 @@ def main(opts): instr_text = [l for l in instr_text if l != ""] instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32) + # ------------------------------------------------------------ + # Configure this to match your design's buffer size and type + # ------------------------------------------------------------ + INOUT0_VOLUME = int(4096) # Input only, 64x uint32_t in this example + INOUT1_VOLUME = int(1) # Input only, 1 uint32_t scale factor + INOUT2_VOLUME = int(4096) # Output only, 64x uint32_t in this example + + INOUT0_DATATYPE = np.int32 + INOUT1_DATATYPE = np.int32 + INOUT2_DATATYPE = np.int32 + + INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize + INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize + INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize + OUT_SIZE = INOUT2_SIZE # ------------------------------------------------------ @@ -47,7 +53,6 @@ def main(opts): bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) - # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4)) bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4)) # Initialize instruction buffer @@ -55,10 +60,10 @@ def main(opts): # Initialize data buffers inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) - inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE) - inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE) + scale_factor = np.array([3], dtype=INOUT1_DATATYPE) + inout2 = np.zeros(OUT_SIZE, dtype=np.uint8) bo_inout0.write(inout0, 0) - bo_inout1.write(inout1, 0) + bo_inout1.write(scale_factor, 0) bo_inout2.write(inout2, 0) # Sync buffers to update input buffer values @@ -94,12 +99,12 @@ def main(opts): continue # Copy output results and verify they are correct - out_size = INOUT2_SIZE - output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE) + entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32) + output_buffer = entire_buffer[:INOUT2_VOLUME] if opts.verify: if opts.verbosity >= 1: print("Verifying results ...") - ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE) + ref = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) * scale_factor e = np.equal(output_buffer, ref) errors = errors + np.size(e) - np.count_nonzero(e) @@ -128,5 +133,6 @@ def main(opts): if __name__ == "__main__": - opts = test_utils.parse_args(sys.argv[1:]) + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) main(opts) diff --git a/programming_guide/section-4/section-4a/vector_scalar_mul.cc b/programming_guide/section-4/section-4a/vector_scalar_mul.cc new file mode 100755 index 0000000000..10c0aecbbc --- /dev/null +++ b/programming_guide/section-4/section-4a/vector_scalar_mul.cc @@ -0,0 +1,25 @@ +//===- vector_scaler_mul.cc -------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +extern "C" { + +void vector_scalar_mul_aie_scalar(int32_t *a, int32_t *c, int32_t *factor, + int32_t N) { + for (int i = 0; i < N; i++) { + c[i] = *factor * a[i]; + } +} + +} // extern "C" diff --git a/programming_guide/section-4/section-4b/Makefile b/programming_guide/section-4/section-4b/Makefile index 09126e5289..8b7b1cc434 100644 --- a/programming_guide/section-4/section-4b/Makefile +++ b/programming_guide/section-4/section-4b/Makefile @@ -18,10 +18,14 @@ build/aie.mlir: aie2.py mkdir -p ${@D} python3 $< > $@ -build/final.xclbin: build/aie.mlir +build/scale.o: vector_scalar_mul.cc mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ - --xclbin-name=${@F} --npu-insts-name=insts.txt ${ **NOTE** In our example design, the [aie2.py](./aie2.py) and associated [Makefile](./Makefile), we provide a Makefile target `run` for standard build and `trace` for trace-enabld build. The trace-enabled build passes the trace buffer size as an argument to [aie2.py](./aie2.py) which conditionally enables the trace `flow` and calls `configure_simple_tracing_aie2` as long as `trace_size` is > 0. This is also true for the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul). + +> **NOTE** In our example design, the [aie2.py](./aie2.py) and associated [Makefile](./Makefile), we provide a Makefile target `run` for standard build and `trace` for trace-enabled build. The trace-enabled build passes the trace buffer size as an argument to [aie2.py](./aie2.py) which conditionally enables the trace `flow` and calls `configure_simple_tracing_aie2` as long as `trace_size` is > 0. This is also true for the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul). ### (2a) C/C++ Host code ([test.cpp](./test.cpp)) The main changes needed for [test.cpp](./test.cpp) is the increase in the output buffer size to account for the trace buffer size, being careful to read only the output buffer portion when verifying correctness of the results. We also need to be sure to pass the correct buffer offset which points to the trace buffer data when calling `write_out_trace`. -You can see in [test.cpp](.test.cpp) that trace_size is set based on an input argument of `-t $(trace_size)` which is defined and passed in the [Makefile](.Makefile). The `trace` target from the [Makefile](./Makefile) is shown below. +You can see in [test.cpp](./test.cpp) that trace_size is set based on an input argument of `-t $(trace_size)` which is defined and passed in the [Makefile](./Makefile). The `trace` target from the [Makefile](./Makefile) is shown below. ```Makefile trace: ${targetname}.exe build/final.xclbin build/insts.txt @@ -143,15 +144,14 @@ trace: ${targetname}.exe build/final.xclbin build/insts.txt Following the invocation of the executable, we call the `parse_trace.py` python script which we will cover in more detail in step 3. Within the [test.cpp](./test.cpp), we redefine OUT_SIZE to be the sum of output buffer size (in bytes) and the trace buffer size. ```c++ - int OUT_SIZE = INOUT2_SIZE + trace_size; + int OUT_SIZE = IN_SIZE + trace_size; ``` -All subsuquent references to the output buffer size should use `OUT_SIZE`. The exception is when we want to verify the output results which should be bounded by the original output buffer size, in this case `INOUT2_VOLUME`. +All subsequent references to the output buffer size should use `OUT_SIZE`. The exception is when we want to verify the output results which should be bounded by the original output buffer size, in this case `IN_SIZE`. Finally, the function to write the trace output to a file as defined in `aie.utils.trace` is `write_out_trace` and we need to pass it the pointer in the output buffer where the trace data begins, the trace buffer size and the trace file name (default is `trace.txt`). ```c++ - test_utils::write_out_trace( - ((char *)bufInOut2) + INOUT2_SIZE, - trace_size, vm["trace_file"].as()); + test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size, + vm["trace_file"].as()); ``` ### (2b) Python Host code ([test.py](./test.py)) @@ -163,7 +163,7 @@ trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt ``` The python equivalent host code performs the same steps as the C/C++ host code as we redefine `OUT_SIZE` to include the `trace_size`. ```python - OUT_SIZE = INOUT1_SIZE + int(opts.trace_size) + OUT_SIZE = INOUT2_SIZE + int(opts.trace_size) ``` During verification, the `output_buffer` excludes the trace data and uses the `read` function as follows: ```python @@ -195,14 +195,21 @@ Open https://ui.perfetto.dev in your browser and then open up the waveform json * Check matching packet IDs for packet-routed flows. The packet flow ID must match the configured ID value in Trace Control 1 register or else the packets don't get routed. ## Exercises -1. Let's give tracing a try. In this directory, we're been examining a design based off the `Vector Scalar Add` example. Run `make trace` to compile the design and generate a trace file and run the `prase_trace.py` script on it to generate the `trace_4b.json` waveform file. Open this in http://ui.perfetto.dev. if you zoom into the region of interest with the W and S to zoom in and out respectively and A adn D to pan left and right. You should seem a wave like the following: - - +1. Let's give tracing a try. In this directory, we're been examining a local design based off the `Vector Scalar Mul` example. Run `make trace` to compile the design and generate a trace file and run the `prase_trace.py` script on it to generate the `trace_4b.json` waveform file. Open this in http://ui.perfetto.dev. if you zoom into the region of interest with the keyboard shortcut key W and S to zoom in and out respectively and A and D to pan left and right. You should seem a wave like the following: - Based on this wave, You can mouse over each chunk of continguous data for `PortRunning0` (input dma port) and `PortRunning1` (output dma port). What is the chunk size? How many input and output chunks are there? This shoudl match iteration loop bounds in our exmple design. + -1. **TODO** Additional questions about routing congestion for circuit switch and packet switch routes for trace packets? + Based on this wave, You can mouse over each chunk of continguous data for `PortRunning0` (input dma port) and `PortRunning1` (output dma port). What is the chunk size? How many input and output chunks are there? This shoudl match iteration loop bounds in our exmple design. + Here, we notice a few signals worth mentioning. + * `Event0` - The event marking the beginning of our kernel. See [vector_scalar_mul.cc](./vector_scalar_mul.cc) where we added the function `event0()` before the loop. This is generally a handy thing to do to attach an event to the beginning of our kernel. + * `Event1` - The event marking the end of our kernel. See [vector_scalar_mul.cc](./vector_scalar_mul.cc) where we added the function `event1()` before the loop. Much like event0, attaching event1 to the end of our kernel is also helpful. + * `VectorInstr` - Vector instructions like vector MAC or vector load/store. Here, we are running a scalar implementation so there are no vector events. + * `PortRunning0` - Mapped to Port 0 which is by default configured to the S2MM0 input (DMA from stream to local memory) + * `PortRunning1` - Mapped to Port 1 which is by default configured to the MM2S0 output (DMA from local memory to stream) + * `LockStall` - Any locks that are stalled in the core + * `LockAcquiresInstr` - Any lock acquire requests + * `LockReleaseInstr` - Any lock release requests ----- [[Prev]](../section-4a) [[Up]](../../section-4) [[Next]](../section-4c) diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py index a629daa0ce..87d4e85d13 100644 --- a/programming_guide/section-4/section-4b/aie2.py +++ b/programming_guide/section-4/section-4b/aie2.py @@ -5,95 +5,89 @@ # # (c) Copyright 2023 AMD Inc. -from aie.dialects.aie import * # primary mlir-aie dialect definitions -from aie.extras.context import mlir_mod_ctx # mlir-aie context +import sys -from aie.dialects.aiex import * # extended mlir-aie dialect definitions -from aie.dialects.scf import * # scf (strcutred control flow) dialect -from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx import aie.utils.trace as trace_utils -# AI Engine structural design function -def my_first_aie_program(): +def my_vector_scalar(): enableTrace = True trace_size = 8192 - C_sz_in_bytes = 64 * 4 - # Dvice declaration - aie2 device NPU @device(AIEDevice.npu) def device_body(): - # Memref types - memRef_8_ty = T.memref(8, T.i32()) - memRef_16_ty = T.memref(16, T.i32()) - memRef_32_ty = T.memref(32, T.i32()) - memRef_64_ty = T.memref(64, T.i32()) + memRef_ty = T.memref(1024, T.i32()) + + # AIE Core Function declarations + scale_scalar = external_func( + "vector_scalar_mul_aie_scalar", + inputs=[memRef_ty, memRef_ty, T.memref(1, T.i32()), T.i32()], + ) # Tile declarations - ComputeTile = tile(0, 2) ShimTile = tile(0, 0) - - # Data movement with object FIFOs - # Input (from shim tile to compute tile) - of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty) - - # Output (from compute tile to shim tile) - of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty) - - # Compute tile body - @core(ComputeTile) + ComputeTile2 = tile(0, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty) + of_factor = object_fifo( + "infactor", ShimTile, ComputeTile2, 2, T.memref(1, T.i32()) + ) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty) + + # Set up compute tiles + # Compute tile 2 + @core(ComputeTile2, "scale.o") def core_body(): - for _ in for_(0xFFFFFFFF): - # for _ in for_(8): - # Acquire input and output object FIFO objects - elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1) - elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1) - - # Core functionality - load, add 1, store - for i in for_(8): - v0 = memref.load(elem_in, [i]) - v1 = arith.addi(v0, arith.constant(1, T.i32())) - memref.store(v1, elem_out, [i]) + # Effective while(1) + for _ in for_(sys.maxsize): + elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1) + # Number of sub-vector "tile" iterations + for _ in for_(4): + elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) + elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + call(scale_scalar, [elem_in, elem_out, elem_factor, 1024]) + of_in.release(ObjectFifoPort.Consume, 1) + of_out.release(ObjectFifoPort.Produce, 1) yield_([]) - - # Release input and output object FIFO objects - of_in0.release(ObjectFifoPort.Consume, 1) - of_out0.release(ObjectFifoPort.Produce, 1) + of_factor.release(ObjectFifoPort.Consume, 1) yield_([]) # Set up a circuit-switched flow from core to shim for tracing information if enableTrace: - flow(ComputeTile, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1) + flow(ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1) # To/from AIE-array data movement - @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty) - def sequence(inTensor, notUsed, outTensor): + tensor_ty = T.memref(4096, T.i32()) + scalar_ty = T.memref(1, T.i32()) + @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty) + def sequence(A, F, C): if enableTrace: trace_utils.configure_simple_tracing_aie2( - ComputeTile, + ComputeTile2, ShimTile, ddr_id=2, size=trace_size, - offset=C_sz_in_bytes, + offset=4096 * 4, # offset in bytes ) - npu_dma_memcpy_nd( - metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64] - ) - npu_dma_memcpy_nd( - metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64] - ) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096]) + npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) npu_sync(column=0, row=0, direction=0, channel=0) -# Declares that subsequent code is in mlir-aie context with mlir_mod_ctx() as ctx: - my_first_aie_program() # Call design function within the mlir-aie context - res = ctx.module.operation.verify() # Verify mlir context + my_vector_scalar() + res = ctx.module.operation.verify() if res == True: - print(ctx.module) # Print the python-to-mlir conversion + print(ctx.module) else: print(res) diff --git a/programming_guide/section-4/section-4b/test.cpp b/programming_guide/section-4/section-4b/test.cpp index 6f775e5b54..4e27fd8780 100644 --- a/programming_guide/section-4/section-4b/test.cpp +++ b/programming_guide/section-4/section-4b/test.cpp @@ -1,4 +1,4 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// +//===- test.cpp -------------------------------------------------*- C++ -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,59 +8,26 @@ // //===----------------------------------------------------------------------===// -#include #include #include #include #include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" #include "test_utils.h" +#include "xrt/xrt_bo.h" #ifndef DATATYPES_USING_DEFINED #define DATATYPES_USING_DEFINED -// ------------------------------------------------------ -// Configure this to match your buffer data type -// ------------------------------------------------------ -using INOUT0_DATATYPE = std::uint32_t; -using INOUT1_DATATYPE = std::uint32_t; -using INOUT2_DATATYPE = std::uint32_t; +using DATATYPE = std::uint32_t; // Configure this to match your buffer data type #endif -namespace po = boost::program_options; +const int scaleFactor = 3; -// ---------------------------------------------------------------------------- -// Verify results (specific to our design example) -// ---------------------------------------------------------------------------- -template -int verify(int CSize, std::vector C, int verbosity) { - int errors = 0; - for (uint32_t i = 0; i < CSize; i++) { - uint32_t ref = i + 2; - if (C[i] != ref) { - std::cout << "Error in output " << C[i] << " != " << ref << std::endl; - errors++; - } else { - if (verbosity > 1) - std::cout << "Correct output " << C[i] << " == " << ref << std::endl; - } - } - return errors; -} +namespace po = boost::program_options; -// ---------------------------------------------------------------------------- -// Main -// ---------------------------------------------------------------------------- int main(int argc, const char *argv[]) { - // ------------------------------------------------------ - // Parse program arguments - // ------------------------------------------------------ + // Program arguments parsing po::options_description desc("Allowed options"); po::variables_map vm; test_utils::add_default_options(desc); @@ -72,31 +39,20 @@ int main(int argc, const char *argv[]) { int n_warmup_iterations = vm["warmup"].as(); int trace_size = vm["trace_sz"].as(); - // ------------------------------------------------------ - // Configure this to match your design's buffer size - // ------------------------------------------------------ - int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example - int INOUT1_VOLUME = 64; // Not used in this example - int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example - - size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); - size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); - size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE); - - // TODO Remove trace for now? - size_t OUT_SIZE = INOUT2_SIZE + trace_size; + constexpr bool VERIFY = true; + constexpr int IN_VOLUME = 4096; - srand(time(NULL)); + constexpr int IN_SIZE = IN_VOLUME * sizeof(DATATYPE); + int OUT_SIZE = IN_SIZE + trace_size; // Load instruction sequence std::vector instr_v = test_utils::load_instr_sequence(vm["instr"].as()); + if (verbosity >= 1) std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - // ------------------------------------------------------ - // Get device, load the xclbin & kernel and register them - // ------------------------------------------------------ + // Start the XRT context and load the kernel xrt::device device; xrt::kernel kernel; @@ -104,61 +60,41 @@ int main(int argc, const char *argv[]) { vm["xclbin"].as(), vm["kernel"].as()); - // ------------------------------------------------------ - // Initialize input/ output buffer sizes and sync them - // ------------------------------------------------------ + // set up the buffer objects auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - // Assumes trace will only be added to inout2 - auto bo_inout2 = + auto bo_inA = + xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); + auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); + auto bo_outC = xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - // auto bo_trace = - // // xrt::bo(device, trace_size, XRT_BO_FLAGS_HOST_ONLY, - // kernel.group_id(4)); xrt::bo(device, trace_size, - // XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); if (verbosity >= 1) std::cout << "Writing data into buffer objects.\n"; - // Initialize instruction buffer + // Copy instruction stream to xrt buffer object void *bufInstr = bo_instr.map(); memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - // Initialize Inout buffer 0 - INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); - std::vector AVec(INOUT0_VOLUME); - for (int i = 0; i < INOUT0_VOLUME; i++) - AVec[i] = i + 1; - // AVec.push_back(i + 1); - memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); - - // Initialize Inout buffer 1 - // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); - // std::vector BVec(INOUT1_VOLUME); - // for (int i = 0; i < INOUT1_VOLUME; i++) - // BVec[i] = i + 1 - // //BVec.push_back(i + 1); - // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); - - // Initialize Inout buffer 2 - char *bufInOut2 = bo_inout2.map(); - std::vector CVec(INOUT2_VOLUME); - memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size - // memset(bufInOut2, 0, INOUT2_SIZE); // Zeroes out INOUT2_VOLUME + trace_size - - // char *bufTrace = bo_trace.map(); - // memset(bufTrace, 0, trace_size); - - // Sync buffers to update input buffer values + // Initialize buffer bo_inA + DATATYPE *bufInA = bo_inA.map(); + for (int i = 0; i < IN_VOLUME; i++) + bufInA[i] = i + 1; + + // Initialize buffer bo_inFactor + DATATYPE *bufInFactor = bo_inFactor.map(); + *bufInFactor = scaleFactor; + + // Zero out buffer bo_outC + DATATYPE *bufOut = bo_outC.map(); + memset(bufOut, 0, OUT_SIZE); + + // sync host to device memories bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); - // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE); - // bo_trace.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_outC.sync(XCL_BO_SYNC_BO_TO_DEVICE); // ------------------------------------------------------ // Initialize run configs @@ -179,14 +115,12 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; auto start = std::chrono::high_resolution_clock::now(); - auto run = - kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); - // kernel(bo_instr, instr_v.size(), bo_inout0, bo_trace, bo_inout2); + auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC); run.wait(); - // sleep(3); auto stop = std::chrono::high_resolution_clock::now(); - bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - // bo_trace.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + // Sync device to host memories + bo_outC.sync(XCL_BO_SYNC_BO_FROM_DEVICE); if (iter < n_warmup_iterations) { /* Warmup iterations do not count towards average runtime. */ @@ -194,13 +128,26 @@ int main(int argc, const char *argv[]) { } // Copy output results and verify they are correct - memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE))); + // Copy output results and verify they are correct if (do_verify) { if (verbosity >= 1) { std::cout << "Verifying results ..." << std::endl; } auto vstart = std::chrono::system_clock::now(); - errors = verify(INOUT2_VOLUME, CVec, verbosity); + for (uint32_t i = 0; i < IN_VOLUME; i++) { + int32_t ref = bufInA[i] * scaleFactor; + int32_t test = bufOut[i]; + if (test != ref) { + if (verbosity >= 1) + std::cout << "Error in output " << test << " != " << ref + << std::endl; + errors++; + } else { + if (verbosity >= 1) + std::cout << "Correct output " << test << " == " << ref + << std::endl; + } + } auto vstop = std::chrono::system_clock::now(); float vtime = std::chrono::duration_cast(vstop - vstart) @@ -215,7 +162,7 @@ int main(int argc, const char *argv[]) { // Write trace values if trace_size > 0 if (trace_size > 0) { - test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, trace_size, + test_utils::write_out_trace(((char *)bufOut) + IN_SIZE, trace_size, vm["trace_file"].as()); } @@ -255,12 +202,15 @@ int main(int argc, const char *argv[]) { std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) << std::endl; + // Print Pass/Fail result of our test if (!errors) { - std::cout << "\nPASS!\n\n"; + std::cout << std::endl << "PASS!" << std::endl << std::endl; return 0; } else { - std::cout << "\nError count: " << errors << "\n\n"; - std::cout << "\nFailed.\n\n"; + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; return 1; } } diff --git a/programming_guide/section-4/section-4b/test.py b/programming_guide/section-4/section-4b/test.py index a36dc5d5a7..e7f6628ba6 100644 --- a/programming_guide/section-4/section-4b/test.py +++ b/programming_guide/section-4/section-4b/test.py @@ -3,30 +3,20 @@ # Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -# import argparse import numpy as np import pyxrt as xrt import sys import time +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx +from aie.extras.dialects.ext import memref, arith + import aie.utils.test as test_utils import aie.utils.trace as trace_utils -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -INOUT0_VOLUME = 64 # Input only, 64x uint32_t in this example -INOUT1_VOLUME = 64 # Not used in this example -INOUT2_VOLUME = 64 # Output only, 64x uint32_t in this example - -INOUT0_DATATYPE = np.uint32 -INOUT1_DATATYPE = np.uint32 -INOUT2_DATATYPE = np.uint32 - -INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize -INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize -INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize - def main(opts): @@ -36,6 +26,21 @@ def main(opts): instr_text = [l for l in instr_text if l != ""] instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32) + # ------------------------------------------------------------ + # Configure this to match your design's buffer size and type + # ------------------------------------------------------------ + INOUT0_VOLUME = int(4096) # Input only, 64x uint32_t in this example + INOUT1_VOLUME = int(1) # Input only, 1 uint32_t scale factor + INOUT2_VOLUME = int(4096) # Output only, 64x uint32_t in this example + + INOUT0_DATATYPE = np.int32 + INOUT1_DATATYPE = np.int32 + INOUT2_DATATYPE = np.int32 + + INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize + INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize + INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize + OUT_SIZE = INOUT2_SIZE + int(opts.trace_size) # ------------------------------------------------------ @@ -49,7 +54,6 @@ def main(opts): bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) - # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4)) bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4)) # Initialize instruction buffer @@ -57,10 +61,10 @@ def main(opts): # Initialize data buffers inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) - inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE) - inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE) + scale_factor = np.array([3], dtype=INOUT1_DATATYPE) + inout2 = np.zeros(OUT_SIZE, dtype=np.uint8) bo_inout0.write(inout0, 0) - bo_inout1.write(inout1, 0) + bo_inout1.write(scale_factor, 0) bo_inout2.write(inout2, 0) # Sync buffers to update input buffer values @@ -101,9 +105,8 @@ def main(opts): if opts.verify: if opts.verbosity >= 1: print("Verifying results ...") - ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE) + ref = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) * scale_factor e = np.equal(output_buffer, ref) - # e = np.equal(dput_buffer, ref) errors = errors + np.size(e) - np.count_nonzero(e) # Write trace values if trace_size > 0 @@ -136,5 +139,6 @@ def main(opts): if __name__ == "__main__": - opts = test_utils.parse_args(sys.argv[1:]) + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) main(opts) diff --git a/programming_guide/section-4/section-4b/vector_scalar_mul.cc b/programming_guide/section-4/section-4b/vector_scalar_mul.cc new file mode 100755 index 0000000000..b47fc34622 --- /dev/null +++ b/programming_guide/section-4/section-4b/vector_scalar_mul.cc @@ -0,0 +1,26 @@ +//===- vector_scaler_mul.cc -------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +extern "C" { + +void vector_scalar_mul_aie_scalar(int32_t *a, int32_t *c, int32_t *factor, + int32_t N) { + event0(); + for (int i = 0; i < N; i++) { + c[i] = *factor * a[i]; + } + event1(); +} +} // extern "C" diff --git a/programming_guide/section-4/section-4c/README.md b/programming_guide/section-4/section-4c/README.md index d9109bc4e5..81146f42e7 100644 --- a/programming_guide/section-4/section-4c/README.md +++ b/programming_guide/section-4/section-4c/README.md @@ -17,7 +17,7 @@ ----- -Now that we are able to measure the total application time ([section-4a](../section-4a/)) and have examined the kernel performance via tracing ([section-4b](../section-4b)), we will take a closer look at kernel vectorization. We will be using the [vector-scalar multiply example](../../../programming_examples/basic/vector_scalar_mul/) to illustrate kernel vectorization concepts. +Now that we are able to measure the total application time ([section-4a](../section-4a/)) and have examined the kernel performance via tracing ([section-4b](../section-4b)), we will take a closer look at kernel vectorization. We will be using the [vector-scalar multiply example](../../../programming_examples/basic/vector_scalar_mul/) rather than a local copy of that same design to illustrate kernel vectorization concepts. Note that by default, that example design is working with 16-bit data (vs 32-bit of our local examples) and has `vectorized=True`. Go ahead and read the design example summary for [vector-scalar multiply](../../../programming_examples/basic/vector_scalar_mul/) first to get an idea of the different components of this example design. Then, let's take a closer look at the kernel source file ([scale.cc](../../../aie_kernels/aie2/scale.cc)). diff --git a/programming_guide/section-4/test.cpp b/programming_guide/section-4/test.cpp deleted file mode 100644 index 2ec8a0d1c3..0000000000 --- a/programming_guide/section-4/test.cpp +++ /dev/null @@ -1,256 +0,0 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" - -#include "test_utils.h" - -#ifndef DATATYPES_USING_DEFINED -#define DATATYPES_USING_DEFINED -// ------------------------------------------------------ -// Configure this to match your buffer data type -// ------------------------------------------------------ -using INOUT0_DATATYPE = std::uint32_t; -using INOUT1_DATATYPE = std::uint32_t; -using INOUT2_DATATYPE = std::uint32_t; -#endif - -namespace po = boost::program_options; - -// ---------------------------------------------------------------------------- -// Verify results (specific to our design example) -// ---------------------------------------------------------------------------- -template -int verify(int CSize, std::vector C, int verbosity) { - int errors = 0; - for (uint32_t i = 0; i < CSize; i++) { - uint32_t ref = i + 2; - if (C[i] != ref) { - std::cout << "Error in output " << C[i] << " != " << ref << std::endl; - errors++; - } else { - if (verbosity > 1) - std::cout << "Correct output " << C[i] << " == " << ref << std::endl; - } - } - return errors; -} - -// ---------------------------------------------------------------------------- -// Main -// ---------------------------------------------------------------------------- -int main(int argc, const char *argv[]) { - - // ------------------------------------------------------ - // Parse program arguments - // ------------------------------------------------------ - po::options_description desc("Allowed options"); - po::variables_map vm; - test_utils::add_default_options(desc); - - test_utils::parse_options(argc, argv, desc, vm); - int verbosity = vm["verbosity"].as(); - int do_verify = vm["verify"].as(); - int n_iterations = vm["iters"].as(); - int n_warmup_iterations = vm["warmup"].as(); - int trace_size = vm["trace_sz"].as(); - - // ------------------------------------------------------ - // Configure this to match your design's buffer size - // ------------------------------------------------------ - int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example - int INOUT1_VOLUME = 64; // Not used in this example - int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example - - size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); - size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); - size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE); - - // TODO Remove trace for now? - size_t OUT_SIZE = INOUT2_SIZE + trace_size; - - srand(time(NULL)); - - // Load instruction sequence - std::vector instr_v = - test_utils::load_instr_sequence(vm["instr"].as()); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // ------------------------------------------------------ - // Get device, load the xclbin & kernel and register them - // ------------------------------------------------------ - xrt::device device; - xrt::kernel kernel; - - test_utils::init_xrt_load_kernel(device, kernel, verbosity, - vm["xclbin"].as(), - vm["kernel"].as()); - - // ------------------------------------------------------ - // Initialize input/ output buffer sizes and sync them - // ------------------------------------------------------ - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - // Assumes trace will only be added to inout2 - auto bo_inout2 = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - // Initialize instruction buffer - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - // Initialize Inout buffer 0 - INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); - std::vector AVec(INOUT0_VOLUME); - for (int i = 0; i < INOUT0_VOLUME; i++) - AVec[i] = i + 1; - // AVec.push_back(i + 1); - memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); - - // Initialize Inout buffer 1 - // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); - // std::vector BVec(INOUT1_VOLUME); - // for (int i = 0; i < INOUT1_VOLUME; i++) - // BVec[i] = i + 1 - // //BVec.push_back(i + 1); - // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); - - // Initialize Inout buffer 2 - char *bufInOut2 = bo_inout2.map(); - std::vector CVec(INOUT2_VOLUME); - memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size - - // Sync buffers to update input buffer values - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); - // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - // ------------------------------------------------------ - // Initialize run configs - // ------------------------------------------------------ - unsigned num_iter = n_iterations + n_warmup_iterations; - float npu_time_total = 0; - float npu_time_min = 9999999; - float npu_time_max = 0; - - int errors = 0; - - // ------------------------------------------------------ - // Main run loop - // ------------------------------------------------------ - for (unsigned iter = 0; iter < num_iter; iter++) { - - // Run kernel - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto start = std::chrono::high_resolution_clock::now(); - auto run = - kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); - run.wait(); - auto stop = std::chrono::high_resolution_clock::now(); - bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - if (iter < n_warmup_iterations) { - /* Warmup iterations do not count towards average runtime. */ - continue; - } - - // Copy output results and verify they are correct - memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE))); - if (do_verify) { - if (verbosity >= 1) { - std::cout << "Verifying results ..." << std::endl; - } - auto vstart = std::chrono::system_clock::now(); - errors = verify(INOUT2_VOLUME, CVec, verbosity); - auto vstop = std::chrono::system_clock::now(); - float vtime = - std::chrono::duration_cast(vstop - vstart) - .count(); - if (verbosity >= 1) { - std::cout << "Verify time: " << vtime << "secs." << std::endl; - } - } else { - if (verbosity >= 1) - std::cout << "WARNING: results not verified." << std::endl; - } - - // Write trace values if trace_size > 0 - if (trace_size > 0) { - // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, - // trace_size, - test_utils::write_out_trace(((char *)bufInOut2), trace_size, - vm["trace_file"].as()); - } - - // Accumulate run times - float npu_time = - std::chrono::duration_cast(stop - start) - .count(); - - npu_time_total += npu_time; - npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; - npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; - } - - // ------------------------------------------------------ - // Print verification and timing results - // ------------------------------------------------------ - - // TODO - Mac count to guide gflops - float macs = 0; - - std::cout << std::endl - << "Avg NPU time: " << npu_time_total / n_iterations << "us." - << std::endl; - if (macs > 0) - std::cout << "Avg NPU gflops: " - << macs / (1000 * npu_time_total / n_iterations) << std::endl; - - std::cout << std::endl - << "Min NPU time: " << npu_time_min << "us." << std::endl; - if (macs > 0) - std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) - << std::endl; - - std::cout << std::endl - << "Max NPU time: " << npu_time_max << "us." << std::endl; - if (macs > 0) - std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) - << std::endl; - - if (!errors) { - std::cout << "\nPASS!\n\n"; - return 0; - } else { - std::cout << "\nError count: " << errors << "\n\n"; - std::cout << "\nFailed.\n\n"; - return 1; - } -} diff --git a/programming_guide/section-4/test.py b/programming_guide/section-4/test.py deleted file mode 100644 index 0e82d741cb..0000000000 --- a/programming_guide/section-4/test.py +++ /dev/null @@ -1,132 +0,0 @@ -# test.py -*- Python -*- -# -# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. -# SPDX-License-Identifier: MIT - -import numpy as np -import pyxrt as xrt -import sys -import time - -import aie.utils.test as test_utils - -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -INOUT0_VOLUME = 64 # Input only, 64x uint32_t in this example -INOUT1_VOLUME = 64 # Not used in this example -INOUT2_VOLUME = 64 # Output only, 64x uint32_t in this example - -INOUT0_DATATYPE = np.uint32 -INOUT1_DATATYPE = np.uint32 -INOUT2_DATATYPE = np.uint32 - -INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize -INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize -INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize - - -def main(opts): - - # Load instruction sequence - with open(opts.instr, "r") as f: - instr_text = f.read().split("\n") - instr_text = [l for l in instr_text if l != ""] - instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32) - - OUT_SIZE = INOUT2_SIZE - - # ------------------------------------------------------ - # Get device, load the xclbin & kernel and register them - # ------------------------------------------------------ - (device, kernel) = test_utils.init_xrt_load_kernel(opts) - - # ------------------------------------------------------ - # Initialize input/ output buffer sizes and sync them - # ------------------------------------------------------ - bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) - bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) - bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) - # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4)) - bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4)) - - # Initialize instruction buffer - bo_instr.write(instr_v, 0) - - # Initialize data buffers - inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) - inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE) - inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE) - bo_inout0.write(inout0, 0) - bo_inout1.write(inout1, 0) - bo_inout2.write(inout2, 0) - - # Sync buffers to update input buffer values - bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - - # ------------------------------------------------------ - # Initialize run configs - # ------------------------------------------------------ - num_iter = opts.iters + opts.warmup_iters - npu_time_total = 0 - npu_time_min = 9999999 - npu_time_max = 0 - errors = 0 - - # ------------------------------------------------------ - # Main run loop - # ------------------------------------------------------ - for i in range(num_iter): - # Run kernel - if opts.verbosity >= 1: - print("Running Kernel.") - start = time.time_ns() - h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) - h.wait() - stop = time.time_ns() - bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) - - # Warmup iterations do not count towards average runtime. - if i < opts.warmup_iters: - continue - - # Copy output results and verify they are correct - out_size = INOUT2_SIZE - output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE) - if opts.verify: - if opts.verbosity >= 1: - print("Verifying results ...") - ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE) - e = np.equal(output_buffer, ref) - errors = errors + np.size(e) - np.count_nonzero(e) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - npu_time_min = min(npu_time_min, npu_time) - npu_time_max = max(npu_time_max, npu_time) - - # ------------------------------------------------------ - # Print verification and timing results - # ------------------------------------------------------ - - # TODO - Mac count to guide gflops - - print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000))) - print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000))) - print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000))) - - if not errors: - print("\nPASS!\n") - exit(0) - else: - print("\nError count: ", errors) - print("\nFailed.\n") - exit(-1) - - -if __name__ == "__main__": - opts = test_utils.parse_args(sys.argv[1:]) - main(opts) diff --git a/programming_guide/section-4/test_trace.cpp b/programming_guide/section-4/test_trace.cpp deleted file mode 100644 index 2ec8a0d1c3..0000000000 --- a/programming_guide/section-4/test_trace.cpp +++ /dev/null @@ -1,256 +0,0 @@ -//===- test.cpp -------------------------------------------000---*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include - -#include "xrt/xrt_bo.h" -#include "xrt/xrt_device.h" -#include "xrt/xrt_kernel.h" - -#include "test_utils.h" - -#ifndef DATATYPES_USING_DEFINED -#define DATATYPES_USING_DEFINED -// ------------------------------------------------------ -// Configure this to match your buffer data type -// ------------------------------------------------------ -using INOUT0_DATATYPE = std::uint32_t; -using INOUT1_DATATYPE = std::uint32_t; -using INOUT2_DATATYPE = std::uint32_t; -#endif - -namespace po = boost::program_options; - -// ---------------------------------------------------------------------------- -// Verify results (specific to our design example) -// ---------------------------------------------------------------------------- -template -int verify(int CSize, std::vector C, int verbosity) { - int errors = 0; - for (uint32_t i = 0; i < CSize; i++) { - uint32_t ref = i + 2; - if (C[i] != ref) { - std::cout << "Error in output " << C[i] << " != " << ref << std::endl; - errors++; - } else { - if (verbosity > 1) - std::cout << "Correct output " << C[i] << " == " << ref << std::endl; - } - } - return errors; -} - -// ---------------------------------------------------------------------------- -// Main -// ---------------------------------------------------------------------------- -int main(int argc, const char *argv[]) { - - // ------------------------------------------------------ - // Parse program arguments - // ------------------------------------------------------ - po::options_description desc("Allowed options"); - po::variables_map vm; - test_utils::add_default_options(desc); - - test_utils::parse_options(argc, argv, desc, vm); - int verbosity = vm["verbosity"].as(); - int do_verify = vm["verify"].as(); - int n_iterations = vm["iters"].as(); - int n_warmup_iterations = vm["warmup"].as(); - int trace_size = vm["trace_sz"].as(); - - // ------------------------------------------------------ - // Configure this to match your design's buffer size - // ------------------------------------------------------ - int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example - int INOUT1_VOLUME = 64; // Not used in this example - int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example - - size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE); - size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE); - size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE); - - // TODO Remove trace for now? - size_t OUT_SIZE = INOUT2_SIZE + trace_size; - - srand(time(NULL)); - - // Load instruction sequence - std::vector instr_v = - test_utils::load_instr_sequence(vm["instr"].as()); - if (verbosity >= 1) - std::cout << "Sequence instr count: " << instr_v.size() << "\n"; - - // ------------------------------------------------------ - // Get device, load the xclbin & kernel and register them - // ------------------------------------------------------ - xrt::device device; - xrt::kernel kernel; - - test_utils::init_xrt_load_kernel(device, kernel, verbosity, - vm["xclbin"].as(), - vm["kernel"].as()); - - // ------------------------------------------------------ - // Initialize input/ output buffer sizes and sync them - // ------------------------------------------------------ - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0)); - auto bo_inout0 = - xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2)); - auto bo_inout1 = - xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); - // Assumes trace will only be added to inout2 - auto bo_inout2 = - xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); - - if (verbosity >= 1) - std::cout << "Writing data into buffer objects.\n"; - - // Initialize instruction buffer - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); - - // Initialize Inout buffer 0 - INOUT0_DATATYPE *bufInOut0 = bo_inout0.map(); - std::vector AVec(INOUT0_VOLUME); - for (int i = 0; i < INOUT0_VOLUME; i++) - AVec[i] = i + 1; - // AVec.push_back(i + 1); - memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE))); - - // Initialize Inout buffer 1 - // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map(); - // std::vector BVec(INOUT1_VOLUME); - // for (int i = 0; i < INOUT1_VOLUME; i++) - // BVec[i] = i + 1 - // //BVec.push_back(i + 1); - // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE))); - - // Initialize Inout buffer 2 - char *bufInOut2 = bo_inout2.map(); - std::vector CVec(INOUT2_VOLUME); - memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size - - // Sync buffers to update input buffer values - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE); - // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE); - - // ------------------------------------------------------ - // Initialize run configs - // ------------------------------------------------------ - unsigned num_iter = n_iterations + n_warmup_iterations; - float npu_time_total = 0; - float npu_time_min = 9999999; - float npu_time_max = 0; - - int errors = 0; - - // ------------------------------------------------------ - // Main run loop - // ------------------------------------------------------ - for (unsigned iter = 0; iter < num_iter; iter++) { - - // Run kernel - if (verbosity >= 1) - std::cout << "Running Kernel.\n"; - auto start = std::chrono::high_resolution_clock::now(); - auto run = - kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2); - run.wait(); - auto stop = std::chrono::high_resolution_clock::now(); - bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - if (iter < n_warmup_iterations) { - /* Warmup iterations do not count towards average runtime. */ - continue; - } - - // Copy output results and verify they are correct - memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE))); - if (do_verify) { - if (verbosity >= 1) { - std::cout << "Verifying results ..." << std::endl; - } - auto vstart = std::chrono::system_clock::now(); - errors = verify(INOUT2_VOLUME, CVec, verbosity); - auto vstop = std::chrono::system_clock::now(); - float vtime = - std::chrono::duration_cast(vstop - vstart) - .count(); - if (verbosity >= 1) { - std::cout << "Verify time: " << vtime << "secs." << std::endl; - } - } else { - if (verbosity >= 1) - std::cout << "WARNING: results not verified." << std::endl; - } - - // Write trace values if trace_size > 0 - if (trace_size > 0) { - // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE, - // trace_size, - test_utils::write_out_trace(((char *)bufInOut2), trace_size, - vm["trace_file"].as()); - } - - // Accumulate run times - float npu_time = - std::chrono::duration_cast(stop - start) - .count(); - - npu_time_total += npu_time; - npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; - npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; - } - - // ------------------------------------------------------ - // Print verification and timing results - // ------------------------------------------------------ - - // TODO - Mac count to guide gflops - float macs = 0; - - std::cout << std::endl - << "Avg NPU time: " << npu_time_total / n_iterations << "us." - << std::endl; - if (macs > 0) - std::cout << "Avg NPU gflops: " - << macs / (1000 * npu_time_total / n_iterations) << std::endl; - - std::cout << std::endl - << "Min NPU time: " << npu_time_min << "us." << std::endl; - if (macs > 0) - std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min) - << std::endl; - - std::cout << std::endl - << "Max NPU time: " << npu_time_max << "us." << std::endl; - if (macs > 0) - std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max) - << std::endl; - - if (!errors) { - std::cout << "\nPASS!\n\n"; - return 0; - } else { - std::cout << "\nError count: " << errors << "\n\n"; - std::cout << "\nFailed.\n\n"; - return 1; - } -} diff --git a/programming_guide/section-4/test_trace.py b/programming_guide/section-4/test_trace.py deleted file mode 100644 index b6c0d99c02..0000000000 --- a/programming_guide/section-4/test_trace.py +++ /dev/null @@ -1,142 +0,0 @@ -# test.py -*- Python -*- -# -# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved. -# SPDX-License-Identifier: MIT - -# import argparse -import numpy as np -import pyxrt as xrt -import sys -import time - -import aie.utils.test as test_utils - -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -INOUT0_VOLUME = 64 # Input only, 64x uint32_t in this example -INOUT1_VOLUME = 64 # Not used in this example -INOUT2_VOLUME = 64 # Output only, 64x uint32_t in this example - -INOUT0_DATATYPE = np.uint32 -INOUT1_DATATYPE = np.uint32 -INOUT2_DATATYPE = np.uint32 - -INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize -INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize -INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize - - -def main(opts): - - # Load instruction sequence - with open(opts.instr, "r") as f: - instr_text = f.read().split("\n") - instr_text = [l for l in instr_text if l != ""] - instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32) - - OUT_SIZE = INOUT2_SIZE + opts.trace_size - - # ------------------------------------------------------ - # Get device, load the xclbin & kernel and register them - # ------------------------------------------------------ - (device, kernel) = init_xrt_load_kernel(opts) - - # ------------------------------------------------------ - # Initialize input/ output buffer sizes and sync them - # ------------------------------------------------------ - bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0)) - bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2)) - bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3)) - # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4)) - bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4)) - - # Initialize instruction buffer - bo_instr.write(instr_v, 0) - - # Initialize data buffers - inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) - inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE) - inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE) - bo_inout0.write(inout0, 0) - bo_inout1.write(inout1, 0) - bo_inout2.write(inout2, 0) - - # Sync buffers to update input buffer values - bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE) - - # ------------------------------------------------------ - # Initialize run configs - # ------------------------------------------------------ - num_iter = opts.iters + opts.warmup_iters - npu_time_total = 0 - npu_time_min = 9999999 - npu_time_max = 0 - errors = 0 - - # ------------------------------------------------------ - # Main run loop - # ------------------------------------------------------ - for i in range(num_iter): - # Run kernel - if opts.verbosity >= 1: - print("Running Kernel.") - start = time.time_ns() - h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2) - h.wait() - stop = time.time_ns() - bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) - - # Warmup iterations do not count towards average runtime. - if i < opts.warmup_iters: - continue - - # Copy output results and verify they are correct - out_size = INOUT2_SIZE + opts.trace_size - print("out_size:", out_size) - output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE) - dout_buffer = output_buffer[0 : INOUT2_VOLUME - 1] - trace_buffer = output_buffer[INOUT2_VOLUME - 1 :] - if opts.verify: - if opts.verbosity >= 1: - print("Verifying results ...") - ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE) - # e = np.equal(output_buffer, ref) - e = np.equal(dput_buffer, ref) - errors = errors + np.size(e) - np.count_nonzero(e) - - # Write trace values if trace_size > 0 - # if opts.trace_size > 0: - # print("Do something with trace!") - # test_utils.write_out_trace(trace_buffer, opts.trace_size, opts.trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - npu_time_min = min(npu_time_min, npu_time) - npu_time_max = max(npu_time_max, npu_time) - - # ------------------------------------------------------ - # Print verification and timing results - # ------------------------------------------------------ - - # TODO - Mac count to guide gflops - - print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000))) - print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000))) - print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000))) - - if not errors: - print("\nPASS!\n") - exit(0) - else: - print("\nError count: ", errors) - print("\nFailed.\n") - exit(-1) - - -if __name__ == "__main__": - opts = test_utils.parse_args(sys.argv[1:]) - main(opts)