Revert "Change vec_scal_add examples to vec_scal_mul and cleaned up R…

…EADME references (#1400)" This reverts commit b9c556b.
Xilinx · Apr 24, 2024 · d8083a2 · d8083a2
1 parent f03d0c1
commit d8083a2
Show file tree

Hide file tree

Showing 29 changed files with 1,685 additions and 379 deletions.
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -100,4 +100,4 @@ clean: clean_trace
 
 .PHONY: clean_trace
 clean_trace:
-	rm -rf tmpTrace parse*.json  trace.txt
+	rm -rf tmpTrace parse*.json 
diff --git a/programming_guide/assets/trace_vector_scalar_mul1.png b/programming_guide/assets/trace_vector_scalar_mul1.png
diff --git a/programming_guide/quick_reference.md b/programming_guide/quick_reference.md
@@ -49,12 +49,6 @@
 | `print(ctx.module)` | Converts our ctx wrapped structural code to mlir and prints to stdout|
 | `ctx.module.operation.verify()` | Runs additional structural verficiation on the python binded source code and return result to stdout |
 
-## Common AIE API functions for Kernel Programming
-| Function Signature  | Definition | Parameters | Return Type | Example | 
-|---------------------|------------|------------|-------------|---------|
-| `aie::vector<T, vec_factor> my_vector` | Declare vector type | `T`: data type <br> `vec_factor`: vector width | n/a | aie::vector<int16_t, 32> my_vector; |
-| `aie::load_v<vec_factor>(pA1);` | Vector load | `vec_factor`: vector width | `aie::vector` | aie::vector<int16_t, 32> my_vector; |
-
 ## Helpful AI Engine Architecture References and Tables
 * [AIE2 - Table of supported data types and vector sizes (AIE API)](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__basic__types.html)
 

diff --git a/programming_guide/section-1/Makefile b/programming_guide/section-1/Makefile
@@ -6,7 +6,7 @@
 # 
 ##===----------------------------------------------------------------------===##
 
-include ../../programming_examples/makefile-common
+include ../../tutorials/makefile-common
 
 build/aie.mlir: aie2.py
 	mkdir -p ${@D}

diff --git a/programming_guide/section-3/Makefile b/programming_guide/section-3/Makefile
@@ -12,15 +12,11 @@ all: build/final.xclbin build/insts.txt
 
 targetname = vectorScalar
 
-build/aie.mlir: aie2.py
-	mkdir -p ${@D}
-	python3 $< > $@
-
 build/scale.o: vector_scalar_mul.cc
 	mkdir -p ${@D}
 	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
 
-build/final.xclbin: build/aie.mlir build/scale.o
+build/final.xclbin: aie.mlir build/kernel1.o build/kernel2.o build/kernel3.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
 				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)

diff --git a/programming_guide/section-3/README.md b/programming_guide/section-3/README.md
@@ -149,6 +149,7 @@ To compile the design and C++ testbench:
 
 ```sh
 make
+make build/vectorScalar.exe
 ```
 
 To run the design:

diff --git a/programming_guide/section-3/test.cpp b/programming_guide/section-3/test.cpp
@@ -34,10 +34,13 @@ int main(int argc, const char *argv[]) {
 
   test_utils::parse_options(argc, argv, desc, vm);
   int verbosity = vm["verbosity"].as<int>();
+  int trace_size = vm["trace_sz"].as<int>();
 
   constexpr bool VERIFY = true;
+  constexpr bool ENABLE_TRACING = false;
+  // constexpr int TRACE_SIZE = 8192;
   constexpr int IN_SIZE = 4096;
-  constexpr int OUT_SIZE = IN_SIZE;
+  constexpr int OUT_SIZE = ENABLE_TRACING ? IN_SIZE + trace_size / 4 : IN_SIZE;
 
   // Load instruction sequence
   std::vector<uint32_t> instr_v =
@@ -61,7 +64,7 @@ int main(int argc, const char *argv[]) {
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
   auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE),
                              XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
-  auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE),
+  auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE) + trace_size,
                          XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
 
   if (verbosity >= 1)
@@ -82,7 +85,7 @@ int main(int argc, const char *argv[]) {
 
   // Zero out buffer bo_outC
   DATATYPE *bufOut = bo_outC.map<DATATYPE *>();
-  memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE));
+  memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE) + trace_size);
 
   // sync host to device memories
   bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
@@ -117,6 +120,11 @@ int main(int argc, const char *argv[]) {
     }
   }
 
+  if (trace_size > 0) {
+    test_utils::write_out_trace(((char *)bufOut) + (IN_SIZE * sizeof(DATATYPE)),
+                                trace_size, vm["trace_file"].as<std::string>());
+  }
+
   // Print Pass/Fail result of our test
   if (!errors) {
     std::cout << std::endl << "PASS!" << std::endl << std::endl;

diff --git a/programming_guide/section-3/test.py b/programming_guide/section-3/test.py
@@ -15,6 +15,7 @@
 from aie.extras.dialects.ext import memref, arith
 
 import aie.utils.test as test_utils
+import aie.utils.trace as trace_utils
 
 
 def main(opts):
@@ -40,7 +41,7 @@ def main(opts):
     INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
     INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
 
-    OUT_SIZE = INOUT2_SIZE
+    OUT_SIZE = INOUT2_SIZE + int(opts.trace_size)
 
     # ------------------------------------------------------
     # Get device, load the xclbin & kernel and register them
@@ -98,6 +99,11 @@ def main(opts):
         e = np.equal(output_buffer, ref)
         errors = errors + np.size(e) - np.count_nonzero(e)
 
+    # Write trace values if trace_size > 0
+    if opts.trace_size > 0:
+        trace_buffer = entire_buffer[INOUT2_VOLUME:]
+        trace_utils.write_out_trace(trace_buffer, str(opts.trace_file))
+
     # ------------------------------------------------------
     # Print verification and timing results
     # ------------------------------------------------------

diff --git a/programming_guide/section-4/CMakeLists.txt b/programming_guide/section-4/CMakeLists.txt
@@ -0,0 +1,70 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_guide/section-4/aie2.py b/programming_guide/section-4/aie2.py
@@ -0,0 +1,74 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import *  # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx  # mlir-aie context
+
+from aie.dialects.aiex import *  # extended mlir-aie dialect definitions
+from aie.dialects.scf import *  # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith  # memref and arithmatic dialects
+
+
+# AI Engine structural design function
+def my_first_aie_program():
+
+    # Dvice declaration - aie2 device NPU
+    @device(AIEDevice.npu)
+    def device_body():
+        # Memref types
+        memRef_8_ty = T.memref(8, T.i32())
+        memRef_16_ty = T.memref(16, T.i32())
+        memRef_32_ty = T.memref(32, T.i32())
+        memRef_64_ty = T.memref(64, T.i32())
+
+        # Tile declarations
+        ComputeTile = tile(0, 2)
+        ShimTile = tile(0, 0)
+
+        # Data movement with object FIFOs
+        # Input (from shim tile to compute tile)
+        of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
+
+        # Output (from compute tile to shim tile)
+        of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
+
+        # Compute tile body
+        @core(ComputeTile)
+        def core_body():
+            for _ in for_(8):
+                # Acquire input and output object FIFO objects
+                elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
+                elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
+
+                # Core functionality - load, add 1, store
+                for i in for_(8):
+                    v0 = memref.load(elem_in, [i])
+                    v1 = arith.addi(v0, arith.constant(1, T.i32()))
+                    memref.store(v1, elem_out, [i])
+                    yield_([])
+
+                # Release input and output object FIFO objects
+                of_in0.release(ObjectFifoPort.Consume, 1)
+                of_out0.release(ObjectFifoPort.Produce, 1)
+                yield_([])
+
+        # To/from AIE-array data movement
+        @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty)
+        def sequence(inTensor, unused, outTensor):
+            npu_dma_memcpy_nd(
+                metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
+            )
+            npu_dma_memcpy_nd(
+                metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
+            )
+            npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+# Declares that subsequent code is in mlir-aie context
+with mlir_mod_ctx() as ctx:
+    my_first_aie_program()  # Call design function within the mlir-aie context
+    print(ctx.module)  # Print the python-to-mlir conversion
diff --git a/programming_guide/section-4/section-4a/Makefile b/programming_guide/section-4/section-4a/Makefile
@@ -16,14 +16,10 @@ build/aie.mlir: aie2.py
 	mkdir -p ${@D}
 	python3 $< > $@
 
-build/scale.o: vector_scalar_mul.cc
+build/final.xclbin: build/aie.mlir
 	mkdir -p ${@D}
-	cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
-
-build/final.xclbin: build/aie.mlir build/scale.o
-	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build

diff --git a/programming_guide/section-4/section-4a/README.md b/programming_guide/section-4/section-4a/README.md
@@ -24,7 +24,7 @@ Adding the application timer is as simple as noting a start and stop time surrou
 
 ```c++
     auto start = std::chrono::high_resolution_clock::now();
-    auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC);
+    auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
     run.wait();
     auto stop = std::chrono::high_resolution_clock::now();
 
@@ -78,6 +78,9 @@ We can then compute and print the actual average, minimum and maximum run times.
 
 1. Let's set our iterations to 10 and run again with `make run` which recompiles our host code for `test.cpp`. What reported Avg NPU time do you see this time? <img src="../../../mlir_tutorials/images/answer1.jpg" title="Answer can be anywhere from 430-480us but is likely different than before" height=25>
 
+1. Let's change our design and increase the loop size of our kernel by a factor of 10. This involves changing the outer loop from 8 to 80. What reported times do you see now? <img src="../../../mlir_tutorials/images/answer1.jpg" title="? us" height=25>
+
+
 -----
 [[Up]](../../section-4) [[Next]](../section-4b)