Merge branch 'asplos' into runLitMakefile

Xilinx · Apr 24, 2024 · 8b43d91 · 8b43d91
2 parents 28dee67 + 2cd4290
commit 8b43d91
Show file tree

Hide file tree

Showing 65 changed files with 1,004 additions and 279 deletions.
diff --git a/aie_kernels/aie2/scale.cc b/aie_kernels/aie2/scale.cc
@@ -19,6 +19,7 @@
 
 #include <aie_api/aie.hpp>
 
+// Scalar scale template
 template <typename T>
 void scale_scalar(T *a, T *c, T factor, const int32_t N) {
   event0();
@@ -28,35 +29,70 @@ void scale_scalar(T *a, T *c, T factor, const int32_t N) {
   event1();
 }
 
+// Vectorized scale template
 // Assume N is multiple of 16
 template <typename T>
-void scale_vectorized(T *a, T *c, T factor, const int32_t N) {
-  constexpr int vec_factor = 16;
+void scale_vectorized(T *a, T *c, int32_t factor, const int32_t N) {
   event0();
+  constexpr int vec_factor = 32;
   T *__restrict pA1 = a;
   T *__restrict pC1 = c;
   const int F = N / vec_factor;
+  T fac = factor;
   for (int i = 0; i < F; i++)
     chess_prepare_for_pipelining chess_loop_range(16, ) {
       aie::vector<T, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
       pA1 += vec_factor;
+      aie::accum<acc32, vec_factor> cout = aie::mul(A0, fac);
+      aie::store_v(pC1, cout.template to_vector<T>(0));
+      pC1 += vec_factor;
+    }
+  event1();
+}
+
+// Vectorized scale tempalte for int32_t (acc64 used)
+// Assume N is multiple of 16
+template <>
+void scale_vectorized<int32_t>(int32_t *a, int32_t *c, int32_t factor,
+                               const int32_t N) {
+  event0();
+  constexpr int vec_factor = 32;
+  int32_t *__restrict pA1 = a;
+  int32_t *__restrict pC1 = c;
+  const int F = N / vec_factor;
+  for (int i = 0; i < F; i++)
+    chess_prepare_for_pipelining chess_loop_range(16, ) {
+      aie::vector<int32_t, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
+      pA1 += vec_factor;
       aie::accum<acc64, vec_factor> cout = aie::mul(A0, factor);
-      aie::store_v(pC1, cout.to_vector<T>(0));
+      aie::store_v(pC1, cout.template to_vector<int32_t>(0));
       pC1 += vec_factor;
     }
   event1();
 }
 
 extern "C" {
 
-void vector_scalar_mul_aie(int32_t *a_in, int32_t *c_out, int32_t *factor,
-                           int32_t N) {
+// 16-bit datatype
+void vector_scalar_mul_int32_scalar(int32_t *a_in, int32_t *c_out,
+                                    int32_t *factor, int32_t N) {
+  scale_scalar<int32_t>(a_in, c_out, *factor, N);
+}
+
+void vector_scalar_mul_int32_vector(int32_t *a_in, int32_t *c_out,
+                                    int32_t *factor, int32_t N) {
   scale_vectorized<int32_t>(a_in, c_out, *factor, N);
 }
 
-void vector_scalar_mul_aie_scalar(int32_t *a_in, int32_t *c_out,
-                                  int32_t *factor, int32_t N) {
-  scale_scalar<int32_t>(a_in, c_out, *factor, N);
+// 32-bit datatype
+void vector_scalar_mul_int16_scalar(int16_t *a_in, int16_t *c_out,
+                                    int32_t *factor, int32_t N) {
+  scale_scalar<int16_t>(a_in, c_out, *factor, N);
+}
+
+void vector_scalar_mul_int16_vector(int16_t *a_in, int16_t *c_out,
+                                    int32_t *factor, int32_t N) {
+  scale_vectorized<int16_t>(a_in, c_out, *factor, N);
 }
 
 } // extern "C"
diff --git a/docs/conferenceDescriptions/asplos24TutorialDescription.md b/docs/conferenceDescriptions/asplos24TutorialDescription.md
@@ -16,7 +16,7 @@ This tutorial will cover the following key topics:
 
 Date: Saturday April 27th 2024 (morning)  
 Location: Hilton La Jolla Torrey Pines, San Diego, California (with ASPLOS’24)  
-Prerequisite: please bring your laptop, so that you can ssh into our Ryzen AI enabled miniPCs for the hands-on excersizes.
+Prerequisite: please bring your laptop, so that you can ssh into our Ryzen AI enabled miniPCs for the hands-on exercises.
 
 ### Contents and Timeline (tentative)
 

diff --git a/programming_examples/basic/dma_transpose/CMakeLists.txt b/programming_examples/basic/dma_transpose/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23) 
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+    set(CMAKE_C_COMPILER gcc-13)
+    set(CMAKE_CXX_COMPILER g++-13)
+    set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+    set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+    set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName proj_${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+	set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+    test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC 
+    ${XRT_INC_DIR}
+    ${Boost_INCLUDE_DIRS}
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+    ${XRT_LIB_DIR}
+    ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+        boost_program_options
+        boost_filesystem
+    )
+else()
+    target_link_libraries(${currentTarget} PUBLIC
+        xrt_coreutil
+    )
+endif()
diff --git a/programming_examples/basic/dma_transpose/Makefile b/programming_examples/basic/dma_transpose/Makefile
@@ -0,0 +1,51 @@
+##===- Makefile -----------------------------------------------------------===##
+# 
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+# 
+##===----------------------------------------------------------------------===##
+
+include ../../makefile-common
+
+SHELL := /bin/bash
+
+all: build/final.xclbin build/insts.txt
+
+targetname = dmaTranspose
+M ?= 64
+K ?= 32
+
+build/aie.mlir: aie2.py
+	mkdir -p ${@D}
+	python3 $< ${M} ${K} > $@
+
+.PHONY: inst/insts.txt
+inst/insts.txt: aie2.py
+	rm -rf inst
+	mkdir -p inst 
+	python3 $< ${LENGTH} > inst/aie.mlir
+	pushd inst && aiecc.py --aie-only-generate-ipu --ipu-insts-name=insts.txt aie.mlir && popd
+	${powershell} ./build/${targetname}.exe -x build/final.xclbin -i inst/insts.txt -k MLIR_AIE -l ${LENGTH}
+
+build/final.xclbin: build/aie.mlir
+	mkdir -p ${@D}
+	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+	rm -rf _build
+	mkdir -p _build
+	cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+	cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+	cp _build/${targetname}.exe $@
+else
+	cp _build/${targetname} $@ 
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt
+	${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --M ${M} --K ${K}
+
+clean:
+	rm -rf build _build inst ${targetname}.exe
diff --git a/programming_examples/basic/dma_transpose/README.md b/programming_examples/basic/dma_transpose/README.md
@@ -0,0 +1,25 @@
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# <ins> 2-D Array Transpose using AIE DMAs </ins>
+
+This reference design can be run on a Ryzen™ AI NPU.
+
+In the [design](./aie2.py) a 2-D array in row-major layout is read from external memory to `ComputeTile2` with a transposed layout,
+by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). The data is read from and written to external memory through Shim tile (`col`, 0).
+
+The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/README.md/#object-fifo-link) of the programming guide.
+
+
+To compile and run the design for NPU:
+```
+make
+make run
+```
diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py
@@ -0,0 +1,66 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+N = 4096
+M = 64
+K = 64
+
+if len(sys.argv) == 3:
+    M = int(sys.argv[1])
+    K = int(sys.argv[2])
+    N = M * K
+
+
+def my_passthrough():
+    with mlir_mod_ctx() as ctx:
+
+        @device(AIEDevice.ipu)
+        def device_body():
+            memRef_ty = T.memref(M, K, T.i32())
+
+            # Tile declarations
+            ShimTile = tile(0, 0)
+            ComputeTile2 = tile(0, 2)
+
+            # AIE-array data movement with object fifos
+            of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
+            of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+            object_fifo_link(of_in, of_out)
+
+            # Set up compute tiles
+
+            # Compute tile 2
+            @core(ComputeTile2)
+            def core_body():
+                for _ in for_(sys.maxsize):
+                    yield_([])
+
+            # To/from AIE-array data movement
+            tensor_ty = T.memref(N, T.i32())
+
+            @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+            def sequence(A, B, C):
+                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                # The strides below are configured to read across all rows in the same column
+                # Stride of K in dim/wrap 2 skips an entire row to read a full column
+                ipu_dma_memcpy_nd(
+                    metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K]
+                )
+                ipu_sync(column=0, row=0, direction=0, channel=0)
+
+    print(ctx.module)
+
+
+my_passthrough()
diff --git a/programming_examples/basic/dma_transpose/run.lit b/programming_examples/basic/dma_transpose/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: %python %S/aie2.py 64 32 > ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt --M 64 --K 32 | FileCheck %s
+// CHECK: PASS!