Skip to content

Commit

Permalink
Merge branch 'asplos' into runLitMakefile
Browse files Browse the repository at this point in the history
  • Loading branch information
jgmelber authored Apr 24, 2024
2 parents 28dee67 + 2cd4290 commit 8b43d91
Show file tree
Hide file tree
Showing 65 changed files with 1,004 additions and 279 deletions.
52 changes: 44 additions & 8 deletions aie_kernels/aie2/scale.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <aie_api/aie.hpp>

// Scalar scale template
template <typename T>
void scale_scalar(T *a, T *c, T factor, const int32_t N) {
event0();
Expand All @@ -28,35 +29,70 @@ void scale_scalar(T *a, T *c, T factor, const int32_t N) {
event1();
}

// Vectorized scale template
// Assume N is multiple of 16
template <typename T>
void scale_vectorized(T *a, T *c, T factor, const int32_t N) {
constexpr int vec_factor = 16;
void scale_vectorized(T *a, T *c, int32_t factor, const int32_t N) {
event0();
constexpr int vec_factor = 32;
T *__restrict pA1 = a;
T *__restrict pC1 = c;
const int F = N / vec_factor;
T fac = factor;
for (int i = 0; i < F; i++)
chess_prepare_for_pipelining chess_loop_range(16, ) {
aie::vector<T, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
pA1 += vec_factor;
aie::accum<acc32, vec_factor> cout = aie::mul(A0, fac);
aie::store_v(pC1, cout.template to_vector<T>(0));
pC1 += vec_factor;
}
event1();
}

// Vectorized scale tempalte for int32_t (acc64 used)
// Assume N is multiple of 16
template <>
void scale_vectorized<int32_t>(int32_t *a, int32_t *c, int32_t factor,
const int32_t N) {
event0();
constexpr int vec_factor = 32;
int32_t *__restrict pA1 = a;
int32_t *__restrict pC1 = c;
const int F = N / vec_factor;
for (int i = 0; i < F; i++)
chess_prepare_for_pipelining chess_loop_range(16, ) {
aie::vector<int32_t, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
pA1 += vec_factor;
aie::accum<acc64, vec_factor> cout = aie::mul(A0, factor);
aie::store_v(pC1, cout.to_vector<T>(0));
aie::store_v(pC1, cout.template to_vector<int32_t>(0));
pC1 += vec_factor;
}
event1();
}

extern "C" {

void vector_scalar_mul_aie(int32_t *a_in, int32_t *c_out, int32_t *factor,
int32_t N) {
// 16-bit datatype
void vector_scalar_mul_int32_scalar(int32_t *a_in, int32_t *c_out,
int32_t *factor, int32_t N) {
scale_scalar<int32_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_int32_vector(int32_t *a_in, int32_t *c_out,
int32_t *factor, int32_t N) {
scale_vectorized<int32_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_aie_scalar(int32_t *a_in, int32_t *c_out,
int32_t *factor, int32_t N) {
scale_scalar<int32_t>(a_in, c_out, *factor, N);
// 32-bit datatype
void vector_scalar_mul_int16_scalar(int16_t *a_in, int16_t *c_out,
int32_t *factor, int32_t N) {
scale_scalar<int16_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_int16_vector(int16_t *a_in, int16_t *c_out,
int32_t *factor, int32_t N) {
scale_vectorized<int16_t>(a_in, c_out, *factor, N);
}

} // extern "C"
2 changes: 1 addition & 1 deletion docs/conferenceDescriptions/asplos24TutorialDescription.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ This tutorial will cover the following key topics:

Date: Saturday April 27th 2024 (morning)
Location: Hilton La Jolla Torrey Pines, San Diego, California (with ASPLOS’24)
Prerequisite: please bring your laptop, so that you can ssh into our Ryzen AI enabled miniPCs for the hands-on excersizes.
Prerequisite: please bring your laptop, so that you can ssh into our Ryzen AI enabled miniPCs for the hands-on exercises.

### Contents and Timeline (tentative)

Expand Down
75 changes: 75 additions & 0 deletions programming_examples/basic/dma_transpose/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 Advanced Micro Devices, Inc.

# parameters
# -DBOOST_ROOT: Path to Boost install
# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
# -DTARGET_NAME: Target name to be built

# cmake needs this line
cmake_minimum_required(VERSION 3.1)

set(CMAKE_CXX_STANDARD 23)
set(CMAKE_CXX_STANDARD_REQUIRED YES)

find_program(WSL NAMES powershell.exe)

if (NOT WSL)
set(CMAKE_C_COMPILER gcc-13)
set(CMAKE_CXX_COMPILER g++-13)
set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
else()
set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
endif()

set(TARGET_NAME test CACHE STRING "Target to be built")

SET (ProjectName proj_${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})

if ( WSL )
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
endif ()

project(${ProjectName})

# Find packages
find_package(Boost REQUIRED)

add_executable(${currentTarget}
${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
test.cpp
)

target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)

target_include_directories (${currentTarget} PUBLIC
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
)

target_link_directories(${currentTarget} PUBLIC
${XRT_LIB_DIR}
${Boost_LIBRARY_DIRS}
)

if (NOT WSL)
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
boost_program_options
boost_filesystem
)
else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
endif()
51 changes: 51 additions & 0 deletions programming_examples/basic/dma_transpose/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
##===- Makefile -----------------------------------------------------------===##
#
# This file licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
##===----------------------------------------------------------------------===##

include ../../makefile-common

SHELL := /bin/bash

all: build/final.xclbin build/insts.txt

targetname = dmaTranspose
M ?= 64
K ?= 32

build/aie.mlir: aie2.py
mkdir -p ${@D}
python3 $< ${M} ${K} > $@

.PHONY: inst/insts.txt
inst/insts.txt: aie2.py
rm -rf inst
mkdir -p inst
python3 $< ${LENGTH} > inst/aie.mlir
pushd inst && aiecc.py --aie-only-generate-ipu --ipu-insts-name=insts.txt aie.mlir && popd
${powershell} ./build/${targetname}.exe -x build/final.xclbin -i inst/insts.txt -k MLIR_AIE -l ${LENGTH}

build/final.xclbin: build/aie.mlir
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)

${targetname}.exe: test.cpp
rm -rf _build
mkdir -p _build
cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
cd _build && ${powershell} cmake --build . --config Release
ifeq "${powershell}" "powershell.exe"
cp _build/${targetname}.exe $@
else
cp _build/${targetname} $@
endif

run: ${targetname}.exe build/final.xclbin build/insts.txt
${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --M ${M} --K ${K}

clean:
rm -rf build _build inst ${targetname}.exe
25 changes: 25 additions & 0 deletions programming_examples/basic/dma_transpose/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<!---//===- README.md --------------------------*- Markdown -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2024, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//-->

# <ins> 2-D Array Transpose using AIE DMAs </ins>

This reference design can be run on a Ryzen™ AI NPU.

In the [design](./aie2.py) a 2-D array in row-major layout is read from external memory to `ComputeTile2` with a transposed layout,
by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). The data is read from and written to external memory through Shim tile (`col`, 0).

The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/README.md/#object-fifo-link) of the programming guide.


To compile and run the design for NPU:
```
make
make run
```
66 changes: 66 additions & 0 deletions programming_examples/basic/dma_transpose/aie2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#
# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
# (c) Copyright 2023 AMD Inc.

import sys

from aie.dialects.aie import *
from aie.dialects.aiex import *
from aie.dialects.scf import *
from aie.extras.dialects.ext import memref, arith
from aie.extras.context import mlir_mod_ctx

N = 4096
M = 64
K = 64

if len(sys.argv) == 3:
M = int(sys.argv[1])
K = int(sys.argv[2])
N = M * K


def my_passthrough():
with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
memRef_ty = T.memref(M, K, T.i32())

# Tile declarations
ShimTile = tile(0, 0)
ComputeTile2 = tile(0, 2)

# AIE-array data movement with object fifos
of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
object_fifo_link(of_in, of_out)

# Set up compute tiles

# Compute tile 2
@core(ComputeTile2)
def core_body():
for _ in for_(sys.maxsize):
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())

@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(A, B, C):
ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
# The strides below are configured to read across all rows in the same column
# Stride of K in dim/wrap 2 skips an entire row to read a full column
ipu_dma_memcpy_nd(
metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K]
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


my_passthrough()
10 changes: 10 additions & 0 deletions programming_examples/basic/dma_transpose/run.lit
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// (c) Copyright 2023 Advanced Micro Devices, Inc.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// REQUIRES: ryzen_ai, chess
//
// RUN: %python %S/aie2.py 64 32 > ./aie.mlir
// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt --M 64 --K 32 | FileCheck %s
// CHECK: PASS!
Loading

0 comments on commit 8b43d91

Please sign in to comment.