Skip to content

Commit

Permalink
Major update to section-4, especially 4c
Browse files Browse the repository at this point in the history
  • Loading branch information
jackl-xilinx committed Apr 24, 2024
1 parent d326765 commit 6c69e17
Show file tree
Hide file tree
Showing 27 changed files with 382 additions and 173 deletions.
58 changes: 49 additions & 9 deletions aie_kernels/aie2/scale.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <aie_api/aie.hpp>

// Scalar scale template
template <typename T>
void scale_scalar(T *a, T *c, T factor, const int32_t N) {
event0();
Expand All @@ -28,35 +29,74 @@ void scale_scalar(T *a, T *c, T factor, const int32_t N) {
event1();
}


// Vectorized scale template
// Assume N is multiple of 16
template <typename T>
void scale_vectorized(T *a, T *c, T factor, const int32_t N) {
constexpr int vec_factor = 16;
void scale_vectorized(T *a, T *c, int32_t factor, const int32_t N) {
event0();
constexpr int vec_factor = 32;
T *__restrict pA1 = a;
T *__restrict pC1 = c;
const int F = N / vec_factor;
T fac = factor;
for (int i = 0; i < F; i++)
chess_prepare_for_pipelining chess_loop_range(16, ) {
chess_prepare_for_pipelining chess_loop_range(16, )
{
aie::vector<T, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
pA1 += vec_factor;
aie::accum<acc32, vec_factor> cout = aie::mul(A0, fac);
aie::store_v(pC1, cout.template to_vector<T>(0));
pC1 += vec_factor;
}
event1();
}

// Vectorized scale tempalte for int32_t (acc64 used)
// Assume N is multiple of 16
template <>
void scale_vectorized<int32_t>(int32_t *a, int32_t *c, int32_t factor, const int32_t N) {
event0();
constexpr int vec_factor = 32;
int32_t *__restrict pA1 = a;
int32_t *__restrict pC1 = c;
const int F = N / vec_factor;
for (int i = 0; i < F; i++)
chess_prepare_for_pipelining chess_loop_range(16, )
{
aie::vector<int32_t, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
pA1 += vec_factor;
aie::accum<acc64, vec_factor> cout = aie::mul(A0, factor);
aie::store_v(pC1, cout.to_vector<T>(0));
aie::store_v(pC1, cout.template to_vector<int32_t>(0));
pC1 += vec_factor;
}
}
event1();
}


extern "C" {

void vector_scalar_mul_aie(int32_t *a_in, int32_t *c_out, int32_t *factor,
int32_t N) {
// 16-bit datatype
void vector_scalar_mul_int32_scalar(int32_t *a_in, int32_t *c_out,
int32_t *factor, int32_t N) {
scale_scalar<int32_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_int32_vector(int32_t *a_in, int32_t *c_out,
int32_t *factor, int32_t N) {
scale_vectorized<int32_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_aie_scalar(int32_t *a_in, int32_t *c_out,
// 32-bit datatype
void vector_scalar_mul_int16_scalar(int16_t *a_in, int16_t *c_out,
int32_t *factor, int32_t N) {
scale_scalar<int32_t>(a_in, c_out, *factor, N);
scale_scalar<int16_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_int16_vector(int16_t *a_in, int16_t *c_out,
int32_t *factor, int32_t N) {
scale_vectorized<int16_t>(a_in, c_out, *factor, N);
}


} // extern "C"
17 changes: 12 additions & 5 deletions programming_examples/basic/matrix_multiplication/makefile-common
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ M?=512
K?=512
N?=512

trace_size=16384

mlir_target?=build/aie_${M}x${K}x${N}.mlir
xclbin_target?=build/final_${M}x${K}x${N}.xclbin
insts_target?=build/insts_${M}x${K}x${N}.txt
Expand Down Expand Up @@ -83,14 +85,19 @@ run: ${targetname}.exe ${xclbin_target} ${insts_target} #sign
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs}

.PHONY: clean
clean:
rm -rf build _build ${targetname}.exe
trace: ${targetname}.exe ${xclbin_target} ${insts_target} # sign
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}
../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json

.PHONY: parse_trace
parse_trace:
../../../utils/parse_eventIR.py --filename trace.txt --mlir ./build/aie.mlir --colshift 1 > trace_eventIR.json
../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json

.PHONY: clean
clean: clean_trace
rm -rf build _build ${targetname}.exe

.PHONY: clean_trace
clean_trace:
rm -rf tmpTrace trace_eventIR.json
rm -rf tmpTrace parse*.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def my_matmul():
word_size_out = 2

vectorized = True
enable_tracing = False
enable_tracing = True
trace_size = 16384

A_sz_in_i32s = M * K * word_size_in // 4
Expand Down
100 changes: 52 additions & 48 deletions programming_examples/basic/vector_scalar_add/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,63 +13,67 @@


def my_vector_bias_add():
with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
memRef_16_ty = T.memref(16, T.i32())
memRef_8_ty = T.memref(8, T.i32())
@device(AIEDevice.ipu)
def device_body():
memRef_16_ty = T.memref(16, T.i32())
memRef_8_ty = T.memref(8, T.i32())

# Tile declarations
ShimTile = tile(0, 0)
MemTile = tile(0, 1)
ComputeTile2 = tile(0, 2)
# Tile declarations
ShimTile = tile(0, 0)
MemTile = tile(0, 1)
ComputeTile2 = tile(0, 2)

# AIE-array data movement with object fifos
# Input
of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty)
of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
object_fifo_link(of_in0, of_in1)
# AIE-array data movement with object fifos
# Input
of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty)
of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
object_fifo_link(of_in0, of_in1)

# Output
of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty)
of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
object_fifo_link(of_out1, of_out0)
# Output
of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty)
of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
object_fifo_link(of_out1, of_out0)

# Set up compute tiles
# Set up compute tiles

# Compute tile 2
@core(ComputeTile2)
def core_body():
# Effective while(1)
for _ in for_(8):
elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
for i in for_(8):
v0 = memref.load(elem_in, [i])
v1 = arith.addi(v0, arith.constant(1, T.i32()))
memref.store(v1, elem_out, [i])
yield_([])
of_in1.release(ObjectFifoPort.Consume, 1)
of_out1.release(ObjectFifoPort.Produce, 1)
# Compute tile 2
@core(ComputeTile2)
def core_body():
# Effective while(1)
for _ in for_(8):
elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
for i in for_(8):
v0 = memref.load(elem_in, [i])
v1 = arith.addi(v0, arith.constant(1, T.i32()))
memref.store(v1, elem_out, [i])
yield_([])
of_in1.release(ObjectFifoPort.Consume, 1)
of_out1.release(ObjectFifoPort.Produce, 1)
yield_([])

# To/from AIE-array data movement
# To/from AIE-array data movement

memRef_64_ty = T.memref(64, T.i32())
memRef_32_ty = T.memref(32, T.i32())
memRef_64_ty = T.memref(64, T.i32())
memRef_32_ty = T.memref(32, T.i32())

@FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
def sequence(inTensor, notUsed, outTensor):
ipu_dma_memcpy_nd(
metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
)
ipu_dma_memcpy_nd(
metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
)
ipu_sync(column=0, row=0, direction=0, channel=0)
@FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
def sequence(inTensor, notUsed, outTensor):
ipu_dma_memcpy_nd(
metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
)
ipu_dma_memcpy_nd(
metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


my_vector_bias_add()
# Declares that subsequent code is in mlir-aie context
with mlir_mod_ctx() as ctx:
my_vector_bias_add()
res = ctx.module.operation.verify()
if(res == True):
print(ctx.module)
else:
print(res)
14 changes: 7 additions & 7 deletions programming_examples/basic/vector_scalar_mul/Makefile
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ include ../../makefile-common
VPATH := ../../../aie_kernels/aie2

targetname = vectorScalar
#data_size = 4096
data_size = 512
data_size = 4096
#data_size = 512
#data_size = 1024
trace_size = 8192

Expand All @@ -38,7 +38,7 @@ build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%)

build/final_trace_${data_size}.xclbin: build/aie_trace.mlir build/scale.o
build/final_trace_${data_size}.xclbin: build/aie_trace_${data_size}.mlir build/scale.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%)
Expand All @@ -62,16 +62,16 @@ run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt

trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size}
../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace__${data_size}.mlir --colshift 1 > parse_eventIR_vs.json
../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json

trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} python3 test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size}
../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > parse_eventIR_vs.json
../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json


clean_trace:
rm -rf tmpTrace trace.txt
rm -rf tmpTrace trace.txt parse*json trace*json

clean: clean_trace
rm -rf build _build ${targetname}_*.exe
rm -rf build _build ${targetname}*.exe

18 changes: 10 additions & 8 deletions programming_examples/basic/vector_scalar_mul/aie2.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@


def my_vector_scalar(vector_size, trace_size):
word_size_in = 2
N = vector_size
N_in_bytes = N * 4
N_in_i32s = N * word_size_in // 4
N_in_bytes = N_in_i32s * 4
N_div_n = 4 # chop input vector into 4 sub-vectors
n = N // N_div_n

Expand All @@ -27,17 +29,17 @@ def my_vector_scalar(vector_size, trace_size):

@device(AIEDevice.ipu)
def device_body():
memRef_ty = T.memref(n, T.i32())
memRef_ty = T.memref(n, T.i16())
memRef_ty2 = T.memref(1, T.i32())

# AIE Core Function declarations

scale_scalar = external_func(
"vector_scalar_mul_aie_scalar",
"vector_scalar_mul_int16_scalar",
inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()],
)
scale = external_func(
"vector_scalar_mul_aie", inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()]
"vector_scalar_mul_int16_vector", inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()]
)

# Tile declarations
Expand Down Expand Up @@ -78,7 +80,7 @@ def core_body():
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())
tensor_ty = T.memref(N_in_i32s, T.i32())
scalar_ty = T.memref(1, T.i32())

@FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
Expand All @@ -92,15 +94,15 @@ def sequence(A, F, C):
size=trace_size,
offset=N_in_bytes,
)
ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N_in_i32s])
ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s])
ipu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
ipu_sync(column=0, row=0, direction=0, channel=0)


try:
vector_size = int(sys.argv[1])
if vector_size % 64 != 0 or vector_size <= 512:
if vector_size % 64 != 0 or vector_size < 512:
print("Vector size must be a multiple of 64 and greater than or equal to 512")
raise ValueError
trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2])
Expand Down
9 changes: 5 additions & 4 deletions programming_examples/basic/vector_scalar_mul/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
// Configure this to match your buffer data type
// ------------------------------------------------------
// using DATATYPE = std::uint8_t;
using DATATYPE = std::uint32_t;
// using DATATYPE = std::uint32_t;
using DATATYPE = std::uint16_t;
#endif

const int scaleFactor = 3;
Expand Down Expand Up @@ -67,7 +68,7 @@ int main(int argc, const char *argv[]) {
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
auto bo_inA =
xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE),
auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_outC =
xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
Expand All @@ -85,8 +86,8 @@ int main(int argc, const char *argv[]) {
bufInA[i] = i + 1;

// Initialize buffer bo_inFactor
DATATYPE *bufInFactor = bo_inFactor.map<DATATYPE *>();
*bufInFactor = scaleFactor;
int32_t *bufInFactor = bo_inFactor.map<int32_t *>();
*bufInFactor = (DATATYPE)scaleFactor;

// Zero out buffer bo_outC
DATATYPE *bufOut = bo_outC.map<DATATYPE *>();
Expand Down
Loading

0 comments on commit 6c69e17

Please sign in to comment.