Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ASPLOS] Major update to section-4, especially 4c #1392

Merged
merged 14 commits into from
Apr 24, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 44 additions & 8 deletions aie_kernels/aie2/scale.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include <aie_api/aie.hpp>

// Scalar scale template
template <typename T>
void scale_scalar(T *a, T *c, T factor, const int32_t N) {
event0();
Expand All @@ -28,35 +29,70 @@ void scale_scalar(T *a, T *c, T factor, const int32_t N) {
event1();
}

// Vectorized scale template
// Assume N is multiple of 16
template <typename T>
void scale_vectorized(T *a, T *c, T factor, const int32_t N) {
constexpr int vec_factor = 16;
void scale_vectorized(T *a, T *c, int32_t factor, const int32_t N) {
event0();
constexpr int vec_factor = 32;
T *__restrict pA1 = a;
T *__restrict pC1 = c;
const int F = N / vec_factor;
T fac = factor;
for (int i = 0; i < F; i++)
chess_prepare_for_pipelining chess_loop_range(16, ) {
aie::vector<T, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
pA1 += vec_factor;
aie::accum<acc32, vec_factor> cout = aie::mul(A0, fac);
aie::store_v(pC1, cout.template to_vector<T>(0));
pC1 += vec_factor;
}
event1();
}

// Vectorized scale tempalte for int32_t (acc64 used)
// Assume N is multiple of 16
template <>
void scale_vectorized<int32_t>(int32_t *a, int32_t *c, int32_t factor,
const int32_t N) {
event0();
constexpr int vec_factor = 32;
int32_t *__restrict pA1 = a;
int32_t *__restrict pC1 = c;
const int F = N / vec_factor;
for (int i = 0; i < F; i++)
chess_prepare_for_pipelining chess_loop_range(16, ) {
aie::vector<int32_t, vec_factor> A0 = aie::load_v<vec_factor>(pA1);
pA1 += vec_factor;
aie::accum<acc64, vec_factor> cout = aie::mul(A0, factor);
aie::store_v(pC1, cout.to_vector<T>(0));
aie::store_v(pC1, cout.template to_vector<int32_t>(0));
pC1 += vec_factor;
}
event1();
}

extern "C" {

void vector_scalar_mul_aie(int32_t *a_in, int32_t *c_out, int32_t *factor,
int32_t N) {
// 16-bit datatype
void vector_scalar_mul_int32_scalar(int32_t *a_in, int32_t *c_out,
int32_t *factor, int32_t N) {
scale_scalar<int32_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_int32_vector(int32_t *a_in, int32_t *c_out,
int32_t *factor, int32_t N) {
scale_vectorized<int32_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_aie_scalar(int32_t *a_in, int32_t *c_out,
int32_t *factor, int32_t N) {
scale_scalar<int32_t>(a_in, c_out, *factor, N);
// 32-bit datatype
void vector_scalar_mul_int16_scalar(int16_t *a_in, int16_t *c_out,
int32_t *factor, int32_t N) {
scale_scalar<int16_t>(a_in, c_out, *factor, N);
}

void vector_scalar_mul_int16_vector(int16_t *a_in, int16_t *c_out,
int32_t *factor, int32_t N) {
scale_vectorized<int16_t>(a_in, c_out, *factor, N);
}

} // extern "C"
17 changes: 12 additions & 5 deletions programming_examples/basic/matrix_multiplication/makefile-common
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ M?=512
K?=512
N?=512

trace_size=16384

mlir_target?=build/aie_${M}x${K}x${N}.mlir
xclbin_target?=build/final_${M}x${K}x${N}.xclbin
insts_target?=build/insts_${M}x${K}x${N}.txt
Expand Down Expand Up @@ -83,14 +85,19 @@ run: ${targetname}.exe ${xclbin_target} ${insts_target} #sign
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs}

.PHONY: clean
clean:
rm -rf build _build ${targetname}.exe
trace: ${targetname}.exe ${xclbin_target} ${insts_target} # sign
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}
../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json

.PHONY: parse_trace
parse_trace:
../../../utils/parse_eventIR.py --filename trace.txt --mlir ./build/aie.mlir --colshift 1 > trace_eventIR.json
../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json

.PHONY: clean
clean: clean_trace
rm -rf build _build ${targetname}.exe

.PHONY: clean_trace
clean_trace:
rm -rf tmpTrace trace_eventIR.json
rm -rf tmpTrace parse*.json
100 changes: 52 additions & 48 deletions programming_examples/basic/vector_scalar_add/aie2.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,63 +13,67 @@


def my_vector_bias_add():
with mlir_mod_ctx() as ctx:

@device(AIEDevice.ipu)
def device_body():
memRef_16_ty = T.memref(16, T.i32())
memRef_8_ty = T.memref(8, T.i32())
@device(AIEDevice.ipu)
def device_body():
memRef_16_ty = T.memref(16, T.i32())
memRef_8_ty = T.memref(8, T.i32())

# Tile declarations
ShimTile = tile(0, 0)
MemTile = tile(0, 1)
ComputeTile2 = tile(0, 2)
# Tile declarations
ShimTile = tile(0, 0)
MemTile = tile(0, 1)
ComputeTile2 = tile(0, 2)

# AIE-array data movement with object fifos
# Input
of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty)
of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
object_fifo_link(of_in0, of_in1)
# AIE-array data movement with object fifos
# Input
of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty)
of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
object_fifo_link(of_in0, of_in1)

# Output
of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty)
of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
object_fifo_link(of_out1, of_out0)
# Output
of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty)
of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
object_fifo_link(of_out1, of_out0)

# Set up compute tiles
# Set up compute tiles

# Compute tile 2
@core(ComputeTile2)
def core_body():
# Effective while(1)
for _ in for_(8):
elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
for i in for_(8):
v0 = memref.load(elem_in, [i])
v1 = arith.addi(v0, arith.constant(1, T.i32()))
memref.store(v1, elem_out, [i])
yield_([])
of_in1.release(ObjectFifoPort.Consume, 1)
of_out1.release(ObjectFifoPort.Produce, 1)
# Compute tile 2
@core(ComputeTile2)
def core_body():
# Effective while(1)
for _ in for_(8):
elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
for i in for_(8):
v0 = memref.load(elem_in, [i])
v1 = arith.addi(v0, arith.constant(1, T.i32()))
memref.store(v1, elem_out, [i])
yield_([])
of_in1.release(ObjectFifoPort.Consume, 1)
of_out1.release(ObjectFifoPort.Produce, 1)
yield_([])

# To/from AIE-array data movement
# To/from AIE-array data movement

memRef_64_ty = T.memref(64, T.i32())
memRef_32_ty = T.memref(32, T.i32())
memRef_64_ty = T.memref(64, T.i32())
memRef_32_ty = T.memref(32, T.i32())

@FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
def sequence(inTensor, notUsed, outTensor):
ipu_dma_memcpy_nd(
metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
)
ipu_dma_memcpy_nd(
metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
)
ipu_sync(column=0, row=0, direction=0, channel=0)
@FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
def sequence(inTensor, notUsed, outTensor):
ipu_dma_memcpy_nd(
metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
)
ipu_dma_memcpy_nd(
metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
)
ipu_sync(column=0, row=0, direction=0, channel=0)

print(ctx.module)


my_vector_bias_add()
# Declares that subsequent code is in mlir-aie context
with mlir_mod_ctx() as ctx:
my_vector_bias_add()
res = ctx.module.operation.verify()
if res == True:
print(ctx.module)
else:
print(res)
14 changes: 7 additions & 7 deletions programming_examples/basic/vector_scalar_mul/Makefile
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ include ../../makefile-common
VPATH := ../../../aie_kernels/aie2

targetname = vectorScalar
#data_size = 4096
data_size = 512
data_size = 4096
#data_size = 512
#data_size = 1024
trace_size = 8192

Expand All @@ -38,7 +38,7 @@ build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%)

build/final_trace_${data_size}.xclbin: build/aie_trace.mlir build/scale.o
build/final_trace_${data_size}.xclbin: build/aie_trace_${data_size}.mlir build/scale.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%)
Expand All @@ -62,16 +62,16 @@ run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt

trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size}
../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace__${data_size}.mlir --colshift 1 > parse_eventIR_vs.json
../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json

trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} python3 test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size}
../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > parse_eventIR_vs.json
../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json


clean_trace:
rm -rf tmpTrace trace.txt
rm -rf tmpTrace trace.txt parse*json trace*json

clean: clean_trace
rm -rf build _build ${targetname}_*.exe
rm -rf build _build ${targetname}*.exe

21 changes: 13 additions & 8 deletions programming_examples/basic/vector_scalar_mul/aie2.py
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@


def my_vector_scalar(vector_size, trace_size):
word_size_in = 2
N = vector_size
N_in_bytes = N * 4
N_in_i32s = N * word_size_in // 4
N_in_bytes = N_in_i32s * 4
N_div_n = 4 # chop input vector into 4 sub-vectors
n = N // N_div_n

Expand All @@ -27,17 +29,18 @@ def my_vector_scalar(vector_size, trace_size):

@device(AIEDevice.ipu)
def device_body():
memRef_ty = T.memref(n, T.i32())
memRef_ty = T.memref(n, T.i16())
memRef_ty2 = T.memref(1, T.i32())

# AIE Core Function declarations

scale_scalar = external_func(
"vector_scalar_mul_aie_scalar",
"vector_scalar_mul_int16_scalar",
inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()],
)
scale = external_func(
"vector_scalar_mul_aie", inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()]
"vector_scalar_mul_int16_vector",
inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()],
)

# Tile declarations
Expand Down Expand Up @@ -78,7 +81,7 @@ def core_body():
yield_([])

# To/from AIE-array data movement
tensor_ty = T.memref(N, T.i32())
tensor_ty = T.memref(N_in_i32s, T.i32())
scalar_ty = T.memref(1, T.i32())

@FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
Expand All @@ -92,15 +95,17 @@ def sequence(A, F, C):
size=trace_size,
offset=N_in_bytes,
)
ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
ipu_dma_memcpy_nd(
metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N_in_i32s]
)
ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s])
ipu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
ipu_sync(column=0, row=0, direction=0, channel=0)


try:
vector_size = int(sys.argv[1])
if vector_size % 64 != 0 or vector_size <= 512:
if vector_size % 64 != 0 or vector_size < 512:
print("Vector size must be a multiple of 64 and greater than or equal to 512")
raise ValueError
trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2])
Expand Down
9 changes: 5 additions & 4 deletions programming_examples/basic/vector_scalar_mul/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@
// Configure this to match your buffer data type
// ------------------------------------------------------
// using DATATYPE = std::uint8_t;
using DATATYPE = std::uint32_t;
// using DATATYPE = std::uint32_t;
using DATATYPE = std::uint16_t;
#endif

const int scaleFactor = 3;
Expand Down Expand Up @@ -67,7 +68,7 @@ int main(int argc, const char *argv[]) {
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
auto bo_inA =
xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE),
auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_outC =
xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
Expand All @@ -85,8 +86,8 @@ int main(int argc, const char *argv[]) {
bufInA[i] = i + 1;

// Initialize buffer bo_inFactor
DATATYPE *bufInFactor = bo_inFactor.map<DATATYPE *>();
*bufInFactor = scaleFactor;
int32_t *bufInFactor = bo_inFactor.map<int32_t *>();
*bufInFactor = (DATATYPE)scaleFactor;

// Zero out buffer bo_outC
DATATYPE *bufOut = bo_outC.map<DATATYPE *>();
Expand Down
6 changes: 3 additions & 3 deletions programming_examples/basic/vector_scalar_mul/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ def main(opts):
INOUT1_VOLUME = int(1) # Input only, 1 uint32_t scale factor
INOUT2_VOLUME = int(opts.size) # Output only, 64x uint32_t in this example

INOUT0_DATATYPE = np.int32
INOUT0_DATATYPE = np.int16
INOUT1_DATATYPE = np.int32
INOUT2_DATATYPE = np.int32
INOUT2_DATATYPE = np.int16

INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
Expand Down Expand Up @@ -90,7 +90,7 @@ def main(opts):
bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)

# Copy output results and verify they are correct
entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint16)
output_buffer = entire_buffer[:INOUT2_VOLUME]
if opts.verify:
if opts.verbosity >= 1:
Expand Down
Loading
Loading