From f33b3caf27c86447ceb2fc3ccf91e385cf2398c4 Mon Sep 17 00:00:00 2001 From: Joseph Melber Date: Mon, 22 Apr 2024 21:48:23 -0600 Subject: [PATCH] Add torch to lit cfg for programming_examples (#1370) Co-authored-by: singagan <53442471+singagan@users.noreply.github.com> Co-authored-by: Gagandeep Singh --- .github/workflows/buildAndTestRyzenAI.yml | 1 + programming_examples/lit.cfg.py | 8 + programming_examples/ml/bottleneck/Makefile | 2 +- programming_examples/ml/bottleneck/README.md | 7 - programming_examples/ml/bottleneck/run.lit | 2 +- programming_examples/ml/bottleneck/test.py | 355 +++---- programming_examples/ml/conv2d/Makefile | 2 +- programming_examples/ml/conv2d/README.md | 9 +- programming_examples/ml/conv2d/run.lit | 4 +- programming_examples/ml/conv2d/test.py | 271 +++--- .../ml/conv2d_fused_relu/Makefile | 2 +- .../ml/conv2d_fused_relu/README.md | 9 +- .../ml/conv2d_fused_relu/run.lit | 2 +- .../ml/conv2d_fused_relu/test.py | 275 +++--- programming_examples/ml/resnet/README.md | 8 - .../ml/resnet/layers_conv2_x/Makefile | 2 +- .../ml/resnet/layers_conv2_x/aie2.py | 2 +- .../ml/resnet/layers_conv2_x/run.lit | 2 +- .../ml/resnet/layers_conv2_x/test.py | 876 +++++++++--------- programming_guide/section-6/README.md | 8 +- 20 files changed, 962 insertions(+), 885 deletions(-) diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml index acf2262fa2..bc3988e002 100644 --- a/.github/workflows/buildAndTestRyzenAI.yml +++ b/.github/workflows/buildAndTestRyzenAI.yml @@ -127,6 +127,7 @@ jobs: python -m venv aie-venv source aie-venv/bin/activate pip install -r python/requirements.txt + pip install -r python/requirements_ml.txt pip install jupyter sed -i.bak 's/OUTPUT_TIMEOUT = 10/OUTPUT_TIMEOUT = 100/g' \ $(python -c 'import site; print(site.getsitepackages()[0])')/jupyter_client/runapp.py diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index d5ff22c85e..b28803cb43 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -104,6 +104,14 @@ opencv_flags = "" config.substitutions.append(("%opencv_flags", opencv_flags)) +try: + import torch + + config.available_features.add("torch") +except ImportError: + print("torch not found", file=sys.stderr) + pass + VitisSysrootFlag = "" if config.aieHostTarget == "x86_64": config.substitutions.append(("%aieHostTargetTriplet%", "x86_64-unknown-linux-gnu")) diff --git a/programming_examples/ml/bottleneck/Makefile b/programming_examples/ml/bottleneck/Makefile index f5c6e4561f..47ca6a78f7 100755 --- a/programming_examples/ml/bottleneck/Makefile +++ b/programming_examples/ml/bottleneck/Makefile @@ -37,4 +37,4 @@ clean: *.log aie_partition.json *.bin BOOT.BIN _x test.exe run_py: - ${powershell} python3 test.py + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE \ No newline at end of file diff --git a/programming_examples/ml/bottleneck/README.md b/programming_examples/ml/bottleneck/README.md index 144b8e36f2..40a69e8576 100644 --- a/programming_examples/ml/bottleneck/README.md +++ b/programming_examples/ml/bottleneck/README.md @@ -115,11 +115,4 @@ make To run the design: ``` make run_py -``` - -### Prerequisites -To install the dependencies, run the following command: -``` -pip install -r requirements.txt - ``` \ No newline at end of file diff --git a/programming_examples/ml/bottleneck/run.lit b/programming_examples/ml/bottleneck/run.lit index ec30002c97..8a6024d66e 100644 --- a/programming_examples/ml/bottleneck/run.lit +++ b/programming_examples/ml/bottleneck/run.lit @@ -8,5 +8,5 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py | FileCheck %s +// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/bottleneck/test.py b/programming_examples/ml/bottleneck/test.py index 34f6347175..48a9a8929c 100644 --- a/programming_examples/ml/bottleneck/test.py +++ b/programming_examples/ml/bottleneck/test.py @@ -14,177 +14,192 @@ import os import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute +import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) -design = "bottleneck_int8" -xclbin_path = os.path.abspath("build/final.xclbin") -insts_path = os.path.abspath("build/insts.txt") - -log_folder = "log/" -if not os.path.exists(log_folder): - os.makedirs(log_folder) - -num_iter = 1 -npu_time_total = 0 -npu_time_min = 9999999 -npu_time_max = 0 -trace_size = 16384 -enable_trace = False -trace_file = "log/trace_" + design + ".txt" -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -dtype_in = np.dtype("int8") -dtype_wts = np.dtype("int8") -dtype_out = np.dtype("uint8") - -shape_in_act = (32, 32, 32, 8) -shape_in_wts1 = (8, 32, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 -shape_in_wts2 = (8, 8, 3, 3, 8, 8) # out,in,ky,kx,in8,out8 -shape_in_wts3 = (32, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 -shape_total_wts = (69632, 1) -shape_out = (32, 32, 32, 8) - -# ------------------------------------------------------ -# Initialize activation, weights, scaling factor for int8 model -# ------------------------------------------------------ -int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor) -int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor) -int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor) -int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor) - -inp_scale1 = 0.5 -inp_scale2 = 0.5 -inp_scale3 = 0.5 -inp_scale4 = 0.5 - -weight_scale1 = 0.5 -weight_scale2 = 0.5 -weight_scale3 = 0.5 - -combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2) -combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3) -combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1) -combined_scale4 = -math.log2(inp_scale1 / inp_scale4) -conv_scale = 0.0039 # scale to convert int8 output to floating point -relu_scale = 0.0078 # scale to convert int8 output to floating point -min = 0 -max = 255 - -# ------------------------------------------------------ -# Get device, load the xclbin & kernel and register them -# ------------------------------------------------------ -app = setup_aie( - xclbin_path, - insts_path, - shape_in_act, - dtype_in, - shape_total_wts, - dtype_wts, - shape_out, - dtype_out, - enable_trace=enable_trace, - trace_size=trace_size, -) - - -# ------------------------------------------------------ -# Define your golden reference -# ------------------------------------------------------ -class bottleneck_int8(nn.Module): - def __init__(self, in_planes=256, planes=64): - super(bottleneck_int8, self).__init__() - self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False) - self.conv2 = nn.Conv2d( - 64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False - ) - self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False) - - self.relu1 = nn.ReLU() - self.relu2 = nn.ReLU() - self.relu3 = nn.ReLU() - - def forward(self, x): - conv1_out = self.conv1(x) * inp_scale1 * weight_scale1 - relu1_out = torch.clamp( - torch.round(self.relu1(conv1_out) / inp_scale2), min, max - ) # convert to int and apply relu - conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2 - relu2_out = torch.clamp( - torch.round(self.relu2(conv2_out) / inp_scale3), min, max - ) - conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3 - same_scale_init = torch.clamp(torch.round(conv3_out / inp_scale1), -128, 127) - skip_add = inp_scale1 * (same_scale_init + int_inp) - final_out = inp_scale4 * ( - torch.clamp(torch.round(skip_add / inp_scale4), min, max) - ) - return final_out - - -# ------------------------------------------------------ -# Pytorch baseline -# ------------------------------------------------------ -model = bottleneck_int8() -model.eval() -model.conv1.weight.data.copy_(int_weight1) -model.conv2.weight.data.copy_(int_weight2) -model.conv3.weight.data.copy_(int_weight3) - -golden_output = model(int_inp) - -# ------------------------------------------------------ -# Reorder input data-layout -# ------------------------------------------------------ -ds = DataShaper() -before_input = int_inp.squeeze().data.numpy().astype(dtype_in) -before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") -ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") -ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") - -wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") -wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") -wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") - -total_wts = np.concatenate((wts1, wts2, wts3), axis=None) -total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - -# ------------------------------------------------------ -# Main run loop -# ------------------------------------------------------ -for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4 - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) - write_out_trace(trace, trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - -# ------------------------------------------------------ -# Reorder output data-layout -# ------------------------------------------------------ -temp_out = aie_output.reshape(32, 32, 32, 8) -temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") -ofm_mem_fmt = temp_out.reshape(256, 32, 32) -ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") -ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) - -# ------------------------------------------------------ -# Compare the AIE output and the golden reference -# ------------------------------------------------------ -print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - -assert np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=inp_scale4, -) - -print("\nPASS!\n") + +def main(opts): + design = "bottleneck_int8" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("uint8") + + shape_in_act = (32, 32, 32, 8) + shape_in_wts1 = (8, 32, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 + shape_in_wts2 = (8, 8, 3, 3, 8, 8) # out,in,ky,kx,in8,out8 + shape_in_wts3 = (32, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 + shape_total_wts = (69632, 1) + shape_out = (32, 32, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 100, (1, 256, 32, 32)).type(torch.FloatTensor) + int_weight1 = torch.randint(50, 100, (64, 256, 1, 1)).type(torch.FloatTensor) + int_weight2 = torch.randint(50, 100, (64, 64, 3, 3)).type(torch.FloatTensor) + int_weight3 = torch.randint(50, 100, (256, 64, 1, 1)).type(torch.FloatTensor) + + inp_scale1 = 0.5 + inp_scale2 = 0.5 + inp_scale3 = 0.5 + inp_scale4 = 0.5 + + weight_scale1 = 0.5 + weight_scale2 = 0.5 + weight_scale3 = 0.5 + + combined_scale1 = -math.log2(inp_scale1 * weight_scale1 / inp_scale2) + combined_scale2 = -math.log2(inp_scale2 * weight_scale2 / inp_scale3) + combined_scale3 = -math.log2(inp_scale3 * weight_scale3 / inp_scale1) + combined_scale4 = -math.log2(inp_scale1 / inp_scale4) + conv_scale = 0.0039 # scale to convert int8 output to floating point + relu_scale = 0.0078 # scale to convert int8 output to floating point + min = 0 + max = 255 + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class bottleneck_int8(nn.Module): + def __init__(self, in_planes=256, planes=64): + super(bottleneck_int8, self).__init__() + self.conv1 = nn.Conv2d(256, 64, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d( + 64, 64, kernel_size=3, padding=1, padding_mode="zeros", bias=False + ) + self.conv3 = nn.Conv2d(64, 256, kernel_size=1, bias=False) + + self.relu1 = nn.ReLU() + self.relu2 = nn.ReLU() + self.relu3 = nn.ReLU() + + def forward(self, x): + conv1_out = self.conv1(x) * inp_scale1 * weight_scale1 + relu1_out = torch.clamp( + torch.round(self.relu1(conv1_out) / inp_scale2), min, max + ) # convert to int and apply relu + conv2_out = self.conv2(relu1_out) * inp_scale2 * weight_scale2 + relu2_out = torch.clamp( + torch.round(self.relu2(conv2_out) / inp_scale3), min, max + ) + conv3_out = self.conv3(relu2_out) * inp_scale3 * weight_scale3 + same_scale_init = torch.clamp( + torch.round(conv3_out / inp_scale1), -128, 127 + ) + skip_add = inp_scale1 * (same_scale_init + int_inp) + final_out = inp_scale4 * ( + torch.clamp(torch.round(skip_add / inp_scale4), min, max) + ) + return final_out + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = bottleneck_int8() + model.eval() + model.conv1.weight.data.copy_(int_weight1) + model.conv2.weight.data.copy_(int_weight2) + model.conv3.weight.data.copy_(int_weight3) + + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + wts1 = ds.reorder_mat(int_weight1.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") + wts2 = ds.reorder_mat(int_weight2.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") + wts3 = ds.reorder_mat(int_weight3.data.numpy().astype(dtype_in), "OIYXI8O8", "OIYX") + + total_wts = np.concatenate((wts1, wts2, wts3), axis=None) + total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4 + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 32, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(256, 32, 32) + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + assert np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=inp_scale4, + ) + + print("\nPASS!\n") + + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile index 2bba6ea11c..0a89ce4bf0 100755 --- a/programming_examples/ml/conv2d/Makefile +++ b/programming_examples/ml/conv2d/Makefile @@ -34,4 +34,4 @@ clean: chess* *.o insts.txt \ *.log aie_partition.json *.bin BOOT.BIN _x test.exe run_py: - ${powershell} python3 test.py \ No newline at end of file + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE \ No newline at end of file diff --git a/programming_examples/ml/conv2d/README.md b/programming_examples/ml/conv2d/README.md index 81b25f3e52..b2d93f066d 100644 --- a/programming_examples/ml/conv2d/README.md +++ b/programming_examples/ml/conv2d/README.md @@ -56,12 +56,5 @@ make To run the design: ``` -make run -``` - -### Prerequisites -To install the dependencies, run the following command: -``` -pip install -r requirements.txt - +make run_py ``` \ No newline at end of file diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit index 1eeef90b94..349e45f9bc 100644 --- a/programming_examples/ml/conv2d/run.lit +++ b/programming_examples/ml/conv2d/run.lit @@ -1,4 +1,4 @@ -// (c) Copyright 2023 Advanced Micro Devices, Inc. +// (c) Copyright 2024 Advanced Micro Devices, Inc. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // // REQUIRES: ryzen_ai, chess, torch @@ -6,5 +6,5 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_i8.cc -o conv2dk1_i8.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py | FileCheck %s +// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py index 1dc847d8fe..1a8d2e7712 100644 --- a/programming_examples/ml/conv2d/test.py +++ b/programming_examples/ml/conv2d/test.py @@ -14,136 +14,149 @@ import os import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute +import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) -design = "conv2d" -xclbin_path = os.path.abspath("build/final.xclbin") -insts_path = os.path.abspath("build/insts.txt") - -log_folder = "log/" -if not os.path.exists(log_folder): - os.makedirs(log_folder) - -num_iter = 1 -npu_time_total = 0 -npu_time_min = 9999999 -npu_time_max = 0 -trace_size = 16384 -enable_trace = False -trace_file = "log/trace_" + design + ".txt" -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -dtype_in = np.dtype("int8") -dtype_wts = np.dtype("int8") -dtype_out = np.dtype("int8") - -shape_total_wts = (4096, 1) -shape_in_act = (32, 8, 32, 8) #'YCXC8' , 'CYX' -shape_in_wts1 = (8, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 -shape_out = (32, 8, 32, 8) - -# ------------------------------------------------------ -# Initialize activation, weights, scaling factor for int8 model -# ------------------------------------------------------ -int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor) -int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor) -conv_scale = 7.6294e-06 # scale to convert int8 output to floating point -int8_scale = 0.0078 # scale to convert int8 output to floating point -min = -128 -max = 127 -# ------------------------------------------------------ -# Get device, load the xclbin & kernel and register them -# ------------------------------------------------------ -app = setup_aie( - xclbin_path, - insts_path, - shape_in_act, - dtype_in, - shape_total_wts, - dtype_wts, - shape_out, - dtype_out, - enable_trace=enable_trace, - trace_size=trace_size, -) - - -# ------------------------------------------------------ -# Define your golden reference -# ------------------------------------------------------ -class conv2d_int_model(nn.Module): - def __init__(self, in_planes=64, planes=64): - super(conv2d_int_model, self).__init__() - self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False) - - def forward(self, x): - out_int = self.conv(x) - out_quant = out_int * conv_scale # int8 x int8 leads to int32 output - out_float = int8_scale * torch.clamp( - torch.round(out_quant / int8_scale), min, max - ) # converting to int8 range - return out_float - - -# ------------------------------------------------------ -# Pytorch baseline -# ------------------------------------------------------ -model = conv2d_int_model() -model.eval() -model.conv.weight.data.copy_(int_weight) - -golden_output = model(int_inp) - -# ------------------------------------------------------ -# Reorder input data-layout -# ------------------------------------------------------ -ds = DataShaper() -before_input = int_inp.squeeze().data.numpy().astype(dtype_in) -before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") -ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") -ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") - -wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX") -total_wts = np.concatenate((wts1), axis=None) -total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - -# ------------------------------------------------------ -# Main run loop -# ------------------------------------------------------ -for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) - write_out_trace(trace, trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - -# ------------------------------------------------------ -# Reorder output data-layout -# ------------------------------------------------------ -temp_out = aie_output.reshape(32, 8, 32, 8) -temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") -ofm_mem_fmt = temp_out.reshape(64, 32, 32) -ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") -ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) - -# ------------------------------------------------------ -# Compare the AIE output and the golden reference -# ------------------------------------------------------ - -print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - -assert np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=2 * int8_scale, -) -print("\nPASS!\n") + +def main(opts): + design = "conv2d" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("int8") + + shape_total_wts = (4096, 1) + shape_in_act = (32, 8, 32, 8) #'YCXC8' , 'CYX' + shape_in_wts1 = (8, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 + shape_out = (32, 8, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 20, (1, 64, 32, 32)).type(torch.FloatTensor) + int_weight = torch.randint(50, 80, (64, 64, 1, 1)).type(torch.FloatTensor) + conv_scale = 7.6294e-06 # scale to convert int8 output to floating point + int8_scale = 0.0078 # scale to convert int8 output to floating point + min = -128 + max = 127 + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class conv2d_int_model(nn.Module): + def __init__(self, in_planes=64, planes=64): + super(conv2d_int_model, self).__init__() + self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False) + + def forward(self, x): + out_int = self.conv(x) + out_quant = out_int * conv_scale # int8 x int8 leads to int32 output + out_float = int8_scale * torch.clamp( + torch.round(out_quant / int8_scale), min, max + ) # converting to int8 range + return out_float + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = conv2d_int_model() + model.eval() + model.conv.weight.data.copy_(int_weight) + + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX") + total_wts = np.concatenate((wts1), axis=None) + total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts) * int8_scale + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 8, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(64, 32, 32) + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + assert np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=2 * int8_scale, + ) + print("\nPASS!\n") + + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) diff --git a/programming_examples/ml/conv2d_fused_relu/Makefile b/programming_examples/ml/conv2d_fused_relu/Makefile index f804bdd842..7c59ae4877 100755 --- a/programming_examples/ml/conv2d_fused_relu/Makefile +++ b/programming_examples/ml/conv2d_fused_relu/Makefile @@ -34,4 +34,4 @@ clean: *.log aie_partition.json *.bin BOOT.BIN _x test.exe run_py: - ${powershell} python3 test.py + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE diff --git a/programming_examples/ml/conv2d_fused_relu/README.md b/programming_examples/ml/conv2d_fused_relu/README.md index 68e7e9b8cf..3f4a2264cd 100644 --- a/programming_examples/ml/conv2d_fused_relu/README.md +++ b/programming_examples/ml/conv2d_fused_relu/README.md @@ -88,12 +88,5 @@ make To run the design: ``` -make run -``` - -### Prerequisites -To install the dependencies, run the following command: -``` -pip install -r requirements.txt - +make run_py ``` \ No newline at end of file diff --git a/programming_examples/ml/conv2d_fused_relu/run.lit b/programming_examples/ml/conv2d_fused_relu/run.lit index 0c122f451e..cfddde9013 100644 --- a/programming_examples/ml/conv2d_fused_relu/run.lit +++ b/programming_examples/ml/conv2d_fused_relu/run.lit @@ -6,5 +6,5 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DINT8_ACT -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py | FileCheck %s +// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/conv2d_fused_relu/test.py b/programming_examples/ml/conv2d_fused_relu/test.py index 5bfe139112..6fe407faaa 100644 --- a/programming_examples/ml/conv2d_fused_relu/test.py +++ b/programming_examples/ml/conv2d_fused_relu/test.py @@ -14,138 +14,151 @@ import os import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute +import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) -design = "conv2d_with_relu" -xclbin_path = os.path.abspath("build/final.xclbin") -insts_path = os.path.abspath("build/insts.txt") - -log_folder = "log/" -if not os.path.exists(log_folder): - os.makedirs(log_folder) - -num_iter = 1 -npu_time_total = 0 -npu_time_min = 9999999 -npu_time_max = 0 -trace_size = 16384 -enable_trace = False -trace_file = "log/trace_" + design + ".txt" -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -dtype_in = np.dtype("int8") -dtype_wts = np.dtype("int8") -dtype_out = np.dtype("uint8") - -shape_total_wts = (4096, 1) -shape_in_act = (32, 8, 32, 8) #'YCXC8' , 'CYX' -shape_in_wts1 = (8, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 -shape_out = (32, 8, 32, 8) - -# ------------------------------------------------------ -# Initialize activation, weights, scaling factor for int8 model -# ------------------------------------------------------ -int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor) -int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor) -conv_scale = 0.0039 # scale to convert int8 output to floating point -relu_scale = 0.0078 # scale to convert int8 output to floating point -min = 0 -max = 255 - -# ------------------------------------------------------ -# Get device, load the xclbin & kernel and register them -# ------------------------------------------------------ -app = setup_aie( - xclbin_path, - insts_path, - shape_in_act, - dtype_in, - shape_total_wts, - dtype_wts, - shape_out, - dtype_out, - enable_trace=enable_trace, - trace_size=trace_size, -) - - -# ------------------------------------------------------ -# Define your golden reference -# ------------------------------------------------------ -class conv2d_relu_int_model(nn.Module): - def __init__(self, in_planes=64, planes=64): - super(conv2d_relu_int_model, self).__init__() - self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False) - self.relu = nn.ReLU() - - def forward(self, x): - out_int = self.conv(x) - out_float = out_int * conv_scale - out_int = self.relu(out_float) - out_float = relu_scale * torch.clamp( - torch.round(out_int / relu_scale), min, max - ) # converting to int to do proper clipping - return out_float - - -# ------------------------------------------------------ -# Pytorch baseline -# ------------------------------------------------------ -model = conv2d_relu_int_model() -model.eval() -model.conv.weight.data.copy_(int_weight) -golden_output = model(int_inp) - -# ------------------------------------------------------ -# Reorder input data-layout -# ------------------------------------------------------ -ds = DataShaper() -before_input = int_inp.squeeze().data.numpy().astype(dtype_in) -before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") -ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") -ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") - -wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX") -total_wts = np.concatenate((wts1), axis=None) -total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - -# ------------------------------------------------------ -# Main run loop -# ------------------------------------------------------ -for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) - write_out_trace(trace, trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - -# ------------------------------------------------------ -# Reorder output data-layout -# ------------------------------------------------------ -temp_out = aie_output.reshape(32, 8, 32, 8) -temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") -ofm_mem_fmt = temp_out.reshape(64, 32, 32) -ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") -ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) - -# ------------------------------------------------------ -# Compare the AIE output and the golden reference -# ------------------------------------------------------ -print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - -assert np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=2 * relu_scale, -) - -print("\nPASS!\n") + +def main(opts): + design = "conv2d_with_relu" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("uint8") + + shape_total_wts = (4096, 1) + shape_in_act = (32, 8, 32, 8) #'YCXC8' , 'CYX' + shape_in_wts1 = (8, 8, 1, 1, 8, 8) # out,in,ky,kx,in8,out8 + shape_out = (32, 8, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 100, (1, 64, 32, 32)).type(torch.FloatTensor) + int_weight = torch.randint(50, 100, (64, 64, 1, 1)).type(torch.FloatTensor) + conv_scale = 0.0039 # scale to convert int8 output to floating point + relu_scale = 0.0078 # scale to convert int8 output to floating point + min = 0 + max = 255 + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class conv2d_relu_int_model(nn.Module): + def __init__(self, in_planes=64, planes=64): + super(conv2d_relu_int_model, self).__init__() + self.conv = nn.Conv2d(64, 64, kernel_size=1, bias=False) + self.relu = nn.ReLU() + + def forward(self, x): + out_int = self.conv(x) + out_float = out_int * conv_scale + out_int = self.relu(out_float) + out_float = relu_scale * torch.clamp( + torch.round(out_int / relu_scale), min, max + ) # converting to int to do proper clipping + return out_float + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = conv2d_relu_int_model() + model.eval() + model.conv.weight.data.copy_(int_weight) + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + wts1 = ds.reorder_mat(int_weight.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX") + total_wts = np.concatenate((wts1), axis=None) + total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 8, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(64, 32, 32) + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + assert np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=2 * relu_scale, + ) + + print("\nPASS!\n") + + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) diff --git a/programming_examples/ml/resnet/README.md b/programming_examples/ml/resnet/README.md index 6382079c62..de4cc92535 100755 --- a/programming_examples/ml/resnet/README.md +++ b/programming_examples/ml/resnet/README.md @@ -107,14 +107,6 @@ To run the design: make run_py ``` -### Prerequisites - -To install the dependencies, run the following command: -``` -pip install -r requirements.txt - -``` - ## References [1] He, K., Zhang, X., Ren, S., & Sun, J. (2016). Deep residual learning for image recognition. In Proceedings of the IEEE conference on computer vision and pattern recognition (pp. 770-778). diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile index d8f1b7261a..6218e61fb5 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/Makefile +++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile @@ -44,4 +44,4 @@ clean: *.log aie_partition.json *.bin BOOT.BIN _x test.exe run_py: - ${powershell} python3 test.py + ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE \ No newline at end of file diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py index 235b5c5308..f5243070d9 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py +++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py @@ -7,8 +7,8 @@ from aie.dialects.aie import * from aie.dialects.aiex import * +from aie.dialects.scf import * from aie.extras.dialects.ext import memref, arith -from aie.dialects.scf import for_, yield_ from aie.extras.context import mlir_mod_ctx from aie.ir import MemRefType, TypeAttr diff --git a/programming_examples/ml/resnet/layers_conv2_x/run.lit b/programming_examples/ml/resnet/layers_conv2_x/run.lit index 61f43e45e6..c35a868772 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/run.lit +++ b/programming_examples/ml/resnet/layers_conv2_x/run.lit @@ -10,5 +10,5 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir // RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py | FileCheck %s +// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/resnet/layers_conv2_x/test.py b/programming_examples/ml/resnet/layers_conv2_x/test.py index 02dc01b127..48b45b99ae 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/test.py +++ b/programming_examples/ml/resnet/layers_conv2_x/test.py @@ -14,423 +14,473 @@ import os import numpy as np from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute +import aie.utils.test as test_utils torch.use_deterministic_algorithms(True) torch.manual_seed(0) -design = "resnet_conv2_x_int8" -xclbin_path = os.path.abspath("build/final.xclbin") -insts_path = os.path.abspath("build/insts.txt") - -log_folder = "log/" -if not os.path.exists(log_folder): - os.makedirs(log_folder) - -num_iter = 1 -npu_time_total = 0 -npu_time_min = 9999999 -npu_time_max = 0 -trace_size = 16384 -enable_trace = False -trace_file = "log/trace_" + design + ".txt" -# ------------------------------------------------------ -# Configure this to match your design's buffer size -# ------------------------------------------------------ -dtype_in = np.dtype("int8") -dtype_wts = np.dtype("int8") -dtype_out = np.dtype("uint8") - -shape_in_act = (32, 8, 32, 8) -shape_total_wts = (212992, 1) -shape_out = (32, 32, 32, 8) - -# ------------------------------------------------------ -# Initialize activation, weights, scaling factor for int8 model -# ------------------------------------------------------ -int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor) -block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor) -block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor) -block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor) -block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type(torch.FloatTensor) - -block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type(torch.FloatTensor) -block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor) -block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type(torch.FloatTensor) - -block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type(torch.FloatTensor) -block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor) -block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type(torch.FloatTensor) - -init_scale = 0.5 -block_0_relu_1 = 0.5 -block_0_relu_2 = 0.5 -block_0_relu_3 = 0.5 - -block_0_weight_scale1 = 0.5 -block_0_weight_scale2 = 0.5 -block_0_weight_scale3 = 0.5 -block_0_weight_scale_skip = 0.5 - -block_1_relu_1 = 0.5 -block_1_relu_2 = 0.5 -block_1_relu_3 = 0.5 - -block_1_weight_scale1 = 0.5 -block_1_weight_scale2 = 0.5 -block_1_weight_scale3 = 0.5 -block_1_quant_add_1 = 0.5 - -block_2_relu_1 = 0.5 -block_2_relu_2 = 0.5 -block_2_relu_3 = 0.5 - -block_2_weight_scale1 = 0.5 -block_2_weight_scale2 = 0.5 -block_2_weight_scale3 = 0.5 -block_2_quant_add_1 = 0.5 - -block_0_combined_scale1 = -math.log2( - init_scale * block_0_weight_scale1 / block_0_relu_1 -) # RHS after first conv1x1 | clip 0-->255 -block_0_combined_scale2 = -math.log2( - block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2 -) # RHS after second conv3x3 | clip 0-->255 -block_0_combined_scale3 = -math.log2( - block_0_relu_2 * block_0_weight_scale3 / init_scale -) # RHS after third conv1x1 | clip -128-->+127 -block_0_combined_scale_skip = -math.log2( - init_scale * block_0_weight_scale_skip / init_scale -) # LHS after conv1x1 | clip -128-->+127 -block_0_combined_scale4 = -math.log2( - init_scale / block_0_relu_3 -) # After addition | clip 0-->255 - -block_1_combined_scale1 = -math.log2( - block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1 -) # RHS after first conv1x1 | clip 0-->255 -block_1_combined_scale2 = -math.log2( - block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2 -) # RHS after second conv3x3 | clip 0-->255 -block_1_combined_scale3 = -math.log2( - block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1 -) # RHS after third conv1x1 | clip -128-->+127 -block_1_combined_scale4 = -math.log2( - block_1_quant_add_1 / block_1_relu_3 -) # After addition | clip 0-->255 - -block_2_combined_scale1 = -math.log2( - block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1 -) # RHS after first conv1x1 | clip 0-->255 -block_2_combined_scale2 = -math.log2( - block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2 -) # RHS after second conv3x3 | clip 0-->255 -block_2_combined_scale3 = -math.log2( - block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1 -) # RHS after third conv1x1 | clip -128-->+127 -block_2_combined_scale4 = -math.log2( - block_2_quant_add_1 / block_2_relu_3 -) # After addition | clip 0-->255 - -min = 0 -max = 255 - -# ------------------------------------------------------ -# Get device, load the xclbin & kernel and register them -# ------------------------------------------------------ -app = setup_aie( - xclbin_path, - insts_path, - shape_in_act, - dtype_in, - shape_total_wts, - dtype_wts, - shape_out, - dtype_out, - enable_trace=enable_trace, - trace_size=trace_size, -) - - -# ------------------------------------------------------ -# Define your golden reference -# ------------------------------------------------------ -class resnet_conv2_x_int8(nn.Module): - expansion = 4 - - def __init__(self, in_planes=64, planes=64): - super(resnet_conv2_x_int8, self).__init__() - - self.shortcut = nn.Conv2d( - in_planes, self.expansion * planes, kernel_size=1, bias=False - ) - # Bottleneck 0 - self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) - self.block_0_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False - ) - self.block_0_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_0_relu1 = nn.ReLU() - self.block_0_relu2 = nn.ReLU() - self.block_0_relu3 = nn.ReLU() - - # Bottleneck 1 - self.block_1_conv1 = nn.Conv2d( - self.expansion * planes, planes, kernel_size=1, bias=False - ) - self.block_1_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False - ) - self.block_1_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_1_relu1 = nn.ReLU() - self.block_1_relu2 = nn.ReLU() - self.block_1_relu3 = nn.ReLU() - - # Bottleneck 2 - self.block_2_conv1 = nn.Conv2d( - self.expansion * planes, planes, kernel_size=1, bias=False - ) - self.block_2_conv2 = nn.Conv2d( - planes, planes, kernel_size=3, padding=1, padding_mode="zeros", bias=False - ) - self.block_2_conv3 = nn.Conv2d( - planes, self.expansion * planes, kernel_size=1, bias=False - ) - - self.block_2_relu1 = nn.ReLU() - self.block_2_relu2 = nn.ReLU() - self.block_2_relu3 = nn.ReLU() - - def forward(self, x): - # **************** Bottleneck 0 **************** - block_0_conv1_out = self.block_0_conv1(x) * init_scale * block_0_weight_scale1 - block_0_relu1_out = torch.clamp( - torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1), - min, - max, - ) # convert to int and apply relu - block_0_conv2_out = ( - self.block_0_conv2(block_0_relu1_out) - * block_0_relu_1 - * block_0_weight_scale2 - ) - block_0_relu2_out = torch.clamp( - torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2), - min, - max, - ) - block_0_conv3_out = ( - self.block_0_conv3(block_0_relu2_out) - * block_0_relu_2 - * block_0_weight_scale3 - ) - block_0_rhf_same_scale = torch.clamp( - torch.round(block_0_conv3_out / init_scale), -128, 127 - ) - - block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip - block_0_lhs_same_scale = torch.clamp( - torch.round(block_0_lhs_conv / init_scale), -128, 127 - ) - # convert to int and apply relu - - block_0_skip_add = init_scale * ( - block_0_rhf_same_scale + block_0_lhs_same_scale - ) - block_0_final_out = torch.clamp( - torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max - ) - # **************** Bottleneck 1 **************** - block_1_conv1_out = ( - self.block_1_conv1(block_0_final_out) - * block_0_relu_3 - * block_1_weight_scale1 - ) - block_1_relu1_out = torch.clamp( - torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1), - min, - max, - ) # convert to int and apply relu - block_1_conv2_out = ( - self.block_1_conv2(block_1_relu1_out) - * block_1_relu_1 - * block_1_weight_scale2 - ) - block_1_relu2_out = torch.clamp( - torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2), - min, - max, - ) - block_1_conv3_out = ( - self.block_1_conv3(block_1_relu2_out) - * block_1_relu_2 - * block_1_weight_scale3 - ) - block_1_rhf_same_scale = torch.clamp( - torch.round(block_1_conv3_out / block_0_relu_3), -128, 127 - ) - - block_1_skip_add = block_0_relu_3 * (block_1_rhf_same_scale + block_0_final_out) - block_1_final_out = torch.clamp( - torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), min, max - ) - - # **************** Bottleneck 2 **************** - block_2_conv1_out = ( - self.block_2_conv1(block_1_final_out) - * block_1_relu_3 - * block_2_weight_scale1 - ) - block_2_relu1_out = torch.clamp( - torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1), - min, - max, - ) # convert to int and apply relu - block_2_conv2_out = ( - self.block_2_conv2(block_2_relu1_out) - * block_2_relu_1 - * block_2_weight_scale2 - ) - block_2_relu2_out = torch.clamp( - torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2), - min, - max, - ) - block_2_conv3_out = ( - self.block_2_conv3(block_2_relu2_out) - * block_2_relu_2 - * block_2_weight_scale3 - ) - block_2_rhf_same_scale = torch.clamp( - torch.round(block_2_conv3_out / block_1_relu_3), -128, 127 - ) - - block_2_skip_add = block_1_relu_3 * (block_2_rhf_same_scale + block_1_final_out) - block_2_final_out = block_2_relu_3 * ( - torch.clamp( - torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3), + +def main(opts): + design = "resnet_conv2_x_int8" + xclbin_path = opts.xclbin + insts_path = opts.instr + + log_folder = "log/" + if not os.path.exists(log_folder): + os.makedirs(log_folder) + + num_iter = 1 + npu_time_total = 0 + npu_time_min = 9999999 + npu_time_max = 0 + trace_size = 16384 + enable_trace = False + trace_file = "log/trace_" + design + ".txt" + # ------------------------------------------------------ + # Configure this to match your design's buffer size + # ------------------------------------------------------ + dtype_in = np.dtype("int8") + dtype_wts = np.dtype("int8") + dtype_out = np.dtype("uint8") + + shape_in_act = (32, 8, 32, 8) + shape_total_wts = (212992, 1) + shape_out = (32, 32, 32, 8) + + # ------------------------------------------------------ + # Initialize activation, weights, scaling factor for int8 model + # ------------------------------------------------------ + int_inp = torch.randint(1, 10, (1, 64, 32, 32)).type(torch.FloatTensor) + block_0_int_weight_1 = torch.randint(10, 20, (64, 64, 1, 1)).type(torch.FloatTensor) + block_0_int_weight_2 = torch.randint(10, 20, (64, 64, 3, 3)).type(torch.FloatTensor) + block_0_int_weight_3 = torch.randint(10, 20, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + block_0_int_weight_skip = torch.randint(10, 20, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + + block_1_int_weight_1 = torch.randint(20, 30, (64, 256, 1, 1)).type( + torch.FloatTensor + ) + block_1_int_weight_2 = torch.randint(20, 30, (64, 64, 3, 3)).type(torch.FloatTensor) + block_1_int_weight_3 = torch.randint(20, 30, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + + block_2_int_weight_1 = torch.randint(30, 40, (64, 256, 1, 1)).type( + torch.FloatTensor + ) + block_2_int_weight_2 = torch.randint(30, 40, (64, 64, 3, 3)).type(torch.FloatTensor) + block_2_int_weight_3 = torch.randint(30, 40, (256, 64, 1, 1)).type( + torch.FloatTensor + ) + + init_scale = 0.5 + block_0_relu_1 = 0.5 + block_0_relu_2 = 0.5 + block_0_relu_3 = 0.5 + + block_0_weight_scale1 = 0.5 + block_0_weight_scale2 = 0.5 + block_0_weight_scale3 = 0.5 + block_0_weight_scale_skip = 0.5 + + block_1_relu_1 = 0.5 + block_1_relu_2 = 0.5 + block_1_relu_3 = 0.5 + + block_1_weight_scale1 = 0.5 + block_1_weight_scale2 = 0.5 + block_1_weight_scale3 = 0.5 + block_1_quant_add_1 = 0.5 + + block_2_relu_1 = 0.5 + block_2_relu_2 = 0.5 + block_2_relu_3 = 0.5 + + block_2_weight_scale1 = 0.5 + block_2_weight_scale2 = 0.5 + block_2_weight_scale3 = 0.5 + block_2_quant_add_1 = 0.5 + + block_0_combined_scale1 = -math.log2( + init_scale * block_0_weight_scale1 / block_0_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_0_combined_scale2 = -math.log2( + block_0_relu_1 * block_0_weight_scale2 / block_0_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_0_combined_scale3 = -math.log2( + block_0_relu_2 * block_0_weight_scale3 / init_scale + ) # RHS after third conv1x1 | clip -128-->+127 + block_0_combined_scale_skip = -math.log2( + init_scale * block_0_weight_scale_skip / init_scale + ) # LHS after conv1x1 | clip -128-->+127 + block_0_combined_scale4 = -math.log2( + init_scale / block_0_relu_3 + ) # After addition | clip 0-->255 + + block_1_combined_scale1 = -math.log2( + block_0_relu_3 * block_1_weight_scale1 / block_1_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_1_combined_scale2 = -math.log2( + block_1_relu_1 * block_1_weight_scale2 / block_1_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_1_combined_scale3 = -math.log2( + block_1_relu_2 * block_1_weight_scale3 / block_1_quant_add_1 + ) # RHS after third conv1x1 | clip -128-->+127 + block_1_combined_scale4 = -math.log2( + block_1_quant_add_1 / block_1_relu_3 + ) # After addition | clip 0-->255 + + block_2_combined_scale1 = -math.log2( + block_1_relu_3 * block_2_weight_scale1 / block_2_relu_1 + ) # RHS after first conv1x1 | clip 0-->255 + block_2_combined_scale2 = -math.log2( + block_2_relu_1 * block_2_weight_scale2 / block_2_relu_2 + ) # RHS after second conv3x3 | clip 0-->255 + block_2_combined_scale3 = -math.log2( + block_2_relu_2 * block_2_weight_scale3 / block_2_quant_add_1 + ) # RHS after third conv1x1 | clip -128-->+127 + block_2_combined_scale4 = -math.log2( + block_2_quant_add_1 / block_2_relu_3 + ) # After addition | clip 0-->255 + + min = 0 + max = 255 + + # ------------------------------------------------------ + # Get device, load the xclbin & kernel and register them + # ------------------------------------------------------ + app = setup_aie( + xclbin_path, + insts_path, + shape_in_act, + dtype_in, + shape_total_wts, + dtype_wts, + shape_out, + dtype_out, + enable_trace=enable_trace, + trace_size=trace_size, + ) + + # ------------------------------------------------------ + # Define your golden reference + # ------------------------------------------------------ + class resnet_conv2_x_int8(nn.Module): + expansion = 4 + + def __init__(self, in_planes=64, planes=64): + super(resnet_conv2_x_int8, self).__init__() + + self.shortcut = nn.Conv2d( + in_planes, self.expansion * planes, kernel_size=1, bias=False + ) + # Bottleneck 0 + self.block_0_conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False) + self.block_0_conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, + ) + self.block_0_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_0_relu1 = nn.ReLU() + self.block_0_relu2 = nn.ReLU() + self.block_0_relu3 = nn.ReLU() + + # Bottleneck 1 + self.block_1_conv1 = nn.Conv2d( + self.expansion * planes, planes, kernel_size=1, bias=False + ) + self.block_1_conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, + ) + self.block_1_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_1_relu1 = nn.ReLU() + self.block_1_relu2 = nn.ReLU() + self.block_1_relu3 = nn.ReLU() + + # Bottleneck 2 + self.block_2_conv1 = nn.Conv2d( + self.expansion * planes, planes, kernel_size=1, bias=False + ) + self.block_2_conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + padding=1, + padding_mode="zeros", + bias=False, + ) + self.block_2_conv3 = nn.Conv2d( + planes, self.expansion * planes, kernel_size=1, bias=False + ) + + self.block_2_relu1 = nn.ReLU() + self.block_2_relu2 = nn.ReLU() + self.block_2_relu3 = nn.ReLU() + + def forward(self, x): + # **************** Bottleneck 0 **************** + block_0_conv1_out = ( + self.block_0_conv1(x) * init_scale * block_0_weight_scale1 + ) + block_0_relu1_out = torch.clamp( + torch.round(self.block_0_relu1(block_0_conv1_out) / block_0_relu_1), + min, + max, + ) # convert to int and apply relu + block_0_conv2_out = ( + self.block_0_conv2(block_0_relu1_out) + * block_0_relu_1 + * block_0_weight_scale2 + ) + block_0_relu2_out = torch.clamp( + torch.round(self.block_0_relu2(block_0_conv2_out) / block_0_relu_2), + min, + max, + ) + block_0_conv3_out = ( + self.block_0_conv3(block_0_relu2_out) + * block_0_relu_2 + * block_0_weight_scale3 + ) + block_0_rhf_same_scale = torch.clamp( + torch.round(block_0_conv3_out / init_scale), -128, 127 + ) + + block_0_lhs_conv = self.shortcut(x) * init_scale * block_0_weight_scale_skip + block_0_lhs_same_scale = torch.clamp( + torch.round(block_0_lhs_conv / init_scale), -128, 127 + ) + # convert to int and apply relu + + block_0_skip_add = init_scale * ( + block_0_rhf_same_scale + block_0_lhs_same_scale + ) + block_0_final_out = torch.clamp( + torch.round(self.block_0_relu3(block_0_skip_add) / block_0_relu_3), min, max, ) - ) - return block_2_final_out - - -# ------------------------------------------------------ -# Pytorch baseline -# ------------------------------------------------------ -model = resnet_conv2_x_int8() -model.eval() -model.block_0_conv1.weight.data.copy_(block_0_int_weight_1) -model.block_0_conv2.weight.data.copy_(block_0_int_weight_2) -model.block_0_conv3.weight.data.copy_(block_0_int_weight_3) -model.shortcut.weight.data.copy_(block_0_int_weight_skip) - -model.block_1_conv1.weight.data.copy_(block_1_int_weight_1) -model.block_1_conv2.weight.data.copy_(block_1_int_weight_2) -model.block_1_conv3.weight.data.copy_(block_1_int_weight_3) - -model.block_2_conv1.weight.data.copy_(block_2_int_weight_1) -model.block_2_conv2.weight.data.copy_(block_2_int_weight_2) -model.block_2_conv3.weight.data.copy_(block_2_int_weight_3) - -golden_output = model(int_inp) - -# ------------------------------------------------------ -# Reorder input data-layout -# ------------------------------------------------------ -ds = DataShaper() -before_input = int_inp.squeeze().data.numpy().astype(dtype_in) -before_input.tofile(log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d") -ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") -ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") - -block0_wts1 = ds.reorder_mat( - block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block0_wts2 = ds.reorder_mat( - block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block0_wts3 = ds.reorder_mat( - block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block0_wts_skip = ds.reorder_mat( - block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) - -total_wts = np.concatenate( - (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None -) - -block1_wts1 = ds.reorder_mat( - block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block1_wts2 = ds.reorder_mat( - block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block1_wts3 = ds.reorder_mat( - block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) - -total_wts2 = np.concatenate( - (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None -) - -block2_wts1 = ds.reorder_mat( - block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block2_wts2 = ds.reorder_mat( - block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) -block2_wts3 = ds.reorder_mat( - block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" -) - -total_wts3 = np.concatenate( - (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None -) - -total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") - -# ------------------------------------------------------ -# Main run loop -# ------------------------------------------------------ -for i in range(num_iter): - start = time.time_ns() - aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3 - stop = time.time_ns() - - if enable_trace: - aie_output, trace = extract_trace(aie_output, shape_out, dtype_out, trace_size) - write_out_trace(trace, trace_file) - - npu_time = stop - start - npu_time_total = npu_time_total + npu_time - -# ------------------------------------------------------ -# Reorder output data-layout -# ------------------------------------------------------ -temp_out = aie_output.reshape(32, 32, 32, 8) -temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") -ofm_mem_fmt = temp_out.reshape(256, 32, 32) -ofm_mem_fmt.tofile(log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d") -ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) - -# ------------------------------------------------------ -# Compare the AIE output and the golden reference -# ------------------------------------------------------ -print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) - -assert np.allclose( - ofm_mem_fmt_out.detach().numpy(), - golden_output.detach().numpy(), - rtol=0, - atol=block_2_relu_3, -) - -print("\nPASS!\n") + # **************** Bottleneck 1 **************** + block_1_conv1_out = ( + self.block_1_conv1(block_0_final_out) + * block_0_relu_3 + * block_1_weight_scale1 + ) + block_1_relu1_out = torch.clamp( + torch.round(self.block_1_relu1(block_1_conv1_out) / block_1_relu_1), + min, + max, + ) # convert to int and apply relu + block_1_conv2_out = ( + self.block_1_conv2(block_1_relu1_out) + * block_1_relu_1 + * block_1_weight_scale2 + ) + block_1_relu2_out = torch.clamp( + torch.round(self.block_1_relu2(block_1_conv2_out) / block_1_relu_2), + min, + max, + ) + block_1_conv3_out = ( + self.block_1_conv3(block_1_relu2_out) + * block_1_relu_2 + * block_1_weight_scale3 + ) + block_1_rhf_same_scale = torch.clamp( + torch.round(block_1_conv3_out / block_0_relu_3), -128, 127 + ) + + block_1_skip_add = block_0_relu_3 * ( + block_1_rhf_same_scale + block_0_final_out + ) + block_1_final_out = torch.clamp( + torch.round(self.block_1_relu3(block_1_skip_add) / block_1_relu_3), + min, + max, + ) + + # **************** Bottleneck 2 **************** + block_2_conv1_out = ( + self.block_2_conv1(block_1_final_out) + * block_1_relu_3 + * block_2_weight_scale1 + ) + block_2_relu1_out = torch.clamp( + torch.round(self.block_2_relu1(block_2_conv1_out) / block_2_relu_1), + min, + max, + ) # convert to int and apply relu + block_2_conv2_out = ( + self.block_2_conv2(block_2_relu1_out) + * block_2_relu_1 + * block_2_weight_scale2 + ) + block_2_relu2_out = torch.clamp( + torch.round(self.block_2_relu2(block_2_conv2_out) / block_2_relu_2), + min, + max, + ) + block_2_conv3_out = ( + self.block_2_conv3(block_2_relu2_out) + * block_2_relu_2 + * block_2_weight_scale3 + ) + block_2_rhf_same_scale = torch.clamp( + torch.round(block_2_conv3_out / block_1_relu_3), -128, 127 + ) + + block_2_skip_add = block_1_relu_3 * ( + block_2_rhf_same_scale + block_1_final_out + ) + block_2_final_out = block_2_relu_3 * ( + torch.clamp( + torch.round(self.block_2_relu3(block_2_skip_add) / block_2_relu_3), + min, + max, + ) + ) + return block_2_final_out + + # ------------------------------------------------------ + # Pytorch baseline + # ------------------------------------------------------ + model = resnet_conv2_x_int8() + model.eval() + model.block_0_conv1.weight.data.copy_(block_0_int_weight_1) + model.block_0_conv2.weight.data.copy_(block_0_int_weight_2) + model.block_0_conv3.weight.data.copy_(block_0_int_weight_3) + model.shortcut.weight.data.copy_(block_0_int_weight_skip) + + model.block_1_conv1.weight.data.copy_(block_1_int_weight_1) + model.block_1_conv2.weight.data.copy_(block_1_int_weight_2) + model.block_1_conv3.weight.data.copy_(block_1_int_weight_3) + + model.block_2_conv1.weight.data.copy_(block_2_int_weight_1) + model.block_2_conv2.weight.data.copy_(block_2_int_weight_2) + model.block_2_conv3.weight.data.copy_(block_2_int_weight_3) + + golden_output = model(int_inp) + + # ------------------------------------------------------ + # Reorder input data-layout + # ------------------------------------------------------ + ds = DataShaper() + before_input = int_inp.squeeze().data.numpy().astype(dtype_in) + before_input.tofile( + log_folder + "/before_ifm_mem_fmt_1x1.txt", sep=",", format="%d" + ) + ifm_mem_fmt = ds.reorder_mat(before_input, "YCXC8", "CYX") + ifm_mem_fmt.tofile(log_folder + "/after_ifm_mem_fmt_1x1.txt", sep=",", format="%d") + + block0_wts1 = ds.reorder_mat( + block_0_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts2 = ds.reorder_mat( + block_0_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts3 = ds.reorder_mat( + block_0_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block0_wts_skip = ds.reorder_mat( + block_0_int_weight_skip.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts = np.concatenate( + (block0_wts1, block0_wts2, block0_wts3, block0_wts_skip), axis=None + ) + + block1_wts1 = ds.reorder_mat( + block_1_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block1_wts2 = ds.reorder_mat( + block_1_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block1_wts3 = ds.reorder_mat( + block_1_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts2 = np.concatenate( + (total_wts, block1_wts1, block1_wts2, block1_wts3), axis=None + ) + + block2_wts1 = ds.reorder_mat( + block_2_int_weight_1.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block2_wts2 = ds.reorder_mat( + block_2_int_weight_2.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + block2_wts3 = ds.reorder_mat( + block_2_int_weight_3.data.numpy().astype(dtype_wts), "OIYXI8O8", "OIYX" + ) + + total_wts3 = np.concatenate( + (total_wts2, block2_wts1, block2_wts2, block2_wts3), axis=None + ) + + total_wts3.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d") + + # ------------------------------------------------------ + # Main run loop + # ------------------------------------------------------ + for i in range(num_iter): + start = time.time_ns() + aie_output = execute(app, ifm_mem_fmt, total_wts) * block_2_relu_3 + stop = time.time_ns() + + if enable_trace: + aie_output, trace = extract_trace( + aie_output, shape_out, dtype_out, trace_size + ) + write_out_trace(trace, trace_file) + + npu_time = stop - start + npu_time_total = npu_time_total + npu_time + + # ------------------------------------------------------ + # Reorder output data-layout + # ------------------------------------------------------ + temp_out = aie_output.reshape(32, 32, 32, 8) + temp_out = ds.reorder_mat(temp_out, "CDYX", "YCXD") + ofm_mem_fmt = temp_out.reshape(256, 32, 32) + ofm_mem_fmt.tofile( + log_folder + "/after_ofm_mem_fmt_final.txt", sep=",", format="%d" + ) + ofm_mem_fmt_out = torch.from_numpy(ofm_mem_fmt).unsqueeze(0) + + # ------------------------------------------------------ + # Compare the AIE output and the golden reference + # ------------------------------------------------------ + print("\nAvg NPU time: {}us.".format(int((npu_time_total / num_iter) / 1000))) + + assert np.allclose( + ofm_mem_fmt_out.detach().numpy(), + golden_output.detach().numpy(), + rtol=0, + atol=block_2_relu_3, + ) + + print("\nPASS!\n") + + +if __name__ == "__main__": + p = test_utils.create_default_argparser() + opts = p.parse_args(sys.argv[1:]) + main(opts) diff --git a/programming_guide/section-6/README.md b/programming_guide/section-6/README.md index 83e8899002..f54c812ab3 100644 --- a/programming_guide/section-6/README.md +++ b/programming_guide/section-6/README.md @@ -26,8 +26,14 @@ There are a number of example designs available [here](../../programming_example | Design name | Data type | Description | |-|-|-| -|[bottleneck](../../programming_examples/ml/bottleneck/)|ui8|A Bottleneck Residual Block is a variant of the residual block that utilises 1x1 convolutions to create a bottleneck. The use of a bottleneck reduces the number of parameters and computations.| +|[bottleneck](../../programming_examples/ml/bottleneck/)|ui8|A Bottleneck Residual Block is a variant of the residual block that utilises three convolutions, using 1x1, 3x3 and 1x1 filter sizes, respectively. The use of a bottleneck reduces the number of parameters and computations.| |[resnet](../../programming_examples/ml/resnet/)|ui8|ResNet with offloaded conv2_x bottleneck blocks. The implementation features kernel fusion and dataflow optimizations highlighting the unique architectural capabilties of AI Engines.| +## Exercises + +1. In [bottlneck](../../programming_examples/ml/bottleneck/) design following a dataflow approach, how many elements does the 3x3 convolution operation require to proceed with its computation? +2. Suppose you have a bottleneck block with input dimensions of 32x32x256. After passing through the 1x1 convolutional layer, the output dimensions become 32x32x64. What would be the output dimensions after the subsequent 3x3 convolutional layer, assuming a stride of 1 and no padding and output channel of 64? + ----- [[Prev - Section 5](../section-5/)] [[Top](..)] +