From 626182746a720c85748e232366b8649386f24b3a Mon Sep 17 00:00:00 2001 From: Jeff Fifield Date: Tue, 23 Apr 2024 13:32:34 -0600 Subject: [PATCH] replace "ipu" with "npu" (#1305) --- docs/buildHostLin.md | 2 +- docs/buildHostWin.md | 8 +- include/aie-c/Translation.h | 2 +- include/aie/Dialect/AIE/IR/AIEAttrs.td | 2 +- include/aie/Dialect/AIE/IR/AIETargetModel.h | 4 +- include/aie/Dialect/AIEX/IR/AIEX.td | 24 ++-- .../aie/Dialect/AIEX/Transforms/AIEXPasses.h | 2 +- .../aie/Dialect/AIEX/Transforms/AIEXPasses.td | 6 +- include/aie/Targets/AIETargets.h | 4 +- lib/CAPI/Translation.cpp | 14 +- lib/Dialect/AIE/IR/AIEDialect.cpp | 6 +- lib/Dialect/AIEX/IR/AIEXDialect.cpp | 8 +- .../{AIEDmaToIpu.cpp => AIEDmaToNpu.cpp} | 60 ++++---- .../AIEX/Transforms/AIEXToStandard.cpp | 14 +- lib/Dialect/AIEX/Transforms/CMakeLists.txt | 2 +- lib/Targets/AIETargetCDODirect.cpp | 12 +- lib/Targets/AIETargetHSA.cpp | 6 +- .../{AIETargetIPU.cpp => AIETargetNPU.cpp} | 20 +-- lib/Targets/AIETargets.cpp | 6 +- lib/Targets/CMakeLists.txt | 2 +- .../basic/matrix_add_one/Makefile | 6 +- .../basic/matrix_add_one/README.md | 6 +- .../basic/matrix_add_one/aie2.py | 10 +- .../basic/matrix_add_one/run.lit | 6 +- .../matrix_multiplication/CMakeLists.txt | 2 +- .../matrix_multiplication/makefile-common | 2 +- .../matrix_vector/aie2.py | 10 +- .../matrix_vector/run.lit | 4 +- .../matrix_multiplication/single_core/aie2.py | 10 +- .../matrix_multiplication/single_core/run.lit | 4 +- .../matrix_multiplication/whole_array/aie2.py | 10 +- .../matrix_multiplication/whole_array/run.lit | 4 +- .../basic/passthrough_dmas/CMakeLists.txt | 2 +- .../basic/passthrough_dmas/Makefile | 4 +- .../basic/passthrough_dmas/aie2.py | 10 +- .../basic/passthrough_dmas/run.lit | 6 +- .../basic/passthrough_kernel/CMakeLists.txt | 2 +- .../basic/passthrough_kernel/Makefile | 4 +- .../basic/passthrough_kernel/aie2.py | 8 +- .../basic/passthrough_kernel/run.lit | 6 +- .../basic/vector_add/Makefile | 6 +- .../basic/vector_add/README.md | 6 +- programming_examples/basic/vector_add/aie2.py | 12 +- programming_examples/basic/vector_add/run.lit | 6 +- .../basic/vector_exp/CMakeLists.txt | 2 +- .../basic/vector_exp/Makefile | 2 +- programming_examples/basic/vector_exp/aie2.py | 8 +- programming_examples/basic/vector_exp/run.lit | 6 +- .../basic/vector_mult/CMakeLists.txt | 2 +- .../basic/vector_mult/Makefile | 6 +- .../basic/vector_mult/README.md | 6 +- .../basic/vector_mult/aie2.py | 12 +- .../basic/vector_mult/run.lit | 6 +- .../basic/vector_reduce_add/CMakeLists.txt | 2 +- .../basic/vector_reduce_add/Makefile | 4 +- .../basic/vector_reduce_add/aie2.py | 10 +- .../basic/vector_reduce_add/run.lit | 6 +- .../basic/vector_reduce_max/CMakeLists.txt | 2 +- .../basic/vector_reduce_max/Makefile | 4 +- .../basic/vector_reduce_max/aie2.py | 10 +- .../basic/vector_reduce_max/run.lit | 6 +- .../basic/vector_reduce_min/CMakeLists.txt | 2 +- .../basic/vector_reduce_min/Makefile | 4 +- .../basic/vector_reduce_min/aie2.py | 10 +- .../basic/vector_reduce_min/run.lit | 6 +- .../basic/vector_scalar_add/CMakeLists.txt | 2 +- .../basic/vector_scalar_add/Makefile | 4 +- .../basic/vector_scalar_add/aie2.py | 8 +- .../basic/vector_scalar_add/run.lit | 4 +- .../basic/vector_scalar_mul/CMakeLists.txt | 2 +- .../basic/vector_scalar_mul/Makefile | 4 +- .../basic/vector_scalar_mul/aie2.py | 10 +- .../basic/vector_scalar_mul/run.lit | 6 +- .../basic/vector_sum/CMakeLists.txt | 2 +- .../basic/vector_sum/Makefile | 6 +- .../basic/vector_sum/README.md | 4 +- programming_examples/basic/vector_sum/aie2.py | 10 +- programming_examples/basic/vector_sum/run.lit | 6 +- programming_examples/lit.cfg.py | 8 +- programming_examples/makefile-common | 2 +- .../ml/bottleneck/CMakeLists.txt | 2 +- programming_examples/ml/bottleneck/Makefile | 6 +- programming_examples/ml/bottleneck/aie2.py | 32 ++--- programming_examples/ml/bottleneck/run.lit | 4 +- programming_examples/ml/conv2d/CMakeLists.txt | 2 +- programming_examples/ml/conv2d/Makefile | 6 +- programming_examples/ml/conv2d/aie2.py | 26 ++-- programming_examples/ml/conv2d/run.lit | 4 +- .../ml/conv2d_fused_relu/CMakeLists.txt | 2 +- .../ml/conv2d_fused_relu/Makefile | 6 +- .../ml/conv2d_fused_relu/aie2.py | 26 ++-- .../ml/conv2d_fused_relu/run.lit | 4 +- .../ml/eltwise_add/CMakeLists.txt | 2 +- programming_examples/ml/eltwise_add/Makefile | 8 +- programming_examples/ml/eltwise_add/aie2.py | 10 +- programming_examples/ml/eltwise_add/run.lit | 4 +- .../ml/eltwise_mul/CMakeLists.txt | 2 +- programming_examples/ml/eltwise_mul/Makefile | 8 +- programming_examples/ml/eltwise_mul/aie2.py | 10 +- programming_examples/ml/eltwise_mul/run.lit | 4 +- programming_examples/ml/relu/CMakeLists.txt | 2 +- programming_examples/ml/relu/Makefile | 8 +- programming_examples/ml/relu/aie2.py | 8 +- programming_examples/ml/relu/run.lit | 4 +- .../ml/resnet/layers_conv2_x/CMakeLists.txt | 2 +- .../ml/resnet/layers_conv2_x/Makefile | 6 +- .../ml/resnet/layers_conv2_x/aie.mlir | 58 ++++---- .../ml/resnet/layers_conv2_x/aie2.py | 34 ++--- .../ml/resnet/layers_conv2_x/run.lit | 4 +- .../ml/softmax/CMakeLists.txt | 2 +- programming_examples/ml/softmax/Makefile | 4 +- programming_examples/ml/softmax/aie2.py | 8 +- programming_examples/ml/softmax/run.lit | 4 +- .../ml/weight_expand/CMakeLists.txt | 2 +- .../ml/weight_expand/Makefile | 2 +- programming_examples/ml/weight_expand/aie2.py | 8 +- programming_examples/utils/README.md | 2 +- programming_examples/utils/parse_eventIR.py | 6 +- programming_examples/utils/parse_trace.py | 6 +- .../vision/color_detect/CMakeLists.txt | 2 +- .../vision/color_detect/Makefile | 4 +- .../vision/color_detect/README.md | 2 +- .../vision/color_detect/aie2_colorDetect.py | 8 +- .../vision/color_detect/run.lit | 4 +- .../vision/color_threshold/CMakeLists.txt | 2 +- .../vision/color_threshold/Makefile | 4 +- .../vision/color_threshold/README.md | 2 +- .../color_threshold/aie2_colorThreshold.py | 32 ++--- .../vision/color_threshold/run.lit | 4 +- .../vision/edge_detect/CMakeLists.txt | 2 +- .../vision/edge_detect/Makefile | 4 +- .../vision/edge_detect/README.md | 2 +- .../vision/edge_detect/aie2_edgeDetect.py | 8 +- .../vision/edge_detect/run.lit | 4 +- .../vision/vision_passthrough/CMakeLists.txt | 2 +- .../vision/vision_passthrough/Makefile | 4 +- .../vision/vision_passthrough/aie2.py | 20 +-- .../aie2_lineBased_8b_1080.mlir | 8 +- .../aie2_lineBased_8b_8k.mlir | 8 +- .../aie2_lineBased_8b_tiny.mlir | 8 +- .../vision/vision_passthrough/run.lit | 4 +- python/AIEMLIRModule.cpp | 6 +- python/XRTModule.cpp | 28 ++-- python/_mlir_libs/_aie.pyi | 4 +- python/_mlir_libs/_xrt.pyi | 4 +- python/compiler/aiecc/cl_arguments.py | 18 +-- python/compiler/aiecc/main.py | 14 +- python/dialects/aie.py | 4 +- python/dialects/aiex.py | 60 ++++---- python/utils/README.md | 30 ++-- python/utils/trace.py | 16 +-- .../{DmaToIpu => DmaToNpu}/aiert_insts.mlir | 16 +-- .../{DmaToIpu => DmaToNpu}/bad_rtp_write.mlir | 10 +- .../dma_to_npu.mlir} | 38 ++--- .../dma_to_npu_invalid.mlir} | 10 +- .../dma_to_npu_issue_token.mlir} | 18 +-- .../{DmaToIpu => DmaToNpu}/push_to_queue.mlir | 12 +- .../{DmaToIpu => DmaToNpu}/rtp_write.mlir | 12 +- .../assign-bd-ids/bad_bd_assignments.mlir | 12 +- test/Passes/assign-bd-ids/basic.mlir | 4 +- test/Passes/assign-bd-ids/user_assigned.mlir | 8 +- .../AIETargetHSA/input_with_addresses.mlir | 6 +- .../ipu_instgen.mlir => NPU/npu_instgen.mlir} | 12 +- test/aie2xclbin/simple_xclbin.mlir | 2 +- test/aiecc/simple_xclbin.mlir | 6 +- .../bad_alignment.mlir | 8 +- test/dialect/AIE/bad_cascade.mlir | 6 +- test/dialect/AIE/bad_dma_op.mlir | 2 +- test/dialect/AIE/badshimtiledma.mlir | 2 +- test/dialect/AIE/badtiledma4.mlir | 2 +- test/dialect/AIE/buffer.mlir | 2 +- .../AIEX/{bad_ipu_nd.mlir => bad_npu_nd.mlir} | 26 ++-- ...ush_queue.mlir => bad_npu_push_queue.mlir} | 10 +- ...pu_write_bd.mlir => bad_npu_write_bd.mlir} | 18 +-- test/dialect/AIEX/invalid.mlir | 8 +- test/dialect/AIEX/roundtrip.mlir | 18 +-- test/lit.cfg.py | 8 +- .../aiex_standard_lowering.mlir | 10 +- .../aie.mlir | 8 +- .../run.lit | 4 +- .../test.cpp | 0 .../add_314_using_dma_op/aie.mlir | 8 +- .../add_314_using_dma_op/run.lit | 4 +- .../add_314_using_dma_op/test.cpp | 0 .../add_one_objFifo/CMakeLists.txt | 2 +- .../add_one_objFifo/Makefile | 2 +- .../add_one_objFifo/aie.mlir | 8 +- .../add_one_objFifo/run.lit | 4 +- .../add_one_objFifo/run.sh | 0 .../add_one_objFifo/test.cpp | 0 .../add_one_using_dma/aie.mlir | 8 +- .../add_one_using_dma/run.lit | 4 +- .../add_one_using_dma/test.cpp | 0 .../cascade_flows/CMakeLists.txt | 2 +- .../cascade_flows/Makefile | 2 +- .../cascade_flows/aie.mlir | 8 +- .../cascade_flows/kernel1.cc | 0 .../cascade_flows/kernel2.cc | 0 .../cascade_flows/kernel3.cc | 0 .../cascade_flows/run.lit | 4 +- .../cascade_flows/test.cpp | 0 test/{ipu-xrt => npu-xrt}/e2e/conftest.py | 2 +- test/{ipu-xrt => npu-xrt}/e2e/lit.local.cfg | 0 test/{ipu-xrt => npu-xrt}/e2e/pytest.ini | 0 .../e2e/run_all_tests_one_by_one.sh | 0 ...dd_256_using_dma_op_no_double_buffering.py | 22 +-- test/{ipu-xrt => npu-xrt}/e2e/test_locks.py | 66 ++++----- .../e2e/test_manual_dpu_args.py | 82 +++++------ .../e2e/test_nonsquare_matrix_mult.py | 64 ++++----- .../test_nonsquare_matrix_mult_vectorized.py | 64 ++++----- .../e2e/test_offsets_sizes_strides.py | 38 ++--- .../e2e/test_repeat_count.py | 44 +++--- .../e2e/test_shared_buffers_init_value.py | 22 +-- .../e2e/test_square_matrix_mult.py | 64 ++++----- .../e2e/test_square_matrix_mult_vectorized.py | 64 ++++----- .../e2e/test_tiled_matrix_add.py | 76 +++++----- ...iled_nonsquare_spatial_tile_matrix_mult.py | 106 +++++++------- .../test_tiled_nonsquare_tile_matrix_mult.py | 92 ++++++------- ...d_nonsquare_tile_matrix_mult_vectorized.py | 130 +++++++++--------- .../e2e/test_tiled_vec_add.py | 76 +++++----- .../e2e/test_tiled_vec_add_vectorized.py | 76 +++++----- test/{ipu-xrt => npu-xrt}/e2e/test_vec_dot.py | 76 +++++----- .../e2e/tiled_matrix_add.ipynb | 98 ++++++------- ...onsquare_tile_matrix_mult_vectorized.ipynb | 42 +++--- test/{ipu-xrt => npu-xrt}/e2e/util.py | 0 test/{ipu-xrt => npu-xrt}/lit.local.cfg | 2 +- test/{ipu-xrt => npu-xrt}/makefile-common | 2 +- .../matrix_multiplication_using_dma/aie.mlir | 14 +- .../matrix_multiplication_using_dma/mm.cc | 0 .../run-a2x.lit | 4 +- .../matrix_multiplication_using_dma/run.lit | 4 +- .../matrix_multiplication_using_dma/test.cpp | 0 .../matrix_multiplication_using_dma/zero.cc | 0 test/{ipu-xrt => npu-xrt}/two_col/Makefile | 6 +- test/{ipu-xrt => npu-xrt}/two_col/aie.mlir | 24 ++-- test/{ipu-xrt => npu-xrt}/two_col/run.lit | 4 +- test/{ipu-xrt => npu-xrt}/two_col/run.sh | 0 test/{ipu-xrt => npu-xrt}/two_col/test.cpp | 0 .../{ipu-xrt => npu-xrt}/two_col/threshold.cc | 0 .../vector_scalar_using_dma/aie.mlir | 8 +- .../vector_scalar_using_dma/run.lit | 4 +- .../vector_scalar_using_dma/scale.cc | 0 .../vector_scalar_using_dma/test.cpp | 0 .../nested_loop_test.mlir | 4 +- test/python/{ipu.py => npu.py} | 36 ++--- test/python/tile_array.py | 38 ++--- test/python/trace_utils.py | 24 ++-- tools/aie2xclbin/XCLBinGen.cpp | 14 +- tools/aie2xclbin/XCLBinGen.h | 2 +- tools/aie2xclbin/aie2xclbin.cpp | 8 +- utils/{reset_ipu.sh => reset_npu.sh} | 2 +- utils/{run_on_ipu.sh => run_on_npu.sh} | 0 252 files changed, 1531 insertions(+), 1531 deletions(-) rename lib/Dialect/AIEX/Transforms/{AIEDmaToIpu.cpp => AIEDmaToNpu.cpp} (87%) rename lib/Targets/{AIETargetIPU.cpp => AIETargetNPU.cpp} (88%) rename test/Conversion/{DmaToIpu => DmaToNpu}/aiert_insts.mlir (83%) rename test/Conversion/{DmaToIpu => DmaToNpu}/bad_rtp_write.mlir (62%) rename test/Conversion/{DmaToIpu/dma_to_ipu.mlir => DmaToNpu/dma_to_npu.mlir} (73%) rename test/Conversion/{DmaToIpu/dma_to_ipu_invalid.mlir => DmaToNpu/dma_to_npu_invalid.mlir} (71%) rename test/Conversion/{DmaToIpu/dma_to_ipu_issue_token.mlir => DmaToNpu/dma_to_npu_issue_token.mlir} (72%) rename test/Conversion/{DmaToIpu => DmaToNpu}/push_to_queue.mlir (69%) rename test/Conversion/{DmaToIpu => DmaToNpu}/rtp_write.mlir (66%) rename test/Targets/{IPU/ipu_instgen.mlir => NPU/npu_instgen.mlir} (90%) rename test/dialect/AIEX/{bad_ipu_nd.mlir => bad_npu_nd.mlir} (78%) rename test/dialect/AIEX/{bad_ipu_push_queue.mlir => bad_npu_push_queue.mlir} (82%) rename test/dialect/AIEX/{bad_ipu_write_bd.mlir => bad_npu_write_bd.mlir} (90%) rename test/{ipu-xrt => npu-xrt}/add_256_using_dma_op_no_double_buffering/aie.mlir (95%) rename test/{ipu-xrt => npu-xrt}/add_256_using_dma_op_no_double_buffering/run.lit (70%) rename test/{ipu-xrt => npu-xrt}/add_256_using_dma_op_no_double_buffering/test.cpp (100%) rename test/{ipu-xrt => npu-xrt}/add_314_using_dma_op/aie.mlir (97%) rename test/{ipu-xrt => npu-xrt}/add_314_using_dma_op/run.lit (70%) rename test/{ipu-xrt => npu-xrt}/add_314_using_dma_op/test.cpp (100%) rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/CMakeLists.txt (96%) rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/Makefile (91%) rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/aie.mlir (91%) rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/run.lit (75%) rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/run.sh (100%) rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/test.cpp (100%) rename test/{ipu-xrt => npu-xrt}/add_one_using_dma/aie.mlir (97%) rename test/{ipu-xrt => npu-xrt}/add_one_using_dma/run.lit (75%) rename test/{ipu-xrt => npu-xrt}/add_one_using_dma/test.cpp (100%) rename test/{ipu-xrt => npu-xrt}/cascade_flows/CMakeLists.txt (96%) rename test/{ipu-xrt => npu-xrt}/cascade_flows/Makefile (95%) rename test/{ipu-xrt => npu-xrt}/cascade_flows/aie.mlir (92%) rename test/{ipu-xrt => npu-xrt}/cascade_flows/kernel1.cc (100%) rename test/{ipu-xrt => npu-xrt}/cascade_flows/kernel2.cc (100%) rename test/{ipu-xrt => npu-xrt}/cascade_flows/kernel3.cc (100%) rename test/{ipu-xrt => npu-xrt}/cascade_flows/run.lit (83%) rename test/{ipu-xrt => npu-xrt}/cascade_flows/test.cpp (100%) rename test/{ipu-xrt => npu-xrt}/e2e/conftest.py (96%) rename test/{ipu-xrt => npu-xrt}/e2e/lit.local.cfg (100%) rename test/{ipu-xrt => npu-xrt}/e2e/pytest.ini (100%) rename test/{ipu-xrt => npu-xrt}/e2e/run_all_tests_one_by_one.sh (100%) rename test/{ipu-xrt => npu-xrt}/e2e/test_add_256_using_dma_op_no_double_buffering.py (93%) rename test/{ipu-xrt => npu-xrt}/e2e/test_locks.py (93%) rename test/{ipu-xrt => npu-xrt}/e2e/test_manual_dpu_args.py (88%) rename test/{ipu-xrt => npu-xrt}/e2e/test_nonsquare_matrix_mult.py (91%) rename test/{ipu-xrt => npu-xrt}/e2e/test_nonsquare_matrix_mult_vectorized.py (94%) rename test/{ipu-xrt => npu-xrt}/e2e/test_offsets_sizes_strides.py (92%) rename test/{ipu-xrt => npu-xrt}/e2e/test_repeat_count.py (90%) rename test/{ipu-xrt => npu-xrt}/e2e/test_shared_buffers_init_value.py (95%) rename test/{ipu-xrt => npu-xrt}/e2e/test_square_matrix_mult.py (91%) rename test/{ipu-xrt => npu-xrt}/e2e/test_square_matrix_mult_vectorized.py (94%) rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_matrix_add.py (92%) rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py (93%) rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_nonsquare_tile_matrix_mult.py (91%) rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py (92%) rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_vec_add.py (90%) rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_vec_add_vectorized.py (92%) rename test/{ipu-xrt => npu-xrt}/e2e/test_vec_dot.py (90%) rename test/{ipu-xrt => npu-xrt}/e2e/tiled_matrix_add.ipynb (91%) rename test/{ipu-xrt => npu-xrt}/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb (95%) rename test/{ipu-xrt => npu-xrt}/e2e/util.py (100%) rename test/{ipu-xrt => npu-xrt}/lit.local.cfg (91%) rename test/{ipu-xrt => npu-xrt}/makefile-common (92%) rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/aie.mlir (97%) rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/mm.cc (100%) rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/run-a2x.lit (77%) rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/run.lit (78%) rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/test.cpp (100%) rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/zero.cc (100%) rename test/{ipu-xrt => npu-xrt}/two_col/Makefile (72%) rename test/{ipu-xrt => npu-xrt}/two_col/aie.mlir (91%) rename test/{ipu-xrt => npu-xrt}/two_col/run.lit (73%) rename test/{ipu-xrt => npu-xrt}/two_col/run.sh (100%) rename test/{ipu-xrt => npu-xrt}/two_col/test.cpp (100%) rename test/{ipu-xrt => npu-xrt}/two_col/threshold.cc (100%) rename test/{ipu-xrt => npu-xrt}/vector_scalar_using_dma/aie.mlir (95%) rename test/{ipu-xrt => npu-xrt}/vector_scalar_using_dma/run.lit (78%) rename test/{ipu-xrt => npu-xrt}/vector_scalar_using_dma/scale.cc (100%) rename test/{ipu-xrt => npu-xrt}/vector_scalar_using_dma/test.cpp (100%) rename test/python/{ipu.py => npu.py} (95%) rename utils/{reset_ipu.sh => reset_npu.sh} (93%) rename utils/{run_on_ipu.sh => run_on_npu.sh} (100%) diff --git a/docs/buildHostLin.md b/docs/buildHostLin.md index 017bc053cd..422b48b64e 100644 --- a/docs/buildHostLin.md +++ b/docs/buildHostLin.md @@ -293,7 +293,7 @@ source ${MLIR_AIE_BUILD_DIR}/utils/env_setup.sh ${MLIR_AIE_BUILD_DIR}/install ${ ## Build a Design -For your design of interest, for instance [add_one_objFifo](../reference_designs/ipu-xrt/add_one_objFifo/), 2 steps are needed: (i) build the AIE desgin and then (ii) build the host code. +For your design of interest, for instance [vector_add](../programming_examples/basic/vector_add/), 2 steps are needed: (i) build the AIE desgin and then (ii) build the host code. ### Build Device AIE Part diff --git a/docs/buildHostWin.md b/docs/buildHostWin.md index 04373d2892..d6daf03704 100644 --- a/docs/buildHostWin.md +++ b/docs/buildHostWin.md @@ -58,7 +58,7 @@ All steps in WSL Ubuntu terminal. 1. After installing the updated RyzenAI driver (see next subsection), use the gendef tool (from the mingw-w64-tools package) to create a .def file with the symbols: ``` - mkdir /mnt/c/Technical/xrtIPUfromDLL; cd /mnt/c/Technical/xrtIPUfromDLL + mkdir /mnt/c/Technical/xrtNPUfromDLL; cd /mnt/c/Technical/xrtNPUfromDLL cp /mnt/c/Windows/System32/AMD/xrt_coreutil.dll . gendef xrt_coreutil.dll ``` @@ -67,7 +67,7 @@ All steps in WSL Ubuntu terminal. All steps in Win11 (powershell where needed). -1. Upgrade the IPU driver IPU driver to version 10.106.8.62 [download here](https://account.amd.com/en/forms/downloads/ryzen-ai-software-platform-xef.html?filename=ipu_stack_rel_silicon_2308.zip), following the [instructions](href="https://ryzenai.docs.amd.com/en/latest/inst.html) on setting up the driver. +1. Upgrade the NPU driver to version 10.106.8.62 [download here](https://account.amd.com/en/forms/downloads/ryzen-ai-software-platform-xef.html?filename=ipu_stack_rel_silicon_2308.zip), following the [instructions](href="https://ryzenai.docs.amd.com/en/latest/inst.html) on setting up the driver. 1. Install [Microsoft Visual Studio 17 2022 Community Edition](https://visualstudio.microsoft.com/vs/community/) with package for C++ development. 1. Install CMake on windows ([https://cmake.org/download/](https://cmake.org/download/)) @@ -79,7 +79,7 @@ All steps in Win11 (powershell where needed). 1. Clone [https://github.com/Xilinx/XRT](https://github.com/Xilinx/XRT) for instance under `C:\Technical` and `git checkout 2023.2` 1. Create a .lib file from the .dll shipping with the driver - In wsl, generate a .def file (see above) - - Start a x86 Native Tools Command Prompt (installed as part of VS17), go to the folder `C:\Technical\xrtIPUfromDLL` and run command: + - Start a x86 Native Tools Command Prompt (installed as part of VS17), go to the folder `C:\Technical\xrtNPUfromDLL` and run command: ``` lib /def:xrt_coreutil.def /machine:x64 /out:xrt_coreutil.lib ``` @@ -113,7 +113,7 @@ source /utils/env_setup.sh /in ## Build a Design -For your design of interest, for instance [add_one_objFifo](../reference_designs/ipu-xrt/add_one_objFifo/), 2 steps are needed: (i) build the AIE desgin in WSL and then (ii) build the host code in powershell. +For your design of interest, for instance [vector_add](../programming_examples/basic/vector_add/), 2 steps are needed: (i) build the AIE desgin in WSL and then (ii) build the host code in powershell. ### Build device AIE part: WSL Ubuntu terminal 1. Prepare your enviroment with the mlir-aie tools (built during Prerequisites part of this guide). See [Set up your environment](#set-up-your-environment) above. diff --git a/include/aie-c/Translation.h b/include/aie-c/Translation.h index 93fa89b1f3..762c5ce041 100644 --- a/include/aie-c/Translation.h +++ b/include/aie-c/Translation.h @@ -18,7 +18,7 @@ extern "C" { MLIR_CAPI_EXPORTED MlirStringRef aieTranslateAIEVecToCpp(MlirOperation op, bool aieml); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateModuleToLLVMIR(MlirOperation op); -MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToIPU(MlirOperation op); +MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToNPU(MlirOperation op); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToXAIEV2(MlirOperation op); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToHSA(MlirOperation op); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToBCF(MlirOperation op, int col, diff --git a/include/aie/Dialect/AIE/IR/AIEAttrs.td b/include/aie/Dialect/AIE/IR/AIEAttrs.td index 4d2f17371f..167a1bf4d9 100644 --- a/include/aie/Dialect/AIE/IR/AIEAttrs.td +++ b/include/aie/Dialect/AIE/IR/AIEAttrs.td @@ -100,7 +100,7 @@ def AIEDevice: I32EnumAttr<"AIEDevice", "AIE Device", I32EnumAttrCase<"xcvc1902", 1>, I32EnumAttrCase<"xcve2302", 2>, I32EnumAttrCase<"xcve2802", 3>, - I32EnumAttrCase<"ipu", 4> + I32EnumAttrCase<"npu", 4> ]> { let cppNamespace = "xilinx::AIE"; diff --git a/include/aie/Dialect/AIE/IR/AIETargetModel.h b/include/aie/Dialect/AIE/IR/AIETargetModel.h index b524e97578..42ac68dc74 100644 --- a/include/aie/Dialect/AIE/IR/AIETargetModel.h +++ b/include/aie/Dialect/AIE/IR/AIETargetModel.h @@ -450,11 +450,11 @@ class VE2802TargetModel : public AIE2TargetModel { } }; -class IPUTargetModel : public AIE2TargetModel { +class NPUTargetModel : public AIE2TargetModel { llvm::SmallDenseSet nocColumns = {0, 1, 2, 3}; public: - IPUTargetModel() = default; + NPUTargetModel() = default; int columns() const override { return 5; } diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index e38cb59d65..39ce49ada8 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -463,7 +463,7 @@ def AIE_SelectOp: AIEX_Op<"select", []>, Results<(outs Index)> { ]; } -def AIE_IpuDmaMemcpyNdOp: AIEX_Op<"ipu.dma_memcpy_nd", [ +def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ AttrSizedOperandSegments, MyOffsetSizeAndStrideOpInterface ]> { @@ -519,10 +519,10 @@ def AIE_IpuDmaMemcpyNdOp: AIEX_Op<"ipu.dma_memcpy_nd", [ let hasVerifier = 1; } -def AIE_IpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> { +def AIE_NpuDmaWaitOp: AIEX_Op<"npu.dma_wait", []> { let summary = "Blocking operation to wait for a DMA to complete execution."; let description = [{ - The IpuDmaWaitOp blocks until the DMA referenced through `symbol` completes execution + The NpuDmaWaitOp blocks until the DMA referenced through `symbol` completes execution and issues a task-complete-token. Example: @@ -530,13 +530,13 @@ def AIE_IpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> { ... aie.objectfifo @out0(%tile_0_1, {%tile_0_0}, 4 : i32) : !aie.objectfifo> ... - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32> ... - aiex.ipu.dma_wait { symbol = @out0 } + aiex.npu.dma_wait { symbol = @out0 } ``` Here, we have an objectfifo with symbol name `out0`, which is then referenced in the - `ipu.dma_memcpy_nd` operation as the target for the respective DMA operation. Afterwards, - an `ipu.dma_wait` operation references the same symbol to block until the respective DMA + `npu.dma_memcpy_nd` operation as the target for the respective DMA operation. Afterwards, + an `npu.dma_wait` operation references the same symbol to block until the respective DMA has executed all of its tasks. }]; let arguments = ( @@ -549,7 +549,7 @@ def AIE_IpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> { } // Write RTP -def AIE_IpuWriteRTPOp: AIEX_Op<"ipu.rtp_write", []> { +def AIE_NpuWriteRTPOp: AIEX_Op<"npu.rtp_write", []> { let summary = "rtp write operator"; let arguments = ( ins StrAttr:$buffer_sym_name, @@ -567,7 +567,7 @@ def AIE_IpuWriteRTPOp: AIEX_Op<"ipu.rtp_write", []> { } // Push BD to Queue -def AIE_IpuShimTilePushQueueOp: AIEX_Op<"ipu.shimtile_push_queue", []> { +def AIE_NpuShimTilePushQueueOp: AIEX_Op<"npu.shimtile_push_queue", []> { let summary = "bd queue push operator"; let arguments = ( ins FlatSymbolRefAttr:$metadata, @@ -586,7 +586,7 @@ def AIE_IpuShimTilePushQueueOp: AIEX_Op<"ipu.shimtile_push_queue", []> { } // WRITE32 -def AIE_IpuWrite32Op: AIEX_Op<"ipu.write32", []> { +def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", []> { let summary = "write32 operator"; let arguments = ( ins I32Attr:$column, @@ -604,7 +604,7 @@ def AIE_IpuWrite32Op: AIEX_Op<"ipu.write32", []> { } // OP_SYNC -def AIE_IpuSyncOp: AIEX_Op<"ipu.sync", []> { +def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> { let summary = "sync operator"; let arguments = ( ins I32Attr:$column, @@ -624,7 +624,7 @@ def AIE_IpuSyncOp: AIEX_Op<"ipu.sync", []> { } // WRITEBD_EXTEND_SHIMTILE -def AIE_IpuWriteBdExShimTileOp: AIEX_Op<"ipu.writebd_shimtile", []> { +def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> { let summary = "dma operator"; let arguments = ( ins I32Attr:$column, diff --git a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h index f3e4f48ef2..b22b707712 100644 --- a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h +++ b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h @@ -28,7 +28,7 @@ std::unique_ptr> createAIELowerMulticastPass(); std::unique_ptr> createAIEBroadcastPacketPass(); -std::unique_ptr> createAIEDmaToIpuPass(); +std::unique_ptr> createAIEDmaToNpuPass(); std::unique_ptr> createAIEXToStandardPass(); /// Generate the code for registering passes. diff --git a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td index 911ca71df1..3c4b34a877 100644 --- a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td +++ b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td @@ -17,7 +17,7 @@ def AIEXToStandard : Pass<"aiex-standard-lowering", "mlir::ModuleOp"> { let summary = "Lower AIEX operations"; let description = [{ - AIEX Ipu Ops are removed. + AIEX Npu Ops are removed. }]; @@ -133,13 +133,13 @@ def AIELowerMemcpy : Pass<"aie-lower-memcpy", "AIE::DeviceOp"> { ]; } -def AIEDmaToIpu : Pass<"aie-dma-to-ipu", "AIE::DeviceOp"> { +def AIEDmaToNpu : Pass<"aie-dma-to-npu", "AIE::DeviceOp"> { let summary = ""; let description = [{ }]; - let constructor = "xilinx::AIEX::createAIEDmaToIpuPass()"; + let constructor = "xilinx::AIEX::createAIEDmaToNpuPass()"; let dependentDialects = [ "mlir::func::FuncDialect", "xilinx::AIE::AIEDialect", diff --git a/include/aie/Targets/AIETargets.h b/include/aie/Targets/AIETargets.h index b9b960d798..114f9c5335 100644 --- a/include/aie/Targets/AIETargets.h +++ b/include/aie/Targets/AIETargets.h @@ -31,9 +31,9 @@ mlir::LogicalResult AIETranslateShimSolution(mlir::ModuleOp module, llvm::raw_ostream &); mlir::LogicalResult AIETranslateGraphXPE(mlir::ModuleOp module, llvm::raw_ostream &); -mlir::LogicalResult AIETranslateToIPU(mlir::ModuleOp module, +mlir::LogicalResult AIETranslateToNPU(mlir::ModuleOp module, llvm::raw_ostream &output); -std::vector AIETranslateToIPU(mlir::ModuleOp); +std::vector AIETranslateToNPU(mlir::ModuleOp); mlir::LogicalResult AIETranslateToLdScript(mlir::ModuleOp module, llvm::raw_ostream &output, int tileCol, int tileRow); diff --git a/lib/CAPI/Translation.cpp b/lib/CAPI/Translation.cpp index 67f9cb8947..6488f44357 100644 --- a/lib/CAPI/Translation.cpp +++ b/lib/CAPI/Translation.cpp @@ -76,15 +76,15 @@ aieTranslateToCDODirect(MlirOperation moduleOp, MlirStringRef workDirPath, return wrap(status); } -MlirStringRef aieTranslateToIPU(MlirOperation moduleOp) { - std::string ipu; - llvm::raw_string_ostream os(ipu); +MlirStringRef aieTranslateToNPU(MlirOperation moduleOp) { + std::string npu; + llvm::raw_string_ostream os(npu); ModuleOp mod = llvm::cast(unwrap(moduleOp)); - if (failed(AIETranslateToIPU(mod, os))) + if (failed(AIETranslateToNPU(mod, os))) return mlirStringRefCreate(nullptr, 0); - char *cStr = static_cast(malloc(ipu.size())); - ipu.copy(cStr, ipu.size()); - return mlirStringRefCreate(cStr, ipu.size()); + char *cStr = static_cast(malloc(npu.size())); + npu.copy(cStr, npu.size()); + return mlirStringRefCreate(cStr, npu.size()); } MlirStringRef aieTranslateToXAIEV2(MlirOperation moduleOp) { diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index 2f9ab57f92..a9e80b44b1 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -104,7 +104,7 @@ LogicalResult myVerifyOffsetSizeAndStrideOp(OffsetSizeAndStrideOpInterface op) { static VC1902TargetModel VC1902model; static VE2302TargetModel VE2302model; static VE2802TargetModel VE2802model; -static IPUTargetModel IPUmodel; +static NPUTargetModel NPUmodel; const AIETargetModel &getTargetModel(Operation *op) { if (auto t = dyn_cast(op)) @@ -983,8 +983,8 @@ const AIETargetModel &DeviceOp::getTargetModel() { return VE2302model; case AIEDevice::xcve2802: return VE2802model; - case AIEDevice::ipu: - return IPUmodel; + case AIEDevice::npu: + return NPUmodel; } return VC1902model; } diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp index 6363626aa2..f2c9ebc433 100644 --- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp +++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp @@ -64,7 +64,7 @@ LogicalResult AIEX::BroadcastPacketOp::verify() { return success(); } -LogicalResult AIEX::IpuDmaMemcpyNdOp::verify() { +LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() { MemRefType buffer = getMemref().getType(); if (buffer.getElementTypeBitWidth() != 32) return emitOpError("must be used with memref type with element width 32."); @@ -105,7 +105,7 @@ LogicalResult AIEX::IpuDmaMemcpyNdOp::verify() { return success(); } -LogicalResult AIEX::IpuDmaWaitOp::verify() { +LogicalResult AIEX::NpuDmaWaitOp::verify() { AIE::DeviceOp dev = (*this)->getParentOfType(); // Some passes (e.g. aie-standard-lowering) use aiex ops outside a DeviceOp, // so we can't expect the device to always exist. @@ -114,7 +114,7 @@ LogicalResult AIEX::IpuDmaWaitOp::verify() { return success(); } -LogicalResult AIEX::IpuShimTilePushQueueOp::verify() { +LogicalResult AIEX::NpuShimTilePushQueueOp::verify() { const auto &targetModel = AIE::getTargetModel(*this); auto numBds = targetModel.getNumBDs(0, 0); // assume shim if (getBdId() > numBds) @@ -124,7 +124,7 @@ LogicalResult AIEX::IpuShimTilePushQueueOp::verify() { return success(); } -LogicalResult AIEX::IpuWriteBdExShimTileOp::verify() { +LogicalResult AIEX::NpuWriteBdExShimTileOp::verify() { const auto &targetModel = AIE::getTargetModel(*this); auto numBds = targetModel.getNumBDs(0, 0); // assume shim if (getBdId() > numBds) diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToIpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp similarity index 87% rename from lib/Dialect/AIEX/Transforms/AIEDmaToIpu.cpp rename to lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp index 3841f73bf5..7239fbf5a0 100644 --- a/lib/Dialect/AIEX/Transforms/AIEDmaToIpu.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp @@ -1,4 +1,4 @@ -//===- AIEDmaToIpu.cpp ------------------------------------------*- C++ -*-===// +//===- AIEDmaToNpu.cpp ------------------------------------------*- C++ -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -20,14 +20,14 @@ using namespace mlir; using namespace xilinx; using namespace xilinx::AIEX; -struct RtpToIpuPattern : OpConversionPattern { +struct RtpToNpuPattern : OpConversionPattern { using OpConversionPattern::OpConversionPattern; - RtpToIpuPattern(MLIRContext *context, PatternBenefit benefit = 1) + RtpToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1) : OpConversionPattern(context, benefit) {} LogicalResult - matchAndRewrite(IpuWriteRTPOp op, OpAdaptor adaptor, + matchAndRewrite(NpuWriteRTPOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto ctx = op->getContext(); auto i32ty = IntegerType::get(ctx, 32); @@ -59,7 +59,7 @@ struct RtpToIpuPattern : OpConversionPattern { IntegerAttr row = IntegerAttr::get(i32ty, r); IntegerAttr address = IntegerAttr::get(ui32ty, rtp_buffer_addr); IntegerAttr value = IntegerAttr::get(i32ty, v); - rewriter.create(op->getLoc(), column.getInt(), row.getInt(), + rewriter.create(op->getLoc(), column.getInt(), row.getInt(), address.getUInt(), value.getInt()); rewriter.eraseOp(op); @@ -81,14 +81,14 @@ getAllocOpForSymbol(AIE::DeviceOp dev, StringRef sym_name) { return std::nullopt; } -struct PushToIpuPattern : OpConversionPattern { +struct PushToNpuPattern : OpConversionPattern { using OpConversionPattern::OpConversionPattern; - PushToIpuPattern(MLIRContext *context, PatternBenefit benefit = 1) + PushToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1) : OpConversionPattern(context, benefit) {} LogicalResult - matchAndRewrite(IpuShimTilePushQueueOp op, OpAdaptor adaptor, + matchAndRewrite(NpuShimTilePushQueueOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto ctx = op->getContext(); auto i32ty = IntegerType::get(ctx, 32); @@ -134,7 +134,7 @@ struct PushToIpuPattern : OpConversionPattern { cmd |= 0x80000000; IntegerAttr value = IntegerAttr::get(ui32ty, cmd); - rewriter.create(op->getLoc(), column.getInt(), zero.getInt(), + rewriter.create(op->getLoc(), column.getInt(), zero.getInt(), address.getUInt(), value.getUInt()); rewriter.eraseOp(op); @@ -142,14 +142,14 @@ struct PushToIpuPattern : OpConversionPattern { } }; -struct DmaToIpuPattern : OpConversionPattern { +struct DmaToNpuPattern : OpConversionPattern { using OpConversionPattern::OpConversionPattern; - DmaToIpuPattern(MLIRContext *context, PatternBenefit benefit = 1) + DmaToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1) : OpConversionPattern(context, benefit) {} LogicalResult - matchAndRewrite(IpuDmaMemcpyNdOp op, OpAdaptor adaptor, + matchAndRewrite(NpuDmaMemcpyNdOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { auto ctx = op->getContext(); auto i32ty = IntegerType::get(ctx, 32); @@ -320,14 +320,14 @@ struct DmaToIpuPattern : OpConversionPattern { if (!isMM2S) issue_token = BoolAttr::get(ctx, true); - (void)rewriter.create( + (void)rewriter.create( op->getLoc(), column, column_num, ddr_id, bd_id, buffer_length, buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type, d0_size, d0_stride, d1_size, d1_stride, d2_stride, iteration_current, iteration_size, iteration_stride, next_bd, use_next_bd, valid_bd, lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id); - rewriter.create(op->getLoc(), op.getMetadataAttr(), + rewriter.create(op->getLoc(), op.getMetadataAttr(), issue_token, repeat_count, bd_id); rewriter.eraseOp(op); @@ -335,17 +335,17 @@ struct DmaToIpuPattern : OpConversionPattern { } }; -/// Convert IpuDmaWaitOp into IpuSyncOp by retrieving the necessary +/// Convert NpuDmaWaitOp into NpuSyncOp by retrieving the necessary /// information from the ShimDMAAllocationOp referenced through the /// symbol argument of this op. -struct DmaWaitToIpuPattern : OpConversionPattern { +struct DmaWaitToNpuPattern : OpConversionPattern { using OpConversionPattern::OpConversionPattern; - DmaWaitToIpuPattern(MLIRContext *context, PatternBenefit benefit = 1) + DmaWaitToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1) : OpConversionPattern(context, benefit) {} LogicalResult - matchAndRewrite(IpuDmaWaitOp op, OpAdaptor adaptor, + matchAndRewrite(NpuDmaWaitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { AIE::DeviceOp dev = op->getParentOfType(); if (!dev) @@ -364,13 +364,13 @@ struct DmaWaitToIpuPattern : OpConversionPattern { // Create with `column_num == 1` and `row_num == 1` to check for a single // column and row. Row is always 0 for shim tiles. - (void)rewriter.replaceOpWithNewOp(op, column, 0, direction, + (void)rewriter.replaceOpWithNewOp(op, column, 0, direction, channel, 1, 1); return success(); } }; -struct AIEDmaToIpuPass : AIEDmaToIpuBase { +struct AIEDmaToNpuPass : AIEDmaToNpuBase { void runOnOperation() override { AIE::DeviceOp device = getOperation(); @@ -379,22 +379,22 @@ struct AIEDmaToIpuPass : AIEDmaToIpuBase { target.addLegalDialect(); target.addLegalOp(); target.addLegalOp(); - target.addIllegalOp(); - target.addIllegalOp(); - target.addIllegalOp(); - target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); RewritePatternSet patterns(&getContext()); - patterns.insert(&getContext()); - patterns.insert(&getContext()); - patterns.insert(&getContext()); - patterns.insert(&getContext()); + patterns.insert(&getContext()); + patterns.insert(&getContext()); + patterns.insert(&getContext()); + patterns.insert(&getContext()); if (failed(applyPartialConversion(device, target, std::move(patterns)))) signalPassFailure(); } }; -std::unique_ptr> AIEX::createAIEDmaToIpuPass() { - return std::make_unique(); +std::unique_ptr> AIEX::createAIEDmaToNpuPass() { + return std::make_unique(); } diff --git a/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp b/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp index 40609c18e5..b7aa242134 100644 --- a/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp +++ b/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp @@ -47,14 +47,14 @@ struct AIEXToStandardPass : AIEXToStandardBase { ModuleOp m = getOperation(); ConversionTarget target(getContext()); RewritePatternSet removepatterns(&getContext()); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), + removepatterns.add>(m.getContext(), m); + removepatterns.add>(m.getContext(), m); + removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), + removepatterns.add>(m.getContext(), m); + removepatterns.add>(m.getContext(), m); + removepatterns.add>(m.getContext(), m); + removepatterns.add>(m.getContext(), m); if (failed(applyPartialConversion(m, target, std::move(removepatterns)))) diff --git a/lib/Dialect/AIEX/Transforms/CMakeLists.txt b/lib/Dialect/AIEX/Transforms/CMakeLists.txt index 72cec940f7..3cfcac793d 100644 --- a/lib/Dialect/AIEX/Transforms/CMakeLists.txt +++ b/lib/Dialect/AIEX/Transforms/CMakeLists.txt @@ -13,7 +13,7 @@ add_mlir_dialect_library(AIEXTransforms AIECreateBroadcastPacket.cpp AIELowerMulticast.cpp AIELowerMemcpy.cpp - AIEDmaToIpu.cpp + AIEDmaToNpu.cpp ADDITIONAL_HEADER_DIRS ${AIE_BINARY_DIR}/include diff --git a/lib/Targets/AIETargetCDODirect.cpp b/lib/Targets/AIETargetCDODirect.cpp index b4778e22c4..5f3b94af5f 100644 --- a/lib/Targets/AIETargetCDODirect.cpp +++ b/lib/Targets/AIETargetCDODirect.cpp @@ -265,7 +265,7 @@ LogicalResult configureBdInBlock(XAie_DevInst &devInst, XAie_DmaDesc &dmaTileBd, // write them out like this so they show up with names in debug prints size_t smid = 0; size_t burstLen = 16; // (10):BLEN=16 (256Byte) (corresponds to - // 0x800000000 from targetipu) + // 0x800000000 from target) size_t qOs = 0; size_t cache = 0; size_t secure = 0; @@ -559,8 +559,8 @@ struct AIEControl { int32_t col = switchboxOp.colIndex(); int32_t row = switchboxOp.rowIndex(); XAie_LocType tileLoc = XAie_TileLoc(col, row); - assert(targetOp.getDevice() == AIEDevice::ipu && - "Only IPU currently supported"); + assert(targetOp.getDevice() == AIEDevice::npu && + "Only NPU currently supported"); if (row == 0) { // FIXME hack for TCT routing // TODO Support both channels @@ -780,9 +780,9 @@ LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, "only exactly 1 device op supported."); DeviceOp targetOp = *devOps.begin(); // things like XAIE_MEM_TILE_ROW_START and the missing - // shim dma on tile (0,0) are hard-coded assumptions about IPU... - assert(targetOp.getDevice() == AIEDevice::ipu && - "Only IPU currently supported"); + // shim dma on tile (0,0) are hard-coded assumptions about NPU... + assert(targetOp.getDevice() == AIEDevice::npu && + "Only NPU currently supported"); int maxCol = 0, minCol = 0; for (auto tileOp : targetOp.getOps()) { minCol = std::min(tileOp.getCol(), minCol); diff --git a/lib/Targets/AIETargetHSA.cpp b/lib/Targets/AIETargetHSA.cpp index 36701fc96d..098a1b5dac 100644 --- a/lib/Targets/AIETargetHSA.cpp +++ b/lib/Targets/AIETargetHSA.cpp @@ -14,7 +14,7 @@ #include "aie/Dialect/AIEX/IR/AIEXDialect.h" #include "aie/Targets/AIETargets.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" // Eddie added to get the IPU func ops +#include "mlir/Dialect/Func/IR/FuncOps.h" // Eddie added to get the NPU func ops #include "mlir/IR/Attributes.h" #include "mlir/IR/IRMapping.h" #include "mlir/Pass/Pass.h" @@ -95,7 +95,7 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) { // Looping over every Memcpy operation so we take the correct number of // buffers int num_ops = 0; - for (auto op : funcOp.getOps()) { + for (auto op : funcOp.getOps()) { // Getting the IDs of the buffers auto memref = op.getMemref(); Block &entryBB = op->getParentOfType().getBody().front(); @@ -117,7 +117,7 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) { output << "\tuint64_t packet_id = 0;\n"; int op_count = 0; - for (auto op : funcOp.getOps()) { + for (auto op : funcOp.getOps()) { auto dev = funcOp->getParentOfType(); if (!dev) { op.emitOpError("couldn't get DeviceOp"); diff --git a/lib/Targets/AIETargetIPU.cpp b/lib/Targets/AIETargetNPU.cpp similarity index 88% rename from lib/Targets/AIETargetIPU.cpp rename to lib/Targets/AIETargetNPU.cpp index 6117bfdb40..7f17c3ad83 100644 --- a/lib/Targets/AIETargetIPU.cpp +++ b/lib/Targets/AIETargetNPU.cpp @@ -1,4 +1,4 @@ -//===- AIETargetIPU.cpp -----------------------------------------*- C++ -*-===// +//===- AIETargetNPU.cpp -----------------------------------------*- C++ -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -50,7 +50,7 @@ reserveAndGetTail(std::vector &instructions, uint64_t tailSize) { tailSize); } -void appendSync(std::vector &instructions, IpuSyncOp op) { +void appendSync(std::vector &instructions, NpuSyncOp op) { auto words = reserveAndGetTail(instructions, 2); @@ -65,7 +65,7 @@ void appendSync(std::vector &instructions, IpuSyncOp op) { words[1] |= (op.getRowNum() & 0xff) << 8; } -void appendWrite32(std::vector &instructions, IpuWrite32Op op) { +void appendWrite32(std::vector &instructions, NpuWrite32Op op) { auto words = reserveAndGetTail(instructions, 3); @@ -80,7 +80,7 @@ void appendWrite32(std::vector &instructions, IpuWrite32Op op) { } void appendWriteBdShimTile(std::vector &instructions, - IpuWriteBdExShimTileOp op) { + NpuWriteBdExShimTileOp op) { auto words = reserveAndGetTail(instructions, 10); @@ -131,7 +131,7 @@ void appendWriteBdShimTile(std::vector &instructions, } // namespace -std::vector xilinx::AIE::AIETranslateToIPU(ModuleOp module) { +std::vector xilinx::AIE::AIETranslateToNPU(ModuleOp module) { std::vector instructions = getProlog(); @@ -143,9 +143,9 @@ std::vector xilinx::AIE::AIETranslateToIPU(ModuleOp module) { Block &entry = f.getRegion().front(); for (auto &o : entry) { llvm::TypeSwitch(&o) - .Case([&](auto op) { appendSync(instructions, op); }) - .Case([&](auto op) { appendWrite32(instructions, op); }) - .Case( + .Case([&](auto op) { appendSync(instructions, op); }) + .Case([&](auto op) { appendWrite32(instructions, op); }) + .Case( [&](auto op) { appendWriteBdShimTile(instructions, op); }); } } @@ -153,9 +153,9 @@ std::vector xilinx::AIE::AIETranslateToIPU(ModuleOp module) { return instructions; } -LogicalResult xilinx::AIE::AIETranslateToIPU(ModuleOp module, +LogicalResult xilinx::AIE::AIETranslateToNPU(ModuleOp module, raw_ostream &output) { - auto instructions = AIETranslateToIPU(module); + auto instructions = AIETranslateToNPU(module); for (auto w : instructions) output << llvm::format("%08X\n", w); return success(); diff --git a/lib/Targets/AIETargets.cpp b/lib/Targets/AIETargets.cpp index 05d79e3eaa..0caa039ef0 100644 --- a/lib/Targets/AIETargets.cpp +++ b/lib/Targets/AIETargets.cpp @@ -320,10 +320,10 @@ void registerAIETranslations() { cdoAieSim, cdoXaieDebug, cdoPartitionStartCol, cdoEnableCores); }, registerDialects); - TranslateFromMLIRRegistration registrationIPU( - "aie-ipu-instgen", "Generate instructions for IPU", + TranslateFromMLIRRegistration registrationNPU( + "aie-npu-instgen", "Generate instructions for NPU", [](ModuleOp module, raw_ostream &output) { - return AIETranslateToIPU(module, output); + return AIETranslateToNPU(module, output); }, registerDialects); } diff --git a/lib/Targets/CMakeLists.txt b/lib/Targets/CMakeLists.txt index 85b3405ede..e6e4307c2c 100644 --- a/lib/Targets/CMakeLists.txt +++ b/lib/Targets/CMakeLists.txt @@ -11,7 +11,7 @@ add_mlir_library(AIETargets AIETargets.cpp AIETargetBCF.cpp AIETargetCDODirect.cpp - AIETargetIPU.cpp + AIETargetNPU.cpp AIETargetLdScript.cpp AIETargetXAIEV2.cpp AIETargetHSA.cpp diff --git a/programming_examples/basic/matrix_add_one/Makefile b/programming_examples/basic/matrix_add_one/Makefile index 435b7b8c9e..83014fbeaf 100644 --- a/programming_examples/basic/matrix_add_one/Makefile +++ b/programming_examples/basic/matrix_add_one/Makefile @@ -13,15 +13,15 @@ ACDC_AIE = $(dir $(shell which aie-opt))/.. SHELL := /bin/bash targetname = matrixAddOne -devicename = ipu +devicename = npu col = 0 all: build/final.xclbin build/final.xclbin: build/aie.mlir mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt ${Matrix Addition -Single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back. The DMA in the Shim tile is programmed to bring the bottom left `8x16` portion of a larger `16x128` matrix into the tile to perform the operation. This reference design can be run on either a RyzenAI IPU or a VCK5000. +Single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back. The DMA in the Shim tile is programmed to bring the bottom left `8x16` portion of a larger `16x128` matrix into the tile to perform the operation. This reference design can be run on either a RyzenAI NPU or a VCK5000. -The kernel executes on AIE tile (`col`, 2). Input data is brought to the local memory of the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting IPU or VCK5000. The Shim tile is programmed with a 2D DMA to only bring a 2D submatrix into the AIE tile for processing. +The kernel executes on AIE tile (`col`, 2). Input data is brought to the local memory of the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting NPU or VCK5000. The Shim tile is programmed with a 2D DMA to only bring a 2D submatrix into the AIE tile for processing. -To compile and run the design for IPU: +To compile and run the design for NPU: ``` make make run diff --git a/programming_examples/basic/matrix_add_one/aie2.py b/programming_examples/basic/matrix_add_one/aie2.py index 36eb3a3d38..a80ba794e6 100644 --- a/programming_examples/basic/matrix_add_one/aie2.py +++ b/programming_examples/basic/matrix_add_one/aie2.py @@ -35,8 +35,8 @@ def my_matrix_add_one(): if len(sys.argv) != 3: raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "ipu": - dev = AIEDevice.ipu + if sys.argv[1] == "npu": + dev = AIEDevice.npu elif sys.argv[1] == "xcvc1902": dev = AIEDevice.xcvc1902 else: @@ -85,21 +85,21 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(inTensor, notUsed, outTensor): - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], strides=[1, 1, IMAGE_WIDTH], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH], strides=[1, 1, IMAGE_WIDTH], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/matrix_add_one/run.lit b/programming_examples/basic/matrix_add_one/run.lit index a429e99221..1922c01828 100644 --- a/programming_examples/basic/matrix_add_one/run.lit +++ b/programming_examples/basic/matrix_add_one/run.lit @@ -3,9 +3,9 @@ // // REQUIRES: ryzen_ai // -// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 > ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/matrix_multiplication/CMakeLists.txt b/programming_examples/basic/matrix_multiplication/CMakeLists.txt index dfe345e188..0f062b0322 100644 --- a/programming_examples/basic/matrix_multiplication/CMakeLists.txt +++ b/programming_examples/basic/matrix_multiplication/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common index fd6a438ea0..6149657e1b 100644 --- a/programming_examples/basic/matrix_multiplication/makefile-common +++ b/programming_examples/basic/matrix_multiplication/makefile-common @@ -60,7 +60,7 @@ ${mlir_target}: aie2.py ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o} mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=${insts_target:build/%=%} $(<:%=../%) + --aie-generate-npu --npu-insts-name=${insts_target:build/%=%} $(<:%=../%) ${targetname}.exe: test.cpp ../test.cpp ../common.h rm -rf _build diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py index 80b5c89613..4ac31574fd 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py @@ -42,7 +42,7 @@ def my_matmul(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): memRef_inA_ty = T.memref(m * k, T.bf16()) memRef_inB_ty = T.memref(k, T.bf16()) @@ -176,7 +176,7 @@ def core_body(): T.memref(C_sz_in_i32s, T.i32()), ) def sequence(A, B, C): - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata=inB_fifo_names[0], bd_id=2, mem=B, @@ -186,7 +186,7 @@ def sequence(A, B, C): for i in range(n_cores): A_offset = i * M_div_m_div_n_cores * m * K * word_size_in // 4 C_offset = i * M_div_m_div_n_cores * m * word_size_out // 4 - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata=memA_fifo_names[i], bd_id=1, mem=A, @@ -194,7 +194,7 @@ def sequence(A, B, C): sizes=[M_div_m_div_n_cores, K_div_k, m, k_in_i32s], strides=[m_x_K_in_i32s, k_in_i32s, K_in_i32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata=outC_fifo_names[i], bd_id=0, mem=C, @@ -204,7 +204,7 @@ def sequence(A, B, C): ) for i in range(n_cores): - ipu_sync(column=i, row=0, direction=0, channel=0) + npu_sync(column=i, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/run.lit b/programming_examples/basic/matrix_multiplication/matrix_vector/run.lit index d446e4f966..eeaa69352a 100644 --- a/programming_examples/basic/matrix_multiplication/matrix_vector/run.lit +++ b/programming_examples/basic/matrix_multiplication/matrix_vector/run.lit @@ -5,8 +5,8 @@ // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../../aie_kernels/aie2/mv.cc -o ./mv.o // RUN: %python %S/aie2.py -M 288 -K 288 -N 1 > ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -M 288 -K 288 -N 1 -v 1 | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -M 288 -K 288 -N 1 -v 1 | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py index ef94adc74a..e00534e708 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py +++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py @@ -55,7 +55,7 @@ def my_matmul(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): memref_a_ty = T.memref(m, k, T.bf16()) memref_b_ty = T.memref(k, n, T.bf16()) @@ -195,7 +195,7 @@ def sequence(A, B, C): num_tile_rows = min( [rows_per_block, M_div_m - tile_row_block * rows_per_block] ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outC", bd_id=0, mem=C, @@ -211,7 +211,7 @@ def sequence(A, B, C): * word_size_in // 4 ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inA", bd_id=2 * tile_row + 1, mem=A, @@ -219,7 +219,7 @@ def sequence(A, B, C): sizes=[N_div_n, K_div_k, m, k_in_i32s], strides=[0, k_in_i32s, K_in_i32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inB", bd_id=2 * tile_row + 2, mem=B, @@ -227,7 +227,7 @@ def sequence(A, B, C): strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/matrix_multiplication/single_core/run.lit b/programming_examples/basic/matrix_multiplication/single_core/run.lit index 0209415093..6f6a32320a 100644 --- a/programming_examples/basic/matrix_multiplication/single_core/run.lit +++ b/programming_examples/basic/matrix_multiplication/single_core/run.lit @@ -5,7 +5,7 @@ // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../../aie_kernels/aie2/mm.cc -o ./mm.o // RUN: %python %S/aie2.py -M 256 -K 256 -N 256 > ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -M 256 -K 256 -N 256 -v 1 | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -M 256 -K 256 -N 256 -v 1 | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py index 69a3c52394..d94a7e8eba 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py +++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py @@ -74,7 +74,7 @@ def my_matmul(M=512, K=512, N=512): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): memRef_inA_ty = T.memref(m * k, T.bf16()) memRef_inB_ty = T.memref(k * n, T.bf16()) @@ -317,7 +317,7 @@ def sequence(A, B, C): for i in range(n_cols): C_col_offset = i * n * word_size_out C_offset_in_i32s = (C_col_offset + C_row_offset) // 4 - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata=outC_fifo_names[i], bd_id=0, mem=C, @@ -345,7 +345,7 @@ def sequence(A, B, C): ) A_col_offset_in_i32s = i * m * K * word_size_in // 4 B_col_offset_in_i32s = i * n * word_size_in // 4 - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata=inA_fifo_names[i], bd_id=2 * tile_row + 1, mem=A, @@ -358,7 +358,7 @@ def sequence(A, B, C): sizes=[N_div_n_div_n_cols, K_div_k, m, k_in_i32s], strides=[0, k_in_i32s, K_in_i32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata=inB_fifo_names[i], bd_id=2 * tile_row + 2, mem=B, @@ -367,7 +367,7 @@ def sequence(A, B, C): strides=[n_x_n_cols_in_i32s, k_x_N_in_i32s, N_in_i32s], ) for i in range(n_cols): - ipu_sync(column=i, row=0, direction=0, channel=0) + npu_sync(column=i, row=0, direction=0, channel=0) # print(ctx.module.operation.verify()) print(ctx.module) diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run.lit b/programming_examples/basic/matrix_multiplication/whole_array/run.lit index 202e66b71e..fc23355630 100644 --- a/programming_examples/basic/matrix_multiplication/whole_array/run.lit +++ b/programming_examples/basic/matrix_multiplication/whole_array/run.lit @@ -5,8 +5,8 @@ // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../../aie_kernels/aie2/mm.cc -o ./mm.o // RUN: %python %S/aie2.py -M 512 -K 512 -N 512 > ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -v 1 -M 512 -K 512 -N 512 | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -v 1 -M 512 -K 512 -N 512 | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/passthrough_dmas/CMakeLists.txt b/programming_examples/basic/passthrough_dmas/CMakeLists.txt index 3986c4a075..c17d3d365b 100644 --- a/programming_examples/basic/passthrough_dmas/CMakeLists.txt +++ b/programming_examples/basic/passthrough_dmas/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/passthrough_dmas/Makefile b/programming_examples/basic/passthrough_dmas/Makefile index 13a8d42aae..e09c8a91aa 100644 --- a/programming_examples/basic/passthrough_dmas/Makefile +++ b/programming_examples/basic/passthrough_dmas/Makefile @@ -26,13 +26,13 @@ inst/insts.txt: aie2.py rm -rf inst mkdir -p inst python3 $< ${devicename} ${col} ${LENGTH} > inst/aie.mlir - pushd inst && aiecc.py --aie-only-generate-ipu --ipu-insts-name=insts.txt aie.mlir && popd + pushd inst && aiecc.py --aie-only-generate-npu --npu-insts-name=insts.txt aie.mlir && popd ${powershell} ./build/${targetname}.exe -x build/final.xclbin -i inst/insts.txt -k MLIR_AIE -l ${LENGTH} build/final.xclbin: build/aie.mlir mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py index b59d9332ac..f8dc35a6d9 100755 --- a/programming_examples/basic/passthrough_dmas/aie2.py +++ b/programming_examples/basic/passthrough_dmas/aie2.py @@ -23,8 +23,8 @@ if len(sys.argv) == 4: N = int(sys.argv[1]) -if sys.argv[1] == "ipu": - dev = AIEDevice.ipu +if sys.argv[1] == "npu": + dev = AIEDevice.npu elif sys.argv[1] == "xcvc1902": dev = AIEDevice.xcvc1902 else: @@ -62,9 +62,9 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(A, B, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/passthrough_dmas/run.lit b/programming_examples/basic/passthrough_dmas/run.lit index a4f5d568b6..a466533551 100644 --- a/programming_examples/basic/passthrough_dmas/run.lit +++ b/programming_examples/basic/passthrough_dmas/run.lit @@ -3,8 +3,8 @@ // // REQUIRES: ryzen_ai // -// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 > ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -l 4096 | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -l 4096 | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/passthrough_kernel/CMakeLists.txt b/programming_examples/basic/passthrough_kernel/CMakeLists.txt index 47375adc84..fddc513396 100644 --- a/programming_examples/basic/passthrough_kernel/CMakeLists.txt +++ b/programming_examples/basic/passthrough_kernel/CMakeLists.txt @@ -22,7 +22,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif () set(PASSTHROUGH_SIZE 4096 CACHE STRING "size") diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile index fbfc7580c4..458b992521 100644 --- a/programming_examples/basic/passthrough_kernel/Makefile +++ b/programming_examples/basic/passthrough_kernel/Makefile @@ -28,8 +28,8 @@ build/passThrough.cc.o: passThrough.cc build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py index baec4415fa..5b187a7d94 100644 --- a/programming_examples/basic/passthrough_kernel/aie2.py +++ b/programming_examples/basic/passthrough_kernel/aie2.py @@ -29,7 +29,7 @@ def passthroughKernel(): - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): # define types memRef_ty = T.memref(lineWidthInBytes, T.ui8()) @@ -87,19 +87,19 @@ def sequence(inTensor, outTensor, notUsed): events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="in", bd_id=0, mem=inTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="out", bd_id=1, mem=outTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/passthrough_kernel/run.lit b/programming_examples/basic/passthrough_kernel/run.lit index 30abe48152..7f1c2318b2 100644 --- a/programming_examples/basic/passthrough_kernel/run.lit +++ b/programming_examples/basic/passthrough_kernel/run.lit @@ -5,8 +5,8 @@ // // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/generic/passThrough.cc -o passThrough.cc.o // RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DPASSTHROUGH_SIZE=4096 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s -// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE -s 4096 | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE -s 4096 | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_add/Makefile b/programming_examples/basic/vector_add/Makefile index 9a1a7a2a56..61133a555b 100755 --- a/programming_examples/basic/vector_add/Makefile +++ b/programming_examples/basic/vector_add/Makefile @@ -13,15 +13,15 @@ ACDC_AIE = $(dir $(shell which aie-opt))/.. SHELL := /bin/bash targetname = vectorAdd -devicename = ipu +devicename = npu col = 0 all: build/final.xclbin build/final.xclbin: build/aie.mlir mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt ${Vector Add -Single tile performs a very simple `+` operations from two vectors loaded into memory. The tile then stores the sum of those two vectors back to external memory. This reference design can be run on either a RyzenAI IPU or a VCK5000. +Single tile performs a very simple `+` operations from two vectors loaded into memory. The tile then stores the sum of those two vectors back to external memory. This reference design can be run on either a RyzenAI NPU or a VCK5000. -The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting IPU or VCK5000. The AIE tile performs the summation operations and the Shim tile brings the data back out to external memory. +The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting NPU or VCK5000. The AIE tile performs the summation operations and the Shim tile brings the data back out to external memory. -To compile and run the design for IPU: +To compile and run the design for NPU: ``` make make run diff --git a/programming_examples/basic/vector_add/aie2.py b/programming_examples/basic/vector_add/aie2.py index 6f8ad2d5b6..581729e6ec 100755 --- a/programming_examples/basic/vector_add/aie2.py +++ b/programming_examples/basic/vector_add/aie2.py @@ -28,8 +28,8 @@ def my_vector_add(): if len(sys.argv) != 3: raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "ipu": - dev = AIEDevice.ipu + if sys.argv[1] == "npu": + dev = AIEDevice.npu elif sys.argv[1] == "xcvc1902": dev = AIEDevice.xcvc1902 else: @@ -79,10 +79,10 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(A, B, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/vector_add/run.lit b/programming_examples/basic/vector_add/run.lit index a429e99221..1922c01828 100644 --- a/programming_examples/basic/vector_add/run.lit +++ b/programming_examples/basic/vector_add/run.lit @@ -3,9 +3,9 @@ // // REQUIRES: ryzen_ai // -// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 > ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_exp/CMakeLists.txt b/programming_examples/basic/vector_exp/CMakeLists.txt index 20452d080e..ee2050a94e 100644 --- a/programming_examples/basic/vector_exp/CMakeLists.txt +++ b/programming_examples/basic/vector_exp/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/vector_exp/Makefile b/programming_examples/basic/vector_exp/Makefile index 68205484e0..5b471771ba 100644 --- a/programming_examples/basic/vector_exp/Makefile +++ b/programming_examples/basic/vector_exp/Makefile @@ -32,7 +32,7 @@ build/aie.mlir: aie2.py build/final.xclbin: build/aie.mlir build/kernels.a mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py index dd6e5e9773..dbf2a3ce2b 100644 --- a/programming_examples/basic/vector_exp/aie2.py +++ b/programming_examples/basic/vector_exp/aie2.py @@ -32,7 +32,7 @@ def my_eltwise_exp(): buffer_depth = 2 # Device declaration - aie2 device NPU (aka Ryzen AI) - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): memRef_ty = T.memref(n, T.bf16()) @@ -106,13 +106,13 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_exp/run.lit b/programming_examples/basic/vector_exp/run.lit index f2db79ab6a..247ca37a33 100644 --- a/programming_examples/basic/vector_exp/run.lit +++ b/programming_examples/basic/vector_exp/run.lit @@ -6,8 +6,8 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -I %S/../../../aie_runtime_lib/AIE2 -c %S/../../../aie_kernels/aie2/bf16_exp.cc -o exp.o // RUN: xchesscc_wrapper aie2 -I %aietools/include -I. -c %S/../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o lut_based_ops.o // RUN: ar rvs kernels.a exp.o lut_based_ops.o -// RUN: %python %S/aie2.py ipu 0 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_mult/CMakeLists.txt b/programming_examples/basic/vector_mult/CMakeLists.txt index 20452d080e..ee2050a94e 100644 --- a/programming_examples/basic/vector_mult/CMakeLists.txt +++ b/programming_examples/basic/vector_mult/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/vector_mult/Makefile b/programming_examples/basic/vector_mult/Makefile index 330692f4fb..bc07e3d05b 100755 --- a/programming_examples/basic/vector_mult/Makefile +++ b/programming_examples/basic/vector_mult/Makefile @@ -13,15 +13,15 @@ ACDC_AIE = $(dir $(shell which aie-opt))/.. SHELL := /bin/bash targetname = vectorMult -devicename = ipu +devicename = npu col = 0 all: build/final.xclbin build/final.xclbin: build/aie.mlir mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt ${Vector Multiplication -Single tile performs a very simple `*` operations from two vectors loaded into memory. The tile then stores the element wise multiplication of those two vectors back to external memory. This reference design can be run on either a RyzenAI IPU or a VCK5000. +Single tile performs a very simple `*` operations from two vectors loaded into memory. The tile then stores the element wise multiplication of those two vectors back to external memory. This reference design can be run on either a RyzenAI NPU or a VCK5000. -The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting IPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory. +The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting NPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory. -To compile and run the design for IPU: +To compile and run the design for NPU: ``` make make run diff --git a/programming_examples/basic/vector_mult/aie2.py b/programming_examples/basic/vector_mult/aie2.py index 5a36f85a33..209f5243bb 100755 --- a/programming_examples/basic/vector_mult/aie2.py +++ b/programming_examples/basic/vector_mult/aie2.py @@ -28,8 +28,8 @@ def my_vector_add(): if len(sys.argv) != 3: raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "ipu": - dev = AIEDevice.ipu + if sys.argv[1] == "npu": + dev = AIEDevice.npu elif sys.argv[1] == "xcvc1902": dev = AIEDevice.xcvc1902 else: @@ -79,10 +79,10 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(A, B, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/vector_mult/run.lit b/programming_examples/basic/vector_mult/run.lit index a429e99221..1922c01828 100644 --- a/programming_examples/basic/vector_mult/run.lit +++ b/programming_examples/basic/vector_mult/run.lit @@ -3,9 +3,9 @@ // // REQUIRES: ryzen_ai // -// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 > ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_reduce_add/CMakeLists.txt b/programming_examples/basic/vector_reduce_add/CMakeLists.txt index 9ae325a430..024b4cfd54 100644 --- a/programming_examples/basic/vector_reduce_add/CMakeLists.txt +++ b/programming_examples/basic/vector_reduce_add/CMakeLists.txt @@ -22,7 +22,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/vector_reduce_add/Makefile b/programming_examples/basic/vector_reduce_add/Makefile index 37ca25abec..b0f8eebe0c 100644 --- a/programming_examples/basic/vector_reduce_add/Makefile +++ b/programming_examples/basic/vector_reduce_add/Makefile @@ -11,7 +11,7 @@ include ../../makefile-common ACDC_AIE = $(dir $(shell which aie-opt))/.. targetname = vector_max -devicename = ipu +devicename = npu col = 0 CHESS_FLAGS=${CHESSCCWRAP2_FLAGS} KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/aie2/ @@ -29,7 +29,7 @@ build/aie.mlir: aie2.py build/final.xclbin: build/aie.mlir build/i32_add_reduce.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/basic/vector_reduce_add/aie2.py b/programming_examples/basic/vector_reduce_add/aie2.py index fe035bfc96..cf5d5691e9 100644 --- a/programming_examples/basic/vector_reduce_add/aie2.py +++ b/programming_examples/basic/vector_reduce_add/aie2.py @@ -24,8 +24,8 @@ def my_reduce_add(): if len(sys.argv) != 3: raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "ipu": - dev = AIEDevice.ipu + if sys.argv[1] == "npu": + dev = AIEDevice.npu elif sys.argv[1] == "xcvc1902": dev = AIEDevice.xcvc1902 else: @@ -67,9 +67,9 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_reduce_add/run.lit b/programming_examples/basic/vector_reduce_add/run.lit index 1ebe2c8741..f35b24884f 100644 --- a/programming_examples/basic/vector_reduce_add/run.lit +++ b/programming_examples/basic/vector_reduce_add/run.lit @@ -4,8 +4,8 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/reduce_add.cc -o reduce_add.cc.o -// RUN: %python %S/aie2.py ipu 0 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_reduce_max/CMakeLists.txt b/programming_examples/basic/vector_reduce_max/CMakeLists.txt index 9ae325a430..024b4cfd54 100644 --- a/programming_examples/basic/vector_reduce_max/CMakeLists.txt +++ b/programming_examples/basic/vector_reduce_max/CMakeLists.txt @@ -22,7 +22,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/vector_reduce_max/Makefile b/programming_examples/basic/vector_reduce_max/Makefile index 55a013704d..5e47d478b2 100755 --- a/programming_examples/basic/vector_reduce_max/Makefile +++ b/programming_examples/basic/vector_reduce_max/Makefile @@ -11,7 +11,7 @@ include ../../makefile-common ACDC_AIE = $(dir $(shell which aie-opt))/.. targetname = reduce_max -devicename = ipu +devicename = npu col = 0 CHESS_FLAGS=${CHESSCCWRAP2_FLAGS} KERNEL_LIB=../../../aie_kernels/aie2 @@ -29,7 +29,7 @@ build/aie.mlir: aie2.py build/final.xclbin: build/aie.mlir build/reduce_max.cc.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/basic/vector_reduce_max/aie2.py b/programming_examples/basic/vector_reduce_max/aie2.py index c081cf7659..bf9013e1b9 100755 --- a/programming_examples/basic/vector_reduce_max/aie2.py +++ b/programming_examples/basic/vector_reduce_max/aie2.py @@ -24,8 +24,8 @@ def my_reduce_max(): if len(sys.argv) != 3: raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "ipu": - dev = AIEDevice.ipu + if sys.argv[1] == "npu": + dev = AIEDevice.npu elif sys.argv[1] == "xcvc1902": dev = AIEDevice.xcvc1902 else: @@ -67,9 +67,9 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_reduce_max/run.lit b/programming_examples/basic/vector_reduce_max/run.lit index 6c3233183c..584d7c1628 100644 --- a/programming_examples/basic/vector_reduce_max/run.lit +++ b/programming_examples/basic/vector_reduce_max/run.lit @@ -4,8 +4,8 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/reduce_max.cc -o reduce_max.cc.o -// RUN: %python %S/aie2.py ipu 0 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_reduce_min/CMakeLists.txt b/programming_examples/basic/vector_reduce_min/CMakeLists.txt index 76d48dfe36..820bc8059d 100644 --- a/programming_examples/basic/vector_reduce_min/CMakeLists.txt +++ b/programming_examples/basic/vector_reduce_min/CMakeLists.txt @@ -22,7 +22,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/vector_reduce_min/Makefile b/programming_examples/basic/vector_reduce_min/Makefile index 177213e22a..b4321855e1 100755 --- a/programming_examples/basic/vector_reduce_min/Makefile +++ b/programming_examples/basic/vector_reduce_min/Makefile @@ -11,7 +11,7 @@ include ../../makefile-common ACDC_AIE = $(dir $(shell which aie-opt))/.. targetname = reduce_min -devicename = ipu +devicename = npu col = 0 CHESS_FLAGS=${CHESSCCWRAP2_FLAGS} KERNEL_LIB=../../../aie_kernels/aie2 @@ -29,7 +29,7 @@ build/aie.mlir: aie2.py build/final.xclbin: build/aie.mlir build/reduce_min.cc.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/basic/vector_reduce_min/aie2.py b/programming_examples/basic/vector_reduce_min/aie2.py index a8ef279a13..0b391f17a0 100755 --- a/programming_examples/basic/vector_reduce_min/aie2.py +++ b/programming_examples/basic/vector_reduce_min/aie2.py @@ -24,8 +24,8 @@ def my_reduce_min(): if len(sys.argv) != 3: raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "ipu": - dev = AIEDevice.ipu + if sys.argv[1] == "npu": + dev = AIEDevice.npu elif sys.argv[1] == "xcvc1902": dev = AIEDevice.xcvc1902 else: @@ -67,9 +67,9 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) with mlir_mod_ctx() as ctx: diff --git a/programming_examples/basic/vector_reduce_min/run.lit b/programming_examples/basic/vector_reduce_min/run.lit index 95ecbd533a..710a9a02cd 100644 --- a/programming_examples/basic/vector_reduce_min/run.lit +++ b/programming_examples/basic/vector_reduce_min/run.lit @@ -4,8 +4,8 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/reduce_min.cc -o reduce_min.cc.o -// RUN: %python %S/aie2.py ipu 0 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_scalar_add/CMakeLists.txt b/programming_examples/basic/vector_scalar_add/CMakeLists.txt index c4ca0825d4..20f5d8a4a3 100644 --- a/programming_examples/basic/vector_scalar_add/CMakeLists.txt +++ b/programming_examples/basic/vector_scalar_add/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/vector_scalar_add/Makefile b/programming_examples/basic/vector_scalar_add/Makefile index 4ad8553675..463b63532b 100644 --- a/programming_examples/basic/vector_scalar_add/Makefile +++ b/programming_examples/basic/vector_scalar_add/Makefile @@ -18,8 +18,8 @@ build/aie.mlir: aie2.py build/final.xclbin: build/aie.mlir mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt index 4d1000b813..e7b0f3d539 100644 --- a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt +++ b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(VECTORSCALARMUL_SIZE 4096 CACHE STRING "vector size") diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile index 0d686b3068..8af81834f7 100755 --- a/programming_examples/basic/vector_scalar_mul/Makefile +++ b/programming_examples/basic/vector_scalar_mul/Makefile @@ -36,12 +36,12 @@ build/aie_trace_${data_size}.mlir: aie2.py build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%) build/final_trace_${data_size}.xclbin: build/aie_trace.mlir build/scale.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%) ${targetname}_${data_size}.exe: test.cpp rm -rf _build diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py index d6ca3d0813..1f49b8d5a2 100755 --- a/programming_examples/basic/vector_scalar_mul/aie2.py +++ b/programming_examples/basic/vector_scalar_mul/aie2.py @@ -25,7 +25,7 @@ def my_vector_scalar(vector_size, trace_size): vectorized = True - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): memRef_ty = T.memref(n, T.i32()) memRef_ty2 = T.memref(1, T.i32()) @@ -92,10 +92,10 @@ def sequence(A, F, C): size=trace_size, offset=N_in_bytes, ) - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1]) + npu_sync(column=0, row=0, direction=0, channel=0) try: diff --git a/programming_examples/basic/vector_scalar_mul/run.lit b/programming_examples/basic/vector_scalar_mul/run.lit index e599a22c04..fd55fdb97d 100644 --- a/programming_examples/basic/vector_scalar_mul/run.lit +++ b/programming_examples/basic/vector_scalar_mul/run.lit @@ -5,8 +5,8 @@ // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/scale.cc -o ./scale.o // RUN: %python %S/aie2.py 4096 0 > ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DVECTORSCALARMUL_SIZE=4096 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s -// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE -s 4096 | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE -s 4096 | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/basic/vector_sum/CMakeLists.txt b/programming_examples/basic/vector_sum/CMakeLists.txt index f253b14fb0..5e637b4d7d 100644 --- a/programming_examples/basic/vector_sum/CMakeLists.txt +++ b/programming_examples/basic/vector_sum/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/basic/vector_sum/Makefile b/programming_examples/basic/vector_sum/Makefile index 8c0372f191..e9c2016543 100755 --- a/programming_examples/basic/vector_sum/Makefile +++ b/programming_examples/basic/vector_sum/Makefile @@ -13,15 +13,15 @@ ACDC_AIE = $(dir $(shell which aie-opt))/.. SHELL := /bin/bash targetname = vectorSum -devicename = ipu +devicename = npu col = 0 all: build/final.xclbin build/final.xclbin: build/aie.mlir mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt ${Vector sum -Single tile traverses through a vector in memory and returns the sum of each value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. This reference design can be run on either a RyzenAI IPU or a VCK5000. The value of `col` is dependent on whether the application is targetting IPU or VCK5000. +Single tile traverses through a vector in memory and returns the sum of each value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. This reference design can be run on either a RyzenAI NPU or a VCK5000. The value of `col` is dependent on whether the application is targetting NPU or VCK5000. -To compile and run the design for IPU: +To compile and run the design for NPU: ``` make make run diff --git a/programming_examples/basic/vector_sum/aie2.py b/programming_examples/basic/vector_sum/aie2.py index 4e40b8009c..8073833962 100755 --- a/programming_examples/basic/vector_sum/aie2.py +++ b/programming_examples/basic/vector_sum/aie2.py @@ -26,8 +26,8 @@ def my_vector_sum(): if len(sys.argv) != 3: raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)") - if sys.argv[1] == "ipu": - dev = AIEDevice.ipu + if sys.argv[1] == "npu": + dev = AIEDevice.npu elif sys.argv[1] == "xcvc1902": dev = AIEDevice.xcvc1902 else: @@ -77,9 +77,9 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty) def sequence(A, B, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/basic/vector_sum/run.lit b/programming_examples/basic/vector_sum/run.lit index a429e99221..1922c01828 100644 --- a/programming_examples/basic/vector_sum/run.lit +++ b/programming_examples/basic/vector_sum/run.lit @@ -3,9 +3,9 @@ // // REQUIRES: ryzen_ai // -// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir -// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python %S/aie2.py npu 0 > ./aie.mlir +// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py index a03d2c7338..06db63f291 100755 --- a/programming_examples/lit.cfg.py +++ b/programming_examples/lit.cfg.py @@ -45,7 +45,7 @@ # for python llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python")) -run_on_ipu = "echo" +run_on_npu = "echo" xrt_flags = "" # Not using run_on_board anymore, need more specific per-platform commands @@ -137,8 +137,8 @@ aie_model = m.group(2) print("\tmodel:", aie_model) config.available_features.add("ryzen_ai") - run_on_ipu = ( - f"flock /tmp/ipu.lock {config.aie_src_root}/utils/run_on_ipu.sh" + run_on_npu = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) except: print("Failed to run xbutil") @@ -146,7 +146,7 @@ else: print("xrt not found") -config.substitutions.append(("%run_on_ipu", run_on_ipu)) +config.substitutions.append(("%run_on_npu", run_on_npu)) config.substitutions.append(("%xrt_flags", xrt_flags)) config.substitutions.append(("%XRT_DIR", config.xrt_dir)) config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1" diff --git a/programming_examples/makefile-common b/programming_examples/makefile-common index 5ab55c2e08..b5007535b8 100644 --- a/programming_examples/makefile-common +++ b/programming_examples/makefile-common @@ -1,4 +1,4 @@ -# Contains common definitions used across the Makefiles of ipu-xrt tests. +# Contains common definitions used across the Makefiles of npu-xrt tests. REPO_ROOT ?= $(shell realpath $(dir $(shell which aie-opt))/../../..) INSTALL_ROOT ?= $(shell realpath $(dir $(shell which aie-opt))/..) diff --git a/programming_examples/ml/bottleneck/CMakeLists.txt b/programming_examples/ml/bottleneck/CMakeLists.txt index 4b897cb29c..c7db0e9c5c 100644 --- a/programming_examples/ml/bottleneck/CMakeLists.txt +++ b/programming_examples/ml/bottleneck/CMakeLists.txt @@ -25,7 +25,7 @@ else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif () set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width") diff --git a/programming_examples/ml/bottleneck/Makefile b/programming_examples/ml/bottleneck/Makefile index 47ca6a78f7..0133e02c7a 100755 --- a/programming_examples/ml/bottleneck/Makefile +++ b/programming_examples/ml/bottleneck/Makefile @@ -16,7 +16,7 @@ build/${mlirFileName}.mlir: aie2.py python3 $< > $@ insts.txt: build/${mlirFileName}.mlir - aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $< + aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< build/conv2dk1.o: ../../../aie_kernels/aie2/conv2dk1.cc xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ @@ -28,8 +28,8 @@ build/conv2dk1_skip.o: ../../../aie_kernels/aie2/conv2dk1_skip.cc xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ build/final.xclbin: build/${mlirFileName}.mlir - cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd build && aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) clean: rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log .xclbin sim \ diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py index a488ae8ded..ac349259f4 100644 --- a/programming_examples/ml/bottleneck/aie2.py +++ b/programming_examples/ml/bottleneck/aie2.py @@ -38,7 +38,7 @@ def bottleneck4AIEs(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def deviceBody(): # define types @@ -543,9 +543,9 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): # EVENTS_CORE_PORT_RUNNING_0 (0x4B) # Trace_Event0 (4 slots) - ipu_write32(0, 4, 0x340E0, 0x4B222125) + npu_write32(0, 4, 0x340E0, 0x4B222125) # Trace_Event1 (4 slots) - ipu_write32(0, 4, 0x340E4, 0x2D2C1A4F) + npu_write32(0, 4, 0x340E4, 0x2D2C1A4F) # Event slots as configured above: # 0: Kernel executes vector instruction @@ -559,13 +559,13 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): # Stream_Switch_Event_Port_Selection_0 # This is necessary to capture the Port_Running_0 and Port_Running_1 events - ipu_write32(0, 4, 0x3FF00, 0x121) + npu_write32(0, 4, 0x3FF00, 0x121) # Trace_Control0: Define trace start and stop triggers. Set start event TRUE. - ipu_write32(0, 4, 0x340D0, 0x10000) + npu_write32(0, 4, 0x340D0, 0x10000) # Start trace copy out. - ipu_writebd_shimtile( + npu_writebd_shimtile( bd_id=3, buffer_length=trace_sz_in_i32s, buffer_offset=acitivationsOutSize32b, @@ -593,45 +593,45 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): use_next_bd=0, valid_bd=1, ) - ipu_write32(0, 2, 0x1D20C, 0x3) + npu_write32(0, 2, 0x1D20C, 0x3) # write RTP parameters - IpuWriteRTPOp( + NpuWriteRTPOp( "rtpComputeTile2", col=0, row=2, index=0, value=1 ) # scale - IpuWriteRTPOp( + NpuWriteRTPOp( "rtpComputeTile3", col=0, row=3, index=0, value=1 ) # scale - IpuWriteRTPOp( + NpuWriteRTPOp( "rtpComputeTile5", col=0, row=5, index=0, value=1 ) # scale - IpuWriteRTPOp( + NpuWriteRTPOp( "rtpComputeTile4", col=0, row=4, index=0, value=1 ) # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input - IpuWriteRTPOp( + NpuWriteRTPOp( "rtpComputeTile4", col=0, row=4, index=1, value=0 ) # skip_scale - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inOF_act_L3L2", bd_id=0, mem=inputFromL3, sizes=[1, 1, 1, activationsInSize32b], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=outputToL3, sizes=[1, 1, 1, acitivationsOutSize32b], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=1, mem=weightsFromL3, sizes=[1, 1, 1, totalWeightsSize32b], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/ml/bottleneck/run.lit b/programming_examples/ml/bottleneck/run.lit index 8a6024d66e..7d6f2abcc4 100644 --- a/programming_examples/ml/bottleneck/run.lit +++ b/programming_examples/ml/bottleneck/run.lit @@ -7,6 +7,6 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DUINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk3.cc -o conv2dk3.o // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir +// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/conv2d/CMakeLists.txt b/programming_examples/ml/conv2d/CMakeLists.txt index 4b897cb29c..c7db0e9c5c 100644 --- a/programming_examples/ml/conv2d/CMakeLists.txt +++ b/programming_examples/ml/conv2d/CMakeLists.txt @@ -25,7 +25,7 @@ else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif () set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width") diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile index 0f4c925ed3..c5f3576a1b 100755 --- a/programming_examples/ml/conv2d/Makefile +++ b/programming_examples/ml/conv2d/Makefile @@ -18,14 +18,14 @@ build/${mlirFileName}.mlir: aie2.py insts.txt: build/${mlirFileName}.mlir - aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $< + aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< build/conv2dk1_i8.o: ../../../aie_kernels/aie2/conv2dk1_i8.cc xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ build/final.xclbin: build/${mlirFileName}.mlir - cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd build && aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) clean: rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log* *.xclbin sim \ diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py index 74a2c38838..82584170cf 100644 --- a/programming_examples/ml/conv2d/aie2.py +++ b/programming_examples/ml/conv2d/aie2.py @@ -42,7 +42,7 @@ def conv2dk1(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): actIn_ty = T.memref(actIn, T.i8()) @@ -162,14 +162,14 @@ def sequence(I, W, O): # BB <- Event to start trace capture # C <- Trace mode, 00=event=time, 01=event-PC, 10=execution # Configure so that "Event 1" (always true) causes tracing to start - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x340D0, value=0x00010000, ) # 0x340D4: Trace Control 1 - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x340D4, @@ -177,7 +177,7 @@ def sequence(I, W, O): ) # 0x340E0: Trace Event Group 1 (Which events to trace) # 0xAABBCCDD AA, BB, CC, DD <- four event slots - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x340E0, @@ -185,14 +185,14 @@ def sequence(I, W, O): ) # 0x340E4: Trace Event Group 2 (Which events to trace) # 0xAABBCCDD AA, BB, CC, DD <- four event slots - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x340E4, value=0x2D2C1A4F, ) - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x3FF00, @@ -203,7 +203,7 @@ def sequence(I, W, O): # out to host DDR memory trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory output_size = bufOut - ipu_writebd_shimtile( + npu_writebd_shimtile( bd_id=trace_bd_id, buffer_length=trace_size, buffer_offset=output_size, @@ -232,29 +232,29 @@ def sequence(I, W, O): valid_bd=1, ) # Set start BD to our shim bd_Id (3) - ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id) + npu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id) - IpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=10) + NpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=10) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inOF_act_L3L2", bd_id=0, mem=I, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=O, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=2, mem=W, sizes=[1, 1, 1, weightsInInt32s], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) # print(ctx.module.operation.verify()) print(ctx.module) diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit index 349e45f9bc..59c3c8b031 100644 --- a/programming_examples/ml/conv2d/run.lit +++ b/programming_examples/ml/conv2d/run.lit @@ -5,6 +5,6 @@ // // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_i8.cc -o conv2dk1_i8.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir +// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt b/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt index 4b897cb29c..c7db0e9c5c 100644 --- a/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt +++ b/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt @@ -25,7 +25,7 @@ else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif () set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width") diff --git a/programming_examples/ml/conv2d_fused_relu/Makefile b/programming_examples/ml/conv2d_fused_relu/Makefile index 5911238a7a..0ee3a81d07 100755 --- a/programming_examples/ml/conv2d_fused_relu/Makefile +++ b/programming_examples/ml/conv2d_fused_relu/Makefile @@ -17,14 +17,14 @@ build/${mlirFileName}.mlir: aie2.py insts.txt: build/${mlirFileName}.mlir - aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $< + aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< build/conv2dk1.o: ../../../aie_kernels/aie2/conv2dk1.cc xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ build/final.xclbin: build/${mlirFileName}.mlir - cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd build && aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) clean: rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log* *.xclbin sim \ diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py index be0167e3b4..13a59f0934 100644 --- a/programming_examples/ml/conv2d_fused_relu/aie2.py +++ b/programming_examples/ml/conv2d_fused_relu/aie2.py @@ -42,7 +42,7 @@ def conv2dk1(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): actIn_ty = T.memref(actIn, T.i8()) @@ -162,14 +162,14 @@ def sequence(I, W, O): # BB <- Event to start trace capture # C <- Trace mode, 00=event=time, 01=event-PC, 10=execution # Configure so that "Event 1" (always true) causes tracing to start - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x340D0, value=0x00010000, ) # 0x340D4: Trace Control 1 - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x340D4, @@ -177,7 +177,7 @@ def sequence(I, W, O): ) # 0x340E0: Trace Event Group 1 (Which events to trace) # 0xAABBCCDD AA, BB, CC, DD <- four event slots - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x340E0, @@ -185,14 +185,14 @@ def sequence(I, W, O): ) # 0x340E4: Trace Event Group 2 (Which events to trace) # 0xAABBCCDD AA, BB, CC, DD <- four event slots - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x340E4, value=0x2D2C1A4F, ) - ipu_write32( + npu_write32( column=compute_tile2_col, row=compute_tile2_row, address=0x3FF00, @@ -203,7 +203,7 @@ def sequence(I, W, O): # out to host DDR memory trace_bd_id = 13 # use BD 13 for writing trace output from compute tile to DDR host memory output_size = bufOut - ipu_writebd_shimtile( + npu_writebd_shimtile( bd_id=trace_bd_id, buffer_length=trace_size, buffer_offset=output_size, @@ -232,29 +232,29 @@ def sequence(I, W, O): valid_bd=1, ) # Set start BD to our shim bd_Id (3) - ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id) + npu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id) - IpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=1) + NpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=1) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inOF_act_L3L2", bd_id=0, mem=I, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=O, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inOF_wts_0_L3L2", bd_id=2, mem=W, sizes=[1, 1, 1, weightsInInt32s], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) # print(ctx.module.operation.verify()) print(ctx.module) diff --git a/programming_examples/ml/conv2d_fused_relu/run.lit b/programming_examples/ml/conv2d_fused_relu/run.lit index cfddde9013..07d9b0b5b7 100644 --- a/programming_examples/ml/conv2d_fused_relu/run.lit +++ b/programming_examples/ml/conv2d_fused_relu/run.lit @@ -5,6 +5,6 @@ // // RUN: xchesscc_wrapper aie2 -I %aietools/include -DINT8_ACT -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir +// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! \ No newline at end of file diff --git a/programming_examples/ml/eltwise_add/CMakeLists.txt b/programming_examples/ml/eltwise_add/CMakeLists.txt index c4ca0825d4..20f5d8a4a3 100644 --- a/programming_examples/ml/eltwise_add/CMakeLists.txt +++ b/programming_examples/ml/eltwise_add/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/ml/eltwise_add/Makefile b/programming_examples/ml/eltwise_add/Makefile index e0bb8ecc2c..702bd770ff 100644 --- a/programming_examples/ml/eltwise_add/Makefile +++ b/programming_examples/ml/eltwise_add/Makefile @@ -34,13 +34,13 @@ build/aie_trace.mlir: aie2.py build/final.xclbin: build/aie.mlir build/add.o mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt ${ $@ insts.txt: build/${mlirFileName}.mlir - aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $< + aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< build/conv2dk1_i8.o: ../../../../aie_kernels/aie2/conv2dk1.cc xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@ @@ -35,8 +35,8 @@ build/conv2dk1_skip.o: ../../../../aie_kernels/aie2/conv2dk1_skip.cc xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@ build/final.xclbin: build/${mlirFileName}.mlir - cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd build && aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) clean: rm -rf build *.elf* *.lst *.bif log* ${mlirFileName}.mlir.prj *.xclbin sim \ diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir index ccc04efb9a..103cbbbcbe 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir +++ b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir @@ -6,7 +6,7 @@ //===----------------------------------------------------------------------===// module { -aie.device(ipu) { +aie.device(npu) { //shim %tile00 = aie.tile(0, 0) @@ -909,9 +909,9 @@ aie.device(ipu) { // Trace_Event0 (4 slots) - aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 } + aiex.npu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 } // Trace_Event1 (4 slots) - aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 } + aiex.npu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 } // Event slots as configured above: // 0: Kernel executes vector instruction @@ -925,13 +925,13 @@ aie.device(ipu) { // Stream_Switch_Event_Port_Selection_0 // This is necessary to capture the Port_Running_0 and Port_Running_1 events - aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 } + aiex.npu.write32 { column = 2 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 } // Trace_Control0: Define trace start and stop triggers. Set start event TRUE. - aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 } + aiex.npu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 } // Start trace copy out. - aiex.ipu.writebd_shimtile { bd_id = 3 : i32, + aiex.npu.writebd_shimtile { bd_id = 3 : i32, buffer_length = 16384 : i32, buffer_offset = 262144 : i32, enable_packet = 0 : i32, @@ -965,30 +965,30 @@ aie.device(ipu) { next_bd = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} - aiex.ipu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 } + aiex.npu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 } //End trace dump - aiex.ipu.rtp_write(0, 2, 0, 1) { buffer_sym_name = "rtp2" } - aiex.ipu.rtp_write(0, 3, 0, 1) { buffer_sym_name = "rtp3" } - aiex.ipu.rtp_write(0, 5, 0, 1) { buffer_sym_name = "rtp4" } - aiex.ipu.rtp_write(0, 4, 0, 1) { buffer_sym_name = "rtp5" } - aiex.ipu.rtp_write(0, 4, 1, 0) { buffer_sym_name = "rtp5" } - aiex.ipu.rtp_write(0, 4, 2, 1) { buffer_sym_name = "rtp5" } + aiex.npu.rtp_write(0, 2, 0, 1) { buffer_sym_name = "rtp2" } + aiex.npu.rtp_write(0, 3, 0, 1) { buffer_sym_name = "rtp3" } + aiex.npu.rtp_write(0, 5, 0, 1) { buffer_sym_name = "rtp4" } + aiex.npu.rtp_write(0, 4, 0, 1) { buffer_sym_name = "rtp5" } + aiex.npu.rtp_write(0, 4, 1, 0) { buffer_sym_name = "rtp5" } + aiex.npu.rtp_write(0, 4, 2, 1) { buffer_sym_name = "rtp5" } - aiex.ipu.rtp_write(1, 5, 0, 1) { buffer_sym_name = "rtp15" } - aiex.ipu.rtp_write(1, 4, 0, 1) { buffer_sym_name = "rtp14" } - aiex.ipu.rtp_write(1, 2, 0, 1) { buffer_sym_name = "rtp12" } - aiex.ipu.rtp_write(1, 3, 0, 1) { buffer_sym_name = "rtp13" } - aiex.ipu.rtp_write(1, 3, 1, 0) { buffer_sym_name = "rtp13" } + aiex.npu.rtp_write(1, 5, 0, 1) { buffer_sym_name = "rtp15" } + aiex.npu.rtp_write(1, 4, 0, 1) { buffer_sym_name = "rtp14" } + aiex.npu.rtp_write(1, 2, 0, 1) { buffer_sym_name = "rtp12" } + aiex.npu.rtp_write(1, 3, 0, 1) { buffer_sym_name = "rtp13" } + aiex.npu.rtp_write(1, 3, 1, 0) { buffer_sym_name = "rtp13" } - aiex.ipu.rtp_write(2, 2, 0, 1) { buffer_sym_name = "rtp22" } - aiex.ipu.rtp_write(2, 3, 0, 1) { buffer_sym_name = "rtp23" } - aiex.ipu.rtp_write(2, 5, 0, 1) { buffer_sym_name = "rtp25" } - aiex.ipu.rtp_write(2, 4, 0, 1) { buffer_sym_name = "rtp24" } - aiex.ipu.rtp_write(2, 4, 1, 0) { buffer_sym_name = "rtp24" } + aiex.npu.rtp_write(2, 2, 0, 1) { buffer_sym_name = "rtp22" } + aiex.npu.rtp_write(2, 3, 0, 1) { buffer_sym_name = "rtp23" } + aiex.npu.rtp_write(2, 5, 0, 1) { buffer_sym_name = "rtp25" } + aiex.npu.rtp_write(2, 4, 0, 1) { buffer_sym_name = "rtp24" } + aiex.npu.rtp_write(2, 4, 1, 0) { buffer_sym_name = "rtp24" } %c0 = arith.constant 0 : i32 %c1 = arith.constant 1 : i32 @@ -1000,13 +1000,13 @@ aie.device(ipu) { %total_wts_3_off = arith.constant 35840 : i64 //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words]) - aiex.ipu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32> + aiex.npu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32> + aiex.npu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32> + aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32> + aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32> + aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32> - aiex.ipu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py index 46fa10030f..ea14ca60e2 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py +++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py @@ -34,7 +34,7 @@ def resnet_conv_x(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def deviceBody(): # define types @@ -836,52 +836,52 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): for c, col in enumerate(rtp_name): for r, row in enumerate(col): - IpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1) # scale + npuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1) # scale - IpuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=0) - IpuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=1) + npuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=0) + npuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=1) - IpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=0) + npuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=0) - IpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=0) + npuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=0) # # # write RTP parameters - # IpuWriteRTPOp( + # npuWriteRTPOp( # "rtpComputeTile02", col=0, row=2, index=0, value=1 # ) # scale - # IpuWriteRTPOp( + # npuWriteRTPOp( # "rtpComputeTile03", col=0, row=3, index=0, value=1 # ) # scale - # IpuWriteRTPOp( + # npuWriteRTPOp( # "rtpComputeTile05", col=0, row=5, index=0, value=1 # ) # scale - # IpuWriteRTPOp( + # npuWriteRTPOp( # "rtpComputeTile04", col=0, row=4, index=0, value=1 # ) # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input - # IpuWriteRTPOp( + # npuWriteRTPOp( # "rtpComputeTile04", col=0, row=4, index=1, value=0 # ) # skip_scale - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="act1_00_02_01", bd_id=0, mem=inputFromL3, sizes=[1, 1, 1, activationsInSize32b], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outOFL2L3", bd_id=2, mem=outputToL3, sizes=[1, 1, 1, acitivationsOutSize32b], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="wts_0_L3L2", bd_id=1, mem=weightsFromL3, sizes=[1, 1, 1, totalWeightsSize32b_init], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="wts_1_L3L2", bd_id=1, mem=weightsFromL3, @@ -889,7 +889,7 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): sizes=[1, 1, 1, totalWeightsSize32b_rest], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="wts_2_L3L2", bd_id=1, mem=weightsFromL3, @@ -902,7 +902,7 @@ def sequence(inputFromL3, weightsFromL3, outputToL3): sizes=[1, 1, 1, totalWeightsSize32b_rest], ) - ipu_sync(column=1, row=0, direction=0, channel=0) + npu_sync(column=1, row=0, direction=0, channel=0) res = ctx.module.operation.verify() if res == True: diff --git a/programming_examples/ml/resnet/layers_conv2_x/run.lit b/programming_examples/ml/resnet/layers_conv2_x/run.lit index 6496daafe7..394c68f9ca 100755 --- a/programming_examples/ml/resnet/layers_conv2_x/run.lit +++ b/programming_examples/ml/resnet/layers_conv2_x/run.lit @@ -9,6 +9,6 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1_ui8.o // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir -// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir +// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/ml/softmax/CMakeLists.txt b/programming_examples/ml/softmax/CMakeLists.txt index c4ca0825d4..20f5d8a4a3 100644 --- a/programming_examples/ml/softmax/CMakeLists.txt +++ b/programming_examples/ml/softmax/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/ml/softmax/Makefile b/programming_examples/ml/softmax/Makefile index 3a0a4dcc44..87760b77d1 100755 --- a/programming_examples/ml/softmax/Makefile +++ b/programming_examples/ml/softmax/Makefile @@ -43,12 +43,12 @@ build/aie_trace.mlir: aie2.py build/final.xclbin: build/aie.mlir build/kernels.a mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) build/final_trace.xclbin: build/aie_trace.mlir build/kernels.a mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp diff --git a/programming_examples/ml/softmax/aie2.py b/programming_examples/ml/softmax/aie2.py index 70873eebb6..3457191b24 100755 --- a/programming_examples/ml/softmax/aie2.py +++ b/programming_examples/ml/softmax/aie2.py @@ -33,7 +33,7 @@ def vector_softmax(trace_size): tiles = N_div_n // n_cores buffer_depth = 2 - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): memRef_ty = T.memref(n, T.bf16()) @@ -122,13 +122,13 @@ def sequence(A, C): offset=N_in_bytes, ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s] ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) try: diff --git a/programming_examples/ml/softmax/run.lit b/programming_examples/ml/softmax/run.lit index 54c7ccff98..42441e898a 100644 --- a/programming_examples/ml/softmax/run.lit +++ b/programming_examples/ml/softmax/run.lit @@ -9,7 +9,7 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/softmax.cc -o softmax.o // RUN: ar rvs kernels.a dut.o lut_based_ops.o softmax.o // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/ml/weight_expand/CMakeLists.txt b/programming_examples/ml/weight_expand/CMakeLists.txt index c4ca0825d4..20f5d8a4a3 100644 --- a/programming_examples/ml/weight_expand/CMakeLists.txt +++ b/programming_examples/ml/weight_expand/CMakeLists.txt @@ -27,7 +27,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/programming_examples/ml/weight_expand/Makefile b/programming_examples/ml/weight_expand/Makefile index 641b4902b3..b4967596fb 100755 --- a/programming_examples/ml/weight_expand/Makefile +++ b/programming_examples/ml/weight_expand/Makefile @@ -23,7 +23,7 @@ build/aie.mlir: aie2.py build/final.xclbin: build/aie.mlir build/expand.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/ml/weight_expand/aie2.py b/programming_examples/ml/weight_expand/aie2.py index 3ca1f7aee3..32fe95429f 100755 --- a/programming_examples/ml/weight_expand/aie2.py +++ b/programming_examples/ml/weight_expand/aie2.py @@ -45,7 +45,7 @@ def my_expand(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): memRef_i_ty = T.memref( input_buffer_size_bytes, T.i8() @@ -91,13 +91,13 @@ def core_body(): @FuncOp.from_py_func(tensor_ty, tensor_ty) def sequence(A, C): - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outB", bd_id=0, mem=C, sizes=[1, 1, 1, B_sz_in_i32s] ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s] ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/utils/README.md b/programming_examples/utils/README.md index 9dc2731012..1d59d46e08 100644 --- a/programming_examples/utils/README.md +++ b/programming_examples/utils/README.md @@ -54,7 +54,7 @@ The parse script create a temporary directory `tmpTrace` performs the following We prepend `0x` before each hex line and save it `prep.` since the `hwfrontend` utility expects it. ### 2. Parse MLIR to build event table -The MLIR parser is pretty rudimentary as it scans the source mlir file looking for `aiex.ipu.write32` calls and does a pattern match for trace unit config address and then grab the hex events, which it looks up from an internal table to provide waveform labels. It would be better to use an MLIR pass that already has the config information and cross reference it with a more official event-to-label lookup table instead. +The MLIR parser is pretty rudimentary as it scans the source mlir file looking for `aiex.npu.write32` calls and does a pattern match for trace unit config address and then grab the hex events, which it looks up from an internal table to provide waveform labels. It would be better to use an MLIR pass that already has the config information and cross reference it with a more official event-to-label lookup table instead. ### 3. Create .target file Create a dummy file (`.target`) in the `tmpTrace` with the file content 'hw' since `hwfrontend` utility expects it. diff --git a/programming_examples/utils/parse_eventIR.py b/programming_examples/utils/parse_eventIR.py index b7c989ca3c..b41ff9c74a 100755 --- a/programming_examples/utils/parse_eventIR.py +++ b/programming_examples/utils/parse_eventIR.py @@ -594,9 +594,9 @@ def parse_mlir_trace_events(lines): # TODO Need to check if this line is commented out, check for // ? (harder to check of /* */) # TODO Need to support value in hex with 0x or decimal - # pattern = r"AIEX.ipu.write32\s*\{\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*\}" - # pattern = r"AIEX.ipu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}" - pattern = r"aiex.ipu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}" + # pattern = r"AIEX.npu.write32\s*\{\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*\}" + # pattern = r"AIEX.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}" + pattern = r"aiex.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}" pid_events = list() for t in range(NumTraceTypes): diff --git a/programming_examples/utils/parse_trace.py b/programming_examples/utils/parse_trace.py index ed45353f31..9d2cd144a6 100755 --- a/programming_examples/utils/parse_trace.py +++ b/programming_examples/utils/parse_trace.py @@ -582,9 +582,9 @@ def parse_mlir_trace_events(lines): # TODO Need to check if this line is commented out, check for // ? (harder to check of /* */) # TODO Need to support value in hex with 0x or decimal - # pattern = r"AIEX.ipu.write32\s*\{\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*\}" - # pattern = r"AIEX.ipu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}" - pattern = r"aiex.ipu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}" + # pattern = r"AIEX.npu.write32\s*\{\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*\}" + # pattern = r"AIEX.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}" + pattern = r"aiex.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}" pid_events = list() for t in range(NumTraceTypes): diff --git a/programming_examples/vision/color_detect/CMakeLists.txt b/programming_examples/vision/color_detect/CMakeLists.txt index d850efcad5..f743789b61 100644 --- a/programming_examples/vision/color_detect/CMakeLists.txt +++ b/programming_examples/vision/color_detect/CMakeLists.txt @@ -25,7 +25,7 @@ else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif () set(COLORDETECT_WIDTH 1920 CACHE STRING "image width") diff --git a/programming_examples/vision/color_detect/Makefile b/programming_examples/vision/color_detect/Makefile index c8feea4cb6..ffb8ca55d1 100755 --- a/programming_examples/vision/color_detect/Makefile +++ b/programming_examples/vision/color_detect/Makefile @@ -39,8 +39,8 @@ build/aie2_lineBased_8b_${COLORDETECT_WIDTH}.mlir: aie2_colorDetect.py build/final_${COLORDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${COLORDETECT_WIDTH}.mlir build/rgba2hue.cc.o build/threshold.cc.o build/combined_bitwiseOR_gray2rgba_bitwiseAND.a mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) build/${targetname}.exe: test.cpp mkdir -p ${@D} diff --git a/programming_examples/vision/color_detect/README.md b/programming_examples/vision/color_detect/README.md index 33d41a2339..f2f24dbea6 100644 --- a/programming_examples/vision/color_detect/README.md +++ b/programming_examples/vision/color_detect/README.md @@ -12,7 +12,7 @@ The Color Detect pipeline design consists of the following blocks arranged in a pipeline fashion for the detecting of 2 colors in a sequence of images : `rgba2hue`, `threshold`, `threshold`, `bitwiseOR`, `gray2rgba`, `bitwiseAND`. -The pipeline is mapped onto a single column of the ipu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). As shown in the image below, the `rgba2hue`, and the two `threshold` kernels are each mapped onto one compute tile, while `bitwiseOR`, `gray2rgba` and `bitwiseAND` are mapped together on AIE tile (0, 5). +The pipeline is mapped onto a single column of the npu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). As shown in the image below, the `rgba2hue`, and the two `threshold` kernels are each mapped onto one compute tile, while `bitwiseOR`, `gray2rgba` and `bitwiseAND` are mapped together on AIE tile (0, 5).

./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DCOLORDETECT_WIDTH=1920 -DCOLORDETECT_HEIGHT=1080 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp -I%S/../../utils %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/vision/color_threshold/CMakeLists.txt b/programming_examples/vision/color_threshold/CMakeLists.txt index 040bc74533..f630f55106 100644 --- a/programming_examples/vision/color_threshold/CMakeLists.txt +++ b/programming_examples/vision/color_threshold/CMakeLists.txt @@ -25,7 +25,7 @@ else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif () set(COLORTHRESHOLD_WIDTH 128 CACHE STRING "image width") diff --git a/programming_examples/vision/color_threshold/Makefile b/programming_examples/vision/color_threshold/Makefile index 286f342b08..69958f4c2e 100644 --- a/programming_examples/vision/color_threshold/Makefile +++ b/programming_examples/vision/color_threshold/Makefile @@ -36,8 +36,8 @@ build/aie2_${COLORTHRESHOLD_WIDTH}.mlir: aie2_colorThreshold.py build/final_${COLORTHRESHOLD_WIDTH}.xclbin: build/aie2_${COLORTHRESHOLD_WIDTH}.mlir build/threshold.cc.o mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/vision/color_threshold/README.md b/programming_examples/vision/color_threshold/README.md index fbab6235cc..ad8613544a 100644 --- a/programming_examples/vision/color_threshold/README.md +++ b/programming_examples/vision/color_threshold/README.md @@ -12,7 +12,7 @@ The Color Threshold pipeline design consists of 4 threshold blocks in separate AIE tiles that process a different region of an input image, as shown in the image below. -The pipeline is mapped onto a single column of the ipu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). +The pipeline is mapped onto a single column of the npu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5).

./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DCOLORTHRESHOLD_WIDTH=1920 -DCOLORTHRESHOLD_HEIGHT=1080 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp -I%S/../../utils %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/vision/edge_detect/CMakeLists.txt b/programming_examples/vision/edge_detect/CMakeLists.txt index 59fe331831..c0ceb81739 100644 --- a/programming_examples/vision/edge_detect/CMakeLists.txt +++ b/programming_examples/vision/edge_detect/CMakeLists.txt @@ -25,7 +25,7 @@ else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif () set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width") diff --git a/programming_examples/vision/edge_detect/Makefile b/programming_examples/vision/edge_detect/Makefile index d40e606e63..71c2012432 100755 --- a/programming_examples/vision/edge_detect/Makefile +++ b/programming_examples/vision/edge_detect/Makefile @@ -39,8 +39,8 @@ build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir: aie2_edgeDetect.py build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/rgba2gray.cc.o build/gray2rgba.cc.o build/filter2d.cc.o build/threshold.cc.o build/addWeighted.cc.o build/combined_gray2rgba_addWeighted.a mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/vision/edge_detect/README.md b/programming_examples/vision/edge_detect/README.md index 2450f019ec..26f2d4aff8 100644 --- a/programming_examples/vision/edge_detect/README.md +++ b/programming_examples/vision/edge_detect/README.md @@ -12,7 +12,7 @@ The Edge Detect pipeline design consists of the following blocks arranged in a pipeline fashion for the detection of edges in a sequence of images : `rgba2gray`, `filter2D`, `threshold`, `gray2rgba`, `addWeighted`. -The pipeline is mapped onto a single column of the ipu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). As shown in the image below, the `rgba2gray`, `filter2D` and `threshold` kernels are each mapped onto one compute tile, while `gray2rgba` and `addWeighted` are mapped together on AIE tile (0, 5). +The pipeline is mapped onto a single column of the npu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). As shown in the image below, the `rgba2gray`, `filter2D` and `threshold` kernels are each mapped onto one compute tile, while `gray2rgba` and `addWeighted` are mapped together on AIE tile (0, 5).

./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DEDGEDETECT_WIDTH=1920 -DEDGEDETECT_HEIGHT=1080 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp -I%S/../../utils %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/programming_examples/vision/vision_passthrough/CMakeLists.txt b/programming_examples/vision/vision_passthrough/CMakeLists.txt index 7ba68b268b..a2bb8ac761 100644 --- a/programming_examples/vision/vision_passthrough/CMakeLists.txt +++ b/programming_examples/vision/vision_passthrough/CMakeLists.txt @@ -28,7 +28,7 @@ else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif () set(PASSTHROUGH_WIDTH 1920 CACHE STRING "image width") diff --git a/programming_examples/vision/vision_passthrough/Makefile b/programming_examples/vision/vision_passthrough/Makefile index f07d90fda2..1ae853d942 100644 --- a/programming_examples/vision/vision_passthrough/Makefile +++ b/programming_examples/vision/vision_passthrough/Makefile @@ -32,8 +32,8 @@ build/passThrough.cc.o: passThrough.cc build/final_${PASSTHROUGH_WIDTH}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_WIDTH}.mlir build/passThrough.cc.o mkdir -p ${@D} - cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%) + cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py index 5422f803d1..920d109cfa 100644 --- a/programming_examples/vision/vision_passthrough/aie2.py +++ b/programming_examples/vision/vision_passthrough/aie2.py @@ -29,7 +29,7 @@ def passThroughAIE2(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): # define types line_ty = T.memref(lineWidthInBytes, T.ui8()) @@ -101,9 +101,9 @@ def sequence(inTensor, notUsed, outTensor): # EVENTS_CORE_PORT_RUNNING_0 (0x4B) # Trace_Event0 (4 slots) - IpuWrite32(0, 2, 0x340E0, 0x4B222125) + NpuWrite32(0, 2, 0x340E0, 0x4B222125) # Trace_Event1 (4 slots) - IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F) + NpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F) # Event slots as configured above: # 0: Kernel executes vector instruction @@ -117,13 +117,13 @@ def sequence(inTensor, notUsed, outTensor): # Stream_Switch_Event_Port_Selection_0 # This is necessary to capture the Port_Running_0 and Port_Running_1 events - IpuWrite32(0, 2, 0x3FF00, 0x121) + NpuWrite32(0, 2, 0x3FF00, 0x121) # Trace_Control0: Define trace start and stop triggers. Set start event TRUE. - IpuWrite32(0, 2, 0x340D0, 0x10000) + NpuWrite32(0, 2, 0x340D0, 0x10000) # Start trace copy out. - IpuWriteBdShimTile( + NpuWriteBdShimTile( bd_id=3, buffer_length=traceSizeInBytes, buffer_offset=tensorSize, @@ -151,21 +151,21 @@ def sequence(inTensor, notUsed, outTensor): use_next_bd=0, valid_bd=1, ) - IpuWrite32(0, 0, 0x1D20C, 0x3) + NpuWrite32(0, 0, 0x1D20C, 0x3) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="out", bd_id=0, mem=outTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir index 3c547e4016..0621e0b622 100644 --- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir +++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir @@ -12,7 +12,7 @@ // AIE tiles, buffers, and communication in an AI Engine design module @passThroughLine_aie2 { - aie.device(ipu) { + aie.device(npu) { // declare kernel external kernel function func.func private @passThroughLine(%in: memref<1920xui8>, %out: memref<1920xui8>, %tilewidth: i32) -> () @@ -53,9 +53,9 @@ module @passThroughLine_aie2 { %tilewidth = arith.constant 480 : i64 // in 32b words so tileWidth/4 //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words]) - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32> + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir index d07ba213c4..c2c31b0d9b 100644 --- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir +++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir @@ -12,7 +12,7 @@ // AIE tiles, buffers, and communication in an AI Engine design module @passThroughLine_aie2 { - aie.device(ipu) { + aie.device(npu) { // declare kernel external kernel function func.func private @passThroughLine(%in: memref<7680xui8>, %out: memref<7680xui8>, %tilewidth: i32) -> () @@ -54,9 +54,9 @@ module @passThroughLine_aie2 { %totalLenRGBA = arith.constant 2073600 : i64 //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words]) - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32> + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir index 13f7417166..dd66475ca5 100644 --- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir +++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir @@ -12,7 +12,7 @@ // AIE tiles, buffers, and communication in an AI Engine design module @passThroughLine_aie2 { - aie.device(ipu) { + aie.device(npu) { // declare kernel external kernel function func.func private @passThroughLine(%in: memref<512xui8>, %out: memref<512xui8>, %tilewidth: i32) -> () @@ -53,9 +53,9 @@ module @passThroughLine_aie2 { %tilewidth = arith.constant 128 : i64 // in 32b words so tileWidth/4 //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words]) - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<1152xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<1152xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<1152xi32> + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<1152xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/programming_examples/vision/vision_passthrough/run.lit b/programming_examples/vision/vision_passthrough/run.lit index 5093e3c80c..58f914861c 100644 --- a/programming_examples/vision/vision_passthrough/run.lit +++ b/programming_examples/vision/vision_passthrough/run.lit @@ -5,7 +5,7 @@ // // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/generic/passThrough.cc -o passThrough.cc.o // RUN: %python %S/aie2.py 1920 1080 | aie-opt -cse -canonicalize -o ./aie.mlir -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DPASSTHROUGH_WIDTH=1920 -DPASSTHROUGH_HEIGHT=1080 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp -I%S/../../utils %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/python/AIEMLIRModule.cpp b/python/AIEMLIRModule.cpp index da4dd754c6..588f96fae0 100644 --- a/python/AIEMLIRModule.cpp +++ b/python/AIEMLIRModule.cpp @@ -107,11 +107,11 @@ PYBIND11_MODULE(_aie, m) { "enable_cores"_a = true); m.def( - "ipu_instgen", + "npu_instgen", [&stealCStr](MlirOperation op) { - py::str ipuInstructions = stealCStr(aieTranslateToIPU(op)); + py::str npuInstructions = stealCStr(aieTranslateToNPU(op)); auto individualInstructions = - ipuInstructions.attr("split")().cast(); + npuInstructions.attr("split")().cast(); for (size_t i = 0; i < individualInstructions.size(); ++i) individualInstructions[i] = individualInstructions[i].attr("strip")(); return individualInstructions; diff --git a/python/XRTModule.cpp b/python/XRTModule.cpp index 9396f2073e..fb200f6650 100644 --- a/python/XRTModule.cpp +++ b/python/XRTModule.cpp @@ -25,8 +25,8 @@ namespace py = pybind11; using namespace py::literals; -// group_id 0 is for ipu instructions -// group_id 1 is for number of ipu instructions +// group_id 0 is for npu instructions +// group_id 1 is for number of npu instructions // host side buffers/args follow starting from position 2 // see aiecc.main.emit_design_kernel_json constexpr size_t HOST_BUFFERS_START_IDX = 2; @@ -44,14 +44,14 @@ class PyXCLBin { kernel = std::make_unique(*context, kernelName); } - void loadIPUInstructions(const std::vector &insts) { - ipuInstructions = + void loadNPUInstructions(const std::vector &insts) { + npuInstructions = std::make_unique(*device, insts.size() * sizeof(uint32_t), XCL_BO_FLAGS_CACHEABLE, kernel->group_id(0)); - uint32_t *bufInstr = ipuInstructions->map(); + uint32_t *bufInstr = npuInstructions->map(); for (size_t i = 0; i < insts.size(); ++i) bufInstr[i] = insts.at(i); - ipuInstructions->sync(XCL_BO_SYNC_BO_TO_DEVICE); + npuInstructions->sync(XCL_BO_SYNC_BO_TO_DEVICE); } template @@ -107,17 +107,17 @@ class PyXCLBin { void run() { run_ = std::make_unique(*kernel); - run_->set_arg(0, *ipuInstructions); - run_->set_arg(1, ipuInstructions->size()); + run_->set_arg(0, *npuInstructions); + run_->set_arg(1, npuInstructions->size()); for (size_t i = 0; i < buffers.size(); ++i) run_->set_arg(HOST_BUFFERS_START_IDX + i, *buffers[i]); run_->start(); } - void _runOnlyIpuInstructions() { + void _runOnlyNpuInstructions() { run_ = std::make_unique(*kernel); - run_->set_arg(0, *ipuInstructions); - run_->set_arg(1, ipuInstructions->size()); + run_->set_arg(0, *npuInstructions); + run_->set_arg(1, npuInstructions->size()); run_->start(); } @@ -133,7 +133,7 @@ class PyXCLBin { std::unique_ptr device; std::unique_ptr context; std::unique_ptr kernel; - std::unique_ptr ipuInstructions; + std::unique_ptr npuInstructions; std::vector> buffers; @@ -145,11 +145,11 @@ PYBIND11_MODULE(_xrt, m) { py::class_(m, "XCLBin", py::module_local()) .def(py::init(), "xclbin_path"_a, "kernel_name"_a, "device_index"_a = 0) - .def("load_ipu_instructions", &PyXCLBin::loadIPUInstructions, "insts"_a) + .def("load_npu_instructions", &PyXCLBin::loadNPUInstructions, "insts"_a) .def("sync_buffers_to_device", &PyXCLBin::syncBuffersToDevice) .def("sync_buffers_from_device", &PyXCLBin::syncBuffersFromDevice) .def("run", &PyXCLBin::run) - .def("_run_only_ipu_instructions", &PyXCLBin::_runOnlyIpuInstructions) + .def("_run_only_npu_instructions", &PyXCLBin::_runOnlyNpuInstructions) .def("wait", &PyXCLBin::wait, "timeout"_a = py::none()) .def( "mmap_buffers", diff --git a/python/_mlir_libs/_aie.pyi b/python/_mlir_libs/_aie.pyi index c37cf64e08..ad7497117c 100644 --- a/python/_mlir_libs/_aie.pyi +++ b/python/_mlir_libs/_aie.pyi @@ -11,7 +11,7 @@ __all__ = [ "generate_bcf", "generate_cdo", "generate_xaie", - "ipu_instgen", + "npu_instgen", "register_dialect", "translate_aie_vec_to_cpp", "translate_mlir_to_llvmir", @@ -31,7 +31,7 @@ def generate_cdo( enable_cores: bool = True, ) -> None: ... def generate_xaie(module: Operation) -> str: ... -def ipu_instgen(module: Operation) -> list: ... +def npu_instgen(module: Operation) -> list: ... def register_dialect(registry: DialectRegistry) -> None: ... def translate_aie_vec_to_cpp(module: Operation, aieml: bool = False) -> str: ... def translate_mlir_to_llvmir(module: Operation) -> str: ... diff --git a/python/_mlir_libs/_xrt.pyi b/python/_mlir_libs/_xrt.pyi index b912f76738..d08862a4a5 100644 --- a/python/_mlir_libs/_xrt.pyi +++ b/python/_mlir_libs/_xrt.pyi @@ -8,8 +8,8 @@ class XCLBin: self, xclbin_path: str, kernel_name: str, device_index: int = 0 ) -> None: ... def _get_buffer_host_address(self, arg0: int) -> int: ... - def _run_only_ipu_instructions(self) -> None: ... - def load_ipu_instructions(self, insts: list[int]) -> None: ... + def _run_only_npu_instructions(self) -> None: ... + def load_npu_instructions(self, insts: list[int]) -> None: ... def mmap_buffers( self, shapes: list[list[int]], np_format: typing.Any ) -> list[memoryview]: ... diff --git a/python/compiler/aiecc/cl_arguments.py b/python/compiler/aiecc/cl_arguments.py index 4979674955..a407dda971 100644 --- a/python/compiler/aiecc/cl_arguments.py +++ b/python/compiler/aiecc/cl_arguments.py @@ -195,26 +195,26 @@ def parse_args(args=None): help="Show progress visualization", ) parser.add_argument( - "--aie-generate-ipu", - dest="ipu", + "--aie-generate-npu", + dest="npu", default=False, action="store_const", const=True, - help="Generate ipu instruction stream", + help="Generate npu instruction stream", ) parser.add_argument( - "--aie-only-generate-ipu", - dest="only_ipu", + "--aie-only-generate-npu", + dest="only_npu", default=False, action="store_const", const=True, - help="Generate ipu instruction stream only", + help="Generate npu instruction stream only", ) parser.add_argument( - "--ipu-insts-name", + "--npu-insts-name", dest="insts_name", - default="ipu_insts.txt", - help="Output instructions filename for IPU target", + default="npu_insts.txt", + help="Output instructions filename for NPU target", ) parser.add_argument( "--aie-generate-cdo", diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py index b687c978dc..35a268d245 100644 --- a/python/compiler/aiecc/main.py +++ b/python/compiler/aiecc/main.py @@ -89,7 +89,7 @@ CREATE_PATH_FINDER_FLOWS = Pipeline().Nested( "aie.device", Pipeline().add_pass("aie-create-pathfinder-flows") ) -DMA_TO_IPU = Pipeline().Nested("aie.device", Pipeline().add_pass("aie-dma-to-ipu")) +DMA_TO_NPU = Pipeline().Nested("aie.device", Pipeline().add_pass("aie-dma-to-npu")) async def read_file_async(file_path: str) -> str: @@ -1013,14 +1013,14 @@ async def run_flow(self): exit(-3) aie_peano_target = aie_target.lower() + "-none-elf" - # Optionally generate insts.txt for IPU instruction stream - if opts.ipu or opts.only_ipu: - generated_insts_mlir = self.prepend_tmp("generated_ipu_insts.mlir") + # Optionally generate insts.txt for NPU instruction stream + if opts.npu or opts.only_npu: + generated_insts_mlir = self.prepend_tmp("generated_npu_insts.mlir") await self.do_call( progress_bar.task, [ "aie-opt", - "--aie-dma-to-ipu", + "--aie-dma-to-npu", file_with_addresses, "-o", generated_insts_mlir, @@ -1030,13 +1030,13 @@ async def run_flow(self): progress_bar.task, [ "aie-translate", - "--aie-ipu-instgen", + "--aie-npu-instgen", generated_insts_mlir, "-o", opts.insts_name, ], ) - if opts.only_ipu: + if opts.only_npu: return chess_intrinsic_wrapper_ll_path = await self.prepare_for_chesshack( diff --git a/python/dialects/aie.py b/python/dialects/aie.py index ffdb1b46ee..35702449fa 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -21,7 +21,7 @@ generate_bcf, generate_cdo, generate_xaie, - ipu_instgen, + npu_instgen, register_dialect, translate_aie_vec_to_cpp, translate_mlir_to_llvmir, @@ -617,7 +617,7 @@ def find_neighbors(tile, device=None, logical=True): if device is None: device = find_parent_of_type(lambda op: isinstance(op, DeviceOp)) - assert int(device.device) == int(AIEDevice.ipu), "only ipu supported" + assert int(device.device) == int(AIEDevice.npu), "only npu supported" neighbors = {} col, row = map(int, (tile.col, tile.row)) diff --git a/python/dialects/aiex.py b/python/dialects/aiex.py index 685c91ccfd..1c8d59ac2d 100644 --- a/python/dialects/aiex.py +++ b/python/dialects/aiex.py @@ -32,11 +32,11 @@ # Comes from _aie register_dialect(get_dialect_registry()) -ipu_sync = partial(ipu_sync, column_num=1, row_num=1) +npu_sync = partial(npu_sync, column_num=1, row_num=1) -class IpuDmaMemcpyNd(IpuDmaMemcpyNdOp): - """Specialize IpuDmaMemcpyNdOp class constructor to take python integers""" +class NpuDmaMemcpyNd(NpuDmaMemcpyNdOp): + """Specialize NpuDmaMemcpyNdOp class constructor to take python integers""" def __init__( self, @@ -77,7 +77,7 @@ def __init__( ) -ipu_dma_memcpy_nd = IpuDmaMemcpyNd +npu_dma_memcpy_nd = NpuDmaMemcpyNd _PROLOG = [ @@ -119,8 +119,8 @@ def _get_prolog(): return _PROLOG[:] -# based on https://github.com/Xilinx/mlir-aie/blob/cb232a43383ef3b8efd8b408545c9b74885578ad/lib/Targets/AIETargetIPU.cpp -def _ipu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1): +# based on https://github.com/Xilinx/mlir-aie/blob/cb232a43383ef3b8efd8b408545c9b74885578ad/lib/Targets/AIETargetNPU.cpp +def _npu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1): if isinstance(channel, IntegerAttr): channel = int(channel) words = [None] * 2 @@ -137,7 +137,7 @@ def _ipu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1): return words -def _ipu_write32(column, row, address, value): +def _npu_write32(column, row, address, value): words = [None] * 3 op_code = 2 words[0] = (op_code & 0xFF) << 24 @@ -149,7 +149,7 @@ def _ipu_write32(column, row, address, value): return words -def _ipu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats=0): +def _npu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats=0): if isinstance(channel_index, IntegerAttr): channel_index = int(channel_index) if channel_dir == DMAChannelDir.MM2S: @@ -165,7 +165,7 @@ def _ipu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats= value |= XAIEMLGBL_NOC_MODULE_DMA_S2MM_0_TASK_QUEUE_ENABLE_TOKEN_ISSUE_MASK row = 0 - return _ipu_write32(column, row, address, value) + return _npu_write32(column, row, address, value) # based on ExecWriteBdExtendShimTileOpt @ dpufw/src/include/RunInstOpt.h:666 @@ -181,14 +181,14 @@ def _exec_write_bd_extend_shim_tile_opt(iptr, tensor_addr): write_addr = SHIM_DMA_BD0_BASE_ADDR + (bd_id * SHIM_BD_OFFSET) row = 0 words = [ - *_ipu_write32(column, row, write_addr, iptr[2]), - *_ipu_write32(column, row, write_addr + 4, word3), - *_ipu_write32(column, row, write_addr + 8, word4), - *_ipu_write32(column, row, write_addr + 12, iptr[5]), - *_ipu_write32(column, row, write_addr + 16, iptr[6]), - *_ipu_write32(column, row, write_addr + 20, iptr[7]), - *_ipu_write32(column, row, write_addr + 24, iptr[8]), - *_ipu_write32(column, row, write_addr + 28, iptr[9]), + *_npu_write32(column, row, write_addr, iptr[2]), + *_npu_write32(column, row, write_addr + 4, word3), + *_npu_write32(column, row, write_addr + 8, word4), + *_npu_write32(column, row, write_addr + 12, iptr[5]), + *_npu_write32(column, row, write_addr + 16, iptr[6]), + *_npu_write32(column, row, write_addr + 20, iptr[7]), + *_npu_write32(column, row, write_addr + 24, iptr[8]), + *_npu_write32(column, row, write_addr + 28, iptr[9]), ] return words @@ -202,14 +202,14 @@ def _update_tensor_addr_shim_tile(column, bd_id, tensor_addr, buffer_offset=0): write_addr = SHIM_DMA_BD0_BASE_ADDR + (bd_id * SHIM_BD_OFFSET) row = 0 words = [ - *_ipu_write32(column, row, write_addr + 4, word3), - *_ipu_write32(column, row, write_addr + 8, word4), + *_npu_write32(column, row, write_addr + 4, word3), + *_npu_write32(column, row, write_addr + 8, word4), ] return words # corresponds to ExecWriteBdExtendShimTileOpt -def _ipu_writebd_shimtile( +def _npu_writebd_shimtile( column, bd_id, buffer_length, @@ -304,26 +304,26 @@ def _ipu_writebd_shimtile( return words -def _ipu_noop(): +def _npu_noop(): words = [None] * 1 op_code = 0 words[0] = (op_code & 0xFF) << 24 return words -def _ipu_core_enable(column, row): +def _npu_core_enable(column, row): # note this clears the reset bit - return _ipu_write32(column, row, XAIEMLGBL_CORE_MODULE_CORE_CONTROL, 1) + return _npu_write32(column, row, XAIEMLGBL_CORE_MODULE_CORE_CONTROL, 1) -class ipu: - noop = _ipu_noop - write32 = _ipu_write32 - shimtile_push_queue = _ipu_shimtile_push_queue - writebd_shimtile = _ipu_writebd_shimtile - sync = _ipu_sync +class npu: + noop = _npu_noop + write32 = _npu_write32 + shimtile_push_queue = _npu_shimtile_push_queue + writebd_shimtile = _npu_writebd_shimtile + sync = _npu_sync get_prolog = _get_prolog - enable_cores = _ipu_core_enable + enable_cores = _npu_core_enable _exec_write_bd_extend_shim_tile_opt = _exec_write_bd_extend_shim_tile_opt _update_tensor_addr_shim_tile = _update_tensor_addr_shim_tile diff --git a/python/utils/README.md b/python/utils/README.md index b3223ed697..9e54561aa1 100644 --- a/python/utils/README.md +++ b/python/utils/README.md @@ -55,7 +55,7 @@ Test/ Host code utilities. * This function abstracts a number of python functions for configuring a core tile and an associated shim tile. It does not define the trace packet routing betweent he two however. To better appreciate what this wrapper function does, we need to delve more deeply into the details on how trace units are configured. -Within the `func.func @sequence` block, we add a set of configuration register writes (`aiex.ipu.write32`) to configure the tile trace units and the shimDMA. +Within the `func.func @sequence` block, we add a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and the shimDMA. ### How to configure wrapper and default values The minimum function call we need is: ```python @@ -105,14 +105,14 @@ The table below describes the general trace control registers. in C/C++ ```c++ // Start event = 1, Stop event = 0, Mode = event-time -aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 } -aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340D4 : ui32, value = 0x0 : ui32 } +aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 } +aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340D4 : ui32, value = 0x0 : ui32 } ``` in Python ```python # Start event = 1, Stop event = 0, Mode = event-time -ipu_write32(column=0, row=4, address=0x340D0, value=pack4bytes(stop, start, 0, 0),) -ipu_write32(column=0, row=4, address=0x340D4, value=0,) +npu_write32(column=0, row=4, address=0x340D0, value=pack4bytes(stop, start, 0, 0),) +npu_write32(column=0, row=4, address=0x340D4, value=0,) ``` The table below describes which events the trace hardware monitors. @@ -160,7 +160,7 @@ in C/C++ // Core Instruction - Event 0 (0x21) // Core Instruction - Event 1 (0x22) // Core Port Running 0 (0x4B) -aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 } +aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 } // Events 4-7 monitored // ------------------------ @@ -168,13 +168,13 @@ aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E0 : ui32, va // Lock stalls (0x1A) // Lock acquire requests (0x2C) // Lock release requests (0x2D) -aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 } +aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 } ``` in Python ```python # events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F] -ipu_write32(column=0, row=4, address=0x340E0, value=*events[0:4],) -ipu_write32(column=0, row=4, address=0x340E4, value=*events[4:8],) +npu_write32(column=0, row=4, address=0x340E0, value=*events[0:4],) +npu_write32(column=0, row=4, address=0x340E4, value=*events[4:8],) ``` Some configurations like the Port Running 0/1 events are further configured by a secondary configuration register. In this case, we route the port activity from the stream switch to Port running 0 or 1. @@ -204,7 +204,7 @@ in C/C++ // Stream_Switch_Event_Port_Selection_0 // This is necessary to capture the Port_Running_0 and Port_Running_1 events // Port 0 - Master/ID=1, Port 1 - Slave/ID=1 -aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 } +aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 } ``` in Python ```python @@ -214,8 +214,8 @@ in Python # def slave(port): # return port -ipu_write32(column=0, row=4, address=0x3FF00, value=pack4bytes(0, 0, slave(1), master(1)),) # port 1 is FIFO0? -ipu_write32(column=0, row=4, address=0x3FF04, value=pack4bytes(0, 0, 0, 0),) +npu_write32(column=0, row=4, address=0x3FF00, value=pack4bytes(0, 0, slave(1), master(1)),) # port 1 is FIFO0? +npu_write32(column=0, row=4, address=0x3FF04, value=pack4bytes(0, 0, 0, 0),) ``` ### Configure shimDMA @@ -239,7 +239,7 @@ An example ddr_id to inout buffer mapping is below: in C/C++ ```c++ -aiex.ipu.writebd_shimtile { bd_id = 3 : i32, +aiex.npu.writebd_shimtile { bd_id = 3 : i32, buffer_length = 16384 : i32, buffer_offset = 262144 : i32, enable_packet = 0 : i32, @@ -274,11 +274,11 @@ aiex.ipu.writebd_shimtile { bd_id = 3 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} // Set start BD to our shim bd_Id (3) -aiex.ipu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 } +aiex.npu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 } ``` in Python ```python -ipu_writebd_shimtile( +npu_writebd_shimtile( bd_id=3, buffer_length=16384, buffer_offset=262144, diff --git a/python/utils/trace.py b/python/utils/trace.py index 2f60b587da..510d7feae7 100644 --- a/python/utils/trace.py +++ b/python/utils/trace.py @@ -93,7 +93,7 @@ def configure_simple_tracing_aie2( # BB <- Event to start trace capture # C <- Trace mode, 00=event=time, 01=event-PC, 10=execution # Configure so that "Event 1" (always true) causes tracing to start - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x340D0, @@ -102,7 +102,7 @@ def configure_simple_tracing_aie2( # 0x340D4: Trace Control 1 # This is used to control packet routing. For the moment # only deal with the simple case of circuit routing. - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x340D4, @@ -110,7 +110,7 @@ def configure_simple_tracing_aie2( ) # 0x340E0: Trace Event Group 1 (Which events to trace) # 0xAABBCCDD AA, BB, CC, DD <- four event slots - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x340E0, @@ -118,7 +118,7 @@ def configure_simple_tracing_aie2( ) # 0x340E4: Trace Event Group 2 (Which events to trace) # 0xAABBCCDD AA, BB, CC, DD <- four event slots - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x340E4, @@ -132,13 +132,13 @@ def master(port): def slave(port): return port - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x3FF00, value=pack4bytes(0, 0, slave(1), master(1)), # port 1 is FIFO0? ) - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x3FF04, @@ -147,7 +147,7 @@ def slave(port): # Configure a buffer descriptor to write tracing information that has been routed into this shim tile # out to host DDR memory - ipu_writebd_shimtile( + npu_writebd_shimtile( bd_id=bd_id, buffer_length=size, buffer_offset=offset, @@ -176,7 +176,7 @@ def slave(port): valid_bd=1, ) # configure S2MM channel - ipu_write32( + npu_write32( column=int(shim.col), row=int(shim.row), address=0x1D204 if channel == 0 else 0x1D20C, diff --git a/test/Conversion/DmaToIpu/aiert_insts.mlir b/test/Conversion/DmaToNpu/aiert_insts.mlir similarity index 83% rename from test/Conversion/DmaToIpu/aiert_insts.mlir rename to test/Conversion/DmaToNpu/aiert_insts.mlir index ce82a1443e..bfcbe334ee 100644 --- a/test/Conversion/DmaToIpu/aiert_insts.mlir +++ b/test/Conversion/DmaToNpu/aiert_insts.mlir @@ -6,14 +6,14 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s -// CHECK: aiex.ipu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} -// CHECK: aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483649 : ui32} -// CHECK: aiex.ipu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} -// CHECK: aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32} +// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s +// CHECK: aiex.npu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +// CHECK: aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483649 : ui32} +// CHECK: aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +// CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32} module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @of_toMem : memref<32xi32> memref.global "public" @of_fromMem : memref<32xi32> func.func @sequence(%in : memref<4x2x8xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) { @@ -24,8 +24,8 @@ module { %c8 = arith.constant 8 : i64 %c16 = arith.constant 16 : i64 %c32 = arith.constant 32 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32> + aiex.npu.dma_memcpy_nd(0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) diff --git a/test/Conversion/DmaToIpu/bad_rtp_write.mlir b/test/Conversion/DmaToNpu/bad_rtp_write.mlir similarity index 62% rename from test/Conversion/DmaToIpu/bad_rtp_write.mlir rename to test/Conversion/DmaToNpu/bad_rtp_write.mlir index a28466af13..746df29273 100644 --- a/test/Conversion/DmaToIpu/bad_rtp_write.mlir +++ b/test/Conversion/DmaToNpu/bad_rtp_write.mlir @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --aie-dma-to-ipu -verify-diagnostics %s +// RUN: aie-opt --aie-dma-to-npu -verify-diagnostics %s -aie.device(ipu) { +aie.device(npu) { func.func @sequence() { - // expected-error@+2 {{'aiex.ipu.rtp_write' op RTP buffer address cannot be found. Has an RTP buffer been allocated?}} - // expected-error@+1 {{failed to legalize operation 'aiex.ipu.rtp_write' that was explicitly marked illegal}} - aiex.ipu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" } + // expected-error@+2 {{'aiex.npu.rtp_write' op RTP buffer address cannot be found. Has an RTP buffer been allocated?}} + // expected-error@+1 {{failed to legalize operation 'aiex.npu.rtp_write' that was explicitly marked illegal}} + aiex.npu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" } return } } diff --git a/test/Conversion/DmaToIpu/dma_to_ipu.mlir b/test/Conversion/DmaToNpu/dma_to_npu.mlir similarity index 73% rename from test/Conversion/DmaToIpu/dma_to_ipu.mlir rename to test/Conversion/DmaToNpu/dma_to_npu.mlir index d86de2acce..059766fe7c 100644 --- a/test/Conversion/DmaToIpu/dma_to_ipu.mlir +++ b/test/Conversion/DmaToNpu/dma_to_npu.mlir @@ -1,4 +1,4 @@ -//===- dma_to_ipu.mlir -----------------------------------------*- MLIR -*-===// +//===- dma_to_npu.mlir -----------------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,22 +8,22 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --split-input-file -aie-dma-to-ipu %s | FileCheck %s +// RUN: aie-opt --split-input-file -aie-dma-to-npu %s | FileCheck %s // TODO - more // CHECK-LABEL: dma_memcpy_nd_0 -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 1 : i32 module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> memref.global "public" @fromMem : memref<16xi32> func.func @dma_memcpy_nd_0(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.ipu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> return } aie.shim_dma_allocation @fromMem (MM2S, 0, 0) @@ -34,11 +34,11 @@ module { // ----- // CHECK-LABEL: dma_wait_s2mm -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.ipu.write32 -// CHECK: aiex.ipu.sync +// CHECK: aiex.npu.write32 +// CHECK: aiex.npu.sync // CHECK-SAME: channel = 0 : i32 // CHECK-SAME: column = 0 : i32 // CHECK-SAME: column_num = 1 : i32 @@ -46,11 +46,11 @@ module { // CHECK-SAME: row = 0 : i32 // CHECK-SAME: row_num = 1 : i32 module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> func.func @dma_wait_s2mm(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.ipu.dma_wait {symbol = @toMem} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_wait {symbol = @toMem} return } aie.shim_dma_allocation @toMem (S2MM, 0, 0) @@ -60,11 +60,11 @@ module { // ----- // CHECK-LABEL: dma_wait_mm2s -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.ipu.write32 -// CHECK: aiex.ipu.sync +// CHECK: aiex.npu.write32 +// CHECK: aiex.npu.sync // CHECK-SAME: channel = 1 : i32 // CHECK-SAME: column = 1 : i32 // CHECK-SAME: column_num = 1 : i32 @@ -72,11 +72,11 @@ module { // CHECK-SAME: row = 0 : i32 // CHECK-SAME: row_num = 1 : i32 module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> func.func @dma_wait_mm2s(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.ipu.dma_wait {symbol = @toMem} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_wait {symbol = @toMem} return } aie.shim_dma_allocation @toMem (MM2S, 1, 1) diff --git a/test/Conversion/DmaToIpu/dma_to_ipu_invalid.mlir b/test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir similarity index 71% rename from test/Conversion/DmaToIpu/dma_to_ipu_invalid.mlir rename to test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir index 89eff26d44..31ed2ed019 100644 --- a/test/Conversion/DmaToIpu/dma_to_ipu_invalid.mlir +++ b/test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir @@ -1,4 +1,4 @@ -//===- dma_to_ipu_invalid.mlir ---------------------------------*- MLIR -*-===// +//===- dma_to_npu_invalid.mlir ---------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,15 +8,15 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --split-input-file --aie-dma-to-ipu --verify-diagnostics %s +// RUN: aie-opt --split-input-file --aie-dma-to-npu --verify-diagnostics %s module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> func.func @sequence() { - // expected-error@+2 {{failed to legalize operation 'aiex.ipu.dma_wait' that was explicitly marked illegal}} + // expected-error@+2 {{failed to legalize operation 'aiex.npu.dma_wait' that was explicitly marked illegal}} // expected-error@+1 {{couldn't find shim_dma_allocation op}} - aiex.ipu.dma_wait {symbol = @toMem} + aiex.npu.dma_wait {symbol = @toMem} return } } diff --git a/test/Conversion/DmaToIpu/dma_to_ipu_issue_token.mlir b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir similarity index 72% rename from test/Conversion/DmaToIpu/dma_to_ipu_issue_token.mlir rename to test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir index 4eb5b02bdc..d73195973b 100644 --- a/test/Conversion/DmaToIpu/dma_to_ipu_issue_token.mlir +++ b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir @@ -1,4 +1,4 @@ -//===- dma_to_ipu_issue_token.mlir -----------------------------*- MLIR -*-===// +//===- dma_to_npu_issue_token.mlir -----------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,26 +8,26 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt -aie-dma-to-ipu %s | FileCheck %s +// RUN: aie-opt -aie-dma-to-npu %s | FileCheck %s // TODO - more // CHECK-LABEL: test1 -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.ipu.write32 +// CHECK: aiex.npu.write32 // CHECK-SAME: value = 2147483649 -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 1 : i32 -// CHECK: aiex.ipu.write32 +// CHECK: aiex.npu.write32 // CHECK-SAME: value = 0 module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> memref.global "public" @fromMem : memref<16xi32> func.func @test1(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32> - aiex.ipu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32> + aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> return } aie.shim_dma_allocation @fromMem (MM2S, 0, 0) diff --git a/test/Conversion/DmaToIpu/push_to_queue.mlir b/test/Conversion/DmaToNpu/push_to_queue.mlir similarity index 69% rename from test/Conversion/DmaToIpu/push_to_queue.mlir rename to test/Conversion/DmaToNpu/push_to_queue.mlir index 841d9e7a0f..4c45c90e1d 100644 --- a/test/Conversion/DmaToIpu/push_to_queue.mlir +++ b/test/Conversion/DmaToNpu/push_to_queue.mlir @@ -6,17 +6,17 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s -// CHECK: aiex.ipu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483651 : ui32} -// CHECK: aiex.ipu.write32 {address = 119316 : ui32, column = 2 : i32, row = 0 : i32, value = 196610 : ui32} +// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s +// CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483651 : ui32} +// CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 2 : i32, row = 0 : i32, value = 196610 : ui32} module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<32xi32> memref.global "public" @fromMem : memref<32xi32> func.func @sequence() { - aiex.ipu.shimtile_push_queue {metadata = @toMem, issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 } - aiex.ipu.shimtile_push_queue {metadata = @fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 } + aiex.npu.shimtile_push_queue {metadata = @toMem, issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 } + aiex.npu.shimtile_push_queue {metadata = @fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 } return } aie.shim_dma_allocation @fromMem (MM2S, 0, 2) diff --git a/test/Conversion/DmaToIpu/rtp_write.mlir b/test/Conversion/DmaToNpu/rtp_write.mlir similarity index 66% rename from test/Conversion/DmaToIpu/rtp_write.mlir rename to test/Conversion/DmaToNpu/rtp_write.mlir index 9aba5ad4e7..26f2876b95 100644 --- a/test/Conversion/DmaToIpu/rtp_write.mlir +++ b/test/Conversion/DmaToNpu/rtp_write.mlir @@ -6,19 +6,19 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s -// CHECK: aiex.ipu.write32 {address = 1536 : ui32, column = 2 : i32, row = 3 : i32, value = 50 : ui32} -// CHECK: aiex.ipu.write32 {address = 3216 : ui32, column = 0 : i32, row = 2 : i32, value = 99 : ui32} +// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s +// CHECK: aiex.npu.write32 {address = 1536 : ui32, column = 2 : i32, row = 3 : i32, value = 50 : ui32} +// CHECK: aiex.npu.write32 {address = 3216 : ui32, column = 0 : i32, row = 2 : i32, value = 99 : ui32} module { - aie.device(ipu) { + aie.device(npu) { %0 = aie.tile(2, 3) %1 = aie.buffer(%0) {address = 1536 : i32, sym_name = "rtp"} : memref<16xi32> %2 = aie.tile(0, 2) %3 = aie.buffer(%2) {address = 3200 : i32, sym_name = "RTP"} : memref<16xi32> func.func @sequence() { - aiex.ipu.rtp_write(2, 3, 0, 50) { buffer_sym_name = "rtp" } - aiex.ipu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" } + aiex.npu.rtp_write(2, 3, 0, 50) { buffer_sym_name = "rtp" } + aiex.npu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" } return } } diff --git a/test/Passes/assign-bd-ids/bad_bd_assignments.mlir b/test/Passes/assign-bd-ids/bad_bd_assignments.mlir index 71ab96951f..9ab8036f48 100644 --- a/test/Passes/assign-bd-ids/bad_bd_assignments.mlir +++ b/test/Passes/assign-bd-ids/bad_bd_assignments.mlir @@ -11,7 +11,7 @@ // RUN: aie-opt --verify-diagnostics --split-input-file %s module { - aie.device(ipu) { + aie.device(npu) { %tile_0_2 = aie.tile(0, 2) %double_buffer = aie.buffer(%tile_0_2) : memref<32xi32> %lock_Y = aie.lock(%tile_0_2) {init = 0 : i32} @@ -30,7 +30,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_2 = aie.tile(0, 2) %double_buffer = aie.buffer(%tile_0_2) : memref<32xi32> %lock_X = aie.lock(%tile_0_2) {init = 0 : i32} @@ -49,7 +49,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %buffer_0_1 = aie.buffer(%tile_0_1) : memref<32xi32> %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { @@ -69,7 +69,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -90,7 +90,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -110,7 +110,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} diff --git a/test/Passes/assign-bd-ids/basic.mlir b/test/Passes/assign-bd-ids/basic.mlir index b306c3053e..8862c5c09d 100644 --- a/test/Passes/assign-bd-ids/basic.mlir +++ b/test/Passes/assign-bd-ids/basic.mlir @@ -10,7 +10,7 @@ // RUN: aie-opt --aie-assign-bd-ids --split-input-file %s | FileCheck %s -// CHECK-LABEL: aie.device(ipu) { +// CHECK-LABEL: aie.device(npu) { // CHECK: %[[VAL_0:.*]] = aie.tile(0, 0) // CHECK: %[[VAL_1:.*]] = aie.tile(0, 1) // CHECK: %[[VAL_2:.*]] = aie.tile(0, 2) @@ -30,7 +30,7 @@ // CHECK: aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32} module { - aie.device(ipu) { + aie.device(npu) { %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) diff --git a/test/Passes/assign-bd-ids/user_assigned.mlir b/test/Passes/assign-bd-ids/user_assigned.mlir index 777c07f7b8..c41d3aa7d3 100644 --- a/test/Passes/assign-bd-ids/user_assigned.mlir +++ b/test/Passes/assign-bd-ids/user_assigned.mlir @@ -10,7 +10,7 @@ // RUN: aie-opt --aie-assign-bd-ids --split-input-file %s | FileCheck %s -// CHECK-LABEL: aie.device(ipu) { +// CHECK-LABEL: aie.device(npu) { // CHECK: %[[VAL_0:.*]] = aie.tile(0, 0) // CHECK: %[[VAL_1:.*]] = aie.tile(0, 1) // CHECK: %[[VAL_2:.*]] = aie.tile(0, 2) @@ -28,7 +28,7 @@ // CHECK: aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32} module { - aie.device(ipu) { + aie.device(npu) { %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) @@ -153,7 +153,7 @@ module @aie_module { // ----- -// CHECK-LABEL: aie.device(ipu) { +// CHECK-LABEL: aie.device(npu) { // CHECK: %[[VAL_0:.*]] = aie.tile(0, 0) // CHECK: %[[VAL_1:.*]] = aie.tile(0, 1) // CHECK: %[[VAL_2:.*]] = aie.tile(0, 2) @@ -171,7 +171,7 @@ module @aie_module { // CHECK: aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32} module { - aie.device(ipu) { + aie.device(npu) { %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) diff --git a/test/Targets/AIETargetHSA/input_with_addresses.mlir b/test/Targets/AIETargetHSA/input_with_addresses.mlir index 1cf762054b..1efd284c53 100644 --- a/test/Targets/AIETargetHSA/input_with_addresses.mlir +++ b/test/Targets/AIETargetHSA/input_with_addresses.mlir @@ -46,9 +46,9 @@ module { aie.shim_dma_allocation @out0(S2MM, 0, 6) func.func @sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @out0} : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @in0} : memref<64xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @out0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @in0} : memref<64xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/test/Targets/IPU/ipu_instgen.mlir b/test/Targets/NPU/npu_instgen.mlir similarity index 90% rename from test/Targets/IPU/ipu_instgen.mlir rename to test/Targets/NPU/npu_instgen.mlir index d1c2ef3c6a..4fd9636197 100644 --- a/test/Targets/IPU/ipu_instgen.mlir +++ b/test/Targets/NPU/npu_instgen.mlir @@ -1,4 +1,4 @@ -//===- ipu_instgen.mlir ----------------------------------------*- MLIR -*-===// +//===- npu_instgen.mlir ----------------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-translate --aie-ipu-instgen %s | FileCheck %s +// RUN: aie-translate --aie-npu-instgen %s | FileCheck %s module { - aie.device(ipu) { + aie.device(npu) { func.func @test0(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { // look for the prolog. @@ -48,7 +48,7 @@ module { // CHECK: 00000009 // CHECK: 2CD0000C // CHECK: 2E107041 - aiex.ipu.writebd_shimtile { bd_id = 6 : i32, + aiex.npu.writebd_shimtile { bd_id = 6 : i32, buffer_length = 1 : i32, buffer_offset = 2 : i32, enable_packet = 0 : i32, @@ -77,10 +77,10 @@ module { // CHECK: 02030400 // CHECK: ABC00DEF // CHECK: 00000042 - aiex.ipu.write32 { column = 3 : i32, row = 4 : i32, address = 0xabc00def : ui32, value = 0x42 : ui32 } + aiex.npu.write32 { column = 3 : i32, row = 4 : i32, address = 0xabc00def : ui32, value = 0x42 : ui32 } // CHECK: 03030401 // CHECK: 05010200 - aiex.ipu.sync { column = 3 : i32, row = 4 : i32, direction = 1 : i32, channel = 5 : i32, column_num = 1 : i32, row_num = 2 : i32 } + aiex.npu.sync { column = 3 : i32, row = 4 : i32, direction = 1 : i32, channel = 5 : i32, column_num = 1 : i32, row_num = 2 : i32 } return } } diff --git a/test/aie2xclbin/simple_xclbin.mlir b/test/aie2xclbin/simple_xclbin.mlir index 09e9dcaa6b..55c6aa8ec9 100644 --- a/test/aie2xclbin/simple_xclbin.mlir +++ b/test/aie2xclbin/simple_xclbin.mlir @@ -19,7 +19,7 @@ // PEANO-NOT: xchesscc_wrapper module { - aie.device(ipu) { + aie.device(npu) { %12 = aie.tile(1, 2) %buf = aie.buffer(%12) : memref<256xi32> %4 = aie.core(%12) { diff --git a/test/aiecc/simple_xclbin.mlir b/test/aiecc/simple_xclbin.mlir index 880225b0da..bec65be208 100644 --- a/test/aiecc/simple_xclbin.mlir +++ b/test/aiecc/simple_xclbin.mlir @@ -11,8 +11,8 @@ // REQUIRES: chess // REQUIRES: peano -// RUN: %PYTHON aiecc.py --xchesscc --no-link -nv --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %s | FileCheck %s --check-prefix=XCHESSCC -// RUN: %PYTHON aiecc.py --no-xchesscc --no-link -nv --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %s | FileCheck %s --check-prefix=PEANO +// RUN: %PYTHON aiecc.py --xchesscc --no-link -nv --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %s | FileCheck %s --check-prefix=XCHESSCC +// RUN: %PYTHON aiecc.py --no-xchesscc --no-link -nv --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %s | FileCheck %s --check-prefix=PEANO // Note that llc determines the architecture from the llvm IR. // XCHESSCC-NOT: {{^[^ ]*llc}} @@ -27,7 +27,7 @@ // PEANO: xclbinutil module { - aie.device(ipu) { + aie.device(npu) { %12 = aie.tile(1, 2) %buf = aie.buffer(%12) : memref<256xi32> %4 = aie.core(%12) { diff --git a/test/assign-buffer-addresses/bad_alignment.mlir b/test/assign-buffer-addresses/bad_alignment.mlir index b9c2b83d7e..4b5ca8ffce 100644 --- a/test/assign-buffer-addresses/bad_alignment.mlir +++ b/test/assign-buffer-addresses/bad_alignment.mlir @@ -11,7 +11,7 @@ // RUN: aie-opt --verify-diagnostics --split-input-file %s module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -31,7 +31,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -56,7 +56,7 @@ module { // prevent false-positives/false-negatives (I think). module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -77,7 +77,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} diff --git a/test/dialect/AIE/bad_cascade.mlir b/test/dialect/AIE/bad_cascade.mlir index 21adc09a48..c204d79af1 100644 --- a/test/dialect/AIE/bad_cascade.mlir +++ b/test/dialect/AIE/bad_cascade.mlir @@ -31,7 +31,7 @@ aie.device(xcve2802) { // CHECK: error{{.*}}'aie.cascade_flow' op shimTile row has no cascade stream interface -aie.device(ipu) { +aie.device(npu) { %t10 = aie.tile(1, 0) %t20 = aie.tile(2, 0) aie.cascade_flow(%t10, %t20) @@ -41,7 +41,7 @@ aie.device(ipu) { // CHECK: error{{.*}}'aie.cascade_flow' op memTile row has no cascade stream interface -aie.device(ipu) { +aie.device(npu) { %t11 = aie.tile(1, 1) %t21 = aie.tile(2, 1) aie.cascade_flow(%t11, %t21) @@ -87,7 +87,7 @@ aie.device(xcve2802) { // CHECK: error{{.*}}'aie.configure_cascade' op memTile row has no cascade stream interface -aie.device(ipu) { +aie.device(npu) { %t11 = aie.tile(1, 1) aie.configure_cascade(%t11, North, West) } diff --git a/test/dialect/AIE/bad_dma_op.mlir b/test/dialect/AIE/bad_dma_op.mlir index c8338ae838..9ba149c65e 100644 --- a/test/dialect/AIE/bad_dma_op.mlir +++ b/test/dialect/AIE/bad_dma_op.mlir @@ -12,7 +12,7 @@ // CHECK: error: 'aie.dma' op DMAOp can only appear in single block region module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32} : memref<16xi32> %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32} diff --git a/test/dialect/AIE/badshimtiledma.mlir b/test/dialect/AIE/badshimtiledma.mlir index 7edde144f0..da396798f0 100644 --- a/test/dialect/AIE/badshimtiledma.mlir +++ b/test/dialect/AIE/badshimtiledma.mlir @@ -12,7 +12,7 @@ // CHECK: error{{.*}}'aie.shim_dma' op uses more input channels than available on this tile module @test { - aie.device(ipu) { + aie.device(npu) { %t00 = aie.tile(0, 0) %buf_e = aie.external_buffer : memref<256xi32> diff --git a/test/dialect/AIE/badtiledma4.mlir b/test/dialect/AIE/badtiledma4.mlir index 7d2cf2b9ce..6c498c62f4 100644 --- a/test/dialect/AIE/badtiledma4.mlir +++ b/test/dialect/AIE/badtiledma4.mlir @@ -12,7 +12,7 @@ // CHECK: error{{.*}}'aie.mem' op uses more output channels than available on this tile module @test { - aie.device(ipu) { + aie.device(npu) { %t03 = aie.tile(0, 3) %buf_e = aie.buffer(%t03) : memref<256xi32> diff --git a/test/dialect/AIE/buffer.mlir b/test/dialect/AIE/buffer.mlir index d522f08dd4..a75392c5a3 100644 --- a/test/dialect/AIE/buffer.mlir +++ b/test/dialect/AIE/buffer.mlir @@ -11,7 +11,7 @@ // RUN: aie-opt --aie-standard-lowering %s | FileCheck %s module { - aie.device(ipu) { + aie.device(npu) { %t33 = aie.tile(3, 3) %t42 = aie.tile(4, 2) %t44 = aie.tile(4, 4) diff --git a/test/dialect/AIEX/bad_ipu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir similarity index 78% rename from test/dialect/AIEX/bad_ipu_nd.mlir rename to test/dialect/AIEX/bad_npu_nd.mlir index ebd1715062..45ec8e0dd6 100644 --- a/test/dialect/AIEX/bad_ipu_nd.mlir +++ b/test/dialect/AIEX/bad_npu_nd.mlir @@ -1,4 +1,4 @@ -//===- bad_ipu_nd.mlir -----------------------------------------*- MLIR -*-===// +//===- bad_npu_nd.mlir -----------------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,14 +12,14 @@ // RUN: aie-opt --split-input-file --verify-diagnostics %s module { - aie.device(ipu) { - func.func @bad_ipu_nd_length(%in : memref<1920x1080xi32>, %buf : memref<32xi32>, %out : memref<1920x1080xi32>) { + aie.device(npu) { + func.func @bad_npu_nd_length(%in : memref<1920x1080xi32>, %buf : memref<32xi32>, %out : memref<1920x1080xi32>) { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c1920 = arith.constant 1920 : i64 %c1080 = arith.constant 1080 : i64 // expected-error@+1 {{Size 0 exceeds the [0:1023] range}} - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -29,8 +29,8 @@ module { // ----- module { - aie.device(ipu) { - func.func @bad_ipu_nd_repeat(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + aie.device(npu) { + func.func @bad_npu_nd_repeat(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c2 = arith.constant 2 : i64 @@ -40,7 +40,7 @@ module { %c32 = arith.constant 32 : i64 %c128 = arith.constant 128 : i64 // expected-error@+1 {{Size 3 exceeds the [1:64] range}} - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -50,14 +50,14 @@ module { // ----- module { - aie.device(ipu) { - func.func @bad_ipu_nd_stride(%in : memref<8388608xi32>, %buf : memref<32xi32>, %out : memref<8388608xi32>) { + aie.device(npu) { + func.func @bad_npu_nd_stride(%in : memref<8388608xi32>, %buf : memref<32xi32>, %out : memref<8388608xi32>) { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c2 = arith.constant 2 : i64 %c2097152 = arith.constant 2097152 : i64 // expected-error@+1 {{Stride 1 exceeds the [1:1M] range}} - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -67,14 +67,14 @@ module { // ----- module { - aie.device(ipu) { - func.func @bad_ipu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) { + aie.device(npu) { + func.func @bad_npu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c1920 = arith.constant 1920 : i64 %c1080 = arith.constant 1080 : i64 // expected-error@+1 {{must be used with memref type with element width 32.}} - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) diff --git a/test/dialect/AIEX/bad_ipu_push_queue.mlir b/test/dialect/AIEX/bad_npu_push_queue.mlir similarity index 82% rename from test/dialect/AIEX/bad_ipu_push_queue.mlir rename to test/dialect/AIEX/bad_npu_push_queue.mlir index 49feece90d..64a11960ea 100644 --- a/test/dialect/AIEX/bad_ipu_push_queue.mlir +++ b/test/dialect/AIEX/bad_npu_push_queue.mlir @@ -1,4 +1,4 @@ -//===- bad_ipu_push_queue_bd.mlir ------------------------------*- MLIR -*-===// +//===- bad_npu_push_queue_bd.mlir ------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,10 +12,10 @@ // RUN: aie-opt --split-input-file --verify-diagnostics %s module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.ipu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 } + aiex.npu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 } return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -25,10 +25,10 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_repeat_count(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Repeat count exceeds the [0:255] range.}} - aiex.ipu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 } + aiex.npu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 } return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) diff --git a/test/dialect/AIEX/bad_ipu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir similarity index 90% rename from test/dialect/AIEX/bad_ipu_write_bd.mlir rename to test/dialect/AIEX/bad_npu_write_bd.mlir index f653614c8d..fdc9b425cc 100644 --- a/test/dialect/AIEX/bad_ipu_write_bd.mlir +++ b/test/dialect/AIEX/bad_npu_write_bd.mlir @@ -1,4 +1,4 @@ -//===- bad_ipu_write_bd_bd.mlir --------------------------------*- MLIR -*-===// +//===- bad_npu_write_bd_bd.mlir --------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,10 +12,10 @@ // RUN: aie-opt --split-input-file --verify-diagnostics %s module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.ipu.writebd_shimtile {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd_shimtile {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -25,10 +25,10 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_iteration_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}} - aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -38,10 +38,10 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_stride(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}} - aiex.ipu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -51,10 +51,10 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}} - aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) diff --git a/test/dialect/AIEX/invalid.mlir b/test/dialect/AIEX/invalid.mlir index 9b57d84b70..7a1a1fcc3b 100644 --- a/test/dialect/AIEX/invalid.mlir +++ b/test/dialect/AIEX/invalid.mlir @@ -10,10 +10,10 @@ // RUN: aie-opt --split-input-file --verify-diagnostics %s -aie.device(ipu) { - func.func @ipu_dma_wait_no_symbol() { - // expected-error@+1 {{'aiex.ipu.dma_wait' op couldn't find symbol in parent device}} - aiex.ipu.dma_wait {symbol = @out0} +aie.device(npu) { + func.func @npu_dma_wait_no_symbol() { + // expected-error@+1 {{'aiex.npu.dma_wait' op couldn't find symbol in parent device}} + aiex.npu.dma_wait {symbol = @out0} return } } diff --git a/test/dialect/AIEX/roundtrip.mlir b/test/dialect/AIEX/roundtrip.mlir index 27611d5914..a7c698db09 100644 --- a/test/dialect/AIEX/roundtrip.mlir +++ b/test/dialect/AIEX/roundtrip.mlir @@ -10,21 +10,21 @@ // RUN: aie-opt --split-input-file %s | FileCheck %s -// CHECK-LABEL: func.func @ipu_dma_wait -// CHECK: aiex.ipu.dma_wait {symbol = @out0} -aie.device(ipu) { +// CHECK-LABEL: func.func @npu_dma_wait +// CHECK: aiex.npu.dma_wait {symbol = @out0} +aie.device(npu) { memref.global "public" @out0 : memref<16xi32> - func.func @ipu_dma_wait() { - aiex.ipu.dma_wait {symbol = @out0} + func.func @npu_dma_wait() { + aiex.npu.dma_wait {symbol = @out0} return } } // ----- -// CHECK-LABEL: func.func @ipu_dma_wait_no_device -// CHECK: aiex.ipu.dma_wait {symbol = @out0} -func.func @ipu_dma_wait_no_device() { - aiex.ipu.dma_wait {symbol = @out0} +// CHECK-LABEL: func.func @npu_dma_wait_no_device +// CHECK: aiex.npu.dma_wait {symbol = @out0} +func.func @npu_dma_wait_no_device() { + aiex.npu.dma_wait {symbol = @out0} return } diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 474009e5c6..4eef6329da 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -54,7 +54,7 @@ # for xchesscc_wrapper llvm_config.with_environment("AIETOOLS", config.vitis_aietools_dir) -run_on_ipu = "echo" +run_on_npu = "echo" xrt_flags = "" # Not using run_on_board anymore, need more specific per-platform commands @@ -144,8 +144,8 @@ aie_model = m.group(2) print("\tmodel:", aie_model) config.available_features.add("ryzen_ai") - run_on_ipu = ( - f"flock /tmp/ipu.lock {config.aie_src_root}/utils/run_on_ipu.sh" + run_on_npu = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) except: print("Failed to run xbutil") @@ -153,7 +153,7 @@ else: print("xrt not found") -config.substitutions.append(("%run_on_ipu", run_on_ipu)) +config.substitutions.append(("%run_on_npu", run_on_npu)) config.substitutions.append(("%xrt_flags", xrt_flags)) config.substitutions.append(("%XRT_DIR", config.xrt_dir)) diff --git a/test/lower-to-standard/aiex_standard_lowering.mlir b/test/lower-to-standard/aiex_standard_lowering.mlir index 639dbc1e83..6a0cd0b545 100644 --- a/test/lower-to-standard/aiex_standard_lowering.mlir +++ b/test/lower-to-standard/aiex_standard_lowering.mlir @@ -11,14 +11,14 @@ // RUN: aie-opt --split-input-file --aiex-standard-lowering %s | FileCheck %s // CHECK-LABEL: dma_and_wait -// CHECK-NOT: aiex.ipu.dma_memcpy_nd -// CHECK-NOT: aiex.ipu.dma_wait +// CHECK-NOT: aiex.npu.dma_memcpy_nd +// CHECK-NOT: aiex.npu.dma_wait module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> func.func @dma_and_wait(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.ipu.dma_wait {symbol = @toMem} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_wait {symbol = @toMem} return } aie.shim_dma_allocation @toMem (MM2S, 1, 1) diff --git a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir similarity index 95% rename from test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir rename to test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir index e44add4a05..89bda05890 100644 --- a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir +++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) @@ -100,9 +100,9 @@ module { aie.shim_dma_allocation @this_just_creates_a_symbol_and_the_type_means_nothing_in(MM2S, 0, 0) aie.shim_dma_allocation @this_just_creates_a_symbol_and_the_type_means_nothing_out(S2MM, 0, 0) func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/run.lit similarity index 70% rename from test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit rename to test/npu-xrt/add_256_using_dma_op_no_double_buffering/run.lit index 5d29ef1058..67cf187967 100644 --- a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit +++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/run.lit @@ -7,7 +7,7 @@ // RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir // RUN: cp *.elf aie.mlir.prj/ // RUN: cp *.bin aie.mlir.prj/ -// RUN: %python aiecc.py --no-aiesim --aie-generate-ipu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -// RUN: %run_on_ipu ./test.exe aie.xclbin | FileCheck %s +// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp similarity index 100% rename from test/ipu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp rename to test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp diff --git a/test/ipu-xrt/add_314_using_dma_op/aie.mlir b/test/npu-xrt/add_314_using_dma_op/aie.mlir similarity index 97% rename from test/ipu-xrt/add_314_using_dma_op/aie.mlir rename to test/npu-xrt/add_314_using_dma_op/aie.mlir index 646f263804..37ef98c47c 100644 --- a/test/ipu-xrt/add_314_using_dma_op/aie.mlir +++ b/test/npu-xrt/add_314_using_dma_op/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @objFifo_in0 : memref<16xi32> memref.global "public" @objFifo_out0 : memref<16xi32> @@ -65,9 +65,9 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/ipu-xrt/add_314_using_dma_op/run.lit b/test/npu-xrt/add_314_using_dma_op/run.lit similarity index 70% rename from test/ipu-xrt/add_314_using_dma_op/run.lit rename to test/npu-xrt/add_314_using_dma_op/run.lit index 23c3e076c9..5329b2789e 100644 --- a/test/ipu-xrt/add_314_using_dma_op/run.lit +++ b/test/npu-xrt/add_314_using_dma_op/run.lit @@ -7,8 +7,8 @@ // RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir // RUN: cp *.elf aie.mlir.prj/ // RUN: cp *.bin aie.mlir.prj/ -// RUN: %python aiecc.py --no-aiesim --aie-generate-ipu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -// RUN: %run_on_ipu ./test.exe aie.xclbin | FileCheck %s +// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/add_314_using_dma_op/test.cpp b/test/npu-xrt/add_314_using_dma_op/test.cpp similarity index 100% rename from test/ipu-xrt/add_314_using_dma_op/test.cpp rename to test/npu-xrt/add_314_using_dma_op/test.cpp diff --git a/test/ipu-xrt/add_one_objFifo/CMakeLists.txt b/test/npu-xrt/add_one_objFifo/CMakeLists.txt similarity index 96% rename from test/ipu-xrt/add_one_objFifo/CMakeLists.txt rename to test/npu-xrt/add_one_objFifo/CMakeLists.txt index ad13460e2b..c400599ea1 100644 --- a/test/ipu-xrt/add_one_objFifo/CMakeLists.txt +++ b/test/npu-xrt/add_one_objFifo/CMakeLists.txt @@ -22,7 +22,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/test/ipu-xrt/add_one_objFifo/Makefile b/test/npu-xrt/add_one_objFifo/Makefile similarity index 91% rename from test/ipu-xrt/add_one_objFifo/Makefile rename to test/npu-xrt/add_one_objFifo/Makefile index 9fd67f862e..ce9d9338b3 100644 --- a/test/ipu-xrt/add_one_objFifo/Makefile +++ b/test/npu-xrt/add_one_objFifo/Makefile @@ -7,7 +7,7 @@ all: build/final.xclbin build/insts.txt build/final.xclbin: aie.mlir mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/test/ipu-xrt/add_one_objFifo/aie.mlir b/test/npu-xrt/add_one_objFifo/aie.mlir similarity index 91% rename from test/ipu-xrt/add_one_objFifo/aie.mlir rename to test/npu-xrt/add_one_objFifo/aie.mlir index 3b55edb0d7..137f8b03d8 100644 --- a/test/ipu-xrt/add_one_objFifo/aie.mlir +++ b/test/npu-xrt/add_one_objFifo/aie.mlir @@ -6,7 +6,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { %t00 = aie.tile(0, 0) %t01 = aie.tile(0, 1) %t02 = aie.tile(0, 2) @@ -44,9 +44,9 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c64 = arith.constant 64 : i64 - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> - aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/ipu-xrt/add_one_objFifo/run.lit b/test/npu-xrt/add_one_objFifo/run.lit similarity index 75% rename from test/ipu-xrt/add_one_objFifo/run.lit rename to test/npu-xrt/add_one_objFifo/run.lit index 632a421a4d..a137e2e4ae 100644 --- a/test/ipu-xrt/add_one_objFifo/run.lit +++ b/test/npu-xrt/add_one_objFifo/run.lit @@ -3,8 +3,8 @@ // // REQUIRES: ryzen_ai // -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/add_one_objFifo/run.sh b/test/npu-xrt/add_one_objFifo/run.sh similarity index 100% rename from test/ipu-xrt/add_one_objFifo/run.sh rename to test/npu-xrt/add_one_objFifo/run.sh diff --git a/test/ipu-xrt/add_one_objFifo/test.cpp b/test/npu-xrt/add_one_objFifo/test.cpp similarity index 100% rename from test/ipu-xrt/add_one_objFifo/test.cpp rename to test/npu-xrt/add_one_objFifo/test.cpp diff --git a/test/ipu-xrt/add_one_using_dma/aie.mlir b/test/npu-xrt/add_one_using_dma/aie.mlir similarity index 97% rename from test/ipu-xrt/add_one_using_dma/aie.mlir rename to test/npu-xrt/add_one_using_dma/aie.mlir index 058ae034bc..8647f6b710 100644 --- a/test/ipu-xrt/add_one_using_dma/aie.mlir +++ b/test/npu-xrt/add_one_using_dma/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @objFifo_in0 : memref<16xi32> memref.global "public" @objFifo_in0_cons : memref<16xi32> memref.global "public" @objFifo_in1 : memref<8xi32> @@ -76,9 +76,9 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/ipu-xrt/add_one_using_dma/run.lit b/test/npu-xrt/add_one_using_dma/run.lit similarity index 75% rename from test/ipu-xrt/add_one_using_dma/run.lit rename to test/npu-xrt/add_one_using_dma/run.lit index 632a421a4d..a137e2e4ae 100644 --- a/test/ipu-xrt/add_one_using_dma/run.lit +++ b/test/npu-xrt/add_one_using_dma/run.lit @@ -3,8 +3,8 @@ // // REQUIRES: ryzen_ai // -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/add_one_using_dma/test.cpp b/test/npu-xrt/add_one_using_dma/test.cpp similarity index 100% rename from test/ipu-xrt/add_one_using_dma/test.cpp rename to test/npu-xrt/add_one_using_dma/test.cpp diff --git a/test/ipu-xrt/cascade_flows/CMakeLists.txt b/test/npu-xrt/cascade_flows/CMakeLists.txt similarity index 96% rename from test/ipu-xrt/cascade_flows/CMakeLists.txt rename to test/npu-xrt/cascade_flows/CMakeLists.txt index 257e7ca075..aafc542dde 100644 --- a/test/ipu-xrt/cascade_flows/CMakeLists.txt +++ b/test/npu-xrt/cascade_flows/CMakeLists.txt @@ -22,7 +22,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/test/ipu-xrt/cascade_flows/Makefile b/test/npu-xrt/cascade_flows/Makefile similarity index 95% rename from test/ipu-xrt/cascade_flows/Makefile rename to test/npu-xrt/cascade_flows/Makefile index 6c88c72d19..ef6b2cf5a0 100644 --- a/test/ipu-xrt/cascade_flows/Makefile +++ b/test/npu-xrt/cascade_flows/Makefile @@ -19,7 +19,7 @@ build/%.o: %.cc build/final.xclbin: aie.mlir build/kernel1.o build/kernel2.o build/kernel3.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/test/ipu-xrt/cascade_flows/aie.mlir b/test/npu-xrt/cascade_flows/aie.mlir similarity index 92% rename from test/ipu-xrt/cascade_flows/aie.mlir rename to test/npu-xrt/cascade_flows/aie.mlir index 967c3ecedc..e5b98481b5 100644 --- a/test/ipu-xrt/cascade_flows/aie.mlir +++ b/test/npu-xrt/cascade_flows/aie.mlir @@ -6,7 +6,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { %t00 = aie.tile(0, 0) %t01 = aie.tile(0, 1) %t03 = aie.tile(0, 3) @@ -60,9 +60,9 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c64 = arith.constant 64 : i64 - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> - aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/ipu-xrt/cascade_flows/kernel1.cc b/test/npu-xrt/cascade_flows/kernel1.cc similarity index 100% rename from test/ipu-xrt/cascade_flows/kernel1.cc rename to test/npu-xrt/cascade_flows/kernel1.cc diff --git a/test/ipu-xrt/cascade_flows/kernel2.cc b/test/npu-xrt/cascade_flows/kernel2.cc similarity index 100% rename from test/ipu-xrt/cascade_flows/kernel2.cc rename to test/npu-xrt/cascade_flows/kernel2.cc diff --git a/test/ipu-xrt/cascade_flows/kernel3.cc b/test/npu-xrt/cascade_flows/kernel3.cc similarity index 100% rename from test/ipu-xrt/cascade_flows/kernel3.cc rename to test/npu-xrt/cascade_flows/kernel3.cc diff --git a/test/ipu-xrt/cascade_flows/run.lit b/test/npu-xrt/cascade_flows/run.lit similarity index 83% rename from test/ipu-xrt/cascade_flows/run.lit rename to test/npu-xrt/cascade_flows/run.lit index 6581a3a212..c3b2945605 100644 --- a/test/ipu-xrt/cascade_flows/run.lit +++ b/test/npu-xrt/cascade_flows/run.lit @@ -6,7 +6,7 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel1.cc -o ./kernel1.o // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel2.cc -o ./kernel2.o // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel3.cc -o ./kernel3.o -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/cascade_flows/test.cpp b/test/npu-xrt/cascade_flows/test.cpp similarity index 100% rename from test/ipu-xrt/cascade_flows/test.cpp rename to test/npu-xrt/cascade_flows/test.cpp diff --git a/test/ipu-xrt/e2e/conftest.py b/test/npu-xrt/e2e/conftest.py similarity index 96% rename from test/ipu-xrt/e2e/conftest.py rename to test/npu-xrt/e2e/conftest.py index 7e27c4421b..c2f981380f 100644 --- a/test/ipu-xrt/e2e/conftest.py +++ b/test/npu-xrt/e2e/conftest.py @@ -8,7 +8,7 @@ @pytest.fixture(autouse=True) def run_around_tests(): subprocess.check_call( - [str(Path(__file__).parent.parent.parent.parent / "utils" / "reset_ipu.sh")] + [str(Path(__file__).parent.parent.parent.parent / "utils" / "reset_npu.sh")] ) yield diff --git a/test/ipu-xrt/e2e/lit.local.cfg b/test/npu-xrt/e2e/lit.local.cfg similarity index 100% rename from test/ipu-xrt/e2e/lit.local.cfg rename to test/npu-xrt/e2e/lit.local.cfg diff --git a/test/ipu-xrt/e2e/pytest.ini b/test/npu-xrt/e2e/pytest.ini similarity index 100% rename from test/ipu-xrt/e2e/pytest.ini rename to test/npu-xrt/e2e/pytest.ini diff --git a/test/ipu-xrt/e2e/run_all_tests_one_by_one.sh b/test/npu-xrt/e2e/run_all_tests_one_by_one.sh similarity index 100% rename from test/ipu-xrt/e2e/run_all_tests_one_by_one.sh rename to test/npu-xrt/e2e/run_all_tests_one_by_one.sh diff --git a/test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py b/test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py similarity index 93% rename from test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py rename to test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py index 8af3ee9cf5..fc57e8b0f1 100644 --- a/test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py +++ b/test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py @@ -8,7 +8,7 @@ from pathlib import Path import random -from aie.compiler.aiecc.main import DMA_TO_IPU +from aie.compiler.aiecc.main import DMA_TO_NPU from aie.compiler.util import compile_without_vectorization, make_xclbin from aie.dialects import aie, aiex from aie.dialects.aie import ( @@ -16,7 +16,7 @@ DMAChannelDir, LockAction, WireBundle, - ipu_instgen, + npu_instgen, ) from aie.dialects.scf import for_ as range_, yield_ from aie.extras.dialects.ext import arith, func, memref @@ -47,8 +47,8 @@ def test_add_256_using_dma_op_no_double_buffering(ctx: MLIRContext, workdir: Pat LEN = 128 LOCAL_MEM_SIZE = 32 - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -115,7 +115,7 @@ def bobsyouruncle( _arg1: T.memref(1, T.i32()), arg2: T.memref(LEN, T.i32()), ): - aiex.ipu_dma_memcpy_nd( + aiex.npu_dma_memcpy_nd( this_is_meaningless_1.sym_name.value, 0, arg0, @@ -123,7 +123,7 @@ def bobsyouruncle( [1, 1, 1, LEN], [0, 0, 0], ) - aiex.ipu_dma_memcpy_nd( + aiex.npu_dma_memcpy_nd( this_is_meaningless_2.sym_name.value, 1, arg2, @@ -132,7 +132,7 @@ def bobsyouruncle( [0, 0, 0], ) - aiex.ipu_sync( + aiex.npu_sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) @@ -188,12 +188,12 @@ def dma2(): aie.end() compile_without_vectorization(ctx.module, workdir) - generated_ipu_insts = run_pipeline(ctx.module, DMA_TO_IPU) - ipu_insts = [int(inst, 16) for inst in ipu_instgen(generated_ipu_insts.operation)] + generated_npu_insts = run_pipeline(ctx.module, DMA_TO_NPU) + npu_insts = [int(inst, 16) for inst in npu_instgen(generated_npu_insts.operation)] xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(LEN,), (LEN,), (LEN,)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_locks.py b/test/npu-xrt/e2e/test_locks.py similarity index 93% rename from test/ipu-xrt/e2e/test_locks.py rename to test/npu-xrt/e2e/test_locks.py index 3f50bf1da6..4ecc07a095 100644 --- a/test/ipu-xrt/e2e/test_locks.py +++ b/test/npu-xrt/e2e/test_locks.py @@ -42,10 +42,10 @@ def test_one_global(ctx: MLIRContext, workdir: Path): iv = np.random.randint(0, 10, (K,), dtype=np.int32) column = 2 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): # TODO(max): figure this annoying thing out... if column != 0: _dummy_tile = aie.tile(0, 2) @@ -112,8 +112,8 @@ def memtile_dma(): ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( column=column, bd_id=bd_id, buffer_length=K, @@ -121,16 +121,16 @@ def memtile_dma(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( channel_dir=S2MM, channel_index=flow_to_shim.dest_channel, column=column, bd_id=bd_id, ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=flow_to_shim.dest_channel, column=column, direction=0, @@ -140,9 +140,9 @@ def memtile_dma(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) [c] = xclbin.mmap_buffers([(K,)], np.int32) wrap_C = np.asarray(c) C = np.zeros((K,), dtype=np.int32) @@ -166,10 +166,10 @@ def test_threesome(ctx: MLIRContext, workdir: Path): iv1 = np.random.randint(0, 10, (K,), dtype=np.int32) iv2 = np.random.randint(0, 10, (K,), dtype=np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): _dummy_tile = aie.tile(0, 2) tile_1_2 = aie.tile(1, 2) global_weight_1_2 = memref.global_(initial_value=iv1) @@ -249,8 +249,8 @@ def memtile_dma(): ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( column=shim_tile_column, bd_id=bd_id, buffer_length=K, @@ -258,16 +258,16 @@ def memtile_dma(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( channel_dir=S2MM, channel_index=flow_to_shim.dest_channel, column=shim_tile_column, bd_id=bd_id, ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=flow_to_shim.dest_channel, column=shim_tile_column, direction=0, @@ -277,9 +277,9 @@ def memtile_dma(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) [c] = xclbin.mmap_buffers([(K,)], np.int32) wrap_C = np.asarray(c) C = np.zeros((K,), dtype=np.int32) @@ -305,10 +305,10 @@ def test_foursome(ctx: MLIRContext, workdir: Path): iv2 = np.random.randint(0, 10, (K,), dtype=np.int32) iv3 = np.random.randint(0, 10, (K,), dtype=np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): _dummy_tile = aie.tile(0, 2) tile_1_3 = aie.tile(1, 3) @@ -407,8 +407,8 @@ def memtile_dma(): ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( column=shim_tile_column, bd_id=bd_id, buffer_length=K, @@ -416,16 +416,16 @@ def memtile_dma(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( channel_dir=S2MM, channel_index=flow_to_shim.dest_channel, column=shim_tile_column, bd_id=bd_id, ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=flow_to_shim.dest_channel, column=shim_tile_column, direction=0, @@ -435,9 +435,9 @@ def memtile_dma(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) [c] = xclbin.mmap_buffers([(K,)], np.int32) wrap_C = np.asarray(c) C = np.zeros((K,), dtype=np.int32) diff --git a/test/ipu-xrt/e2e/test_manual_dpu_args.py b/test/npu-xrt/e2e/test_manual_dpu_args.py similarity index 88% rename from test/ipu-xrt/e2e/test_manual_dpu_args.py rename to test/npu-xrt/e2e/test_manual_dpu_args.py index 3016384071..9b372e439e 100644 --- a/test/ipu-xrt/e2e/test_manual_dpu_args.py +++ b/test/npu-xrt/e2e/test_manual_dpu_args.py @@ -54,8 +54,8 @@ def test_manual_args(ctx: MLIRContext, workdir: Path): iters = 10 loop = False - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -115,29 +115,29 @@ def dma6(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * iters, np.int32) col = 0 channel_index = 0 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() for bd_id in range(iters): - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(bd_id), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend(aiex.ipu.sync(column=col)) + npu_insts.extend(aiex.npu.sync(column=col)) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) @@ -161,8 +161,8 @@ def test_manual_args_with_offset(ctx: MLIRContext, workdir: Path): iters = 10 loop = False - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -222,30 +222,30 @@ def dma6(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K * iters,)] * iters, np.int32) col = 0 channel_index = 0 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() for i in range(iters): bd_id = i - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K, buffer_offset=K * i ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(i), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend(aiex.ipu.sync(column=col)) + npu_insts.extend(aiex.npu.sync(column=col)) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) @@ -268,8 +268,8 @@ def test_manual_args_with_different_cols(ctx: MLIRContext, workdir: Path): RANDOM_WEIGHT = np.random.randint(0, 10, (K,), dtype=np.int32) cols = [0, 1, 2, 3] - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): for c in cols: tile_c_0 = aie.tile(c, 0) tile_c_2 = aie.tile(c, 2) @@ -306,29 +306,29 @@ def dma3(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32) bd_id = 0 channel_index = 0 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() for col in cols: - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(col), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend(aiex.ipu.sync(column=col)) + npu_insts.extend(aiex.npu.sync(column=col)) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) @@ -353,8 +353,8 @@ def test_manual_args_with_shim_dma(ctx: MLIRContext, workdir: Path): iters = 21 - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): if 0 not in cols: tile_dummy = aie.tile(0, 3) for c in cols: @@ -408,20 +408,20 @@ def dma(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32) bd_id = 0 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() for i, col in enumerate(cols): - update_addrs = aiex.ipu._update_tensor_addr_shim_tile( + update_addrs = aiex.npu._update_tensor_addr_shim_tile( col, bd_id, tensor_addr=xclbin._get_buffer_host_address(i) ) - ipu_insts.extend(update_addrs) - ipu_insts.extend(aiex.ipu.enable_cores(col, compute_tile_row)) + npu_insts.extend(update_addrs) + npu_insts.extend(aiex.npu.enable_cores(col, compute_tile_row)) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) diff --git a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py b/test/npu-xrt/e2e/test_nonsquare_matrix_mult.py similarity index 91% rename from test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py rename to test/npu-xrt/e2e/test_nonsquare_matrix_mult.py index 20c5998709..0489b46381 100644 --- a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py +++ b/test/npu-xrt/e2e/test_nonsquare_matrix_mult.py @@ -45,10 +45,10 @@ def test_nonsquare_matrix_mult(ctx: MLIRContext, workdir: Path): M, K, N = 16, 32, 16 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -91,8 +91,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * K, @@ -100,14 +100,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K * N, @@ -115,14 +115,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -130,9 +130,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -235,9 +235,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -268,10 +268,10 @@ def core(): def test_nonsquare_matrix_mult_sugar(ctx: MLIRContext, workdir: Path): M, K, N = 16, 32, 16 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -309,8 +309,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * K, @@ -318,14 +318,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K * N, @@ -333,14 +333,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -348,9 +348,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -417,9 +417,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py b/test/npu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py similarity index 94% rename from test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py rename to test/npu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py index ae1079fd4e..816ddba151 100644 --- a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py +++ b/test/npu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py @@ -64,11 +64,11 @@ def matmul_i32_i32( def test_nonsquare_matrix_mult_vectorized(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -112,8 +112,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * K, @@ -121,14 +121,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K * N, @@ -136,14 +136,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -151,9 +151,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -327,9 +327,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -359,11 +359,11 @@ def super_vectorize(target: any_op_t()): def test_nonsquare_matrix_mult_vectorized_sugar(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -402,8 +402,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * K, @@ -411,14 +411,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K * N, @@ -426,14 +426,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -441,9 +441,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -579,9 +579,9 @@ def super_vectorize(target: any_op_t()): ) compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_offsets_sizes_strides.py b/test/npu-xrt/e2e/test_offsets_sizes_strides.py similarity index 92% rename from test/ipu-xrt/e2e/test_offsets_sizes_strides.py rename to test/npu-xrt/e2e/test_offsets_sizes_strides.py index 1262d59bf3..ecae0293ba 100644 --- a/test/ipu-xrt/e2e/test_offsets_sizes_strides.py +++ b/test/npu-xrt/e2e/test_offsets_sizes_strides.py @@ -54,10 +54,10 @@ def test_offsets_sizes_strides(ctx: MLIRContext, workdir: Path): tile_m_B, tile_n_B = M // tile_rows_B, N // tile_cols_B tile_m_C, tile_n_C = M // tile_rows_C, N // tile_cols_C - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -102,8 +102,8 @@ def ipu(): channel_index = 0 ddr_id = 0 for i, bd_id in enumerate(range(4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, 64, @@ -115,16 +115,16 @@ def ipu(): d0_stride=1, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, 64, @@ -136,16 +136,16 @@ def ipu(): d0_stride=1, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C channel_index = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, 64, @@ -157,11 +157,11 @@ def ipu(): d0_stride=1, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -257,9 +257,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_repeat_count.py b/test/npu-xrt/e2e/test_repeat_count.py similarity index 90% rename from test/ipu-xrt/e2e/test_repeat_count.py rename to test/npu-xrt/e2e/test_repeat_count.py index c769770283..e350931920 100644 --- a/test/ipu-xrt/e2e/test_repeat_count.py +++ b/test/npu-xrt/e2e/test_repeat_count.py @@ -53,10 +53,10 @@ def test_repeat_count(ctx: MLIRContext, workdir: Path): iters = 4 loop = False RANDOM_WEIGHT = np.random.randint(0, 10, (K,), dtype=np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -109,8 +109,8 @@ def dma6(): ddr_id = 0 col = 0 for i, bd_id in enumerate(range(iters)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K, @@ -118,11 +118,11 @@ def dma6(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=col, column_num=1, @@ -136,9 +136,9 @@ def dma6(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(iters * K,)], np.int32) wrap_C = np.asarray(views[0]) @@ -165,10 +165,10 @@ def test_no_loop(ctx: MLIRContext, workdir: Path): RANDOM_WEIGHT = np.ones((K,), dtype=np.int32) * random.randint(1, 100) col = 2 iters = 10 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): nonlocal col if col != 0: @@ -206,28 +206,28 @@ def dma3(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)], np.int32) channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K, ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( S2MM, channel_index, col, bd_id, repeats=iters - 1 ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=col, column_num=1, @@ -237,7 +237,7 @@ def dma3(): ) ) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) diff --git a/test/ipu-xrt/e2e/test_shared_buffers_init_value.py b/test/npu-xrt/e2e/test_shared_buffers_init_value.py similarity index 95% rename from test/ipu-xrt/e2e/test_shared_buffers_init_value.py rename to test/npu-xrt/e2e/test_shared_buffers_init_value.py index 461031ebb3..e8fb6b4ebe 100644 --- a/test/ipu-xrt/e2e/test_shared_buffers_init_value.py +++ b/test/npu-xrt/e2e/test_shared_buffers_init_value.py @@ -38,10 +38,10 @@ def test_foursome(ctx: MLIRContext, workdir: Path): init_weights = [np.random.randint(0, 10, (K,), dtype=np.int32) for _ in range(7)] random_numbers = [random.randint(0, 10) for _ in range(7, 7 + 3)] - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): _dummy_tile = aie.tile(0, 2) # west @@ -170,8 +170,8 @@ def memtile_dma(): ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( column=shim_tile_column, bd_id=bd_id, buffer_length=K, @@ -179,16 +179,16 @@ def memtile_dma(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( channel_dir=S2MM, channel_index=flow_to_shim.dest_channel, column=shim_tile_column, bd_id=bd_id, ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=flow_to_shim.dest_channel, column=shim_tile_column, direction=S2MM, @@ -198,9 +198,9 @@ def memtile_dma(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) [c] = xclbin.mmap_buffers([(K,)], np.int32) wrap_C = np.asarray(c) C = np.zeros((K,), dtype=np.int32) diff --git a/test/ipu-xrt/e2e/test_square_matrix_mult.py b/test/npu-xrt/e2e/test_square_matrix_mult.py similarity index 91% rename from test/ipu-xrt/e2e/test_square_matrix_mult.py rename to test/npu-xrt/e2e/test_square_matrix_mult.py index 6f746fc490..b229c3a1a8 100644 --- a/test/ipu-xrt/e2e/test_square_matrix_mult.py +++ b/test/npu-xrt/e2e/test_square_matrix_mult.py @@ -45,10 +45,10 @@ def test_square_matrix_mult(ctx: MLIRContext, workdir: Path): M = N = 16 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -91,8 +91,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -100,14 +100,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -115,14 +115,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -130,9 +130,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -229,9 +229,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -262,10 +262,10 @@ def core(): def test_square_matrix_mult_sugar(ctx: MLIRContext, workdir: Path): M = N = 16 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -299,8 +299,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -308,14 +308,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -323,14 +323,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -338,9 +338,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -397,9 +397,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py b/test/npu-xrt/e2e/test_square_matrix_mult_vectorized.py similarity index 94% rename from test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py rename to test/npu-xrt/e2e/test_square_matrix_mult_vectorized.py index b11e4463f8..55a8feeb3c 100644 --- a/test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py +++ b/test/npu-xrt/e2e/test_square_matrix_mult_vectorized.py @@ -64,12 +64,12 @@ def matmul_i32_i32( def test_square_matrix_mult_vectorized(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -113,8 +113,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -122,14 +122,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -137,14 +137,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -152,9 +152,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -326,9 +326,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -358,11 +358,11 @@ def super_vectorize(target: any_op_t()): def test_square_matrix_mult_vectorized_sugar(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -401,8 +401,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -410,14 +410,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -425,14 +425,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -440,9 +440,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -581,9 +581,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_tiled_matrix_add.py b/test/npu-xrt/e2e/test_tiled_matrix_add.py similarity index 92% rename from test/ipu-xrt/e2e/test_tiled_matrix_add.py rename to test/npu-xrt/e2e/test_tiled_matrix_add.py index 00755104ed..21d90f3231 100644 --- a/test/ipu-xrt/e2e/test_tiled_matrix_add.py +++ b/test/npu-xrt/e2e/test_tiled_matrix_add.py @@ -47,10 +47,10 @@ def test_tiled_matrix_add(ctx: MLIRContext, workdir: Path): _, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles( M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -100,8 +100,8 @@ def ipu(): channel_index = 0 ddr_id = 0 for i, bd_id in enumerate(range(4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -113,16 +113,16 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -134,16 +134,16 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C channel_index = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -155,11 +155,11 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -258,9 +258,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -291,10 +291,10 @@ def test_matrix_add_sugar(ctx: MLIRContext, workdir: Path): _, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles( M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): shim_tile_0_0 = aie.tile(0, 0) mem_tile_0_1 = aie.tile(0, 1) compute_tile_0_2 = aie.tile(0, 2) @@ -359,8 +359,8 @@ def ipu(): # in A ddr_id = 0 for i, bd_id in enumerate(range(4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -372,8 +372,8 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( MM2S, input_a_tile_0_0_to_tile_0_1.source_channel, col, bd_id ) ) @@ -381,8 +381,8 @@ def ipu(): # in B ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -394,8 +394,8 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( MM2S, input_b_tile_0_0_to_tile_0_1.source_channel, col, bd_id ) ) @@ -403,8 +403,8 @@ def ipu(): # out C ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -416,13 +416,13 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( S2MM, output_c_tile_0_1_to_tile_0_0.dest_channel, col, bd_id ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -496,9 +496,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py b/test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py similarity index 93% rename from test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py rename to test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py index 1b19015d33..4e5e41615b 100644 --- a/test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py +++ b/test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py @@ -70,7 +70,7 @@ def shim_tensor_slice( M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols ) - ipu_insts = aiex.ipu.writebd_shimtile( + npu_insts = aiex.npu.writebd_shimtile( column=column, bd_id=bd_id, ddr_id=ddr_id, @@ -81,23 +81,23 @@ def shim_tensor_slice( d0_size=d0_size, d0_stride=d0_stride, ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(channel_dir, channel_index, column, bd_id=bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(channel_dir, channel_index, column, bd_id=bd_id) ) - return ipu_insts + return npu_insts def shim_bd(direction, channel, buffer_length, column=0, bd_id=0, ddr_id=0): - ipu_insts = [] - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts = [] + npu_insts.extend( + aiex.npu.writebd_shimtile( column=column, bd_id=bd_id, ddr_id=ddr_id, buffer_length=buffer_length ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(direction, channel, column, bd_id=bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(direction, channel, column, bd_id=bd_id) ) - return ipu_insts + return npu_insts def test_tiled_nonsquare_tile_spatial_2x2(ctx: MLIRContext, workdir: Path): @@ -136,10 +136,10 @@ def test_tiled_nonsquare_tile_spatial_2x2(ctx: MLIRContext, workdir: Path): M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): # col a0 (top row of matrix products) tiles = np.empty((5, 6), dtype=object) for col in [0, 1]: @@ -167,17 +167,17 @@ def ipu(): # fmt: off column = 0 # broadcast a0 - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0)) # broadcast b0 - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1)) column = 1 # broadcast a1 - ipu_insts.extend( + npu_insts.extend( shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, d1_size_A * d1_stride_A, column, MM2S, broadcast_a1_flow_ep.source_channel, 0, 0) ) # broadcast b1 - ipu_insts.extend( + npu_insts.extend( shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, d0_size_B * d0_stride_B, column, MM2S, broadcast_b1_flow_ep.source_channel, 1, 1) ) # fmt: on @@ -339,15 +339,15 @@ def memtile_dma_c_1(): # fmt: off for i, (column, channel, bd_id) in enumerate(channels): - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2)) - ipu_insts.extend(aiex.ipu.sync(channel=channel, column=column)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2)) + npu_insts.extend(aiex.npu.sync(channel=channel, column=column)) # fmt: on compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -453,7 +453,7 @@ def test_tiled_nonsquare_tile_spatial_2x2_vectorized(ctx: MLIRContext, workdir: M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aievec = ExplicitlyManagedModule() kernel = matmul_i32_i32_already_vectorized.emit(force=True) @@ -462,8 +462,8 @@ def test_tiled_nonsquare_tile_spatial_2x2_vectorized(ctx: MLIRContext, workdir: mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32_already_vectorized.emit(decl=True) # col a0 (top row of matrix products) tiles = np.empty((5, 6), dtype=object) @@ -492,17 +492,17 @@ def ipu(): # fmt: off column = 0 # broadcast a0 - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0)) # broadcast b0 - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1)) column = 1 # broadcast a1 - ipu_insts.extend( + npu_insts.extend( shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, d1_size_A * d1_stride_A, column, MM2S, broadcast_a1_flow_ep.source_channel, 0, 0) ) # broadcast b1 - ipu_insts.extend( + npu_insts.extend( shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, d0_size_B * d0_stride_B, column, MM2S, broadcast_b1_flow_ep.source_channel, 1, 1) ) # fmt: on @@ -664,8 +664,8 @@ def memtile_dma_c_1(): # fmt: off for i, (column, channel, bd_id) in enumerate(channels): - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2)) - ipu_insts.extend(aiex.ipu.sync(channel=channel, column=column)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2)) + npu_insts.extend(aiex.npu.sync(channel=channel, column=column)) # fmt: on mod_aie = mod_aie.finish() @@ -673,9 +673,9 @@ def memtile_dma_c_1(): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -712,8 +712,8 @@ def test_tiled_nonsquare_tile_spatial_4x4_weight_stationary_v1( dest_channels = {} - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray(cols, rows) for i, ((col, row), t) in enumerate(tiles[:, 2:]): b = aie.buffer( @@ -784,28 +784,28 @@ def memtile_dma(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() bd_id = 0 for col in cols: dest_channel = dest_channels[col] - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(col), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, dest_channel, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, dest_channel, col, bd_id) ) - ipu_insts.extend(aiex.ipu.sync(column=col)) - xclbin.load_ipu_instructions(ipu_insts) + npu_insts.extend(aiex.npu.sync(column=col)) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) @@ -826,8 +826,8 @@ def test_double_pump_single_buffer(ctx: MLIRContext, workdir: Path): source_channels = {} # dest_channels = {} - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray(cols=[0], rows=[0, 1, 2]) buffer = tiles[0, 2].buffer([(K,)], [T.i32()], "double_buffer") @@ -969,27 +969,27 @@ def memtile_dma(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * 2, np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() col = 0 for bd_id, player in enumerate(["player_a", "player_b"]): source_channel = source_channels[player] - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(col), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, source_channel, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, source_channel, col, bd_id) ) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py similarity index 91% rename from test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py rename to test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py index fdce41d8ae..cc460b6122 100644 --- a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py +++ b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py @@ -79,10 +79,10 @@ def test_tiled_nonsquare_tile_matrix_mult(ctx: MLIRContext, workdir: Path): M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -130,8 +130,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -139,16 +139,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -160,13 +160,13 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -178,8 +178,8 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -193,8 +193,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -206,11 +206,11 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -312,9 +312,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -378,10 +378,10 @@ def test_tiled_nonsquare_tile_matrix_mult_sugar(ctx: MLIRContext, workdir: Path) M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -424,8 +424,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -433,16 +433,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -454,13 +454,13 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -472,8 +472,8 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -487,8 +487,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -500,11 +500,11 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -570,9 +570,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py similarity index 92% rename from test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py rename to test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py index 036400fb1a..3d5b85c45b 100644 --- a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py +++ b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py @@ -101,12 +101,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized(ctx: MLIRContext, workdir: M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -155,8 +155,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -164,16 +164,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -185,13 +185,13 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -203,8 +203,8 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -218,8 +218,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -231,11 +231,11 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -408,9 +408,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -466,12 +466,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized_sugar( M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -515,8 +515,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -524,8 +524,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -533,8 +533,8 @@ def ipu(): col = 0 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -546,13 +546,13 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -564,8 +564,8 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -580,8 +580,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -593,11 +593,11 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -736,9 +736,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -828,12 +828,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized_sugar_already_vectorized( M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32_already_vectorized.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -877,8 +877,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -886,15 +886,15 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 col = 0 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -906,11 +906,11 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id)) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -922,7 +922,7 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 @@ -936,8 +936,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -949,9 +949,9 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend(aiex.ipu.write32(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.write32(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -1030,9 +1030,9 @@ def core(): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wrap_A, wrap_B, wrap_C = map( np.asarray, xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) ) diff --git a/test/ipu-xrt/e2e/test_tiled_vec_add.py b/test/npu-xrt/e2e/test_tiled_vec_add.py similarity index 90% rename from test/ipu-xrt/e2e/test_tiled_vec_add.py rename to test/npu-xrt/e2e/test_tiled_vec_add.py index ab0cd13769..ff8c1e77d1 100644 --- a/test/ipu-xrt/e2e/test_tiled_vec_add.py +++ b/test/npu-xrt/e2e/test_tiled_vec_add.py @@ -48,10 +48,10 @@ def test_vec_add(ctx: MLIRContext, workdir: Path): tiles = 4 k = K // tiles - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -95,8 +95,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -104,16 +104,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -121,16 +121,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C channel_index = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -138,11 +138,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -241,9 +241,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32) wrap_A = np.asarray(views[0]) @@ -278,10 +278,10 @@ def test_vec_add_sugar(ctx: MLIRContext, workdir: Path): tiles = 4 k = K // tiles - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -316,8 +316,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -325,8 +325,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -334,8 +334,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -343,8 +343,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -352,8 +352,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -361,11 +361,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -422,9 +422,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py b/test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py similarity index 92% rename from test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py rename to test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py index d0990390ae..bcc8beb2be 100644 --- a/test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py +++ b/test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py @@ -66,11 +66,11 @@ def vec_add_i32_i32( def test_vec_add_vectorized(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): vec_add_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -115,8 +115,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -124,8 +124,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -133,8 +133,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -142,8 +142,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -151,8 +151,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -160,11 +160,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -328,9 +328,9 @@ def super_vectorize(target: any_op_t()): ) compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32) wrap_A = np.asarray(views[0]) @@ -359,11 +359,11 @@ def super_vectorize(target: any_op_t()): def test_vec_add_vectorized_sugar(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): vec_add_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -399,8 +399,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -408,8 +408,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -417,8 +417,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -426,8 +426,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -435,8 +435,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -444,11 +444,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -574,9 +574,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_vec_dot.py b/test/npu-xrt/e2e/test_vec_dot.py similarity index 90% rename from test/ipu-xrt/e2e/test_vec_dot.py rename to test/npu-xrt/e2e/test_vec_dot.py index 7a2012a1d0..f111316692 100644 --- a/test/ipu-xrt/e2e/test_vec_dot.py +++ b/test/npu-xrt/e2e/test_vec_dot.py @@ -52,10 +52,10 @@ def test_vec_dot(ctx: MLIRContext, workdir: Path): tiles = 4 k = K // tiles - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -99,8 +99,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -108,8 +108,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -117,8 +117,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -126,8 +126,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -135,8 +135,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=1, @@ -144,11 +144,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -252,9 +252,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (tiles,)], np.int32) wrap_A = np.asarray(views[0]) @@ -289,10 +289,10 @@ def test_vec_dot_sugar(ctx: MLIRContext, workdir: Path): tiles = 4 k = K // tiles - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -327,8 +327,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -336,8 +336,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -345,8 +345,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -354,8 +354,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -363,8 +363,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=1, @@ -372,11 +372,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -440,9 +440,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (tiles,)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/tiled_matrix_add.ipynb b/test/npu-xrt/e2e/tiled_matrix_add.ipynb similarity index 91% rename from test/ipu-xrt/e2e/tiled_matrix_add.ipynb rename to test/npu-xrt/e2e/tiled_matrix_add.ipynb index 1d9a6f1c59..0c9a2247ac 100644 --- a/test/ipu-xrt/e2e/tiled_matrix_add.ipynb +++ b/test/npu-xrt/e2e/tiled_matrix_add.ipynb @@ -94,7 +94,7 @@ "_, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles(\n", " M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols\n", ")\n", - "ipu_insts = aiex.ipu.get_prolog()" + "npu_insts = aiex.npu.get_prolog()" ] }, { @@ -112,8 +112,8 @@ "metadata": {}, "outputs": [], "source": [ - "@aie.device(AIEDevice.ipu)\n", - "def ipu(): # function name isn't load-bearing\n", + "@aie.device(AIEDevice.npu)\n", + "def npu(): # function name isn't load-bearing\n", "\n", " # tiles that will participate\n", " shim_tile_0_0 = aie.tile(0, 0)\n", @@ -193,8 +193,8 @@ " # (yes this is a weird naming/assignment but it's due to a hack in implementation...)\n", " ddr_id = 0\n", " for i, bd_id in enumerate(range(4)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " tile_rows * tile_cols,\n", @@ -206,8 +206,8 @@ " d0_stride=d0_stride,\n", " )\n", " )\n", - " ipu_insts.extend(\n", - " aiex.ipu.shimtile_push_queue(\n", + " npu_insts.extend(\n", + " aiex.npu.shimtile_push_queue(\n", " MM2S, input_a_tile_0_0_to_tile_0_1.source_channel, col, bd_id\n", " )\n", " )\n", @@ -215,8 +215,8 @@ " # in B\n", " ddr_id = 1\n", " for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " tile_rows * tile_cols,\n", @@ -228,8 +228,8 @@ " d0_stride=d0_stride,\n", " )\n", " )\n", - " ipu_insts.extend(\n", - " aiex.ipu.shimtile_push_queue(\n", + " npu_insts.extend(\n", + " aiex.npu.shimtile_push_queue(\n", " MM2S, input_b_tile_0_0_to_tile_0_1.source_channel, col, bd_id\n", " )\n", " )\n", @@ -237,8 +237,8 @@ " # out C\n", " ddr_id = 2\n", " for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " bd_id,\n", " tile_rows * tile_cols,\n", " offsets[i],\n", @@ -249,13 +249,13 @@ " d0_stride=d0_stride,\n", " )\n", " )\n", - " ipu_insts.extend(\n", - " aiex.ipu.shimtile_push_queue(\n", + " npu_insts.extend(\n", + " aiex.npu.shimtile_push_queue(\n", " S2MM, output_c_tile_0_1_to_tile_0_0.dest_channel, col, bd_id\n", " )\n", " )\n", - " ipu_insts.extend(\n", - " aiex.ipu.sync(\n", + " npu_insts.extend(\n", + " aiex.npu.sync(\n", " channel=0, column=0, column_num=1, direction=0, row=0, row_num=1\n", " )\n", " )\n", @@ -365,7 +365,7 @@ "output_type": "stream", "text": [ "module {\n", - " aie.device(ipu) {\n", + " aie.device(npu) {\n", " %tile_0_0 = aie.tile(0, 0)\n", " %tile_0_1 = aie.tile(0, 1)\n", " %tile_0_2 = aie.tile(0, 2)\n", @@ -376,34 +376,34 @@ " aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 2)\n", " aie.flow(%tile_0_1, DMA : 2, %tile_0_0, DMA : 0)\n", " func.func @bobsyouruncle() {\n", - " aiex.ipu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 2 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 4 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 4 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 5 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 5 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 6 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 6 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 7 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 8 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483656 : ui32}\n", - " aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483657 : ui32}\n", - " aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483658 : ui32}\n", - " aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483659 : ui32}\n", - " aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", + " aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 2 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 4 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 4 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 5 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 5 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 6 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 6 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 7 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 8 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483656 : ui32}\n", + " aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", + " aiex.npu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483657 : ui32}\n", + " aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", + " aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483658 : ui32}\n", + " aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", + " aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483659 : ui32}\n", + " aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", " return\n", " }\n", " %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {\n", @@ -525,7 +525,7 @@ "metadata": {}, "outputs": [], "source": [ - "# shim DMA as ipu instructions\n", + "# shim DMA as npu instructions\n", "compile_without_vectorization(ctx.module, workdir)\n", "xclbin_path = make_xclbin(ctx.module, workdir)" ] @@ -546,11 +546,11 @@ "outputs": [], "source": [ "# FileLock because this runs in CI where multiple jobs might be attempting to run (and the device isn't multi-tenant yet)\n", - "with FileLock(\"/tmp/ipu.lock\"):\n", + "with FileLock(\"/tmp/npu.lock\"):\n", " # XRT manager\n", " xclbin = XCLBin(xclbin_path, \"MLIR_AIE\")\n", " # configure shim dmas\n", - " xclbin.load_ipu_instructions(ipu_insts)\n", + " xclbin.load_npu_instructions(npu_insts)\n", "\n", " # initialize input operands and zero out output\n", " views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)\n", diff --git a/test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb b/test/npu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb similarity index 95% rename from test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb rename to test/npu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb index d6b4248284..2b377f3fba 100644 --- a/test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb +++ b/test/npu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb @@ -442,10 +442,10 @@ " # A tiles are \"fat\" so need to offset by rows (i.e. d1 dim)\n", " 0 + d1_size_A * d1_stride_A,\n", " ]\n", - " ipu_insts = aiex.ipu.get_prolog()\n", + " npu_insts = aiex.npu.get_prolog()\n", " for i, bd_id in enumerate(range(2)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " buffer_length=tile_m_A * tile_n_A,\n", @@ -453,14 +453,14 @@ " ddr_id=ddr_id,\n", " )\n", " )\n", - " ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n", + " npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n", "\n", " # in B\n", " channel_index = 1\n", " ddr_id = 1\n", " for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " buffer_length=tile_m_B * tile_n_B,\n", @@ -472,11 +472,11 @@ " d0_stride=d0_stride_B,\n", " )\n", " )\n", - " ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n", + " npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n", " bd_id += 1\n", " # B tiles are \"tall\" so need to offset by cols (i.e. d0 dim)\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " buffer_length=tile_m_B * tile_n_B,\n", @@ -488,7 +488,7 @@ " d0_stride=d0_stride_B,\n", " )\n", " )\n", - " ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n", + " npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n", "\n", " # out C\n", " channel_index = 0\n", @@ -501,8 +501,8 @@ " ]\n", "\n", " for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " buffer_length=tile_m_C * tile_n_C,\n", @@ -514,9 +514,9 @@ " d0_stride=d0_stride_C,\n", " )\n", " )\n", - " ipu_insts.extend(aiex.ipu.write32(S2MM, channel_index, col, bd_id))\n", - " ipu_insts.extend(\n", - " aiex.ipu.sync(\n", + " npu_insts.extend(aiex.npu.write32(S2MM, channel_index, col, bd_id))\n", + " npu_insts.extend(\n", + " aiex.npu.sync(\n", " channel=0,\n", " column=0,\n", " column_num=1,\n", @@ -526,7 +526,7 @@ " )\n", " )\n", "\n", - " return ipu_insts" + " return npu_insts" ] }, { @@ -559,8 +559,8 @@ }, "outputs": [], "source": [ - "@aie.device(AIEDevice.ipu)\n", - "def ipu():\n", + "@aie.device(AIEDevice.npu)\n", + "def npu():\n", " matmul_i32_i32.emit(decl=True)\n", " tile_0_0 = aie.tile(0, 0)\n", " tile_0_1 = aie.tile(0, 1)\n", @@ -675,10 +675,10 @@ "outputs": [], "source": [ "xclbin_path = make_xclbin(mod_aie, workdir)\n", - "with FileLock(\"/tmp/ipu.lock\"):\n", + "with FileLock(\"/tmp/npu.lock\"):\n", " xclbin = XCLBin(xclbin_path, \"MLIR_AIE\")\n", - " ipu_insts = command_control()\n", - " xclbin.load_ipu_instructions(ipu_insts)\n", + " npu_insts = command_control()\n", + " xclbin.load_npu_instructions(npu_insts)\n", "\n", " wrap_A, wrap_B, wrap_C = map(\n", " np.asarray, xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)\n", diff --git a/test/ipu-xrt/e2e/util.py b/test/npu-xrt/e2e/util.py similarity index 100% rename from test/ipu-xrt/e2e/util.py rename to test/npu-xrt/e2e/util.py diff --git a/test/ipu-xrt/lit.local.cfg b/test/npu-xrt/lit.local.cfg similarity index 91% rename from test/ipu-xrt/lit.local.cfg rename to test/npu-xrt/lit.local.cfg index 04b92ba609..2d7aa71633 100644 --- a/test/ipu-xrt/lit.local.cfg +++ b/test/npu-xrt/lit.local.cfg @@ -7,7 +7,7 @@ config.suffixes = [".lit", ".py"] if "ryzen_ai" not in config.available_features: - config.unsupported = ["ipu-xrt"] + config.unsupported = ["npu-xrt"] else: config.unsupported = [] diff --git a/test/ipu-xrt/makefile-common b/test/npu-xrt/makefile-common similarity index 92% rename from test/ipu-xrt/makefile-common rename to test/npu-xrt/makefile-common index d9a0a69015..51e9a19245 100644 --- a/test/ipu-xrt/makefile-common +++ b/test/npu-xrt/makefile-common @@ -1,4 +1,4 @@ -# Contains common definitions used across the Makefiles of ipu-xrt tests. +# Contains common definitions used across the Makefiles of npu-xrt tests. # VITIS related variables VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../) diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir similarity index 97% rename from test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir rename to test/npu-xrt/matrix_multiplication_using_dma/aie.mlir index 01594e64cf..541b44ecea 100644 --- a/test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir +++ b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @inA : memref<64x32xi16> memref.global "public" @inA_cons : memref<64x32xi16> memref.global "public" @inB : memref<32x64xi16> @@ -111,12 +111,12 @@ module { %c64_i64 = arith.constant 64 : i64 %c32_i64 = arith.constant 32 : i64 %c4096_i64 = arith.constant 4096 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c2_i64, %c64_i64, %c32_i64] [%c4096_i64, %c32_i64, %c64_i64]) {id = 0 : i64, metadata = @outC} : memref<8192xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 1 : i64, metadata = @inA} : memref<8192xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c2_i64, %c64_i64, %c32_i64] [%c4096_i64, %c32_i64, %c64_i64]) {id = 0 : i64, metadata = @outC} : memref<8192xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 1 : i64, metadata = @inA} : memref<8192xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/mm.cc b/test/npu-xrt/matrix_multiplication_using_dma/mm.cc similarity index 100% rename from test/ipu-xrt/matrix_multiplication_using_dma/mm.cc rename to test/npu-xrt/matrix_multiplication_using_dma/mm.cc diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit b/test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit similarity index 77% rename from test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit rename to test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit index 483c7967b7..dd8a83ef1a 100644 --- a/test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit +++ b/test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit @@ -4,8 +4,8 @@ // REQUIRES: ryzen_ai // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o -// RUN: aie2xclbin --use-chess --xclbin-name=aie2.xclbin --ipu-insts-name=insts2.txt --tmpdir=aie2xclbin.prj -v %S/aie.mlir +// RUN: aie2xclbin --use-chess --xclbin-name=aie2.xclbin --npu-insts-name=insts2.txt --tmpdir=aie2xclbin.prj -v %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie2.xclbin -k MLIR_AIE -i insts2.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie2.xclbin -k MLIR_AIE -i insts2.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit b/test/npu-xrt/matrix_multiplication_using_dma/run.lit similarity index 78% rename from test/ipu-xrt/matrix_multiplication_using_dma/run.lit rename to test/npu-xrt/matrix_multiplication_using_dma/run.lit index ac347dcce6..850baf0a7d 100644 --- a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit +++ b/test/npu-xrt/matrix_multiplication_using_dma/run.lit @@ -4,7 +4,7 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/test.cpp b/test/npu-xrt/matrix_multiplication_using_dma/test.cpp similarity index 100% rename from test/ipu-xrt/matrix_multiplication_using_dma/test.cpp rename to test/npu-xrt/matrix_multiplication_using_dma/test.cpp diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/zero.cc b/test/npu-xrt/matrix_multiplication_using_dma/zero.cc similarity index 100% rename from test/ipu-xrt/matrix_multiplication_using_dma/zero.cc rename to test/npu-xrt/matrix_multiplication_using_dma/zero.cc diff --git a/test/ipu-xrt/two_col/Makefile b/test/npu-xrt/two_col/Makefile similarity index 72% rename from test/ipu-xrt/two_col/Makefile rename to test/npu-xrt/two_col/Makefile index 9fe6d4d097..08c3e61293 100644 --- a/test/ipu-xrt/two_col/Makefile +++ b/test/npu-xrt/two_col/Makefile @@ -5,14 +5,14 @@ VPATH := $(VISION_KERNELS_VPATH_BASE)/threshold all: final.xclbin insts.txt: aie.mlir - aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $< + aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< threshold.o: threshold.cc xchesscc -d ${CHESSCC2_FLAGS} -DBIT_WIDTH=8 -c $< -o $@ final.xclbin: aie.mlir threshold.o - aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=$@ --ipu-insts-name=insts.txt $< + aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=$@ --npu-insts-name=insts.txt $< clean: rm -rf *.elf* *.bif aie.mlir.prj *.xclbin sim \ diff --git a/test/ipu-xrt/two_col/aie.mlir b/test/npu-xrt/two_col/aie.mlir similarity index 91% rename from test/ipu-xrt/two_col/aie.mlir rename to test/npu-xrt/two_col/aie.mlir index 10975fd06b..692ef5db0a 100644 --- a/test/ipu-xrt/two_col/aie.mlir +++ b/test/npu-xrt/two_col/aie.mlir @@ -1,5 +1,5 @@ module { - aie.device(ipu) { + aie.device(npu) { %0 = aie.tile(0, 0) %1 = aie.tile(0, 1) %2 = aie.tile(0, 2) @@ -123,17 +123,17 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c2048 = arith.constant 2048 : i64 - aiex.ipu.rtp_write(0, 2, 0, 50) { buffer_sym_name = "rtp0" } - aiex.ipu.rtp_write(0, 3, 0, 50) { buffer_sym_name = "rtp1" } - aiex.ipu.rtp_write(1, 4, 0, 50) { buffer_sym_name = "rtp2" } - aiex.ipu.rtp_write(1, 5, 0, 50) { buffer_sym_name = "rtp3" } - aiex.ipu.rtp_write(0, 2, 1, 0) { buffer_sym_name = "rtp0" } - aiex.ipu.rtp_write(0, 3, 1, 0) { buffer_sym_name = "rtp1" } - aiex.ipu.rtp_write(1, 4, 1, 0) { buffer_sym_name = "rtp2" } - aiex.ipu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" } - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32> - aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } + aiex.npu.rtp_write(0, 2, 0, 50) { buffer_sym_name = "rtp0" } + aiex.npu.rtp_write(0, 3, 0, 50) { buffer_sym_name = "rtp1" } + aiex.npu.rtp_write(1, 4, 0, 50) { buffer_sym_name = "rtp2" } + aiex.npu.rtp_write(1, 5, 0, 50) { buffer_sym_name = "rtp3" } + aiex.npu.rtp_write(0, 2, 1, 0) { buffer_sym_name = "rtp0" } + aiex.npu.rtp_write(0, 3, 1, 0) { buffer_sym_name = "rtp1" } + aiex.npu.rtp_write(1, 4, 1, 0) { buffer_sym_name = "rtp2" } + aiex.npu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" } + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32> + aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/ipu-xrt/two_col/run.lit b/test/npu-xrt/two_col/run.lit similarity index 73% rename from test/ipu-xrt/two_col/run.lit rename to test/npu-xrt/two_col/run.lit index 01ff6afed4..5b2b54b291 100644 --- a/test/ipu-xrt/two_col/run.lit +++ b/test/npu-xrt/two_col/run.lit @@ -4,7 +4,7 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/threshold.cc -o ./threshold.o -// RUN: %python aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/two_col/run.sh b/test/npu-xrt/two_col/run.sh similarity index 100% rename from test/ipu-xrt/two_col/run.sh rename to test/npu-xrt/two_col/run.sh diff --git a/test/ipu-xrt/two_col/test.cpp b/test/npu-xrt/two_col/test.cpp similarity index 100% rename from test/ipu-xrt/two_col/test.cpp rename to test/npu-xrt/two_col/test.cpp diff --git a/test/ipu-xrt/two_col/threshold.cc b/test/npu-xrt/two_col/threshold.cc similarity index 100% rename from test/ipu-xrt/two_col/threshold.cc rename to test/npu-xrt/two_col/threshold.cc diff --git a/test/ipu-xrt/vector_scalar_using_dma/aie.mlir b/test/npu-xrt/vector_scalar_using_dma/aie.mlir similarity index 95% rename from test/ipu-xrt/vector_scalar_using_dma/aie.mlir rename to test/npu-xrt/vector_scalar_using_dma/aie.mlir index ebdd9aaefb..e2e9643370 100644 --- a/test/ipu-xrt/vector_scalar_using_dma/aie.mlir +++ b/test/npu-xrt/vector_scalar_using_dma/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @in : memref<1024xi32> memref.global "public" @in_cons : memref<1024xi32> memref.global "public" @out : memref<1024xi32> @@ -66,9 +66,9 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c4096_i64 = arith.constant 4096 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/ipu-xrt/vector_scalar_using_dma/run.lit b/test/npu-xrt/vector_scalar_using_dma/run.lit similarity index 78% rename from test/ipu-xrt/vector_scalar_using_dma/run.lit rename to test/npu-xrt/vector_scalar_using_dma/run.lit index 56b5153e7b..494056eba0 100644 --- a/test/ipu-xrt/vector_scalar_using_dma/run.lit +++ b/test/npu-xrt/vector_scalar_using_dma/run.lit @@ -4,7 +4,7 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/scale.cc -o ./scale.o -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/vector_scalar_using_dma/scale.cc b/test/npu-xrt/vector_scalar_using_dma/scale.cc similarity index 100% rename from test/ipu-xrt/vector_scalar_using_dma/scale.cc rename to test/npu-xrt/vector_scalar_using_dma/scale.cc diff --git a/test/ipu-xrt/vector_scalar_using_dma/test.cpp b/test/npu-xrt/vector_scalar_using_dma/test.cpp similarity index 100% rename from test/ipu-xrt/vector_scalar_using_dma/test.cpp rename to test/npu-xrt/vector_scalar_using_dma/test.cpp diff --git a/test/objectFifo-stateful-transform/nested_loop_test.mlir b/test/objectFifo-stateful-transform/nested_loop_test.mlir index 12d35fce7e..c2ba81e1cb 100644 --- a/test/objectFifo-stateful-transform/nested_loop_test.mlir +++ b/test/objectFifo-stateful-transform/nested_loop_test.mlir @@ -9,7 +9,7 @@ // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s -// CHECK-LABEL: aie.device(ipu) +// CHECK-LABEL: aie.device(npu) // CHECK: scf.for // CHECK: { // CHECK: aie.use_lock @@ -74,7 +74,7 @@ // CHECK: aie.use_lock // CHECK: } -aie.device(ipu) { +aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %tile_1_2 = aie.tile(1, 2) %tile_0_2 = aie.tile(0, 2) diff --git a/test/python/ipu.py b/test/python/npu.py similarity index 95% rename from test/python/ipu.py rename to test/python/npu.py index e2ad6959e8..79b8c64bb6 100644 --- a/test/python/ipu.py +++ b/test/python/npu.py @@ -23,7 +23,7 @@ object_fifo_link, tile, ) -from aie.dialects.aiex import ipu_sync, ipu_dma_memcpy_nd +from aie.dialects.aiex import npu_sync, npu_dma_memcpy_nd from aie.dialects.func import FuncOp from aie.dialects.scf import for_ from aie.dialects.scf import yield_ @@ -49,7 +49,7 @@ def my_vector_scalar(module): buffer_depth = 2 - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): scale_int32 = external_func( "scale_int32", inputs=[T.memref(n, T.i32()), T.memref(n, T.i32())] @@ -79,9 +79,9 @@ def core_body(): T.memref(N, T.i32()), T.memref(N, T.i32()), T.memref(N, T.i32()) ) def sequence(A, B, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) assert module.operation.verify() @@ -124,7 +124,7 @@ def my_matmul(module): vectorized = True - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): zero_scalar = external_func("zero_scalar_i16", inputs=[T.memref(m, n, T.i16())]) zero = external_func("zero_i16", inputs=[T.memref(m, n, T.i16())]) @@ -194,7 +194,7 @@ def sequence(A, B, C): num_tile_rows = min( [rows_per_block, M_div_m - tile_row_block * rows_per_block] ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outC", bd_id=0, mem=C, @@ -210,7 +210,7 @@ def sequence(A, B, C): * word_size_in // 4 ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inA", bd_id=2 * tile_row + 1, mem=A, @@ -218,7 +218,7 @@ def sequence(A, B, C): sizes=[N_div_n, K_div_k, m, k_in_i32s], strides=[0, k_in_i32s, K_in_i32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inB", bd_id=2 * tile_row + 2, mem=B, @@ -226,7 +226,7 @@ def sequence(A, B, C): strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) assert module.operation.verify() @@ -234,7 +234,7 @@ def sequence(A, B, C): # CHECK-LABEL: edge_detect @construct_and_print_module def edge_detect(module): - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): rgba2gray_line = external_func( "rgba2gray_line", @@ -441,21 +441,21 @@ def core_body(): T.memref(2304, T.i32()), T.memref(2304, T.i32()), T.memref(2304, T.i32()) ) def sequence(I, B, O): - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outOF_L2L3", bd_id=0, mem=O, sizes=[1, 1, 36, 64], strides=[0, 0, 64], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inOF_L3L2", bd_id=1, mem=I, sizes=[1, 1, 36, 64], strides=[0, 0, 64], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) assert module.operation.verify() @@ -463,7 +463,7 @@ def sequence(I, B, O): # CHECK-LABEL: my_add_one_objFifo @construct_and_print_module def my_add_one_objFifo(module): - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): shim_tile = tile(0, 0) mem_tile = tile(0, 1) @@ -496,12 +496,12 @@ def core_body(): T.memref(64, T.i32()), T.memref(32, T.i32()), T.memref(64, T.i32()) ) def sequence(inTensor, notUsed, outTensor): - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64] ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64] ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) assert module.operation.verify() diff --git a/test/python/tile_array.py b/test/python/tile_array.py index 272b48832e..e6735a7565 100644 --- a/test/python/tile_array.py +++ b/test/python/tile_array.py @@ -31,8 +31,8 @@ # CHECK-LABEL: broadcast @construct_and_print_module def broadcast(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): df = TileArray() assert df[[0, 1], 0].shape == (2, 1) assert df[[0, 1], 3:].shape == (2, 3) @@ -125,7 +125,7 @@ def ipu(): print(f) # CHECK: module { - # CHECK: aie.device(ipu) { + # CHECK: aie.device(npu) { # CHECK: %tile_0_0 = aie.tile(0, 0) # CHECK: %tile_0_1 = aie.tile(0, 1) # CHECK: %tile_0_2 = aie.tile(0, 2) @@ -194,8 +194,8 @@ def ipu(): # CHECK-LABEL: lshift @construct_and_print_module def lshift(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() fls = tiles[2, 1] << tiles[0, [2, 3]] @@ -214,8 +214,8 @@ def ipu(): # CHECK-LABEL: locks @construct_and_print_module def locks(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() aie.lock(tiles[0, 1].tile) @@ -249,8 +249,8 @@ def ipu(): # CHECK-LABEL: neighbors @construct_and_print_module def neighbors(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() # CHECK: Neighbors(north=%tile_2_3 = aie.tile(2, 3), west=%tile_1_2 = aie.tile(1, 2), south=None) @@ -279,8 +279,8 @@ def channels_basic(module): # CHECK-LABEL: test-basic print("test-basic") - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() b = aie.buffer(tiles[2, 2].tile, (10, 10), T.i32(), name="bob") @@ -295,13 +295,13 @@ def ipu(): # CHECK: %alice = aie.buffer(%tile_2_2) {sym_name = "alice"} : memref<10x10xi32> # CHECK: %alice_producer_lock = aie.lock(%tile_2_2) {sym_name = "alice_producer_lock"} # CHECK: %alice_consumer_lock = aie.lock(%tile_2_2) {sym_name = "alice_consumer_lock"} - print(ipu) + print(npu) # CHECK-LABEL: test-context-manager print("test-context-manager") - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() c = Channel( @@ -334,14 +334,14 @@ def core(): # CHECK: aie.use_lock(%alice_producer_lock, Release) # CHECK: aie.end # CHECK: } - print(ipu) + print(npu) # CHECK-LABEL: nd_channels @construct_and_print_module def nd_channels(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() shapes = np.array([(10, 10)], dtype="i,i").astype(object) @@ -377,8 +377,8 @@ def ipu(): def buffer_test_this_needs_to_distinct_from_all_other_mentions_of_buffer_in_this_file( module, ): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() shapes = [(10, 10)] diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py index 437df0f336..4837ea8748 100644 --- a/test/python/trace_utils.py +++ b/test/python/trace_utils.py @@ -7,14 +7,14 @@ # RUN: %python %s | FileCheck %s --check-prefix TRACE # -# TRACE: aiex.ipu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 65536 : ui32} -# TRACE: aiex.ipu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} -# TRACE: aiex.ipu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527909 : ui32} -# TRACE: aiex.ipu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} -# TRACE: aiex.ipu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} -# TRACE: aiex.ipu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} -# TRACE: aiex.ipu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} -# TRACE: aiex.ipu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32} +# TRACE: aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 65536 : ui32} +# TRACE: aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} +# TRACE: aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527909 : ui32} +# TRACE: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32} +# TRACE: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} +# TRACE: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} +# TRACE: aiex.npu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +# TRACE: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32} import sys @@ -40,7 +40,7 @@ def passthroughKernel(): with mlir_mod_ctx() as ctx: - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): # define types memRef_ty = T.memref(lineWidthInBytes, T.ui8()) @@ -96,19 +96,19 @@ def sequence(inTensor, outTensor, notUsed): events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="in", bd_id=0, mem=inTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="out", bd_id=1, mem=outTensor, sizes=[1, 1, 1, tensorSizeInInt32s], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) print(ctx.module) diff --git a/tools/aie2xclbin/XCLBinGen.cpp b/tools/aie2xclbin/XCLBinGen.cpp index 86c2966f6a..d8de0e06d6 100644 --- a/tools/aie2xclbin/XCLBinGen.cpp +++ b/tools/aie2xclbin/XCLBinGen.cpp @@ -815,7 +815,7 @@ static LogicalResult generateUnifiedObject(MLIRContext *context, } LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp, - XCLBinGenConfig &TK, StringRef OutputIPU, + XCLBinGenConfig &TK, StringRef OutputNPU, StringRef OutputXCLBin) { PassManager pm(ctx, moduleOp.getOperationName()); applyConfigToPassManager(TK, pm); @@ -842,25 +842,25 @@ LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp, return moduleOp.emitOpError() << "Unexpected target architecture: " << TK.TargetArch; - // generateIPUInstructions + // generateNPUInstructions { PassManager pm(ctx, moduleOp.getOperationName()); applyConfigToPassManager(TK, pm); - pm.addNestedPass(AIEX::createAIEDmaToIpuPass()); + pm.addNestedPass(AIEX::createAIEDmaToNpuPass()); ModuleOp copy = moduleOp.clone(); if (failed(pm.run(copy))) - return moduleOp.emitOpError("IPU Instruction pipeline failed"); + return moduleOp.emitOpError("NPU Instruction pipeline failed"); std::string errorMessage; - auto output = openOutputFile(OutputIPU, &errorMessage); + auto output = openOutputFile(OutputNPU, &errorMessage); if (!output) { llvm::errs() << errorMessage << "\n"; return moduleOp.emitOpError(""); } - if (failed(AIE::AIETranslateToIPU(copy, output->os()))) - return moduleOp.emitOpError("IPU Instruction translation failed"); + if (failed(AIE::AIETranslateToNPU(copy, output->os()))) + return moduleOp.emitOpError("NPU Instruction translation failed"); output->keep(); copy->erase(); diff --git a/tools/aie2xclbin/XCLBinGen.h b/tools/aie2xclbin/XCLBinGen.h index 25fcb07396..809daa101c 100644 --- a/tools/aie2xclbin/XCLBinGen.h +++ b/tools/aie2xclbin/XCLBinGen.h @@ -40,7 +40,7 @@ struct XCLBinGenConfig { void findVitis(XCLBinGenConfig &TK); mlir::LogicalResult aie2xclbin(mlir::MLIRContext *ctx, mlir::ModuleOp moduleOp, - XCLBinGenConfig &TK, llvm::StringRef OutputIPU, + XCLBinGenConfig &TK, llvm::StringRef OutputNPU, llvm::StringRef OutputXCLBin); } // namespace xilinx diff --git a/tools/aie2xclbin/aie2xclbin.cpp b/tools/aie2xclbin/aie2xclbin.cpp index bbd34c5528..7bfe6a2982 100644 --- a/tools/aie2xclbin/aie2xclbin.cpp +++ b/tools/aie2xclbin/aie2xclbin.cpp @@ -70,9 +70,9 @@ cl::opt cl::init(HOST_ARCHITECTURE), cl::cat(AIE2XCLBinCat)); cl::opt - IPUInstsName("ipu-insts-name", - cl::desc("Output instructions filename for IPU target"), - cl::init("ipu_insts.txt"), cl::cat(AIE2XCLBinCat)); + NPUInstsName("npu-insts-name", + cl::desc("Output instructions filename for NPU target"), + cl::init("npu_insts.txt"), cl::cat(AIE2XCLBinCat)); cl::opt PrintIRAfterAll("print-ir-after-all", @@ -207,7 +207,7 @@ int main(int argc, char *argv[]) { if (!owning) return 1; - if (failed(aie2xclbin(&ctx, *owning, TK, IPUInstsName.getValue(), + if (failed(aie2xclbin(&ctx, *owning, TK, NPUInstsName.getValue(), XCLBinName.getValue()))) return 1; diff --git a/utils/reset_ipu.sh b/utils/reset_npu.sh similarity index 93% rename from utils/reset_ipu.sh rename to utils/reset_npu.sh index 6a4f02e647..9a07adb48b 100755 --- a/utils/reset_ipu.sh +++ b/utils/reset_npu.sh @@ -12,6 +12,6 @@ if [ x"$NUMBER" != x"" ]; then # /opt/xilinx/xrt/test/example_noop_test /lib/firmware/amdipu/1502/validate.xclbin # fi else - echo "couldn't find ipu" + echo "couldn't find npu" fi diff --git a/utils/run_on_ipu.sh b/utils/run_on_npu.sh similarity index 100% rename from utils/run_on_ipu.sh rename to utils/run_on_npu.sh