From 138aae6cb1962b42ef5d76785f1c8b08014734ac Mon Sep 17 00:00:00 2001 From: Jeff Fifield Date: Thu, 18 Apr 2024 16:58:26 -0600 Subject: [PATCH] more rename ipu->npu --- include/aie/Dialect/AIE/IR/AIEAttrs.td | 2 +- include/aie/Dialect/AIEX/IR/AIEX.td | 22 +-- .../aie/Dialect/AIEX/Transforms/AIEXPasses.td | 2 +- lib/CAPI/Translation.cpp | 10 +- lib/Dialect/AIE/IR/AIEDialect.cpp | 2 +- lib/Targets/AIETargetCDODirect.cpp | 6 +- lib/Targets/AIETargets.cpp | 2 +- python/AIEMLIRModule.cpp | 6 +- python/XRTModule.cpp | 24 ++-- python/_mlir_libs/_aie.pyi | 4 +- python/_mlir_libs/_xrt.pyi | 4 +- python/compiler/aiecc/cl_arguments.py | 16 +-- python/compiler/aiecc/main.py | 12 +- python/dialects/aie.py | 4 +- python/dialects/aiex.py | 54 ++++---- python/utils/trace.py | 16 +-- test/Conversion/DmaToNpu/aiert_insts.mlir | 16 +-- test/Conversion/DmaToNpu/bad_rtp_write.mlir | 10 +- test/Conversion/DmaToNpu/dma_to_ipu.mlir | 38 ++--- .../DmaToNpu/dma_to_ipu_invalid.mlir | 10 +- .../DmaToNpu/dma_to_ipu_issue_token.mlir | 18 +-- test/Conversion/DmaToNpu/push_to_queue.mlir | 12 +- test/Conversion/DmaToNpu/rtp_write.mlir | 12 +- .../assign-bd-ids/bad_bd_assignments.mlir | 12 +- test/Passes/assign-bd-ids/basic.mlir | 4 +- test/Passes/assign-bd-ids/user_assigned.mlir | 8 +- .../AIETargetHSA/input_with_addresses.mlir | 6 +- test/Targets/NPU/npu_instgen.mlir | 12 +- test/aie2xclbin/simple_xclbin.mlir | 2 +- test/aiecc/simple_xclbin.mlir | 6 +- .../bad_alignment.mlir | 8 +- test/dialect/AIE/bad_cascade.mlir | 6 +- test/dialect/AIE/bad_dma_op.mlir | 2 +- test/dialect/AIE/badshimtiledma.mlir | 2 +- test/dialect/AIE/badtiledma4.mlir | 2 +- test/dialect/AIE/buffer.mlir | 2 +- test/dialect/AIEX/bad_ipu_nd.mlir | 26 ++-- test/dialect/AIEX/bad_ipu_push_queue.mlir | 10 +- test/dialect/AIEX/bad_ipu_write_bd.mlir | 18 +-- test/dialect/AIEX/invalid.mlir | 8 +- test/dialect/AIEX/roundtrip.mlir | 18 +-- .../aie.mlir | 8 +- .../run.lit | 4 +- test/ipu-xrt/add_314_using_dma_op/aie.mlir | 8 +- test/ipu-xrt/add_314_using_dma_op/run.lit | 4 +- test/ipu-xrt/add_one_objFifo/Makefile | 2 +- test/ipu-xrt/add_one_objFifo/aie.mlir | 8 +- test/ipu-xrt/add_one_objFifo/run.lit | 4 +- test/ipu-xrt/add_one_using_dma/aie.mlir | 8 +- test/ipu-xrt/add_one_using_dma/run.lit | 4 +- test/ipu-xrt/cascade_flows/CMakeLists.txt | 2 +- test/ipu-xrt/cascade_flows/Makefile | 2 +- test/ipu-xrt/cascade_flows/aie.mlir | 8 +- test/ipu-xrt/cascade_flows/run.lit | 4 +- test/ipu-xrt/e2e/conftest.py | 2 +- ...dd_256_using_dma_op_no_double_buffering.py | 22 +-- test/ipu-xrt/e2e/test_locks.py | 66 ++++----- test/ipu-xrt/e2e/test_manual_dpu_args.py | 82 +++++------ .../ipu-xrt/e2e/test_nonsquare_matrix_mult.py | 64 ++++----- .../test_nonsquare_matrix_mult_vectorized.py | 64 ++++----- .../ipu-xrt/e2e/test_offsets_sizes_strides.py | 38 ++--- test/ipu-xrt/e2e/test_repeat_count.py | 44 +++--- .../e2e/test_shared_buffers_init_value.py | 22 +-- test/ipu-xrt/e2e/test_square_matrix_mult.py | 64 ++++----- .../e2e/test_square_matrix_mult_vectorized.py | 64 ++++----- test/ipu-xrt/e2e/test_tiled_matrix_add.py | 76 +++++----- ...iled_nonsquare_spatial_tile_matrix_mult.py | 106 +++++++------- .../test_tiled_nonsquare_tile_matrix_mult.py | 92 ++++++------- ...d_nonsquare_tile_matrix_mult_vectorized.py | 130 +++++++++--------- test/ipu-xrt/e2e/test_tiled_vec_add.py | 76 +++++----- .../e2e/test_tiled_vec_add_vectorized.py | 76 +++++----- test/ipu-xrt/e2e/test_vec_dot.py | 76 +++++----- test/ipu-xrt/e2e/tiled_matrix_add.ipynb | 98 ++++++------- ...onsquare_tile_matrix_mult_vectorized.ipynb | 42 +++--- test/ipu-xrt/lit.local.cfg | 2 +- test/ipu-xrt/makefile-common | 2 +- .../matrix_multiplication_using_dma/aie.mlir | 14 +- .../run-a2x.lit | 4 +- .../matrix_multiplication_using_dma/run.lit | 4 +- test/ipu-xrt/two_col/Makefile | 6 +- test/ipu-xrt/two_col/aie.mlir | 24 ++-- test/ipu-xrt/two_col/run.lit | 4 +- test/ipu-xrt/vector_scalar_using_dma/aie.mlir | 8 +- test/ipu-xrt/vector_scalar_using_dma/run.lit | 4 +- test/lit.cfg.py | 8 +- .../aiex_standard_lowering.mlir | 10 +- .../nested_loop_test.mlir | 4 +- test/python/ipu.py | 36 ++--- test/python/tile_array.py | 38 ++--- tools/aie2xclbin/aie2xclbin.cpp | 4 +- utils/{run_on_ipu.sh => run_on_npu.sh} | 0 91 files changed, 998 insertions(+), 998 deletions(-) rename utils/{run_on_ipu.sh => run_on_npu.sh} (100%) diff --git a/include/aie/Dialect/AIE/IR/AIEAttrs.td b/include/aie/Dialect/AIE/IR/AIEAttrs.td index 4d2f17371f..167a1bf4d9 100644 --- a/include/aie/Dialect/AIE/IR/AIEAttrs.td +++ b/include/aie/Dialect/AIE/IR/AIEAttrs.td @@ -100,7 +100,7 @@ def AIEDevice: I32EnumAttr<"AIEDevice", "AIE Device", I32EnumAttrCase<"xcvc1902", 1>, I32EnumAttrCase<"xcve2302", 2>, I32EnumAttrCase<"xcve2802", 3>, - I32EnumAttrCase<"ipu", 4> + I32EnumAttrCase<"npu", 4> ]> { let cppNamespace = "xilinx::AIE"; diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td index b929a936cb..39ce49ada8 100644 --- a/include/aie/Dialect/AIEX/IR/AIEX.td +++ b/include/aie/Dialect/AIEX/IR/AIEX.td @@ -463,7 +463,7 @@ def AIE_SelectOp: AIEX_Op<"select", []>, Results<(outs Index)> { ]; } -def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"ipu.dma_memcpy_nd", [ +def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [ AttrSizedOperandSegments, MyOffsetSizeAndStrideOpInterface ]> { @@ -519,7 +519,7 @@ def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"ipu.dma_memcpy_nd", [ let hasVerifier = 1; } -def AIE_NpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> { +def AIE_NpuDmaWaitOp: AIEX_Op<"npu.dma_wait", []> { let summary = "Blocking operation to wait for a DMA to complete execution."; let description = [{ The NpuDmaWaitOp blocks until the DMA referenced through `symbol` completes execution @@ -530,13 +530,13 @@ def AIE_NpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> { ... aie.objectfifo @out0(%tile_0_1, {%tile_0_0}, 4 : i32) : !aie.objectfifo> ... - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32> ... - aiex.ipu.dma_wait { symbol = @out0 } + aiex.npu.dma_wait { symbol = @out0 } ``` Here, we have an objectfifo with symbol name `out0`, which is then referenced in the - `ipu.dma_memcpy_nd` operation as the target for the respective DMA operation. Afterwards, - an `ipu.dma_wait` operation references the same symbol to block until the respective DMA + `npu.dma_memcpy_nd` operation as the target for the respective DMA operation. Afterwards, + an `npu.dma_wait` operation references the same symbol to block until the respective DMA has executed all of its tasks. }]; let arguments = ( @@ -549,7 +549,7 @@ def AIE_NpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> { } // Write RTP -def AIE_NpuWriteRTPOp: AIEX_Op<"ipu.rtp_write", []> { +def AIE_NpuWriteRTPOp: AIEX_Op<"npu.rtp_write", []> { let summary = "rtp write operator"; let arguments = ( ins StrAttr:$buffer_sym_name, @@ -567,7 +567,7 @@ def AIE_NpuWriteRTPOp: AIEX_Op<"ipu.rtp_write", []> { } // Push BD to Queue -def AIE_NpuShimTilePushQueueOp: AIEX_Op<"ipu.shimtile_push_queue", []> { +def AIE_NpuShimTilePushQueueOp: AIEX_Op<"npu.shimtile_push_queue", []> { let summary = "bd queue push operator"; let arguments = ( ins FlatSymbolRefAttr:$metadata, @@ -586,7 +586,7 @@ def AIE_NpuShimTilePushQueueOp: AIEX_Op<"ipu.shimtile_push_queue", []> { } // WRITE32 -def AIE_NpuWrite32Op: AIEX_Op<"ipu.write32", []> { +def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", []> { let summary = "write32 operator"; let arguments = ( ins I32Attr:$column, @@ -604,7 +604,7 @@ def AIE_NpuWrite32Op: AIEX_Op<"ipu.write32", []> { } // OP_SYNC -def AIE_NpuSyncOp: AIEX_Op<"ipu.sync", []> { +def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> { let summary = "sync operator"; let arguments = ( ins I32Attr:$column, @@ -624,7 +624,7 @@ def AIE_NpuSyncOp: AIEX_Op<"ipu.sync", []> { } // WRITEBD_EXTEND_SHIMTILE -def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"ipu.writebd_shimtile", []> { +def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> { let summary = "dma operator"; let arguments = ( ins I32Attr:$column, diff --git a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td index 1c8a93930e..3c4b34a877 100644 --- a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td +++ b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td @@ -133,7 +133,7 @@ def AIELowerMemcpy : Pass<"aie-lower-memcpy", "AIE::DeviceOp"> { ]; } -def AIEDmaToNpu : Pass<"aie-dma-to-ipu", "AIE::DeviceOp"> { +def AIEDmaToNpu : Pass<"aie-dma-to-npu", "AIE::DeviceOp"> { let summary = ""; let description = [{ diff --git a/lib/CAPI/Translation.cpp b/lib/CAPI/Translation.cpp index 653717dc7b..6488f44357 100644 --- a/lib/CAPI/Translation.cpp +++ b/lib/CAPI/Translation.cpp @@ -77,14 +77,14 @@ aieTranslateToCDODirect(MlirOperation moduleOp, MlirStringRef workDirPath, } MlirStringRef aieTranslateToNPU(MlirOperation moduleOp) { - std::string ipu; - llvm::raw_string_ostream os(ipu); + std::string npu; + llvm::raw_string_ostream os(npu); ModuleOp mod = llvm::cast(unwrap(moduleOp)); if (failed(AIETranslateToNPU(mod, os))) return mlirStringRefCreate(nullptr, 0); - char *cStr = static_cast(malloc(ipu.size())); - ipu.copy(cStr, ipu.size()); - return mlirStringRefCreate(cStr, ipu.size()); + char *cStr = static_cast(malloc(npu.size())); + npu.copy(cStr, npu.size()); + return mlirStringRefCreate(cStr, npu.size()); } MlirStringRef aieTranslateToXAIEV2(MlirOperation moduleOp) { diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index d4bff87902..a9e80b44b1 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -983,7 +983,7 @@ const AIETargetModel &DeviceOp::getTargetModel() { return VE2302model; case AIEDevice::xcve2802: return VE2802model; - case AIEDevice::ipu: + case AIEDevice::npu: return NPUmodel; } return VC1902model; diff --git a/lib/Targets/AIETargetCDODirect.cpp b/lib/Targets/AIETargetCDODirect.cpp index 8e399777c5..5f3b94af5f 100644 --- a/lib/Targets/AIETargetCDODirect.cpp +++ b/lib/Targets/AIETargetCDODirect.cpp @@ -265,7 +265,7 @@ LogicalResult configureBdInBlock(XAie_DevInst &devInst, XAie_DmaDesc &dmaTileBd, // write them out like this so they show up with names in debug prints size_t smid = 0; size_t burstLen = 16; // (10):BLEN=16 (256Byte) (corresponds to - // 0x800000000 from targetipu) + // 0x800000000 from target) size_t qOs = 0; size_t cache = 0; size_t secure = 0; @@ -559,7 +559,7 @@ struct AIEControl { int32_t col = switchboxOp.colIndex(); int32_t row = switchboxOp.rowIndex(); XAie_LocType tileLoc = XAie_TileLoc(col, row); - assert(targetOp.getDevice() == AIEDevice::ipu && + assert(targetOp.getDevice() == AIEDevice::npu && "Only NPU currently supported"); if (row == 0) { // FIXME hack for TCT routing @@ -781,7 +781,7 @@ LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, DeviceOp targetOp = *devOps.begin(); // things like XAIE_MEM_TILE_ROW_START and the missing // shim dma on tile (0,0) are hard-coded assumptions about NPU... - assert(targetOp.getDevice() == AIEDevice::ipu && + assert(targetOp.getDevice() == AIEDevice::npu && "Only NPU currently supported"); int maxCol = 0, minCol = 0; for (auto tileOp : targetOp.getOps()) { diff --git a/lib/Targets/AIETargets.cpp b/lib/Targets/AIETargets.cpp index b016f2ea75..0caa039ef0 100644 --- a/lib/Targets/AIETargets.cpp +++ b/lib/Targets/AIETargets.cpp @@ -321,7 +321,7 @@ void registerAIETranslations() { }, registerDialects); TranslateFromMLIRRegistration registrationNPU( - "aie-ipu-instgen", "Generate instructions for NPU", + "aie-npu-instgen", "Generate instructions for NPU", [](ModuleOp module, raw_ostream &output) { return AIETranslateToNPU(module, output); }, diff --git a/python/AIEMLIRModule.cpp b/python/AIEMLIRModule.cpp index 426b673e6b..588f96fae0 100644 --- a/python/AIEMLIRModule.cpp +++ b/python/AIEMLIRModule.cpp @@ -107,11 +107,11 @@ PYBIND11_MODULE(_aie, m) { "enable_cores"_a = true); m.def( - "ipu_instgen", + "npu_instgen", [&stealCStr](MlirOperation op) { - py::str ipuInstructions = stealCStr(aieTranslateToNPU(op)); + py::str npuInstructions = stealCStr(aieTranslateToNPU(op)); auto individualInstructions = - ipuInstructions.attr("split")().cast(); + npuInstructions.attr("split")().cast(); for (size_t i = 0; i < individualInstructions.size(); ++i) individualInstructions[i] = individualInstructions[i].attr("strip")(); return individualInstructions; diff --git a/python/XRTModule.cpp b/python/XRTModule.cpp index aa84135980..fb200f6650 100644 --- a/python/XRTModule.cpp +++ b/python/XRTModule.cpp @@ -25,8 +25,8 @@ namespace py = pybind11; using namespace py::literals; -// group_id 0 is for ipu instructions -// group_id 1 is for number of ipu instructions +// group_id 0 is for npu instructions +// group_id 1 is for number of npu instructions // host side buffers/args follow starting from position 2 // see aiecc.main.emit_design_kernel_json constexpr size_t HOST_BUFFERS_START_IDX = 2; @@ -45,13 +45,13 @@ class PyXCLBin { } void loadNPUInstructions(const std::vector &insts) { - ipuInstructions = + npuInstructions = std::make_unique(*device, insts.size() * sizeof(uint32_t), XCL_BO_FLAGS_CACHEABLE, kernel->group_id(0)); - uint32_t *bufInstr = ipuInstructions->map(); + uint32_t *bufInstr = npuInstructions->map(); for (size_t i = 0; i < insts.size(); ++i) bufInstr[i] = insts.at(i); - ipuInstructions->sync(XCL_BO_SYNC_BO_TO_DEVICE); + npuInstructions->sync(XCL_BO_SYNC_BO_TO_DEVICE); } template @@ -107,8 +107,8 @@ class PyXCLBin { void run() { run_ = std::make_unique(*kernel); - run_->set_arg(0, *ipuInstructions); - run_->set_arg(1, ipuInstructions->size()); + run_->set_arg(0, *npuInstructions); + run_->set_arg(1, npuInstructions->size()); for (size_t i = 0; i < buffers.size(); ++i) run_->set_arg(HOST_BUFFERS_START_IDX + i, *buffers[i]); run_->start(); @@ -116,8 +116,8 @@ class PyXCLBin { void _runOnlyNpuInstructions() { run_ = std::make_unique(*kernel); - run_->set_arg(0, *ipuInstructions); - run_->set_arg(1, ipuInstructions->size()); + run_->set_arg(0, *npuInstructions); + run_->set_arg(1, npuInstructions->size()); run_->start(); } @@ -133,7 +133,7 @@ class PyXCLBin { std::unique_ptr device; std::unique_ptr context; std::unique_ptr kernel; - std::unique_ptr ipuInstructions; + std::unique_ptr npuInstructions; std::vector> buffers; @@ -145,11 +145,11 @@ PYBIND11_MODULE(_xrt, m) { py::class_(m, "XCLBin", py::module_local()) .def(py::init(), "xclbin_path"_a, "kernel_name"_a, "device_index"_a = 0) - .def("load_ipu_instructions", &PyXCLBin::loadNPUInstructions, "insts"_a) + .def("load_npu_instructions", &PyXCLBin::loadNPUInstructions, "insts"_a) .def("sync_buffers_to_device", &PyXCLBin::syncBuffersToDevice) .def("sync_buffers_from_device", &PyXCLBin::syncBuffersFromDevice) .def("run", &PyXCLBin::run) - .def("_run_only_ipu_instructions", &PyXCLBin::_runOnlyNpuInstructions) + .def("_run_only_npu_instructions", &PyXCLBin::_runOnlyNpuInstructions) .def("wait", &PyXCLBin::wait, "timeout"_a = py::none()) .def( "mmap_buffers", diff --git a/python/_mlir_libs/_aie.pyi b/python/_mlir_libs/_aie.pyi index c37cf64e08..ad7497117c 100644 --- a/python/_mlir_libs/_aie.pyi +++ b/python/_mlir_libs/_aie.pyi @@ -11,7 +11,7 @@ __all__ = [ "generate_bcf", "generate_cdo", "generate_xaie", - "ipu_instgen", + "npu_instgen", "register_dialect", "translate_aie_vec_to_cpp", "translate_mlir_to_llvmir", @@ -31,7 +31,7 @@ def generate_cdo( enable_cores: bool = True, ) -> None: ... def generate_xaie(module: Operation) -> str: ... -def ipu_instgen(module: Operation) -> list: ... +def npu_instgen(module: Operation) -> list: ... def register_dialect(registry: DialectRegistry) -> None: ... def translate_aie_vec_to_cpp(module: Operation, aieml: bool = False) -> str: ... def translate_mlir_to_llvmir(module: Operation) -> str: ... diff --git a/python/_mlir_libs/_xrt.pyi b/python/_mlir_libs/_xrt.pyi index b912f76738..d08862a4a5 100644 --- a/python/_mlir_libs/_xrt.pyi +++ b/python/_mlir_libs/_xrt.pyi @@ -8,8 +8,8 @@ class XCLBin: self, xclbin_path: str, kernel_name: str, device_index: int = 0 ) -> None: ... def _get_buffer_host_address(self, arg0: int) -> int: ... - def _run_only_ipu_instructions(self) -> None: ... - def load_ipu_instructions(self, insts: list[int]) -> None: ... + def _run_only_npu_instructions(self) -> None: ... + def load_npu_instructions(self, insts: list[int]) -> None: ... def mmap_buffers( self, shapes: list[list[int]], np_format: typing.Any ) -> list[memoryview]: ... diff --git a/python/compiler/aiecc/cl_arguments.py b/python/compiler/aiecc/cl_arguments.py index 4cc74c4553..a407dda971 100644 --- a/python/compiler/aiecc/cl_arguments.py +++ b/python/compiler/aiecc/cl_arguments.py @@ -195,25 +195,25 @@ def parse_args(args=None): help="Show progress visualization", ) parser.add_argument( - "--aie-generate-ipu", - dest="ipu", + "--aie-generate-npu", + dest="npu", default=False, action="store_const", const=True, - help="Generate ipu instruction stream", + help="Generate npu instruction stream", ) parser.add_argument( - "--aie-only-generate-ipu", - dest="only_ipu", + "--aie-only-generate-npu", + dest="only_npu", default=False, action="store_const", const=True, - help="Generate ipu instruction stream only", + help="Generate npu instruction stream only", ) parser.add_argument( - "--ipu-insts-name", + "--npu-insts-name", dest="insts_name", - default="ipu_insts.txt", + default="npu_insts.txt", help="Output instructions filename for NPU target", ) parser.add_argument( diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py index 51527d2328..35a268d245 100644 --- a/python/compiler/aiecc/main.py +++ b/python/compiler/aiecc/main.py @@ -89,7 +89,7 @@ CREATE_PATH_FINDER_FLOWS = Pipeline().Nested( "aie.device", Pipeline().add_pass("aie-create-pathfinder-flows") ) -DMA_TO_NPU = Pipeline().Nested("aie.device", Pipeline().add_pass("aie-dma-to-ipu")) +DMA_TO_NPU = Pipeline().Nested("aie.device", Pipeline().add_pass("aie-dma-to-npu")) async def read_file_async(file_path: str) -> str: @@ -1014,13 +1014,13 @@ async def run_flow(self): aie_peano_target = aie_target.lower() + "-none-elf" # Optionally generate insts.txt for NPU instruction stream - if opts.ipu or opts.only_ipu: - generated_insts_mlir = self.prepend_tmp("generated_ipu_insts.mlir") + if opts.npu or opts.only_npu: + generated_insts_mlir = self.prepend_tmp("generated_npu_insts.mlir") await self.do_call( progress_bar.task, [ "aie-opt", - "--aie-dma-to-ipu", + "--aie-dma-to-npu", file_with_addresses, "-o", generated_insts_mlir, @@ -1030,13 +1030,13 @@ async def run_flow(self): progress_bar.task, [ "aie-translate", - "--aie-ipu-instgen", + "--aie-npu-instgen", generated_insts_mlir, "-o", opts.insts_name, ], ) - if opts.only_ipu: + if opts.only_npu: return chess_intrinsic_wrapper_ll_path = await self.prepare_for_chesshack( diff --git a/python/dialects/aie.py b/python/dialects/aie.py index ffdb1b46ee..35702449fa 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -21,7 +21,7 @@ generate_bcf, generate_cdo, generate_xaie, - ipu_instgen, + npu_instgen, register_dialect, translate_aie_vec_to_cpp, translate_mlir_to_llvmir, @@ -617,7 +617,7 @@ def find_neighbors(tile, device=None, logical=True): if device is None: device = find_parent_of_type(lambda op: isinstance(op, DeviceOp)) - assert int(device.device) == int(AIEDevice.ipu), "only ipu supported" + assert int(device.device) == int(AIEDevice.npu), "only npu supported" neighbors = {} col, row = map(int, (tile.col, tile.row)) diff --git a/python/dialects/aiex.py b/python/dialects/aiex.py index 7e8ad999b9..1c8d59ac2d 100644 --- a/python/dialects/aiex.py +++ b/python/dialects/aiex.py @@ -32,7 +32,7 @@ # Comes from _aie register_dialect(get_dialect_registry()) -ipu_sync = partial(ipu_sync, column_num=1, row_num=1) +npu_sync = partial(npu_sync, column_num=1, row_num=1) class NpuDmaMemcpyNd(NpuDmaMemcpyNdOp): @@ -77,7 +77,7 @@ def __init__( ) -ipu_dma_memcpy_nd = NpuDmaMemcpyNd +npu_dma_memcpy_nd = NpuDmaMemcpyNd _PROLOG = [ @@ -120,7 +120,7 @@ def _get_prolog(): # based on https://github.com/Xilinx/mlir-aie/blob/cb232a43383ef3b8efd8b408545c9b74885578ad/lib/Targets/AIETargetNPU.cpp -def _ipu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1): +def _npu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1): if isinstance(channel, IntegerAttr): channel = int(channel) words = [None] * 2 @@ -137,7 +137,7 @@ def _ipu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1): return words -def _ipu_write32(column, row, address, value): +def _npu_write32(column, row, address, value): words = [None] * 3 op_code = 2 words[0] = (op_code & 0xFF) << 24 @@ -149,7 +149,7 @@ def _ipu_write32(column, row, address, value): return words -def _ipu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats=0): +def _npu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats=0): if isinstance(channel_index, IntegerAttr): channel_index = int(channel_index) if channel_dir == DMAChannelDir.MM2S: @@ -165,7 +165,7 @@ def _ipu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats= value |= XAIEMLGBL_NOC_MODULE_DMA_S2MM_0_TASK_QUEUE_ENABLE_TOKEN_ISSUE_MASK row = 0 - return _ipu_write32(column, row, address, value) + return _npu_write32(column, row, address, value) # based on ExecWriteBdExtendShimTileOpt @ dpufw/src/include/RunInstOpt.h:666 @@ -181,14 +181,14 @@ def _exec_write_bd_extend_shim_tile_opt(iptr, tensor_addr): write_addr = SHIM_DMA_BD0_BASE_ADDR + (bd_id * SHIM_BD_OFFSET) row = 0 words = [ - *_ipu_write32(column, row, write_addr, iptr[2]), - *_ipu_write32(column, row, write_addr + 4, word3), - *_ipu_write32(column, row, write_addr + 8, word4), - *_ipu_write32(column, row, write_addr + 12, iptr[5]), - *_ipu_write32(column, row, write_addr + 16, iptr[6]), - *_ipu_write32(column, row, write_addr + 20, iptr[7]), - *_ipu_write32(column, row, write_addr + 24, iptr[8]), - *_ipu_write32(column, row, write_addr + 28, iptr[9]), + *_npu_write32(column, row, write_addr, iptr[2]), + *_npu_write32(column, row, write_addr + 4, word3), + *_npu_write32(column, row, write_addr + 8, word4), + *_npu_write32(column, row, write_addr + 12, iptr[5]), + *_npu_write32(column, row, write_addr + 16, iptr[6]), + *_npu_write32(column, row, write_addr + 20, iptr[7]), + *_npu_write32(column, row, write_addr + 24, iptr[8]), + *_npu_write32(column, row, write_addr + 28, iptr[9]), ] return words @@ -202,14 +202,14 @@ def _update_tensor_addr_shim_tile(column, bd_id, tensor_addr, buffer_offset=0): write_addr = SHIM_DMA_BD0_BASE_ADDR + (bd_id * SHIM_BD_OFFSET) row = 0 words = [ - *_ipu_write32(column, row, write_addr + 4, word3), - *_ipu_write32(column, row, write_addr + 8, word4), + *_npu_write32(column, row, write_addr + 4, word3), + *_npu_write32(column, row, write_addr + 8, word4), ] return words # corresponds to ExecWriteBdExtendShimTileOpt -def _ipu_writebd_shimtile( +def _npu_writebd_shimtile( column, bd_id, buffer_length, @@ -304,26 +304,26 @@ def _ipu_writebd_shimtile( return words -def _ipu_noop(): +def _npu_noop(): words = [None] * 1 op_code = 0 words[0] = (op_code & 0xFF) << 24 return words -def _ipu_core_enable(column, row): +def _npu_core_enable(column, row): # note this clears the reset bit - return _ipu_write32(column, row, XAIEMLGBL_CORE_MODULE_CORE_CONTROL, 1) + return _npu_write32(column, row, XAIEMLGBL_CORE_MODULE_CORE_CONTROL, 1) -class ipu: - noop = _ipu_noop - write32 = _ipu_write32 - shimtile_push_queue = _ipu_shimtile_push_queue - writebd_shimtile = _ipu_writebd_shimtile - sync = _ipu_sync +class npu: + noop = _npu_noop + write32 = _npu_write32 + shimtile_push_queue = _npu_shimtile_push_queue + writebd_shimtile = _npu_writebd_shimtile + sync = _npu_sync get_prolog = _get_prolog - enable_cores = _ipu_core_enable + enable_cores = _npu_core_enable _exec_write_bd_extend_shim_tile_opt = _exec_write_bd_extend_shim_tile_opt _update_tensor_addr_shim_tile = _update_tensor_addr_shim_tile diff --git a/python/utils/trace.py b/python/utils/trace.py index 8c3e97be87..e348d5b993 100644 --- a/python/utils/trace.py +++ b/python/utils/trace.py @@ -95,7 +95,7 @@ def configure_simple_tracing_aie2( # BB <- Event to start trace capture # C <- Trace mode, 00=event=time, 01=event-PC, 10=execution # Configure so that "Event 1" (always true) causes tracing to start - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x340D0, @@ -104,7 +104,7 @@ def configure_simple_tracing_aie2( # 0x340D4: Trace Control 1 # This is used to control packet routing. For the moment # only deal with the simple case of circuit routing. - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x340D4, @@ -112,7 +112,7 @@ def configure_simple_tracing_aie2( ) # 0x340E0: Trace Event Group 1 (Which events to trace) # 0xAABBCCDD AA, BB, CC, DD <- four event slots - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x340E0, @@ -120,7 +120,7 @@ def configure_simple_tracing_aie2( ) # 0x340E4: Trace Event Group 2 (Which events to trace) # 0xAABBCCDD AA, BB, CC, DD <- four event slots - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x340E4, @@ -134,13 +134,13 @@ def master(port): def slave(port): return port - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x3FF00, value=pack4bytes(0, 0, slave(1), master(1)), # port 1 is FIFO0? ) - ipu_write32( + npu_write32( column=int(tile.col), row=int(tile.row), address=0x3FF04, @@ -149,7 +149,7 @@ def slave(port): # Configure a buffer descriptor to write tracing information that has been routed into this shim tile # out to host DDR memory - ipu_writebd_shimtile( + npu_writebd_shimtile( bd_id=bd_id, buffer_length=size, buffer_offset=offset, @@ -178,7 +178,7 @@ def slave(port): valid_bd=1, ) # configure S2MM channel - ipu_write32( + npu_write32( column=int(shim.col), row=int(shim.row), address=0x1D204 if channel == 0 else 0x1D20C, diff --git a/test/Conversion/DmaToNpu/aiert_insts.mlir b/test/Conversion/DmaToNpu/aiert_insts.mlir index ce82a1443e..bfcbe334ee 100644 --- a/test/Conversion/DmaToNpu/aiert_insts.mlir +++ b/test/Conversion/DmaToNpu/aiert_insts.mlir @@ -6,14 +6,14 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s -// CHECK: aiex.ipu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} -// CHECK: aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483649 : ui32} -// CHECK: aiex.ipu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} -// CHECK: aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32} +// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s +// CHECK: aiex.npu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +// CHECK: aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483649 : ui32} +// CHECK: aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} +// CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32} module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @of_toMem : memref<32xi32> memref.global "public" @of_fromMem : memref<32xi32> func.func @sequence(%in : memref<4x2x8xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) { @@ -24,8 +24,8 @@ module { %c8 = arith.constant 8 : i64 %c16 = arith.constant 16 : i64 %c32 = arith.constant 32 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32> + aiex.npu.dma_memcpy_nd(0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) diff --git a/test/Conversion/DmaToNpu/bad_rtp_write.mlir b/test/Conversion/DmaToNpu/bad_rtp_write.mlir index a28466af13..746df29273 100644 --- a/test/Conversion/DmaToNpu/bad_rtp_write.mlir +++ b/test/Conversion/DmaToNpu/bad_rtp_write.mlir @@ -6,13 +6,13 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --aie-dma-to-ipu -verify-diagnostics %s +// RUN: aie-opt --aie-dma-to-npu -verify-diagnostics %s -aie.device(ipu) { +aie.device(npu) { func.func @sequence() { - // expected-error@+2 {{'aiex.ipu.rtp_write' op RTP buffer address cannot be found. Has an RTP buffer been allocated?}} - // expected-error@+1 {{failed to legalize operation 'aiex.ipu.rtp_write' that was explicitly marked illegal}} - aiex.ipu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" } + // expected-error@+2 {{'aiex.npu.rtp_write' op RTP buffer address cannot be found. Has an RTP buffer been allocated?}} + // expected-error@+1 {{failed to legalize operation 'aiex.npu.rtp_write' that was explicitly marked illegal}} + aiex.npu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" } return } } diff --git a/test/Conversion/DmaToNpu/dma_to_ipu.mlir b/test/Conversion/DmaToNpu/dma_to_ipu.mlir index d86de2acce..059766fe7c 100644 --- a/test/Conversion/DmaToNpu/dma_to_ipu.mlir +++ b/test/Conversion/DmaToNpu/dma_to_ipu.mlir @@ -1,4 +1,4 @@ -//===- dma_to_ipu.mlir -----------------------------------------*- MLIR -*-===// +//===- dma_to_npu.mlir -----------------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,22 +8,22 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --split-input-file -aie-dma-to-ipu %s | FileCheck %s +// RUN: aie-opt --split-input-file -aie-dma-to-npu %s | FileCheck %s // TODO - more // CHECK-LABEL: dma_memcpy_nd_0 -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 1 : i32 module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> memref.global "public" @fromMem : memref<16xi32> func.func @dma_memcpy_nd_0(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.ipu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> return } aie.shim_dma_allocation @fromMem (MM2S, 0, 0) @@ -34,11 +34,11 @@ module { // ----- // CHECK-LABEL: dma_wait_s2mm -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.ipu.write32 -// CHECK: aiex.ipu.sync +// CHECK: aiex.npu.write32 +// CHECK: aiex.npu.sync // CHECK-SAME: channel = 0 : i32 // CHECK-SAME: column = 0 : i32 // CHECK-SAME: column_num = 1 : i32 @@ -46,11 +46,11 @@ module { // CHECK-SAME: row = 0 : i32 // CHECK-SAME: row_num = 1 : i32 module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> func.func @dma_wait_s2mm(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.ipu.dma_wait {symbol = @toMem} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_wait {symbol = @toMem} return } aie.shim_dma_allocation @toMem (S2MM, 0, 0) @@ -60,11 +60,11 @@ module { // ----- // CHECK-LABEL: dma_wait_mm2s -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.ipu.write32 -// CHECK: aiex.ipu.sync +// CHECK: aiex.npu.write32 +// CHECK: aiex.npu.sync // CHECK-SAME: channel = 1 : i32 // CHECK-SAME: column = 1 : i32 // CHECK-SAME: column_num = 1 : i32 @@ -72,11 +72,11 @@ module { // CHECK-SAME: row = 0 : i32 // CHECK-SAME: row_num = 1 : i32 module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> func.func @dma_wait_mm2s(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.ipu.dma_wait {symbol = @toMem} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_wait {symbol = @toMem} return } aie.shim_dma_allocation @toMem (MM2S, 1, 1) diff --git a/test/Conversion/DmaToNpu/dma_to_ipu_invalid.mlir b/test/Conversion/DmaToNpu/dma_to_ipu_invalid.mlir index 89eff26d44..31ed2ed019 100644 --- a/test/Conversion/DmaToNpu/dma_to_ipu_invalid.mlir +++ b/test/Conversion/DmaToNpu/dma_to_ipu_invalid.mlir @@ -1,4 +1,4 @@ -//===- dma_to_ipu_invalid.mlir ---------------------------------*- MLIR -*-===// +//===- dma_to_npu_invalid.mlir ---------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,15 +8,15 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --split-input-file --aie-dma-to-ipu --verify-diagnostics %s +// RUN: aie-opt --split-input-file --aie-dma-to-npu --verify-diagnostics %s module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> func.func @sequence() { - // expected-error@+2 {{failed to legalize operation 'aiex.ipu.dma_wait' that was explicitly marked illegal}} + // expected-error@+2 {{failed to legalize operation 'aiex.npu.dma_wait' that was explicitly marked illegal}} // expected-error@+1 {{couldn't find shim_dma_allocation op}} - aiex.ipu.dma_wait {symbol = @toMem} + aiex.npu.dma_wait {symbol = @toMem} return } } diff --git a/test/Conversion/DmaToNpu/dma_to_ipu_issue_token.mlir b/test/Conversion/DmaToNpu/dma_to_ipu_issue_token.mlir index 4eb5b02bdc..d73195973b 100644 --- a/test/Conversion/DmaToNpu/dma_to_ipu_issue_token.mlir +++ b/test/Conversion/DmaToNpu/dma_to_ipu_issue_token.mlir @@ -1,4 +1,4 @@ -//===- dma_to_ipu_issue_token.mlir -----------------------------*- MLIR -*-===// +//===- dma_to_npu_issue_token.mlir -----------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,26 +8,26 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt -aie-dma-to-ipu %s | FileCheck %s +// RUN: aie-opt -aie-dma-to-npu %s | FileCheck %s // TODO - more // CHECK-LABEL: test1 -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 0 : i32 // CHECK-SAME: valid_bd = 1 : i32 -// CHECK: aiex.ipu.write32 +// CHECK: aiex.npu.write32 // CHECK-SAME: value = 2147483649 -// CHECK: aiex.ipu.writebd_shimtile +// CHECK: aiex.npu.writebd_shimtile // CHECK-SAME: ddr_id = 1 : i32 -// CHECK: aiex.ipu.write32 +// CHECK: aiex.npu.write32 // CHECK-SAME: value = 0 module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> memref.global "public" @fromMem : memref<16xi32> func.func @test1(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32> - aiex.ipu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32> + aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32> return } aie.shim_dma_allocation @fromMem (MM2S, 0, 0) diff --git a/test/Conversion/DmaToNpu/push_to_queue.mlir b/test/Conversion/DmaToNpu/push_to_queue.mlir index 841d9e7a0f..4c45c90e1d 100644 --- a/test/Conversion/DmaToNpu/push_to_queue.mlir +++ b/test/Conversion/DmaToNpu/push_to_queue.mlir @@ -6,17 +6,17 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s -// CHECK: aiex.ipu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483651 : ui32} -// CHECK: aiex.ipu.write32 {address = 119316 : ui32, column = 2 : i32, row = 0 : i32, value = 196610 : ui32} +// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s +// CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483651 : ui32} +// CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 2 : i32, row = 0 : i32, value = 196610 : ui32} module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<32xi32> memref.global "public" @fromMem : memref<32xi32> func.func @sequence() { - aiex.ipu.shimtile_push_queue {metadata = @toMem, issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 } - aiex.ipu.shimtile_push_queue {metadata = @fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 } + aiex.npu.shimtile_push_queue {metadata = @toMem, issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 } + aiex.npu.shimtile_push_queue {metadata = @fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 } return } aie.shim_dma_allocation @fromMem (MM2S, 0, 2) diff --git a/test/Conversion/DmaToNpu/rtp_write.mlir b/test/Conversion/DmaToNpu/rtp_write.mlir index 9aba5ad4e7..26f2876b95 100644 --- a/test/Conversion/DmaToNpu/rtp_write.mlir +++ b/test/Conversion/DmaToNpu/rtp_write.mlir @@ -6,19 +6,19 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s -// CHECK: aiex.ipu.write32 {address = 1536 : ui32, column = 2 : i32, row = 3 : i32, value = 50 : ui32} -// CHECK: aiex.ipu.write32 {address = 3216 : ui32, column = 0 : i32, row = 2 : i32, value = 99 : ui32} +// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s +// CHECK: aiex.npu.write32 {address = 1536 : ui32, column = 2 : i32, row = 3 : i32, value = 50 : ui32} +// CHECK: aiex.npu.write32 {address = 3216 : ui32, column = 0 : i32, row = 2 : i32, value = 99 : ui32} module { - aie.device(ipu) { + aie.device(npu) { %0 = aie.tile(2, 3) %1 = aie.buffer(%0) {address = 1536 : i32, sym_name = "rtp"} : memref<16xi32> %2 = aie.tile(0, 2) %3 = aie.buffer(%2) {address = 3200 : i32, sym_name = "RTP"} : memref<16xi32> func.func @sequence() { - aiex.ipu.rtp_write(2, 3, 0, 50) { buffer_sym_name = "rtp" } - aiex.ipu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" } + aiex.npu.rtp_write(2, 3, 0, 50) { buffer_sym_name = "rtp" } + aiex.npu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" } return } } diff --git a/test/Passes/assign-bd-ids/bad_bd_assignments.mlir b/test/Passes/assign-bd-ids/bad_bd_assignments.mlir index 71ab96951f..9ab8036f48 100644 --- a/test/Passes/assign-bd-ids/bad_bd_assignments.mlir +++ b/test/Passes/assign-bd-ids/bad_bd_assignments.mlir @@ -11,7 +11,7 @@ // RUN: aie-opt --verify-diagnostics --split-input-file %s module { - aie.device(ipu) { + aie.device(npu) { %tile_0_2 = aie.tile(0, 2) %double_buffer = aie.buffer(%tile_0_2) : memref<32xi32> %lock_Y = aie.lock(%tile_0_2) {init = 0 : i32} @@ -30,7 +30,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_2 = aie.tile(0, 2) %double_buffer = aie.buffer(%tile_0_2) : memref<32xi32> %lock_X = aie.lock(%tile_0_2) {init = 0 : i32} @@ -49,7 +49,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %buffer_0_1 = aie.buffer(%tile_0_1) : memref<32xi32> %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { @@ -69,7 +69,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -90,7 +90,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -110,7 +110,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} diff --git a/test/Passes/assign-bd-ids/basic.mlir b/test/Passes/assign-bd-ids/basic.mlir index b306c3053e..8862c5c09d 100644 --- a/test/Passes/assign-bd-ids/basic.mlir +++ b/test/Passes/assign-bd-ids/basic.mlir @@ -10,7 +10,7 @@ // RUN: aie-opt --aie-assign-bd-ids --split-input-file %s | FileCheck %s -// CHECK-LABEL: aie.device(ipu) { +// CHECK-LABEL: aie.device(npu) { // CHECK: %[[VAL_0:.*]] = aie.tile(0, 0) // CHECK: %[[VAL_1:.*]] = aie.tile(0, 1) // CHECK: %[[VAL_2:.*]] = aie.tile(0, 2) @@ -30,7 +30,7 @@ // CHECK: aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32} module { - aie.device(ipu) { + aie.device(npu) { %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) diff --git a/test/Passes/assign-bd-ids/user_assigned.mlir b/test/Passes/assign-bd-ids/user_assigned.mlir index 777c07f7b8..c41d3aa7d3 100644 --- a/test/Passes/assign-bd-ids/user_assigned.mlir +++ b/test/Passes/assign-bd-ids/user_assigned.mlir @@ -10,7 +10,7 @@ // RUN: aie-opt --aie-assign-bd-ids --split-input-file %s | FileCheck %s -// CHECK-LABEL: aie.device(ipu) { +// CHECK-LABEL: aie.device(npu) { // CHECK: %[[VAL_0:.*]] = aie.tile(0, 0) // CHECK: %[[VAL_1:.*]] = aie.tile(0, 1) // CHECK: %[[VAL_2:.*]] = aie.tile(0, 2) @@ -28,7 +28,7 @@ // CHECK: aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32} module { - aie.device(ipu) { + aie.device(npu) { %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) @@ -153,7 +153,7 @@ module @aie_module { // ----- -// CHECK-LABEL: aie.device(ipu) { +// CHECK-LABEL: aie.device(npu) { // CHECK: %[[VAL_0:.*]] = aie.tile(0, 0) // CHECK: %[[VAL_1:.*]] = aie.tile(0, 1) // CHECK: %[[VAL_2:.*]] = aie.tile(0, 2) @@ -171,7 +171,7 @@ module @aie_module { // CHECK: aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32} module { - aie.device(ipu) { + aie.device(npu) { %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) diff --git a/test/Targets/AIETargetHSA/input_with_addresses.mlir b/test/Targets/AIETargetHSA/input_with_addresses.mlir index 1cf762054b..1efd284c53 100644 --- a/test/Targets/AIETargetHSA/input_with_addresses.mlir +++ b/test/Targets/AIETargetHSA/input_with_addresses.mlir @@ -46,9 +46,9 @@ module { aie.shim_dma_allocation @out0(S2MM, 0, 6) func.func @sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @out0} : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @in0} : memref<64xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @out0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @in0} : memref<64xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/test/Targets/NPU/npu_instgen.mlir b/test/Targets/NPU/npu_instgen.mlir index d1c2ef3c6a..4fd9636197 100644 --- a/test/Targets/NPU/npu_instgen.mlir +++ b/test/Targets/NPU/npu_instgen.mlir @@ -1,4 +1,4 @@ -//===- ipu_instgen.mlir ----------------------------------------*- MLIR -*-===// +//===- npu_instgen.mlir ----------------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -// RUN: aie-translate --aie-ipu-instgen %s | FileCheck %s +// RUN: aie-translate --aie-npu-instgen %s | FileCheck %s module { - aie.device(ipu) { + aie.device(npu) { func.func @test0(%arg0: memref<16xf32>, %arg1: memref<16xf32>) { // look for the prolog. @@ -48,7 +48,7 @@ module { // CHECK: 00000009 // CHECK: 2CD0000C // CHECK: 2E107041 - aiex.ipu.writebd_shimtile { bd_id = 6 : i32, + aiex.npu.writebd_shimtile { bd_id = 6 : i32, buffer_length = 1 : i32, buffer_offset = 2 : i32, enable_packet = 0 : i32, @@ -77,10 +77,10 @@ module { // CHECK: 02030400 // CHECK: ABC00DEF // CHECK: 00000042 - aiex.ipu.write32 { column = 3 : i32, row = 4 : i32, address = 0xabc00def : ui32, value = 0x42 : ui32 } + aiex.npu.write32 { column = 3 : i32, row = 4 : i32, address = 0xabc00def : ui32, value = 0x42 : ui32 } // CHECK: 03030401 // CHECK: 05010200 - aiex.ipu.sync { column = 3 : i32, row = 4 : i32, direction = 1 : i32, channel = 5 : i32, column_num = 1 : i32, row_num = 2 : i32 } + aiex.npu.sync { column = 3 : i32, row = 4 : i32, direction = 1 : i32, channel = 5 : i32, column_num = 1 : i32, row_num = 2 : i32 } return } } diff --git a/test/aie2xclbin/simple_xclbin.mlir b/test/aie2xclbin/simple_xclbin.mlir index 09e9dcaa6b..55c6aa8ec9 100644 --- a/test/aie2xclbin/simple_xclbin.mlir +++ b/test/aie2xclbin/simple_xclbin.mlir @@ -19,7 +19,7 @@ // PEANO-NOT: xchesscc_wrapper module { - aie.device(ipu) { + aie.device(npu) { %12 = aie.tile(1, 2) %buf = aie.buffer(%12) : memref<256xi32> %4 = aie.core(%12) { diff --git a/test/aiecc/simple_xclbin.mlir b/test/aiecc/simple_xclbin.mlir index 880225b0da..bec65be208 100644 --- a/test/aiecc/simple_xclbin.mlir +++ b/test/aiecc/simple_xclbin.mlir @@ -11,8 +11,8 @@ // REQUIRES: chess // REQUIRES: peano -// RUN: %PYTHON aiecc.py --xchesscc --no-link -nv --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %s | FileCheck %s --check-prefix=XCHESSCC -// RUN: %PYTHON aiecc.py --no-xchesscc --no-link -nv --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %s | FileCheck %s --check-prefix=PEANO +// RUN: %PYTHON aiecc.py --xchesscc --no-link -nv --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %s | FileCheck %s --check-prefix=XCHESSCC +// RUN: %PYTHON aiecc.py --no-xchesscc --no-link -nv --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %s | FileCheck %s --check-prefix=PEANO // Note that llc determines the architecture from the llvm IR. // XCHESSCC-NOT: {{^[^ ]*llc}} @@ -27,7 +27,7 @@ // PEANO: xclbinutil module { - aie.device(ipu) { + aie.device(npu) { %12 = aie.tile(1, 2) %buf = aie.buffer(%12) : memref<256xi32> %4 = aie.core(%12) { diff --git a/test/assign-buffer-addresses/bad_alignment.mlir b/test/assign-buffer-addresses/bad_alignment.mlir index b9c2b83d7e..4b5ca8ffce 100644 --- a/test/assign-buffer-addresses/bad_alignment.mlir +++ b/test/assign-buffer-addresses/bad_alignment.mlir @@ -11,7 +11,7 @@ // RUN: aie-opt --verify-diagnostics --split-input-file %s module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -31,7 +31,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -56,7 +56,7 @@ module { // prevent false-positives/false-negatives (I think). module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} @@ -77,7 +77,7 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32} diff --git a/test/dialect/AIE/bad_cascade.mlir b/test/dialect/AIE/bad_cascade.mlir index 21adc09a48..c204d79af1 100644 --- a/test/dialect/AIE/bad_cascade.mlir +++ b/test/dialect/AIE/bad_cascade.mlir @@ -31,7 +31,7 @@ aie.device(xcve2802) { // CHECK: error{{.*}}'aie.cascade_flow' op shimTile row has no cascade stream interface -aie.device(ipu) { +aie.device(npu) { %t10 = aie.tile(1, 0) %t20 = aie.tile(2, 0) aie.cascade_flow(%t10, %t20) @@ -41,7 +41,7 @@ aie.device(ipu) { // CHECK: error{{.*}}'aie.cascade_flow' op memTile row has no cascade stream interface -aie.device(ipu) { +aie.device(npu) { %t11 = aie.tile(1, 1) %t21 = aie.tile(2, 1) aie.cascade_flow(%t11, %t21) @@ -87,7 +87,7 @@ aie.device(xcve2802) { // CHECK: error{{.*}}'aie.configure_cascade' op memTile row has no cascade stream interface -aie.device(ipu) { +aie.device(npu) { %t11 = aie.tile(1, 1) aie.configure_cascade(%t11, North, West) } diff --git a/test/dialect/AIE/bad_dma_op.mlir b/test/dialect/AIE/bad_dma_op.mlir index c8338ae838..9ba149c65e 100644 --- a/test/dialect/AIE/bad_dma_op.mlir +++ b/test/dialect/AIE/bad_dma_op.mlir @@ -12,7 +12,7 @@ // CHECK: error: 'aie.dma' op DMAOp can only appear in single block region module { - aie.device(ipu) { + aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32} : memref<16xi32> %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32} diff --git a/test/dialect/AIE/badshimtiledma.mlir b/test/dialect/AIE/badshimtiledma.mlir index 7edde144f0..da396798f0 100644 --- a/test/dialect/AIE/badshimtiledma.mlir +++ b/test/dialect/AIE/badshimtiledma.mlir @@ -12,7 +12,7 @@ // CHECK: error{{.*}}'aie.shim_dma' op uses more input channels than available on this tile module @test { - aie.device(ipu) { + aie.device(npu) { %t00 = aie.tile(0, 0) %buf_e = aie.external_buffer : memref<256xi32> diff --git a/test/dialect/AIE/badtiledma4.mlir b/test/dialect/AIE/badtiledma4.mlir index 7d2cf2b9ce..6c498c62f4 100644 --- a/test/dialect/AIE/badtiledma4.mlir +++ b/test/dialect/AIE/badtiledma4.mlir @@ -12,7 +12,7 @@ // CHECK: error{{.*}}'aie.mem' op uses more output channels than available on this tile module @test { - aie.device(ipu) { + aie.device(npu) { %t03 = aie.tile(0, 3) %buf_e = aie.buffer(%t03) : memref<256xi32> diff --git a/test/dialect/AIE/buffer.mlir b/test/dialect/AIE/buffer.mlir index d522f08dd4..a75392c5a3 100644 --- a/test/dialect/AIE/buffer.mlir +++ b/test/dialect/AIE/buffer.mlir @@ -11,7 +11,7 @@ // RUN: aie-opt --aie-standard-lowering %s | FileCheck %s module { - aie.device(ipu) { + aie.device(npu) { %t33 = aie.tile(3, 3) %t42 = aie.tile(4, 2) %t44 = aie.tile(4, 4) diff --git a/test/dialect/AIEX/bad_ipu_nd.mlir b/test/dialect/AIEX/bad_ipu_nd.mlir index ebd1715062..45ec8e0dd6 100644 --- a/test/dialect/AIEX/bad_ipu_nd.mlir +++ b/test/dialect/AIEX/bad_ipu_nd.mlir @@ -1,4 +1,4 @@ -//===- bad_ipu_nd.mlir -----------------------------------------*- MLIR -*-===// +//===- bad_npu_nd.mlir -----------------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,14 +12,14 @@ // RUN: aie-opt --split-input-file --verify-diagnostics %s module { - aie.device(ipu) { - func.func @bad_ipu_nd_length(%in : memref<1920x1080xi32>, %buf : memref<32xi32>, %out : memref<1920x1080xi32>) { + aie.device(npu) { + func.func @bad_npu_nd_length(%in : memref<1920x1080xi32>, %buf : memref<32xi32>, %out : memref<1920x1080xi32>) { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c1920 = arith.constant 1920 : i64 %c1080 = arith.constant 1080 : i64 // expected-error@+1 {{Size 0 exceeds the [0:1023] range}} - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -29,8 +29,8 @@ module { // ----- module { - aie.device(ipu) { - func.func @bad_ipu_nd_repeat(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { + aie.device(npu) { + func.func @bad_npu_nd_repeat(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c2 = arith.constant 2 : i64 @@ -40,7 +40,7 @@ module { %c32 = arith.constant 32 : i64 %c128 = arith.constant 128 : i64 // expected-error@+1 {{Size 3 exceeds the [1:64] range}} - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -50,14 +50,14 @@ module { // ----- module { - aie.device(ipu) { - func.func @bad_ipu_nd_stride(%in : memref<8388608xi32>, %buf : memref<32xi32>, %out : memref<8388608xi32>) { + aie.device(npu) { + func.func @bad_npu_nd_stride(%in : memref<8388608xi32>, %buf : memref<32xi32>, %out : memref<8388608xi32>) { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c2 = arith.constant 2 : i64 %c2097152 = arith.constant 2097152 : i64 // expected-error@+1 {{Stride 1 exceeds the [1:1M] range}} - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -67,14 +67,14 @@ module { // ----- module { - aie.device(ipu) { - func.func @bad_ipu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) { + aie.device(npu) { + func.func @bad_npu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c1920 = arith.constant 1920 : i64 %c1080 = arith.constant 1080 : i64 // expected-error@+1 {{must be used with memref type with element width 32.}} - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8> return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) diff --git a/test/dialect/AIEX/bad_ipu_push_queue.mlir b/test/dialect/AIEX/bad_ipu_push_queue.mlir index 49feece90d..64a11960ea 100644 --- a/test/dialect/AIEX/bad_ipu_push_queue.mlir +++ b/test/dialect/AIEX/bad_ipu_push_queue.mlir @@ -1,4 +1,4 @@ -//===- bad_ipu_push_queue_bd.mlir ------------------------------*- MLIR -*-===// +//===- bad_npu_push_queue_bd.mlir ------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,10 +12,10 @@ // RUN: aie-opt --split-input-file --verify-diagnostics %s module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.ipu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 } + aiex.npu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 } return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -25,10 +25,10 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_repeat_count(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Repeat count exceeds the [0:255] range.}} - aiex.ipu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 } + aiex.npu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 } return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) diff --git a/test/dialect/AIEX/bad_ipu_write_bd.mlir b/test/dialect/AIEX/bad_ipu_write_bd.mlir index f653614c8d..fdc9b425cc 100644 --- a/test/dialect/AIEX/bad_ipu_write_bd.mlir +++ b/test/dialect/AIEX/bad_ipu_write_bd.mlir @@ -1,4 +1,4 @@ -//===- bad_ipu_write_bd_bd.mlir --------------------------------*- MLIR -*-===// +//===- bad_npu_write_bd_bd.mlir --------------------------------*- MLIR -*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,10 +12,10 @@ // RUN: aie-opt --split-input-file --verify-diagnostics %s module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{BD ID exceeds the maximum ID.}} - aiex.ipu.writebd_shimtile {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd_shimtile {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -25,10 +25,10 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_iteration_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}} - aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -38,10 +38,10 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_stride(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}} - aiex.ipu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) @@ -51,10 +51,10 @@ module { // ----- module { - aie.device(ipu) { + aie.device(npu) { func.func @bad_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) { // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}} - aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} + aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32} return } aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0) diff --git a/test/dialect/AIEX/invalid.mlir b/test/dialect/AIEX/invalid.mlir index 9b57d84b70..7a1a1fcc3b 100644 --- a/test/dialect/AIEX/invalid.mlir +++ b/test/dialect/AIEX/invalid.mlir @@ -10,10 +10,10 @@ // RUN: aie-opt --split-input-file --verify-diagnostics %s -aie.device(ipu) { - func.func @ipu_dma_wait_no_symbol() { - // expected-error@+1 {{'aiex.ipu.dma_wait' op couldn't find symbol in parent device}} - aiex.ipu.dma_wait {symbol = @out0} +aie.device(npu) { + func.func @npu_dma_wait_no_symbol() { + // expected-error@+1 {{'aiex.npu.dma_wait' op couldn't find symbol in parent device}} + aiex.npu.dma_wait {symbol = @out0} return } } diff --git a/test/dialect/AIEX/roundtrip.mlir b/test/dialect/AIEX/roundtrip.mlir index 27611d5914..a7c698db09 100644 --- a/test/dialect/AIEX/roundtrip.mlir +++ b/test/dialect/AIEX/roundtrip.mlir @@ -10,21 +10,21 @@ // RUN: aie-opt --split-input-file %s | FileCheck %s -// CHECK-LABEL: func.func @ipu_dma_wait -// CHECK: aiex.ipu.dma_wait {symbol = @out0} -aie.device(ipu) { +// CHECK-LABEL: func.func @npu_dma_wait +// CHECK: aiex.npu.dma_wait {symbol = @out0} +aie.device(npu) { memref.global "public" @out0 : memref<16xi32> - func.func @ipu_dma_wait() { - aiex.ipu.dma_wait {symbol = @out0} + func.func @npu_dma_wait() { + aiex.npu.dma_wait {symbol = @out0} return } } // ----- -// CHECK-LABEL: func.func @ipu_dma_wait_no_device -// CHECK: aiex.ipu.dma_wait {symbol = @out0} -func.func @ipu_dma_wait_no_device() { - aiex.ipu.dma_wait {symbol = @out0} +// CHECK-LABEL: func.func @npu_dma_wait_no_device +// CHECK: aiex.npu.dma_wait {symbol = @out0} +func.func @npu_dma_wait_no_device() { + aiex.npu.dma_wait {symbol = @out0} return } diff --git a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir b/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir index e44add4a05..89bda05890 100644 --- a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir +++ b/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { %tile_0_0 = aie.tile(0, 0) %tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) @@ -100,9 +100,9 @@ module { aie.shim_dma_allocation @this_just_creates_a_symbol_and_the_type_means_nothing_in(MM2S, 0, 0) aie.shim_dma_allocation @this_just_creates_a_symbol_and_the_type_means_nothing_out(S2MM, 0, 0) func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } } diff --git a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit b/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit index 5d29ef1058..67cf187967 100644 --- a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit +++ b/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit @@ -7,7 +7,7 @@ // RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir // RUN: cp *.elf aie.mlir.prj/ // RUN: cp *.bin aie.mlir.prj/ -// RUN: %python aiecc.py --no-aiesim --aie-generate-ipu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -// RUN: %run_on_ipu ./test.exe aie.xclbin | FileCheck %s +// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/add_314_using_dma_op/aie.mlir b/test/ipu-xrt/add_314_using_dma_op/aie.mlir index 646f263804..37ef98c47c 100644 --- a/test/ipu-xrt/add_314_using_dma_op/aie.mlir +++ b/test/ipu-xrt/add_314_using_dma_op/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @objFifo_in0 : memref<16xi32> memref.global "public" @objFifo_out0 : memref<16xi32> @@ -65,9 +65,9 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/ipu-xrt/add_314_using_dma_op/run.lit b/test/ipu-xrt/add_314_using_dma_op/run.lit index 23c3e076c9..5329b2789e 100644 --- a/test/ipu-xrt/add_314_using_dma_op/run.lit +++ b/test/ipu-xrt/add_314_using_dma_op/run.lit @@ -7,8 +7,8 @@ // RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir // RUN: cp *.elf aie.mlir.prj/ // RUN: cp *.bin aie.mlir.prj/ -// RUN: %python aiecc.py --no-aiesim --aie-generate-ipu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -// RUN: %run_on_ipu ./test.exe aie.xclbin | FileCheck %s +// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/add_one_objFifo/Makefile b/test/ipu-xrt/add_one_objFifo/Makefile index 9fd67f862e..ce9d9338b3 100644 --- a/test/ipu-xrt/add_one_objFifo/Makefile +++ b/test/ipu-xrt/add_one_objFifo/Makefile @@ -7,7 +7,7 @@ all: build/final.xclbin build/insts.txt build/final.xclbin: aie.mlir mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/test/ipu-xrt/add_one_objFifo/aie.mlir b/test/ipu-xrt/add_one_objFifo/aie.mlir index 3b55edb0d7..137f8b03d8 100644 --- a/test/ipu-xrt/add_one_objFifo/aie.mlir +++ b/test/ipu-xrt/add_one_objFifo/aie.mlir @@ -6,7 +6,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { %t00 = aie.tile(0, 0) %t01 = aie.tile(0, 1) %t02 = aie.tile(0, 2) @@ -44,9 +44,9 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c64 = arith.constant 64 : i64 - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> - aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/ipu-xrt/add_one_objFifo/run.lit b/test/ipu-xrt/add_one_objFifo/run.lit index 632a421a4d..a137e2e4ae 100644 --- a/test/ipu-xrt/add_one_objFifo/run.lit +++ b/test/ipu-xrt/add_one_objFifo/run.lit @@ -3,8 +3,8 @@ // // REQUIRES: ryzen_ai // -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/add_one_using_dma/aie.mlir b/test/ipu-xrt/add_one_using_dma/aie.mlir index 058ae034bc..8647f6b710 100644 --- a/test/ipu-xrt/add_one_using_dma/aie.mlir +++ b/test/ipu-xrt/add_one_using_dma/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @objFifo_in0 : memref<16xi32> memref.global "public" @objFifo_in0_cons : memref<16xi32> memref.global "public" @objFifo_in1 : memref<8xi32> @@ -76,9 +76,9 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c64_i64 = arith.constant 64 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/ipu-xrt/add_one_using_dma/run.lit b/test/ipu-xrt/add_one_using_dma/run.lit index 632a421a4d..a137e2e4ae 100644 --- a/test/ipu-xrt/add_one_using_dma/run.lit +++ b/test/ipu-xrt/add_one_using_dma/run.lit @@ -3,8 +3,8 @@ // // REQUIRES: ryzen_ai // -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/cascade_flows/CMakeLists.txt b/test/ipu-xrt/cascade_flows/CMakeLists.txt index 257e7ca075..aafc542dde 100644 --- a/test/ipu-xrt/cascade_flows/CMakeLists.txt +++ b/test/ipu-xrt/cascade_flows/CMakeLists.txt @@ -22,7 +22,7 @@ if (NOT WSL) else() set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install") set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo") - set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") + set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib") endif() set(TARGET_NAME test CACHE STRING "Target to be built") diff --git a/test/ipu-xrt/cascade_flows/Makefile b/test/ipu-xrt/cascade_flows/Makefile index 6c88c72d19..ef6b2cf5a0 100644 --- a/test/ipu-xrt/cascade_flows/Makefile +++ b/test/ipu-xrt/cascade_flows/Makefile @@ -19,7 +19,7 @@ build/%.o: %.cc build/final.xclbin: aie.mlir build/kernel1.o build/kernel2.o build/kernel3.o mkdir -p ${@D} cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \ - --aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%) + --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%) ${targetname}.exe: test.cpp rm -rf _build diff --git a/test/ipu-xrt/cascade_flows/aie.mlir b/test/ipu-xrt/cascade_flows/aie.mlir index 967c3ecedc..e5b98481b5 100644 --- a/test/ipu-xrt/cascade_flows/aie.mlir +++ b/test/ipu-xrt/cascade_flows/aie.mlir @@ -6,7 +6,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { %t00 = aie.tile(0, 0) %t01 = aie.tile(0, 1) %t03 = aie.tile(0, 3) @@ -60,9 +60,9 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c64 = arith.constant 64 : i64 - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> - aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32> + aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/ipu-xrt/cascade_flows/run.lit b/test/ipu-xrt/cascade_flows/run.lit index 6581a3a212..c3b2945605 100644 --- a/test/ipu-xrt/cascade_flows/run.lit +++ b/test/ipu-xrt/cascade_flows/run.lit @@ -6,7 +6,7 @@ // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel1.cc -o ./kernel1.o // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel2.cc -o ./kernel2.o // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel3.cc -o ./kernel3.o -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/e2e/conftest.py b/test/ipu-xrt/e2e/conftest.py index 7e27c4421b..c2f981380f 100644 --- a/test/ipu-xrt/e2e/conftest.py +++ b/test/ipu-xrt/e2e/conftest.py @@ -8,7 +8,7 @@ @pytest.fixture(autouse=True) def run_around_tests(): subprocess.check_call( - [str(Path(__file__).parent.parent.parent.parent / "utils" / "reset_ipu.sh")] + [str(Path(__file__).parent.parent.parent.parent / "utils" / "reset_npu.sh")] ) yield diff --git a/test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py b/test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py index 8af3ee9cf5..fc57e8b0f1 100644 --- a/test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py +++ b/test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py @@ -8,7 +8,7 @@ from pathlib import Path import random -from aie.compiler.aiecc.main import DMA_TO_IPU +from aie.compiler.aiecc.main import DMA_TO_NPU from aie.compiler.util import compile_without_vectorization, make_xclbin from aie.dialects import aie, aiex from aie.dialects.aie import ( @@ -16,7 +16,7 @@ DMAChannelDir, LockAction, WireBundle, - ipu_instgen, + npu_instgen, ) from aie.dialects.scf import for_ as range_, yield_ from aie.extras.dialects.ext import arith, func, memref @@ -47,8 +47,8 @@ def test_add_256_using_dma_op_no_double_buffering(ctx: MLIRContext, workdir: Pat LEN = 128 LOCAL_MEM_SIZE = 32 - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -115,7 +115,7 @@ def bobsyouruncle( _arg1: T.memref(1, T.i32()), arg2: T.memref(LEN, T.i32()), ): - aiex.ipu_dma_memcpy_nd( + aiex.npu_dma_memcpy_nd( this_is_meaningless_1.sym_name.value, 0, arg0, @@ -123,7 +123,7 @@ def bobsyouruncle( [1, 1, 1, LEN], [0, 0, 0], ) - aiex.ipu_dma_memcpy_nd( + aiex.npu_dma_memcpy_nd( this_is_meaningless_2.sym_name.value, 1, arg2, @@ -132,7 +132,7 @@ def bobsyouruncle( [0, 0, 0], ) - aiex.ipu_sync( + aiex.npu_sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) @@ -188,12 +188,12 @@ def dma2(): aie.end() compile_without_vectorization(ctx.module, workdir) - generated_ipu_insts = run_pipeline(ctx.module, DMA_TO_IPU) - ipu_insts = [int(inst, 16) for inst in ipu_instgen(generated_ipu_insts.operation)] + generated_npu_insts = run_pipeline(ctx.module, DMA_TO_NPU) + npu_insts = [int(inst, 16) for inst in npu_instgen(generated_npu_insts.operation)] xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(LEN,), (LEN,), (LEN,)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_locks.py b/test/ipu-xrt/e2e/test_locks.py index 3f50bf1da6..4ecc07a095 100644 --- a/test/ipu-xrt/e2e/test_locks.py +++ b/test/ipu-xrt/e2e/test_locks.py @@ -42,10 +42,10 @@ def test_one_global(ctx: MLIRContext, workdir: Path): iv = np.random.randint(0, 10, (K,), dtype=np.int32) column = 2 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): # TODO(max): figure this annoying thing out... if column != 0: _dummy_tile = aie.tile(0, 2) @@ -112,8 +112,8 @@ def memtile_dma(): ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( column=column, bd_id=bd_id, buffer_length=K, @@ -121,16 +121,16 @@ def memtile_dma(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( channel_dir=S2MM, channel_index=flow_to_shim.dest_channel, column=column, bd_id=bd_id, ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=flow_to_shim.dest_channel, column=column, direction=0, @@ -140,9 +140,9 @@ def memtile_dma(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) [c] = xclbin.mmap_buffers([(K,)], np.int32) wrap_C = np.asarray(c) C = np.zeros((K,), dtype=np.int32) @@ -166,10 +166,10 @@ def test_threesome(ctx: MLIRContext, workdir: Path): iv1 = np.random.randint(0, 10, (K,), dtype=np.int32) iv2 = np.random.randint(0, 10, (K,), dtype=np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): _dummy_tile = aie.tile(0, 2) tile_1_2 = aie.tile(1, 2) global_weight_1_2 = memref.global_(initial_value=iv1) @@ -249,8 +249,8 @@ def memtile_dma(): ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( column=shim_tile_column, bd_id=bd_id, buffer_length=K, @@ -258,16 +258,16 @@ def memtile_dma(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( channel_dir=S2MM, channel_index=flow_to_shim.dest_channel, column=shim_tile_column, bd_id=bd_id, ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=flow_to_shim.dest_channel, column=shim_tile_column, direction=0, @@ -277,9 +277,9 @@ def memtile_dma(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) [c] = xclbin.mmap_buffers([(K,)], np.int32) wrap_C = np.asarray(c) C = np.zeros((K,), dtype=np.int32) @@ -305,10 +305,10 @@ def test_foursome(ctx: MLIRContext, workdir: Path): iv2 = np.random.randint(0, 10, (K,), dtype=np.int32) iv3 = np.random.randint(0, 10, (K,), dtype=np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): _dummy_tile = aie.tile(0, 2) tile_1_3 = aie.tile(1, 3) @@ -407,8 +407,8 @@ def memtile_dma(): ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( column=shim_tile_column, bd_id=bd_id, buffer_length=K, @@ -416,16 +416,16 @@ def memtile_dma(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( channel_dir=S2MM, channel_index=flow_to_shim.dest_channel, column=shim_tile_column, bd_id=bd_id, ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=flow_to_shim.dest_channel, column=shim_tile_column, direction=0, @@ -435,9 +435,9 @@ def memtile_dma(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) [c] = xclbin.mmap_buffers([(K,)], np.int32) wrap_C = np.asarray(c) C = np.zeros((K,), dtype=np.int32) diff --git a/test/ipu-xrt/e2e/test_manual_dpu_args.py b/test/ipu-xrt/e2e/test_manual_dpu_args.py index 3016384071..9b372e439e 100644 --- a/test/ipu-xrt/e2e/test_manual_dpu_args.py +++ b/test/ipu-xrt/e2e/test_manual_dpu_args.py @@ -54,8 +54,8 @@ def test_manual_args(ctx: MLIRContext, workdir: Path): iters = 10 loop = False - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -115,29 +115,29 @@ def dma6(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * iters, np.int32) col = 0 channel_index = 0 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() for bd_id in range(iters): - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(bd_id), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend(aiex.ipu.sync(column=col)) + npu_insts.extend(aiex.npu.sync(column=col)) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) @@ -161,8 +161,8 @@ def test_manual_args_with_offset(ctx: MLIRContext, workdir: Path): iters = 10 loop = False - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -222,30 +222,30 @@ def dma6(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K * iters,)] * iters, np.int32) col = 0 channel_index = 0 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() for i in range(iters): bd_id = i - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K, buffer_offset=K * i ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(i), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend(aiex.ipu.sync(column=col)) + npu_insts.extend(aiex.npu.sync(column=col)) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) @@ -268,8 +268,8 @@ def test_manual_args_with_different_cols(ctx: MLIRContext, workdir: Path): RANDOM_WEIGHT = np.random.randint(0, 10, (K,), dtype=np.int32) cols = [0, 1, 2, 3] - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): for c in cols: tile_c_0 = aie.tile(c, 0) tile_c_2 = aie.tile(c, 2) @@ -306,29 +306,29 @@ def dma3(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32) bd_id = 0 channel_index = 0 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() for col in cols: - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(col), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend(aiex.ipu.sync(column=col)) + npu_insts.extend(aiex.npu.sync(column=col)) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) @@ -353,8 +353,8 @@ def test_manual_args_with_shim_dma(ctx: MLIRContext, workdir: Path): iters = 21 - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): if 0 not in cols: tile_dummy = aie.tile(0, 3) for c in cols: @@ -408,20 +408,20 @@ def dma(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32) bd_id = 0 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() for i, col in enumerate(cols): - update_addrs = aiex.ipu._update_tensor_addr_shim_tile( + update_addrs = aiex.npu._update_tensor_addr_shim_tile( col, bd_id, tensor_addr=xclbin._get_buffer_host_address(i) ) - ipu_insts.extend(update_addrs) - ipu_insts.extend(aiex.ipu.enable_cores(col, compute_tile_row)) + npu_insts.extend(update_addrs) + npu_insts.extend(aiex.npu.enable_cores(col, compute_tile_row)) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) diff --git a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py b/test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py index 20c5998709..0489b46381 100644 --- a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py +++ b/test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py @@ -45,10 +45,10 @@ def test_nonsquare_matrix_mult(ctx: MLIRContext, workdir: Path): M, K, N = 16, 32, 16 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -91,8 +91,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * K, @@ -100,14 +100,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K * N, @@ -115,14 +115,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -130,9 +130,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -235,9 +235,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -268,10 +268,10 @@ def core(): def test_nonsquare_matrix_mult_sugar(ctx: MLIRContext, workdir: Path): M, K, N = 16, 32, 16 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -309,8 +309,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * K, @@ -318,14 +318,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K * N, @@ -333,14 +333,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -348,9 +348,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -417,9 +417,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py b/test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py index ae1079fd4e..816ddba151 100644 --- a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py +++ b/test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py @@ -64,11 +64,11 @@ def matmul_i32_i32( def test_nonsquare_matrix_mult_vectorized(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -112,8 +112,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * K, @@ -121,14 +121,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K * N, @@ -136,14 +136,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -151,9 +151,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -327,9 +327,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -359,11 +359,11 @@ def super_vectorize(target: any_op_t()): def test_nonsquare_matrix_mult_vectorized_sugar(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -402,8 +402,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * K, @@ -411,14 +411,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K * N, @@ -426,14 +426,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -441,9 +441,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -579,9 +579,9 @@ def super_vectorize(target: any_op_t()): ) compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_offsets_sizes_strides.py b/test/ipu-xrt/e2e/test_offsets_sizes_strides.py index 1262d59bf3..ecae0293ba 100644 --- a/test/ipu-xrt/e2e/test_offsets_sizes_strides.py +++ b/test/ipu-xrt/e2e/test_offsets_sizes_strides.py @@ -54,10 +54,10 @@ def test_offsets_sizes_strides(ctx: MLIRContext, workdir: Path): tile_m_B, tile_n_B = M // tile_rows_B, N // tile_cols_B tile_m_C, tile_n_C = M // tile_rows_C, N // tile_cols_C - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -102,8 +102,8 @@ def ipu(): channel_index = 0 ddr_id = 0 for i, bd_id in enumerate(range(4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, 64, @@ -115,16 +115,16 @@ def ipu(): d0_stride=1, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, 64, @@ -136,16 +136,16 @@ def ipu(): d0_stride=1, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C channel_index = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, 64, @@ -157,11 +157,11 @@ def ipu(): d0_stride=1, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -257,9 +257,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_repeat_count.py b/test/ipu-xrt/e2e/test_repeat_count.py index c769770283..e350931920 100644 --- a/test/ipu-xrt/e2e/test_repeat_count.py +++ b/test/ipu-xrt/e2e/test_repeat_count.py @@ -53,10 +53,10 @@ def test_repeat_count(ctx: MLIRContext, workdir: Path): iters = 4 loop = False RANDOM_WEIGHT = np.random.randint(0, 10, (K,), dtype=np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -109,8 +109,8 @@ def dma6(): ddr_id = 0 col = 0 for i, bd_id in enumerate(range(iters)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K, @@ -118,11 +118,11 @@ def dma6(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=col, column_num=1, @@ -136,9 +136,9 @@ def dma6(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(iters * K,)], np.int32) wrap_C = np.asarray(views[0]) @@ -165,10 +165,10 @@ def test_no_loop(ctx: MLIRContext, workdir: Path): RANDOM_WEIGHT = np.ones((K,), dtype=np.int32) * random.randint(1, 100) col = 2 iters = 10 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): nonlocal col if col != 0: @@ -206,28 +206,28 @@ def dma3(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)], np.int32) channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K, ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( S2MM, channel_index, col, bd_id, repeats=iters - 1 ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=col, column_num=1, @@ -237,7 +237,7 @@ def dma3(): ) ) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) diff --git a/test/ipu-xrt/e2e/test_shared_buffers_init_value.py b/test/ipu-xrt/e2e/test_shared_buffers_init_value.py index 461031ebb3..e8fb6b4ebe 100644 --- a/test/ipu-xrt/e2e/test_shared_buffers_init_value.py +++ b/test/ipu-xrt/e2e/test_shared_buffers_init_value.py @@ -38,10 +38,10 @@ def test_foursome(ctx: MLIRContext, workdir: Path): init_weights = [np.random.randint(0, 10, (K,), dtype=np.int32) for _ in range(7)] random_numbers = [random.randint(0, 10) for _ in range(7, 7 + 3)] - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): _dummy_tile = aie.tile(0, 2) # west @@ -170,8 +170,8 @@ def memtile_dma(): ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( column=shim_tile_column, bd_id=bd_id, buffer_length=K, @@ -179,16 +179,16 @@ def memtile_dma(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( channel_dir=S2MM, channel_index=flow_to_shim.dest_channel, column=shim_tile_column, bd_id=bd_id, ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=flow_to_shim.dest_channel, column=shim_tile_column, direction=S2MM, @@ -198,9 +198,9 @@ def memtile_dma(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) [c] = xclbin.mmap_buffers([(K,)], np.int32) wrap_C = np.asarray(c) C = np.zeros((K,), dtype=np.int32) diff --git a/test/ipu-xrt/e2e/test_square_matrix_mult.py b/test/ipu-xrt/e2e/test_square_matrix_mult.py index 6f746fc490..b229c3a1a8 100644 --- a/test/ipu-xrt/e2e/test_square_matrix_mult.py +++ b/test/ipu-xrt/e2e/test_square_matrix_mult.py @@ -45,10 +45,10 @@ def test_square_matrix_mult(ctx: MLIRContext, workdir: Path): M = N = 16 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -91,8 +91,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -100,14 +100,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -115,14 +115,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -130,9 +130,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -229,9 +229,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -262,10 +262,10 @@ def core(): def test_square_matrix_mult_sugar(ctx: MLIRContext, workdir: Path): M = N = 16 - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -299,8 +299,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -308,14 +308,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -323,14 +323,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -338,9 +338,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -397,9 +397,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py b/test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py index b11e4463f8..55a8feeb3c 100644 --- a/test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py +++ b/test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py @@ -64,12 +64,12 @@ def matmul_i32_i32( def test_square_matrix_mult_vectorized(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -113,8 +113,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -122,14 +122,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -137,14 +137,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -152,9 +152,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -326,9 +326,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -358,11 +358,11 @@ def super_vectorize(target: any_op_t()): def test_square_matrix_mult_vectorized_sugar(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -401,8 +401,8 @@ def ipu(): channel_index = 0 ddr_id = 0 bd_id = 0 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -410,14 +410,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 ddr_id = 1 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -425,14 +425,14 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 ddr_id = 2 bd_id += 1 - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=M * N, @@ -440,9 +440,9 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -581,9 +581,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_tiled_matrix_add.py b/test/ipu-xrt/e2e/test_tiled_matrix_add.py index 00755104ed..21d90f3231 100644 --- a/test/ipu-xrt/e2e/test_tiled_matrix_add.py +++ b/test/ipu-xrt/e2e/test_tiled_matrix_add.py @@ -47,10 +47,10 @@ def test_tiled_matrix_add(ctx: MLIRContext, workdir: Path): _, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles( M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -100,8 +100,8 @@ def ipu(): channel_index = 0 ddr_id = 0 for i, bd_id in enumerate(range(4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -113,16 +113,16 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -134,16 +134,16 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C channel_index = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -155,11 +155,11 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -258,9 +258,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -291,10 +291,10 @@ def test_matrix_add_sugar(ctx: MLIRContext, workdir: Path): _, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles( M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): shim_tile_0_0 = aie.tile(0, 0) mem_tile_0_1 = aie.tile(0, 1) compute_tile_0_2 = aie.tile(0, 2) @@ -359,8 +359,8 @@ def ipu(): # in A ddr_id = 0 for i, bd_id in enumerate(range(4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -372,8 +372,8 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( MM2S, input_a_tile_0_0_to_tile_0_1.source_channel, col, bd_id ) ) @@ -381,8 +381,8 @@ def ipu(): # in B ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -394,8 +394,8 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( MM2S, input_b_tile_0_0_to_tile_0_1.source_channel, col, bd_id ) ) @@ -403,8 +403,8 @@ def ipu(): # out C ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, tile_rows * tile_cols, @@ -416,13 +416,13 @@ def ipu(): d0_stride=d0_stride, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue( + npu_insts.extend( + aiex.npu.shimtile_push_queue( S2MM, output_c_tile_0_1_to_tile_0_0.dest_channel, col, bd_id ) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -496,9 +496,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py b/test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py index 1b19015d33..4e5e41615b 100644 --- a/test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py +++ b/test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py @@ -70,7 +70,7 @@ def shim_tensor_slice( M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols ) - ipu_insts = aiex.ipu.writebd_shimtile( + npu_insts = aiex.npu.writebd_shimtile( column=column, bd_id=bd_id, ddr_id=ddr_id, @@ -81,23 +81,23 @@ def shim_tensor_slice( d0_size=d0_size, d0_stride=d0_stride, ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(channel_dir, channel_index, column, bd_id=bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(channel_dir, channel_index, column, bd_id=bd_id) ) - return ipu_insts + return npu_insts def shim_bd(direction, channel, buffer_length, column=0, bd_id=0, ddr_id=0): - ipu_insts = [] - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts = [] + npu_insts.extend( + aiex.npu.writebd_shimtile( column=column, bd_id=bd_id, ddr_id=ddr_id, buffer_length=buffer_length ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(direction, channel, column, bd_id=bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(direction, channel, column, bd_id=bd_id) ) - return ipu_insts + return npu_insts def test_tiled_nonsquare_tile_spatial_2x2(ctx: MLIRContext, workdir: Path): @@ -136,10 +136,10 @@ def test_tiled_nonsquare_tile_spatial_2x2(ctx: MLIRContext, workdir: Path): M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): # col a0 (top row of matrix products) tiles = np.empty((5, 6), dtype=object) for col in [0, 1]: @@ -167,17 +167,17 @@ def ipu(): # fmt: off column = 0 # broadcast a0 - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0)) # broadcast b0 - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1)) column = 1 # broadcast a1 - ipu_insts.extend( + npu_insts.extend( shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, d1_size_A * d1_stride_A, column, MM2S, broadcast_a1_flow_ep.source_channel, 0, 0) ) # broadcast b1 - ipu_insts.extend( + npu_insts.extend( shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, d0_size_B * d0_stride_B, column, MM2S, broadcast_b1_flow_ep.source_channel, 1, 1) ) # fmt: on @@ -339,15 +339,15 @@ def memtile_dma_c_1(): # fmt: off for i, (column, channel, bd_id) in enumerate(channels): - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2)) - ipu_insts.extend(aiex.ipu.sync(channel=channel, column=column)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2)) + npu_insts.extend(aiex.npu.sync(channel=channel, column=column)) # fmt: on compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -453,7 +453,7 @@ def test_tiled_nonsquare_tile_spatial_2x2_vectorized(ctx: MLIRContext, workdir: M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aievec = ExplicitlyManagedModule() kernel = matmul_i32_i32_already_vectorized.emit(force=True) @@ -462,8 +462,8 @@ def test_tiled_nonsquare_tile_spatial_2x2_vectorized(ctx: MLIRContext, workdir: mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32_already_vectorized.emit(decl=True) # col a0 (top row of matrix products) tiles = np.empty((5, 6), dtype=object) @@ -492,17 +492,17 @@ def ipu(): # fmt: off column = 0 # broadcast a0 - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0)) # broadcast b0 - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1)) column = 1 # broadcast a1 - ipu_insts.extend( + npu_insts.extend( shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, d1_size_A * d1_stride_A, column, MM2S, broadcast_a1_flow_ep.source_channel, 0, 0) ) # broadcast b1 - ipu_insts.extend( + npu_insts.extend( shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, d0_size_B * d0_stride_B, column, MM2S, broadcast_b1_flow_ep.source_channel, 1, 1) ) # fmt: on @@ -664,8 +664,8 @@ def memtile_dma_c_1(): # fmt: off for i, (column, channel, bd_id) in enumerate(channels): - ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2)) - ipu_insts.extend(aiex.ipu.sync(channel=channel, column=column)) + npu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2)) + npu_insts.extend(aiex.npu.sync(channel=channel, column=column)) # fmt: on mod_aie = mod_aie.finish() @@ -673,9 +673,9 @@ def memtile_dma_c_1(): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -712,8 +712,8 @@ def test_tiled_nonsquare_tile_spatial_4x4_weight_stationary_v1( dest_channels = {} - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray(cols, rows) for i, ((col, row), t) in enumerate(tiles[:, 2:]): b = aie.buffer( @@ -784,28 +784,28 @@ def memtile_dma(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() bd_id = 0 for col in cols: dest_channel = dest_channels[col] - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(col), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, dest_channel, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, dest_channel, col, bd_id) ) - ipu_insts.extend(aiex.ipu.sync(column=col)) - xclbin.load_ipu_instructions(ipu_insts) + npu_insts.extend(aiex.npu.sync(column=col)) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) @@ -826,8 +826,8 @@ def test_double_pump_single_buffer(ctx: MLIRContext, workdir: Path): source_channels = {} # dest_channels = {} - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray(cols=[0], rows=[0, 1, 2]) buffer = tiles[0, 2].buffer([(K,)], [T.i32()], "double_buffer") @@ -969,27 +969,27 @@ def memtile_dma(): kernel_json = emit_design_kernel_json(buffer_args=buffer_args) xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") views = xclbin.mmap_buffers([(K,)] * 2, np.int32) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() col = 0 for bd_id, player in enumerate(["player_a", "player_b"]): source_channel = source_channels[player] - writebd_shimtile_insts = aiex.ipu.writebd_shimtile( + writebd_shimtile_insts = aiex.npu.writebd_shimtile( col, bd_id, buffer_length=K ) - ipu_insts.extend( - aiex.ipu._exec_write_bd_extend_shim_tile_opt( + npu_insts.extend( + aiex.npu._exec_write_bd_extend_shim_tile_opt( writebd_shimtile_insts, tensor_addr=xclbin._get_buffer_host_address(col), ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, source_channel, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, source_channel, col, bd_id) ) - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wraps = list(map(np.asarray, views)) diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py b/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py index fdce41d8ae..cc460b6122 100644 --- a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py +++ b/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py @@ -79,10 +79,10 @@ def test_tiled_nonsquare_tile_matrix_mult(ctx: MLIRContext, workdir: Path): M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -130,8 +130,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -139,16 +139,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -160,13 +160,13 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -178,8 +178,8 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -193,8 +193,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -206,11 +206,11 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -312,9 +312,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -378,10 +378,10 @@ def test_tiled_nonsquare_tile_matrix_mult_sugar(ctx: MLIRContext, workdir: Path) M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -424,8 +424,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -433,16 +433,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -454,13 +454,13 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -472,8 +472,8 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -487,8 +487,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -500,11 +500,11 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, direction=0, row=0, row_num=1 ) ) @@ -570,9 +570,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py b/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py index 036400fb1a..3d5b85c45b 100644 --- a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py +++ b/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py @@ -101,12 +101,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized(ctx: MLIRContext, workdir: M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -155,8 +155,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -164,16 +164,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -185,13 +185,13 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -203,8 +203,8 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -218,8 +218,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -231,11 +231,11 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -408,9 +408,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -466,12 +466,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized_sugar( M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -515,8 +515,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -524,8 +524,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -533,8 +533,8 @@ def ipu(): col = 0 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -546,13 +546,13 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -564,8 +564,8 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -580,8 +580,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -593,11 +593,11 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -736,9 +736,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) wrap_A = np.asarray(views[0]) @@ -828,12 +828,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized_sugar_already_vectorized( M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C ) - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): matmul_i32_i32_already_vectorized.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -877,8 +877,8 @@ def ipu(): 0 + d1_size_A * d1_stride_A, ] for i, bd_id in enumerate(range(2)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_A * tile_n_A, @@ -886,15 +886,15 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id)) # in B channel_index = 1 col = 0 ddr_id = 1 for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -906,11 +906,11 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id)) bd_id += 1 # B tiles are "tall" so need to offset by cols (i.e. d0 dim) - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_B * tile_n_B, @@ -922,7 +922,7 @@ def ipu(): d0_stride=d0_stride_B, ) ) - ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id)) + npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id)) # out C channel_index = 0 @@ -936,8 +936,8 @@ def ipu(): ] for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=tile_m_C * tile_n_C, @@ -949,9 +949,9 @@ def ipu(): d0_stride=d0_stride_C, ) ) - ipu_insts.extend(aiex.ipu.write32(S2MM, channel_index, col, bd_id)) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend(aiex.npu.write32(S2MM, channel_index, col, bd_id)) + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -1030,9 +1030,9 @@ def core(): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) wrap_A, wrap_B, wrap_C = map( np.asarray, xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32) ) diff --git a/test/ipu-xrt/e2e/test_tiled_vec_add.py b/test/ipu-xrt/e2e/test_tiled_vec_add.py index ab0cd13769..ff8c1e77d1 100644 --- a/test/ipu-xrt/e2e/test_tiled_vec_add.py +++ b/test/ipu-xrt/e2e/test_tiled_vec_add.py @@ -48,10 +48,10 @@ def test_vec_add(ctx: MLIRContext, workdir: Path): tiles = 4 k = K // tiles - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -95,8 +95,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -104,16 +104,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B channel_index = 1 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -121,16 +121,16 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C channel_index = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -138,11 +138,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -241,9 +241,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32) wrap_A = np.asarray(views[0]) @@ -278,10 +278,10 @@ def test_vec_add_sugar(ctx: MLIRContext, workdir: Path): tiles = 4 k = K // tiles - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -316,8 +316,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -325,8 +325,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -334,8 +334,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -343,8 +343,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -352,8 +352,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -361,11 +361,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -422,9 +422,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py b/test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py index d0990390ae..bcc8beb2be 100644 --- a/test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py +++ b/test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py @@ -66,11 +66,11 @@ def vec_add_i32_i32( def test_vec_add_vectorized(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): vec_add_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -115,8 +115,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -124,8 +124,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -133,8 +133,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -142,8 +142,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -151,8 +151,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -160,11 +160,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -328,9 +328,9 @@ def super_vectorize(target: any_op_t()): ) compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32) wrap_A = np.asarray(views[0]) @@ -359,11 +359,11 @@ def super_vectorize(target: any_op_t()): def test_vec_add_vectorized_sugar(ctx: MLIRContext, workdir: Path): - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() mod_aie = ExplicitlyManagedModule() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): vec_add_i32_i32.emit(decl=True) tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) @@ -399,8 +399,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -408,8 +408,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -417,8 +417,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -426,8 +426,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -435,8 +435,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -444,11 +444,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -574,9 +574,9 @@ def super_vectorize(target: any_op_t()): compile_with_vectorization(mod_aie, mod_aievec, workdir) xclbin_path = make_xclbin(mod_aie, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/test_vec_dot.py b/test/ipu-xrt/e2e/test_vec_dot.py index 7a2012a1d0..f111316692 100644 --- a/test/ipu-xrt/e2e/test_vec_dot.py +++ b/test/ipu-xrt/e2e/test_vec_dot.py @@ -52,10 +52,10 @@ def test_vec_dot(ctx: MLIRContext, workdir: Path): tiles = 4 k = K // tiles - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -99,8 +99,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -108,8 +108,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -117,8 +117,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -126,8 +126,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -135,8 +135,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=1, @@ -144,11 +144,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -252,9 +252,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (tiles,)], np.int32) wrap_A = np.asarray(views[0]) @@ -289,10 +289,10 @@ def test_vec_dot_sugar(ctx: MLIRContext, workdir: Path): tiles = 4 k = K // tiles - ipu_insts = aiex.ipu.get_prolog() + npu_insts = aiex.npu.get_prolog() - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tile_0_0 = aie.tile(0, 0) tile_0_1 = aie.tile(0, 1) tile_0_2 = aie.tile(0, 2) @@ -327,8 +327,8 @@ def ipu(): ddr_id = 0 offsets = list(range(0, K, k)) for i, bd_id in enumerate(range(tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -336,8 +336,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # in B @@ -345,8 +345,8 @@ def ipu(): col = 0 ddr_id = 1 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=k, @@ -354,8 +354,8 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id) ) # out C @@ -363,8 +363,8 @@ def ipu(): col = 0 ddr_id = 2 for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)): - ipu_insts.extend( - aiex.ipu.writebd_shimtile( + npu_insts.extend( + aiex.npu.writebd_shimtile( col, bd_id, buffer_length=1, @@ -372,11 +372,11 @@ def ipu(): ddr_id=ddr_id, ) ) - ipu_insts.extend( - aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id) + npu_insts.extend( + aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id) ) - ipu_insts.extend( - aiex.ipu.sync( + npu_insts.extend( + aiex.npu.sync( channel=0, column=0, column_num=1, @@ -440,9 +440,9 @@ def core(): compile_without_vectorization(ctx.module, workdir) xclbin_path = make_xclbin(ctx.module, workdir) - with FileLock("/tmp/ipu.lock"): + with FileLock("/tmp/npu.lock"): xclbin = XCLBin(xclbin_path, "MLIR_AIE") - xclbin.load_ipu_instructions(ipu_insts) + xclbin.load_npu_instructions(npu_insts) views = xclbin.mmap_buffers([(K,), (K,), (tiles,)], np.int32) wrap_A = np.asarray(views[0]) diff --git a/test/ipu-xrt/e2e/tiled_matrix_add.ipynb b/test/ipu-xrt/e2e/tiled_matrix_add.ipynb index 1d9a6f1c59..0c9a2247ac 100644 --- a/test/ipu-xrt/e2e/tiled_matrix_add.ipynb +++ b/test/ipu-xrt/e2e/tiled_matrix_add.ipynb @@ -94,7 +94,7 @@ "_, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles(\n", " M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols\n", ")\n", - "ipu_insts = aiex.ipu.get_prolog()" + "npu_insts = aiex.npu.get_prolog()" ] }, { @@ -112,8 +112,8 @@ "metadata": {}, "outputs": [], "source": [ - "@aie.device(AIEDevice.ipu)\n", - "def ipu(): # function name isn't load-bearing\n", + "@aie.device(AIEDevice.npu)\n", + "def npu(): # function name isn't load-bearing\n", "\n", " # tiles that will participate\n", " shim_tile_0_0 = aie.tile(0, 0)\n", @@ -193,8 +193,8 @@ " # (yes this is a weird naming/assignment but it's due to a hack in implementation...)\n", " ddr_id = 0\n", " for i, bd_id in enumerate(range(4)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " tile_rows * tile_cols,\n", @@ -206,8 +206,8 @@ " d0_stride=d0_stride,\n", " )\n", " )\n", - " ipu_insts.extend(\n", - " aiex.ipu.shimtile_push_queue(\n", + " npu_insts.extend(\n", + " aiex.npu.shimtile_push_queue(\n", " MM2S, input_a_tile_0_0_to_tile_0_1.source_channel, col, bd_id\n", " )\n", " )\n", @@ -215,8 +215,8 @@ " # in B\n", " ddr_id = 1\n", " for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " tile_rows * tile_cols,\n", @@ -228,8 +228,8 @@ " d0_stride=d0_stride,\n", " )\n", " )\n", - " ipu_insts.extend(\n", - " aiex.ipu.shimtile_push_queue(\n", + " npu_insts.extend(\n", + " aiex.npu.shimtile_push_queue(\n", " MM2S, input_b_tile_0_0_to_tile_0_1.source_channel, col, bd_id\n", " )\n", " )\n", @@ -237,8 +237,8 @@ " # out C\n", " ddr_id = 2\n", " for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " bd_id,\n", " tile_rows * tile_cols,\n", " offsets[i],\n", @@ -249,13 +249,13 @@ " d0_stride=d0_stride,\n", " )\n", " )\n", - " ipu_insts.extend(\n", - " aiex.ipu.shimtile_push_queue(\n", + " npu_insts.extend(\n", + " aiex.npu.shimtile_push_queue(\n", " S2MM, output_c_tile_0_1_to_tile_0_0.dest_channel, col, bd_id\n", " )\n", " )\n", - " ipu_insts.extend(\n", - " aiex.ipu.sync(\n", + " npu_insts.extend(\n", + " aiex.npu.sync(\n", " channel=0, column=0, column_num=1, direction=0, row=0, row_num=1\n", " )\n", " )\n", @@ -365,7 +365,7 @@ "output_type": "stream", "text": [ "module {\n", - " aie.device(ipu) {\n", + " aie.device(npu) {\n", " %tile_0_0 = aie.tile(0, 0)\n", " %tile_0_1 = aie.tile(0, 1)\n", " %tile_0_2 = aie.tile(0, 2)\n", @@ -376,34 +376,34 @@ " aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 2)\n", " aie.flow(%tile_0_1, DMA : 2, %tile_0_0, DMA : 0)\n", " func.func @bobsyouruncle() {\n", - " aiex.ipu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 2 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 4 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 4 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 5 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 5 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 6 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 6 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 7 : ui32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 8 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483656 : ui32}\n", - " aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483657 : ui32}\n", - " aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483658 : ui32}\n", - " aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", - " aiex.ipu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", - " aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483659 : ui32}\n", - " aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", + " aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 2 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 4 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 4 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 5 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 5 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 6 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 6 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 7 : ui32}\n", + " aiex.npu.writebd_shimtile {bd_id = 8 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483656 : ui32}\n", + " aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", + " aiex.npu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483657 : ui32}\n", + " aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", + " aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483658 : ui32}\n", + " aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", + " aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n", + " aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483659 : ui32}\n", + " aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n", " return\n", " }\n", " %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {\n", @@ -525,7 +525,7 @@ "metadata": {}, "outputs": [], "source": [ - "# shim DMA as ipu instructions\n", + "# shim DMA as npu instructions\n", "compile_without_vectorization(ctx.module, workdir)\n", "xclbin_path = make_xclbin(ctx.module, workdir)" ] @@ -546,11 +546,11 @@ "outputs": [], "source": [ "# FileLock because this runs in CI where multiple jobs might be attempting to run (and the device isn't multi-tenant yet)\n", - "with FileLock(\"/tmp/ipu.lock\"):\n", + "with FileLock(\"/tmp/npu.lock\"):\n", " # XRT manager\n", " xclbin = XCLBin(xclbin_path, \"MLIR_AIE\")\n", " # configure shim dmas\n", - " xclbin.load_ipu_instructions(ipu_insts)\n", + " xclbin.load_npu_instructions(npu_insts)\n", "\n", " # initialize input operands and zero out output\n", " views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)\n", diff --git a/test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb b/test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb index d6b4248284..2b377f3fba 100644 --- a/test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb +++ b/test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb @@ -442,10 +442,10 @@ " # A tiles are \"fat\" so need to offset by rows (i.e. d1 dim)\n", " 0 + d1_size_A * d1_stride_A,\n", " ]\n", - " ipu_insts = aiex.ipu.get_prolog()\n", + " npu_insts = aiex.npu.get_prolog()\n", " for i, bd_id in enumerate(range(2)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " buffer_length=tile_m_A * tile_n_A,\n", @@ -453,14 +453,14 @@ " ddr_id=ddr_id,\n", " )\n", " )\n", - " ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n", + " npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n", "\n", " # in B\n", " channel_index = 1\n", " ddr_id = 1\n", " for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " buffer_length=tile_m_B * tile_n_B,\n", @@ -472,11 +472,11 @@ " d0_stride=d0_stride_B,\n", " )\n", " )\n", - " ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n", + " npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n", " bd_id += 1\n", " # B tiles are \"tall\" so need to offset by cols (i.e. d0 dim)\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " buffer_length=tile_m_B * tile_n_B,\n", @@ -488,7 +488,7 @@ " d0_stride=d0_stride_B,\n", " )\n", " )\n", - " ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n", + " npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n", "\n", " # out C\n", " channel_index = 0\n", @@ -501,8 +501,8 @@ " ]\n", "\n", " for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n", - " ipu_insts.extend(\n", - " aiex.ipu.writebd_shimtile(\n", + " npu_insts.extend(\n", + " aiex.npu.writebd_shimtile(\n", " col,\n", " bd_id,\n", " buffer_length=tile_m_C * tile_n_C,\n", @@ -514,9 +514,9 @@ " d0_stride=d0_stride_C,\n", " )\n", " )\n", - " ipu_insts.extend(aiex.ipu.write32(S2MM, channel_index, col, bd_id))\n", - " ipu_insts.extend(\n", - " aiex.ipu.sync(\n", + " npu_insts.extend(aiex.npu.write32(S2MM, channel_index, col, bd_id))\n", + " npu_insts.extend(\n", + " aiex.npu.sync(\n", " channel=0,\n", " column=0,\n", " column_num=1,\n", @@ -526,7 +526,7 @@ " )\n", " )\n", "\n", - " return ipu_insts" + " return npu_insts" ] }, { @@ -559,8 +559,8 @@ }, "outputs": [], "source": [ - "@aie.device(AIEDevice.ipu)\n", - "def ipu():\n", + "@aie.device(AIEDevice.npu)\n", + "def npu():\n", " matmul_i32_i32.emit(decl=True)\n", " tile_0_0 = aie.tile(0, 0)\n", " tile_0_1 = aie.tile(0, 1)\n", @@ -675,10 +675,10 @@ "outputs": [], "source": [ "xclbin_path = make_xclbin(mod_aie, workdir)\n", - "with FileLock(\"/tmp/ipu.lock\"):\n", + "with FileLock(\"/tmp/npu.lock\"):\n", " xclbin = XCLBin(xclbin_path, \"MLIR_AIE\")\n", - " ipu_insts = command_control()\n", - " xclbin.load_ipu_instructions(ipu_insts)\n", + " npu_insts = command_control()\n", + " xclbin.load_npu_instructions(npu_insts)\n", "\n", " wrap_A, wrap_B, wrap_C = map(\n", " np.asarray, xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)\n", diff --git a/test/ipu-xrt/lit.local.cfg b/test/ipu-xrt/lit.local.cfg index 04b92ba609..2d7aa71633 100644 --- a/test/ipu-xrt/lit.local.cfg +++ b/test/ipu-xrt/lit.local.cfg @@ -7,7 +7,7 @@ config.suffixes = [".lit", ".py"] if "ryzen_ai" not in config.available_features: - config.unsupported = ["ipu-xrt"] + config.unsupported = ["npu-xrt"] else: config.unsupported = [] diff --git a/test/ipu-xrt/makefile-common b/test/ipu-xrt/makefile-common index d9a0a69015..51e9a19245 100644 --- a/test/ipu-xrt/makefile-common +++ b/test/ipu-xrt/makefile-common @@ -1,4 +1,4 @@ -# Contains common definitions used across the Makefiles of ipu-xrt tests. +# Contains common definitions used across the Makefiles of npu-xrt tests. # VITIS related variables VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../) diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir b/test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir index 01594e64cf..541b44ecea 100644 --- a/test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir +++ b/test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @inA : memref<64x32xi16> memref.global "public" @inA_cons : memref<64x32xi16> memref.global "public" @inB : memref<32x64xi16> @@ -111,12 +111,12 @@ module { %c64_i64 = arith.constant 64 : i64 %c32_i64 = arith.constant 32 : i64 %c4096_i64 = arith.constant 4096 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c2_i64, %c64_i64, %c32_i64] [%c4096_i64, %c32_i64, %c64_i64]) {id = 0 : i64, metadata = @outC} : memref<8192xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 1 : i64, metadata = @inA} : memref<8192xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c2_i64, %c64_i64, %c32_i64] [%c4096_i64, %c32_i64, %c64_i64]) {id = 0 : i64, metadata = @outC} : memref<8192xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 1 : i64, metadata = @inA} : memref<8192xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit b/test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit index 483c7967b7..dd8a83ef1a 100644 --- a/test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit +++ b/test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit @@ -4,8 +4,8 @@ // REQUIRES: ryzen_ai // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o -// RUN: aie2xclbin --use-chess --xclbin-name=aie2.xclbin --ipu-insts-name=insts2.txt --tmpdir=aie2xclbin.prj -v %S/aie.mlir +// RUN: aie2xclbin --use-chess --xclbin-name=aie2.xclbin --npu-insts-name=insts2.txt --tmpdir=aie2xclbin.prj -v %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie2.xclbin -k MLIR_AIE -i insts2.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie2.xclbin -k MLIR_AIE -i insts2.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit b/test/ipu-xrt/matrix_multiplication_using_dma/run.lit index ac347dcce6..850baf0a7d 100644 --- a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit +++ b/test/ipu-xrt/matrix_multiplication_using_dma/run.lit @@ -4,7 +4,7 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/two_col/Makefile b/test/ipu-xrt/two_col/Makefile index 9fe6d4d097..08c3e61293 100644 --- a/test/ipu-xrt/two_col/Makefile +++ b/test/ipu-xrt/two_col/Makefile @@ -5,14 +5,14 @@ VPATH := $(VISION_KERNELS_VPATH_BASE)/threshold all: final.xclbin insts.txt: aie.mlir - aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $< + aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $< threshold.o: threshold.cc xchesscc -d ${CHESSCC2_FLAGS} -DBIT_WIDTH=8 -c $< -o $@ final.xclbin: aie.mlir threshold.o - aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \ - --xclbin-name=$@ --ipu-insts-name=insts.txt $< + aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \ + --xclbin-name=$@ --npu-insts-name=insts.txt $< clean: rm -rf *.elf* *.bif aie.mlir.prj *.xclbin sim \ diff --git a/test/ipu-xrt/two_col/aie.mlir b/test/ipu-xrt/two_col/aie.mlir index 10975fd06b..692ef5db0a 100644 --- a/test/ipu-xrt/two_col/aie.mlir +++ b/test/ipu-xrt/two_col/aie.mlir @@ -1,5 +1,5 @@ module { - aie.device(ipu) { + aie.device(npu) { %0 = aie.tile(0, 0) %1 = aie.tile(0, 1) %2 = aie.tile(0, 2) @@ -123,17 +123,17 @@ module { %c0 = arith.constant 0 : i64 %c1 = arith.constant 1 : i64 %c2048 = arith.constant 2048 : i64 - aiex.ipu.rtp_write(0, 2, 0, 50) { buffer_sym_name = "rtp0" } - aiex.ipu.rtp_write(0, 3, 0, 50) { buffer_sym_name = "rtp1" } - aiex.ipu.rtp_write(1, 4, 0, 50) { buffer_sym_name = "rtp2" } - aiex.ipu.rtp_write(1, 5, 0, 50) { buffer_sym_name = "rtp3" } - aiex.ipu.rtp_write(0, 2, 1, 0) { buffer_sym_name = "rtp0" } - aiex.ipu.rtp_write(0, 3, 1, 0) { buffer_sym_name = "rtp1" } - aiex.ipu.rtp_write(1, 4, 1, 0) { buffer_sym_name = "rtp2" } - aiex.ipu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" } - aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32> - aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32> - aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } + aiex.npu.rtp_write(0, 2, 0, 50) { buffer_sym_name = "rtp0" } + aiex.npu.rtp_write(0, 3, 0, 50) { buffer_sym_name = "rtp1" } + aiex.npu.rtp_write(1, 4, 0, 50) { buffer_sym_name = "rtp2" } + aiex.npu.rtp_write(1, 5, 0, 50) { buffer_sym_name = "rtp3" } + aiex.npu.rtp_write(0, 2, 1, 0) { buffer_sym_name = "rtp0" } + aiex.npu.rtp_write(0, 3, 1, 0) { buffer_sym_name = "rtp1" } + aiex.npu.rtp_write(1, 4, 1, 0) { buffer_sym_name = "rtp2" } + aiex.npu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" } + aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32> + aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32> + aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 } return } } diff --git a/test/ipu-xrt/two_col/run.lit b/test/ipu-xrt/two_col/run.lit index 01ff6afed4..5b2b54b291 100644 --- a/test/ipu-xrt/two_col/run.lit +++ b/test/ipu-xrt/two_col/run.lit @@ -4,7 +4,7 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/threshold.cc -o ./threshold.o -// RUN: %python aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/ipu-xrt/vector_scalar_using_dma/aie.mlir b/test/ipu-xrt/vector_scalar_using_dma/aie.mlir index ebdd9aaefb..e2e9643370 100644 --- a/test/ipu-xrt/vector_scalar_using_dma/aie.mlir +++ b/test/ipu-xrt/vector_scalar_using_dma/aie.mlir @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @in : memref<1024xi32> memref.global "public" @in_cons : memref<1024xi32> memref.global "public" @out : memref<1024xi32> @@ -66,9 +66,9 @@ module { %c0_i64 = arith.constant 0 : i64 %c1_i64 = arith.constant 1 : i64 %c4096_i64 = arith.constant 4096 : i64 - aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32> - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32> - aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} return } diff --git a/test/ipu-xrt/vector_scalar_using_dma/run.lit b/test/ipu-xrt/vector_scalar_using_dma/run.lit index 56b5153e7b..494056eba0 100644 --- a/test/ipu-xrt/vector_scalar_using_dma/run.lit +++ b/test/ipu-xrt/vector_scalar_using_dma/run.lit @@ -4,7 +4,7 @@ // REQUIRES: ryzen_ai, chess // // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/scale.cc -o ./scale.o -// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir +// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s // CHECK: PASS! diff --git a/test/lit.cfg.py b/test/lit.cfg.py index 474009e5c6..4eef6329da 100644 --- a/test/lit.cfg.py +++ b/test/lit.cfg.py @@ -54,7 +54,7 @@ # for xchesscc_wrapper llvm_config.with_environment("AIETOOLS", config.vitis_aietools_dir) -run_on_ipu = "echo" +run_on_npu = "echo" xrt_flags = "" # Not using run_on_board anymore, need more specific per-platform commands @@ -144,8 +144,8 @@ aie_model = m.group(2) print("\tmodel:", aie_model) config.available_features.add("ryzen_ai") - run_on_ipu = ( - f"flock /tmp/ipu.lock {config.aie_src_root}/utils/run_on_ipu.sh" + run_on_npu = ( + f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh" ) except: print("Failed to run xbutil") @@ -153,7 +153,7 @@ else: print("xrt not found") -config.substitutions.append(("%run_on_ipu", run_on_ipu)) +config.substitutions.append(("%run_on_npu", run_on_npu)) config.substitutions.append(("%xrt_flags", xrt_flags)) config.substitutions.append(("%XRT_DIR", config.xrt_dir)) diff --git a/test/lower-to-standard/aiex_standard_lowering.mlir b/test/lower-to-standard/aiex_standard_lowering.mlir index 639dbc1e83..6a0cd0b545 100644 --- a/test/lower-to-standard/aiex_standard_lowering.mlir +++ b/test/lower-to-standard/aiex_standard_lowering.mlir @@ -11,14 +11,14 @@ // RUN: aie-opt --split-input-file --aiex-standard-lowering %s | FileCheck %s // CHECK-LABEL: dma_and_wait -// CHECK-NOT: aiex.ipu.dma_memcpy_nd -// CHECK-NOT: aiex.ipu.dma_wait +// CHECK-NOT: aiex.npu.dma_memcpy_nd +// CHECK-NOT: aiex.npu.dma_wait module { - aie.device(ipu) { + aie.device(npu) { memref.global "public" @toMem : memref<16xi32> func.func @dma_and_wait(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.ipu.dma_wait {symbol = @toMem} + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> + aiex.npu.dma_wait {symbol = @toMem} return } aie.shim_dma_allocation @toMem (MM2S, 1, 1) diff --git a/test/objectFifo-stateful-transform/nested_loop_test.mlir b/test/objectFifo-stateful-transform/nested_loop_test.mlir index 12d35fce7e..c2ba81e1cb 100644 --- a/test/objectFifo-stateful-transform/nested_loop_test.mlir +++ b/test/objectFifo-stateful-transform/nested_loop_test.mlir @@ -9,7 +9,7 @@ // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s -// CHECK-LABEL: aie.device(ipu) +// CHECK-LABEL: aie.device(npu) // CHECK: scf.for // CHECK: { // CHECK: aie.use_lock @@ -74,7 +74,7 @@ // CHECK: aie.use_lock // CHECK: } -aie.device(ipu) { +aie.device(npu) { %tile_0_1 = aie.tile(0, 1) %tile_1_2 = aie.tile(1, 2) %tile_0_2 = aie.tile(0, 2) diff --git a/test/python/ipu.py b/test/python/ipu.py index e2ad6959e8..79b8c64bb6 100644 --- a/test/python/ipu.py +++ b/test/python/ipu.py @@ -23,7 +23,7 @@ object_fifo_link, tile, ) -from aie.dialects.aiex import ipu_sync, ipu_dma_memcpy_nd +from aie.dialects.aiex import npu_sync, npu_dma_memcpy_nd from aie.dialects.func import FuncOp from aie.dialects.scf import for_ from aie.dialects.scf import yield_ @@ -49,7 +49,7 @@ def my_vector_scalar(module): buffer_depth = 2 - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): scale_int32 = external_func( "scale_int32", inputs=[T.memref(n, T.i32()), T.memref(n, T.i32())] @@ -79,9 +79,9 @@ def core_body(): T.memref(N, T.i32()), T.memref(N, T.i32()), T.memref(N, T.i32()) ) def sequence(A, B, C): - ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) - ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) + npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N]) + npu_sync(column=0, row=0, direction=0, channel=0) assert module.operation.verify() @@ -124,7 +124,7 @@ def my_matmul(module): vectorized = True - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): zero_scalar = external_func("zero_scalar_i16", inputs=[T.memref(m, n, T.i16())]) zero = external_func("zero_i16", inputs=[T.memref(m, n, T.i16())]) @@ -194,7 +194,7 @@ def sequence(A, B, C): num_tile_rows = min( [rows_per_block, M_div_m - tile_row_block * rows_per_block] ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outC", bd_id=0, mem=C, @@ -210,7 +210,7 @@ def sequence(A, B, C): * word_size_in // 4 ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inA", bd_id=2 * tile_row + 1, mem=A, @@ -218,7 +218,7 @@ def sequence(A, B, C): sizes=[N_div_n, K_div_k, m, k_in_i32s], strides=[0, k_in_i32s, K_in_i32s], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inB", bd_id=2 * tile_row + 2, mem=B, @@ -226,7 +226,7 @@ def sequence(A, B, C): strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) assert module.operation.verify() @@ -234,7 +234,7 @@ def sequence(A, B, C): # CHECK-LABEL: edge_detect @construct_and_print_module def edge_detect(module): - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): rgba2gray_line = external_func( "rgba2gray_line", @@ -441,21 +441,21 @@ def core_body(): T.memref(2304, T.i32()), T.memref(2304, T.i32()), T.memref(2304, T.i32()) ) def sequence(I, B, O): - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="outOF_L2L3", bd_id=0, mem=O, sizes=[1, 1, 36, 64], strides=[0, 0, 64], ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="inOF_L3L2", bd_id=1, mem=I, sizes=[1, 1, 36, 64], strides=[0, 0, 64], ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) assert module.operation.verify() @@ -463,7 +463,7 @@ def sequence(I, B, O): # CHECK-LABEL: my_add_one_objFifo @construct_and_print_module def my_add_one_objFifo(module): - @device(AIEDevice.ipu) + @device(AIEDevice.npu) def device_body(): shim_tile = tile(0, 0) mem_tile = tile(0, 1) @@ -496,12 +496,12 @@ def core_body(): T.memref(64, T.i32()), T.memref(32, T.i32()), T.memref(64, T.i32()) ) def sequence(inTensor, notUsed, outTensor): - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64] ) - ipu_dma_memcpy_nd( + npu_dma_memcpy_nd( metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64] ) - ipu_sync(column=0, row=0, direction=0, channel=0) + npu_sync(column=0, row=0, direction=0, channel=0) assert module.operation.verify() diff --git a/test/python/tile_array.py b/test/python/tile_array.py index 272b48832e..e6735a7565 100644 --- a/test/python/tile_array.py +++ b/test/python/tile_array.py @@ -31,8 +31,8 @@ # CHECK-LABEL: broadcast @construct_and_print_module def broadcast(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): df = TileArray() assert df[[0, 1], 0].shape == (2, 1) assert df[[0, 1], 3:].shape == (2, 3) @@ -125,7 +125,7 @@ def ipu(): print(f) # CHECK: module { - # CHECK: aie.device(ipu) { + # CHECK: aie.device(npu) { # CHECK: %tile_0_0 = aie.tile(0, 0) # CHECK: %tile_0_1 = aie.tile(0, 1) # CHECK: %tile_0_2 = aie.tile(0, 2) @@ -194,8 +194,8 @@ def ipu(): # CHECK-LABEL: lshift @construct_and_print_module def lshift(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() fls = tiles[2, 1] << tiles[0, [2, 3]] @@ -214,8 +214,8 @@ def ipu(): # CHECK-LABEL: locks @construct_and_print_module def locks(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() aie.lock(tiles[0, 1].tile) @@ -249,8 +249,8 @@ def ipu(): # CHECK-LABEL: neighbors @construct_and_print_module def neighbors(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() # CHECK: Neighbors(north=%tile_2_3 = aie.tile(2, 3), west=%tile_1_2 = aie.tile(1, 2), south=None) @@ -279,8 +279,8 @@ def channels_basic(module): # CHECK-LABEL: test-basic print("test-basic") - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() b = aie.buffer(tiles[2, 2].tile, (10, 10), T.i32(), name="bob") @@ -295,13 +295,13 @@ def ipu(): # CHECK: %alice = aie.buffer(%tile_2_2) {sym_name = "alice"} : memref<10x10xi32> # CHECK: %alice_producer_lock = aie.lock(%tile_2_2) {sym_name = "alice_producer_lock"} # CHECK: %alice_consumer_lock = aie.lock(%tile_2_2) {sym_name = "alice_consumer_lock"} - print(ipu) + print(npu) # CHECK-LABEL: test-context-manager print("test-context-manager") - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() c = Channel( @@ -334,14 +334,14 @@ def core(): # CHECK: aie.use_lock(%alice_producer_lock, Release) # CHECK: aie.end # CHECK: } - print(ipu) + print(npu) # CHECK-LABEL: nd_channels @construct_and_print_module def nd_channels(module): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() shapes = np.array([(10, 10)], dtype="i,i").astype(object) @@ -377,8 +377,8 @@ def ipu(): def buffer_test_this_needs_to_distinct_from_all_other_mentions_of_buffer_in_this_file( module, ): - @aie.device(AIEDevice.ipu) - def ipu(): + @aie.device(AIEDevice.npu) + def npu(): tiles = TileArray() shapes = [(10, 10)] diff --git a/tools/aie2xclbin/aie2xclbin.cpp b/tools/aie2xclbin/aie2xclbin.cpp index 1bb82c90df..7bfe6a2982 100644 --- a/tools/aie2xclbin/aie2xclbin.cpp +++ b/tools/aie2xclbin/aie2xclbin.cpp @@ -70,9 +70,9 @@ cl::opt cl::init(HOST_ARCHITECTURE), cl::cat(AIE2XCLBinCat)); cl::opt - NPUInstsName("ipu-insts-name", + NPUInstsName("npu-insts-name", cl::desc("Output instructions filename for NPU target"), - cl::init("ipu_insts.txt"), cl::cat(AIE2XCLBinCat)); + cl::init("npu_insts.txt"), cl::cat(AIE2XCLBinCat)); cl::opt PrintIRAfterAll("print-ir-after-all", diff --git a/utils/run_on_ipu.sh b/utils/run_on_npu.sh similarity index 100% rename from utils/run_on_ipu.sh rename to utils/run_on_npu.sh