From 626182746a720c85748e232366b8649386f24b3a Mon Sep 17 00:00:00 2001
From: Jeff Fifield <jeff.fifield@amd.com>
Date: Tue, 23 Apr 2024 13:32:34 -0600
Subject: [PATCH] replace "ipu" with "npu" (#1305)

---
 docs/buildHostLin.md                          |   2 +-
 docs/buildHostWin.md                          |   8 +-
 include/aie-c/Translation.h                   |   2 +-
 include/aie/Dialect/AIE/IR/AIEAttrs.td        |   2 +-
 include/aie/Dialect/AIE/IR/AIETargetModel.h   |   4 +-
 include/aie/Dialect/AIEX/IR/AIEX.td           |  24 ++--
 .../aie/Dialect/AIEX/Transforms/AIEXPasses.h  |   2 +-
 .../aie/Dialect/AIEX/Transforms/AIEXPasses.td |   6 +-
 include/aie/Targets/AIETargets.h              |   4 +-
 lib/CAPI/Translation.cpp                      |  14 +-
 lib/Dialect/AIE/IR/AIEDialect.cpp             |   6 +-
 lib/Dialect/AIEX/IR/AIEXDialect.cpp           |   8 +-
 .../{AIEDmaToIpu.cpp => AIEDmaToNpu.cpp}      |  60 ++++----
 .../AIEX/Transforms/AIEXToStandard.cpp        |  14 +-
 lib/Dialect/AIEX/Transforms/CMakeLists.txt    |   2 +-
 lib/Targets/AIETargetCDODirect.cpp            |  12 +-
 lib/Targets/AIETargetHSA.cpp                  |   6 +-
 .../{AIETargetIPU.cpp => AIETargetNPU.cpp}    |  20 +--
 lib/Targets/AIETargets.cpp                    |   6 +-
 lib/Targets/CMakeLists.txt                    |   2 +-
 .../basic/matrix_add_one/Makefile             |   6 +-
 .../basic/matrix_add_one/README.md            |   6 +-
 .../basic/matrix_add_one/aie2.py              |  10 +-
 .../basic/matrix_add_one/run.lit              |   6 +-
 .../matrix_multiplication/CMakeLists.txt      |   2 +-
 .../matrix_multiplication/makefile-common     |   2 +-
 .../matrix_vector/aie2.py                     |  10 +-
 .../matrix_vector/run.lit                     |   4 +-
 .../matrix_multiplication/single_core/aie2.py |  10 +-
 .../matrix_multiplication/single_core/run.lit |   4 +-
 .../matrix_multiplication/whole_array/aie2.py |  10 +-
 .../matrix_multiplication/whole_array/run.lit |   4 +-
 .../basic/passthrough_dmas/CMakeLists.txt     |   2 +-
 .../basic/passthrough_dmas/Makefile           |   4 +-
 .../basic/passthrough_dmas/aie2.py            |  10 +-
 .../basic/passthrough_dmas/run.lit            |   6 +-
 .../basic/passthrough_kernel/CMakeLists.txt   |   2 +-
 .../basic/passthrough_kernel/Makefile         |   4 +-
 .../basic/passthrough_kernel/aie2.py          |   8 +-
 .../basic/passthrough_kernel/run.lit          |   6 +-
 .../basic/vector_add/Makefile                 |   6 +-
 .../basic/vector_add/README.md                |   6 +-
 programming_examples/basic/vector_add/aie2.py |  12 +-
 programming_examples/basic/vector_add/run.lit |   6 +-
 .../basic/vector_exp/CMakeLists.txt           |   2 +-
 .../basic/vector_exp/Makefile                 |   2 +-
 programming_examples/basic/vector_exp/aie2.py |   8 +-
 programming_examples/basic/vector_exp/run.lit |   6 +-
 .../basic/vector_mult/CMakeLists.txt          |   2 +-
 .../basic/vector_mult/Makefile                |   6 +-
 .../basic/vector_mult/README.md               |   6 +-
 .../basic/vector_mult/aie2.py                 |  12 +-
 .../basic/vector_mult/run.lit                 |   6 +-
 .../basic/vector_reduce_add/CMakeLists.txt    |   2 +-
 .../basic/vector_reduce_add/Makefile          |   4 +-
 .../basic/vector_reduce_add/aie2.py           |  10 +-
 .../basic/vector_reduce_add/run.lit           |   6 +-
 .../basic/vector_reduce_max/CMakeLists.txt    |   2 +-
 .../basic/vector_reduce_max/Makefile          |   4 +-
 .../basic/vector_reduce_max/aie2.py           |  10 +-
 .../basic/vector_reduce_max/run.lit           |   6 +-
 .../basic/vector_reduce_min/CMakeLists.txt    |   2 +-
 .../basic/vector_reduce_min/Makefile          |   4 +-
 .../basic/vector_reduce_min/aie2.py           |  10 +-
 .../basic/vector_reduce_min/run.lit           |   6 +-
 .../basic/vector_scalar_add/CMakeLists.txt    |   2 +-
 .../basic/vector_scalar_add/Makefile          |   4 +-
 .../basic/vector_scalar_add/aie2.py           |   8 +-
 .../basic/vector_scalar_add/run.lit           |   4 +-
 .../basic/vector_scalar_mul/CMakeLists.txt    |   2 +-
 .../basic/vector_scalar_mul/Makefile          |   4 +-
 .../basic/vector_scalar_mul/aie2.py           |  10 +-
 .../basic/vector_scalar_mul/run.lit           |   6 +-
 .../basic/vector_sum/CMakeLists.txt           |   2 +-
 .../basic/vector_sum/Makefile                 |   6 +-
 .../basic/vector_sum/README.md                |   4 +-
 programming_examples/basic/vector_sum/aie2.py |  10 +-
 programming_examples/basic/vector_sum/run.lit |   6 +-
 programming_examples/lit.cfg.py               |   8 +-
 programming_examples/makefile-common          |   2 +-
 .../ml/bottleneck/CMakeLists.txt              |   2 +-
 programming_examples/ml/bottleneck/Makefile   |   6 +-
 programming_examples/ml/bottleneck/aie2.py    |  32 ++---
 programming_examples/ml/bottleneck/run.lit    |   4 +-
 programming_examples/ml/conv2d/CMakeLists.txt |   2 +-
 programming_examples/ml/conv2d/Makefile       |   6 +-
 programming_examples/ml/conv2d/aie2.py        |  26 ++--
 programming_examples/ml/conv2d/run.lit        |   4 +-
 .../ml/conv2d_fused_relu/CMakeLists.txt       |   2 +-
 .../ml/conv2d_fused_relu/Makefile             |   6 +-
 .../ml/conv2d_fused_relu/aie2.py              |  26 ++--
 .../ml/conv2d_fused_relu/run.lit              |   4 +-
 .../ml/eltwise_add/CMakeLists.txt             |   2 +-
 programming_examples/ml/eltwise_add/Makefile  |   8 +-
 programming_examples/ml/eltwise_add/aie2.py   |  10 +-
 programming_examples/ml/eltwise_add/run.lit   |   4 +-
 .../ml/eltwise_mul/CMakeLists.txt             |   2 +-
 programming_examples/ml/eltwise_mul/Makefile  |   8 +-
 programming_examples/ml/eltwise_mul/aie2.py   |  10 +-
 programming_examples/ml/eltwise_mul/run.lit   |   4 +-
 programming_examples/ml/relu/CMakeLists.txt   |   2 +-
 programming_examples/ml/relu/Makefile         |   8 +-
 programming_examples/ml/relu/aie2.py          |   8 +-
 programming_examples/ml/relu/run.lit          |   4 +-
 .../ml/resnet/layers_conv2_x/CMakeLists.txt   |   2 +-
 .../ml/resnet/layers_conv2_x/Makefile         |   6 +-
 .../ml/resnet/layers_conv2_x/aie.mlir         |  58 ++++----
 .../ml/resnet/layers_conv2_x/aie2.py          |  34 ++---
 .../ml/resnet/layers_conv2_x/run.lit          |   4 +-
 .../ml/softmax/CMakeLists.txt                 |   2 +-
 programming_examples/ml/softmax/Makefile      |   4 +-
 programming_examples/ml/softmax/aie2.py       |   8 +-
 programming_examples/ml/softmax/run.lit       |   4 +-
 .../ml/weight_expand/CMakeLists.txt           |   2 +-
 .../ml/weight_expand/Makefile                 |   2 +-
 programming_examples/ml/weight_expand/aie2.py |   8 +-
 programming_examples/utils/README.md          |   2 +-
 programming_examples/utils/parse_eventIR.py   |   6 +-
 programming_examples/utils/parse_trace.py     |   6 +-
 .../vision/color_detect/CMakeLists.txt        |   2 +-
 .../vision/color_detect/Makefile              |   4 +-
 .../vision/color_detect/README.md             |   2 +-
 .../vision/color_detect/aie2_colorDetect.py   |   8 +-
 .../vision/color_detect/run.lit               |   4 +-
 .../vision/color_threshold/CMakeLists.txt     |   2 +-
 .../vision/color_threshold/Makefile           |   4 +-
 .../vision/color_threshold/README.md          |   2 +-
 .../color_threshold/aie2_colorThreshold.py    |  32 ++---
 .../vision/color_threshold/run.lit            |   4 +-
 .../vision/edge_detect/CMakeLists.txt         |   2 +-
 .../vision/edge_detect/Makefile               |   4 +-
 .../vision/edge_detect/README.md              |   2 +-
 .../vision/edge_detect/aie2_edgeDetect.py     |   8 +-
 .../vision/edge_detect/run.lit                |   4 +-
 .../vision/vision_passthrough/CMakeLists.txt  |   2 +-
 .../vision/vision_passthrough/Makefile        |   4 +-
 .../vision/vision_passthrough/aie2.py         |  20 +--
 .../aie2_lineBased_8b_1080.mlir               |   8 +-
 .../aie2_lineBased_8b_8k.mlir                 |   8 +-
 .../aie2_lineBased_8b_tiny.mlir               |   8 +-
 .../vision/vision_passthrough/run.lit         |   4 +-
 python/AIEMLIRModule.cpp                      |   6 +-
 python/XRTModule.cpp                          |  28 ++--
 python/_mlir_libs/_aie.pyi                    |   4 +-
 python/_mlir_libs/_xrt.pyi                    |   4 +-
 python/compiler/aiecc/cl_arguments.py         |  18 +--
 python/compiler/aiecc/main.py                 |  14 +-
 python/dialects/aie.py                        |   4 +-
 python/dialects/aiex.py                       |  60 ++++----
 python/utils/README.md                        |  30 ++--
 python/utils/trace.py                         |  16 +--
 .../{DmaToIpu => DmaToNpu}/aiert_insts.mlir   |  16 +--
 .../{DmaToIpu => DmaToNpu}/bad_rtp_write.mlir |  10 +-
 .../dma_to_npu.mlir}                          |  38 ++---
 .../dma_to_npu_invalid.mlir}                  |  10 +-
 .../dma_to_npu_issue_token.mlir}              |  18 +--
 .../{DmaToIpu => DmaToNpu}/push_to_queue.mlir |  12 +-
 .../{DmaToIpu => DmaToNpu}/rtp_write.mlir     |  12 +-
 .../assign-bd-ids/bad_bd_assignments.mlir     |  12 +-
 test/Passes/assign-bd-ids/basic.mlir          |   4 +-
 test/Passes/assign-bd-ids/user_assigned.mlir  |   8 +-
 .../AIETargetHSA/input_with_addresses.mlir    |   6 +-
 .../ipu_instgen.mlir => NPU/npu_instgen.mlir} |  12 +-
 test/aie2xclbin/simple_xclbin.mlir            |   2 +-
 test/aiecc/simple_xclbin.mlir                 |   6 +-
 .../bad_alignment.mlir                        |   8 +-
 test/dialect/AIE/bad_cascade.mlir             |   6 +-
 test/dialect/AIE/bad_dma_op.mlir              |   2 +-
 test/dialect/AIE/badshimtiledma.mlir          |   2 +-
 test/dialect/AIE/badtiledma4.mlir             |   2 +-
 test/dialect/AIE/buffer.mlir                  |   2 +-
 .../AIEX/{bad_ipu_nd.mlir => bad_npu_nd.mlir} |  26 ++--
 ...ush_queue.mlir => bad_npu_push_queue.mlir} |  10 +-
 ...pu_write_bd.mlir => bad_npu_write_bd.mlir} |  18 +--
 test/dialect/AIEX/invalid.mlir                |   8 +-
 test/dialect/AIEX/roundtrip.mlir              |  18 +--
 test/lit.cfg.py                               |   8 +-
 .../aiex_standard_lowering.mlir               |  10 +-
 .../aie.mlir                                  |   8 +-
 .../run.lit                                   |   4 +-
 .../test.cpp                                  |   0
 .../add_314_using_dma_op/aie.mlir             |   8 +-
 .../add_314_using_dma_op/run.lit              |   4 +-
 .../add_314_using_dma_op/test.cpp             |   0
 .../add_one_objFifo/CMakeLists.txt            |   2 +-
 .../add_one_objFifo/Makefile                  |   2 +-
 .../add_one_objFifo/aie.mlir                  |   8 +-
 .../add_one_objFifo/run.lit                   |   4 +-
 .../add_one_objFifo/run.sh                    |   0
 .../add_one_objFifo/test.cpp                  |   0
 .../add_one_using_dma/aie.mlir                |   8 +-
 .../add_one_using_dma/run.lit                 |   4 +-
 .../add_one_using_dma/test.cpp                |   0
 .../cascade_flows/CMakeLists.txt              |   2 +-
 .../cascade_flows/Makefile                    |   2 +-
 .../cascade_flows/aie.mlir                    |   8 +-
 .../cascade_flows/kernel1.cc                  |   0
 .../cascade_flows/kernel2.cc                  |   0
 .../cascade_flows/kernel3.cc                  |   0
 .../cascade_flows/run.lit                     |   4 +-
 .../cascade_flows/test.cpp                    |   0
 test/{ipu-xrt => npu-xrt}/e2e/conftest.py     |   2 +-
 test/{ipu-xrt => npu-xrt}/e2e/lit.local.cfg   |   0
 test/{ipu-xrt => npu-xrt}/e2e/pytest.ini      |   0
 .../e2e/run_all_tests_one_by_one.sh           |   0
 ...dd_256_using_dma_op_no_double_buffering.py |  22 +--
 test/{ipu-xrt => npu-xrt}/e2e/test_locks.py   |  66 ++++-----
 .../e2e/test_manual_dpu_args.py               |  82 +++++------
 .../e2e/test_nonsquare_matrix_mult.py         |  64 ++++-----
 .../test_nonsquare_matrix_mult_vectorized.py  |  64 ++++-----
 .../e2e/test_offsets_sizes_strides.py         |  38 ++---
 .../e2e/test_repeat_count.py                  |  44 +++---
 .../e2e/test_shared_buffers_init_value.py     |  22 +--
 .../e2e/test_square_matrix_mult.py            |  64 ++++-----
 .../e2e/test_square_matrix_mult_vectorized.py |  64 ++++-----
 .../e2e/test_tiled_matrix_add.py              |  76 +++++-----
 ...iled_nonsquare_spatial_tile_matrix_mult.py | 106 +++++++-------
 .../test_tiled_nonsquare_tile_matrix_mult.py  |  92 ++++++-------
 ...d_nonsquare_tile_matrix_mult_vectorized.py | 130 +++++++++---------
 .../e2e/test_tiled_vec_add.py                 |  76 +++++-----
 .../e2e/test_tiled_vec_add_vectorized.py      |  76 +++++-----
 test/{ipu-xrt => npu-xrt}/e2e/test_vec_dot.py |  76 +++++-----
 .../e2e/tiled_matrix_add.ipynb                |  98 ++++++-------
 ...onsquare_tile_matrix_mult_vectorized.ipynb |  42 +++---
 test/{ipu-xrt => npu-xrt}/e2e/util.py         |   0
 test/{ipu-xrt => npu-xrt}/lit.local.cfg       |   2 +-
 test/{ipu-xrt => npu-xrt}/makefile-common     |   2 +-
 .../matrix_multiplication_using_dma/aie.mlir  |  14 +-
 .../matrix_multiplication_using_dma/mm.cc     |   0
 .../run-a2x.lit                               |   4 +-
 .../matrix_multiplication_using_dma/run.lit   |   4 +-
 .../matrix_multiplication_using_dma/test.cpp  |   0
 .../matrix_multiplication_using_dma/zero.cc   |   0
 test/{ipu-xrt => npu-xrt}/two_col/Makefile    |   6 +-
 test/{ipu-xrt => npu-xrt}/two_col/aie.mlir    |  24 ++--
 test/{ipu-xrt => npu-xrt}/two_col/run.lit     |   4 +-
 test/{ipu-xrt => npu-xrt}/two_col/run.sh      |   0
 test/{ipu-xrt => npu-xrt}/two_col/test.cpp    |   0
 .../{ipu-xrt => npu-xrt}/two_col/threshold.cc |   0
 .../vector_scalar_using_dma/aie.mlir          |   8 +-
 .../vector_scalar_using_dma/run.lit           |   4 +-
 .../vector_scalar_using_dma/scale.cc          |   0
 .../vector_scalar_using_dma/test.cpp          |   0
 .../nested_loop_test.mlir                     |   4 +-
 test/python/{ipu.py => npu.py}                |  36 ++---
 test/python/tile_array.py                     |  38 ++---
 test/python/trace_utils.py                    |  24 ++--
 tools/aie2xclbin/XCLBinGen.cpp                |  14 +-
 tools/aie2xclbin/XCLBinGen.h                  |   2 +-
 tools/aie2xclbin/aie2xclbin.cpp               |   8 +-
 utils/{reset_ipu.sh => reset_npu.sh}          |   2 +-
 utils/{run_on_ipu.sh => run_on_npu.sh}        |   0
 252 files changed, 1531 insertions(+), 1531 deletions(-)
 rename lib/Dialect/AIEX/Transforms/{AIEDmaToIpu.cpp => AIEDmaToNpu.cpp} (87%)
 rename lib/Targets/{AIETargetIPU.cpp => AIETargetNPU.cpp} (88%)
 rename test/Conversion/{DmaToIpu => DmaToNpu}/aiert_insts.mlir (83%)
 rename test/Conversion/{DmaToIpu => DmaToNpu}/bad_rtp_write.mlir (62%)
 rename test/Conversion/{DmaToIpu/dma_to_ipu.mlir => DmaToNpu/dma_to_npu.mlir} (73%)
 rename test/Conversion/{DmaToIpu/dma_to_ipu_invalid.mlir => DmaToNpu/dma_to_npu_invalid.mlir} (71%)
 rename test/Conversion/{DmaToIpu/dma_to_ipu_issue_token.mlir => DmaToNpu/dma_to_npu_issue_token.mlir} (72%)
 rename test/Conversion/{DmaToIpu => DmaToNpu}/push_to_queue.mlir (69%)
 rename test/Conversion/{DmaToIpu => DmaToNpu}/rtp_write.mlir (66%)
 rename test/Targets/{IPU/ipu_instgen.mlir => NPU/npu_instgen.mlir} (90%)
 rename test/dialect/AIEX/{bad_ipu_nd.mlir => bad_npu_nd.mlir} (78%)
 rename test/dialect/AIEX/{bad_ipu_push_queue.mlir => bad_npu_push_queue.mlir} (82%)
 rename test/dialect/AIEX/{bad_ipu_write_bd.mlir => bad_npu_write_bd.mlir} (90%)
 rename test/{ipu-xrt => npu-xrt}/add_256_using_dma_op_no_double_buffering/aie.mlir (95%)
 rename test/{ipu-xrt => npu-xrt}/add_256_using_dma_op_no_double_buffering/run.lit (70%)
 rename test/{ipu-xrt => npu-xrt}/add_256_using_dma_op_no_double_buffering/test.cpp (100%)
 rename test/{ipu-xrt => npu-xrt}/add_314_using_dma_op/aie.mlir (97%)
 rename test/{ipu-xrt => npu-xrt}/add_314_using_dma_op/run.lit (70%)
 rename test/{ipu-xrt => npu-xrt}/add_314_using_dma_op/test.cpp (100%)
 rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/CMakeLists.txt (96%)
 rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/Makefile (91%)
 rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/aie.mlir (91%)
 rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/run.lit (75%)
 rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/run.sh (100%)
 rename test/{ipu-xrt => npu-xrt}/add_one_objFifo/test.cpp (100%)
 rename test/{ipu-xrt => npu-xrt}/add_one_using_dma/aie.mlir (97%)
 rename test/{ipu-xrt => npu-xrt}/add_one_using_dma/run.lit (75%)
 rename test/{ipu-xrt => npu-xrt}/add_one_using_dma/test.cpp (100%)
 rename test/{ipu-xrt => npu-xrt}/cascade_flows/CMakeLists.txt (96%)
 rename test/{ipu-xrt => npu-xrt}/cascade_flows/Makefile (95%)
 rename test/{ipu-xrt => npu-xrt}/cascade_flows/aie.mlir (92%)
 rename test/{ipu-xrt => npu-xrt}/cascade_flows/kernel1.cc (100%)
 rename test/{ipu-xrt => npu-xrt}/cascade_flows/kernel2.cc (100%)
 rename test/{ipu-xrt => npu-xrt}/cascade_flows/kernel3.cc (100%)
 rename test/{ipu-xrt => npu-xrt}/cascade_flows/run.lit (83%)
 rename test/{ipu-xrt => npu-xrt}/cascade_flows/test.cpp (100%)
 rename test/{ipu-xrt => npu-xrt}/e2e/conftest.py (96%)
 rename test/{ipu-xrt => npu-xrt}/e2e/lit.local.cfg (100%)
 rename test/{ipu-xrt => npu-xrt}/e2e/pytest.ini (100%)
 rename test/{ipu-xrt => npu-xrt}/e2e/run_all_tests_one_by_one.sh (100%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_add_256_using_dma_op_no_double_buffering.py (93%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_locks.py (93%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_manual_dpu_args.py (88%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_nonsquare_matrix_mult.py (91%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_nonsquare_matrix_mult_vectorized.py (94%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_offsets_sizes_strides.py (92%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_repeat_count.py (90%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_shared_buffers_init_value.py (95%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_square_matrix_mult.py (91%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_square_matrix_mult_vectorized.py (94%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_matrix_add.py (92%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py (93%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_nonsquare_tile_matrix_mult.py (91%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py (92%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_vec_add.py (90%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_tiled_vec_add_vectorized.py (92%)
 rename test/{ipu-xrt => npu-xrt}/e2e/test_vec_dot.py (90%)
 rename test/{ipu-xrt => npu-xrt}/e2e/tiled_matrix_add.ipynb (91%)
 rename test/{ipu-xrt => npu-xrt}/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb (95%)
 rename test/{ipu-xrt => npu-xrt}/e2e/util.py (100%)
 rename test/{ipu-xrt => npu-xrt}/lit.local.cfg (91%)
 rename test/{ipu-xrt => npu-xrt}/makefile-common (92%)
 rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/aie.mlir (97%)
 rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/mm.cc (100%)
 rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/run-a2x.lit (77%)
 rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/run.lit (78%)
 rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/test.cpp (100%)
 rename test/{ipu-xrt => npu-xrt}/matrix_multiplication_using_dma/zero.cc (100%)
 rename test/{ipu-xrt => npu-xrt}/two_col/Makefile (72%)
 rename test/{ipu-xrt => npu-xrt}/two_col/aie.mlir (91%)
 rename test/{ipu-xrt => npu-xrt}/two_col/run.lit (73%)
 rename test/{ipu-xrt => npu-xrt}/two_col/run.sh (100%)
 rename test/{ipu-xrt => npu-xrt}/two_col/test.cpp (100%)
 rename test/{ipu-xrt => npu-xrt}/two_col/threshold.cc (100%)
 rename test/{ipu-xrt => npu-xrt}/vector_scalar_using_dma/aie.mlir (95%)
 rename test/{ipu-xrt => npu-xrt}/vector_scalar_using_dma/run.lit (78%)
 rename test/{ipu-xrt => npu-xrt}/vector_scalar_using_dma/scale.cc (100%)
 rename test/{ipu-xrt => npu-xrt}/vector_scalar_using_dma/test.cpp (100%)
 rename test/python/{ipu.py => npu.py} (95%)
 rename utils/{reset_ipu.sh => reset_npu.sh} (93%)
 rename utils/{run_on_ipu.sh => run_on_npu.sh} (100%)

diff --git a/docs/buildHostLin.md b/docs/buildHostLin.md
index 017bc053cd..422b48b64e 100644
--- a/docs/buildHostLin.md
+++ b/docs/buildHostLin.md
@@ -293,7 +293,7 @@ source ${MLIR_AIE_BUILD_DIR}/utils/env_setup.sh ${MLIR_AIE_BUILD_DIR}/install ${
 
 ## Build a Design
 
-For your design of interest, for instance [add_one_objFifo](../reference_designs/ipu-xrt/add_one_objFifo/), 2 steps are needed: (i) build the AIE desgin and then (ii) build the host code.
+For your design of interest, for instance [vector_add](../programming_examples/basic/vector_add/), 2 steps are needed: (i) build the AIE desgin and then (ii) build the host code.
 
 ### Build Device AIE Part
 
diff --git a/docs/buildHostWin.md b/docs/buildHostWin.md
index 04373d2892..d6daf03704 100644
--- a/docs/buildHostWin.md
+++ b/docs/buildHostWin.md
@@ -58,7 +58,7 @@ All steps in WSL Ubuntu terminal.
 
 1. After installing the updated RyzenAI driver (see next subsection), use the gendef tool (from the mingw-w64-tools package) to create a .def file with the symbols:
     ```
-    mkdir /mnt/c/Technical/xrtIPUfromDLL; cd /mnt/c/Technical/xrtIPUfromDLL
+    mkdir /mnt/c/Technical/xrtNPUfromDLL; cd /mnt/c/Technical/xrtNPUfromDLL
     cp /mnt/c/Windows/System32/AMD/xrt_coreutil.dll .
     gendef xrt_coreutil.dll
     ```
@@ -67,7 +67,7 @@ All steps in WSL Ubuntu terminal.
 
 All steps in Win11 (powershell where needed).
 
-1. Upgrade the IPU driver IPU driver to version 10.106.8.62 [download here](https://account.amd.com/en/forms/downloads/ryzen-ai-software-platform-xef.html?filename=ipu_stack_rel_silicon_2308.zip), following the [instructions](href="https://ryzenai.docs.amd.com/en/latest/inst.html) on setting up the driver.
+1. Upgrade the NPU driver to version 10.106.8.62 [download here](https://account.amd.com/en/forms/downloads/ryzen-ai-software-platform-xef.html?filename=ipu_stack_rel_silicon_2308.zip), following the [instructions](href="https://ryzenai.docs.amd.com/en/latest/inst.html) on setting up the driver.
 1. Install [Microsoft Visual Studio 17 2022 Community Edition](https://visualstudio.microsoft.com/vs/community/) with package for C++ development.
 
 1. Install CMake on windows ([https://cmake.org/download/](https://cmake.org/download/))
@@ -79,7 +79,7 @@ All steps in Win11 (powershell where needed).
 1. Clone [https://github.com/Xilinx/XRT](https://github.com/Xilinx/XRT) for instance under `C:\Technical` and `git checkout 2023.2`
 1. Create a .lib file from the .dll shipping with the driver
     - In wsl, generate a .def file (see above)
-    - Start a x86 Native Tools Command Prompt (installed as part of VS17), go to the folder `C:\Technical\xrtIPUfromDLL` and run command: 
+    - Start a x86 Native Tools Command Prompt (installed as part of VS17), go to the folder `C:\Technical\xrtNPUfromDLL` and run command: 
       ```
       lib /def:xrt_coreutil.def /machine:x64 /out:xrt_coreutil.lib
       ```
@@ -113,7 +113,7 @@ source <yourPathToBuildMLIR-AIE>/utils/env_setup.sh <yourPathToBuildMLIR-AIE>/in
 
 ## Build a Design
 
-For your design of interest, for instance [add_one_objFifo](../reference_designs/ipu-xrt/add_one_objFifo/), 2 steps are needed: (i) build the AIE desgin in WSL and then (ii) build the host code in powershell.
+For your design of interest, for instance [vector_add](../programming_examples/basic/vector_add/), 2 steps are needed: (i) build the AIE desgin in WSL and then (ii) build the host code in powershell.
 
 ### Build device AIE part: WSL Ubuntu terminal
 1. Prepare your enviroment with the mlir-aie tools (built during Prerequisites part of this guide). See [Set up your environment](#set-up-your-environment) above.
diff --git a/include/aie-c/Translation.h b/include/aie-c/Translation.h
index 93fa89b1f3..762c5ce041 100644
--- a/include/aie-c/Translation.h
+++ b/include/aie-c/Translation.h
@@ -18,7 +18,7 @@ extern "C" {
 MLIR_CAPI_EXPORTED MlirStringRef aieTranslateAIEVecToCpp(MlirOperation op,
                                                          bool aieml);
 MLIR_CAPI_EXPORTED MlirStringRef aieTranslateModuleToLLVMIR(MlirOperation op);
-MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToIPU(MlirOperation op);
+MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToNPU(MlirOperation op);
 MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToXAIEV2(MlirOperation op);
 MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToHSA(MlirOperation op);
 MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToBCF(MlirOperation op, int col,
diff --git a/include/aie/Dialect/AIE/IR/AIEAttrs.td b/include/aie/Dialect/AIE/IR/AIEAttrs.td
index 4d2f17371f..167a1bf4d9 100644
--- a/include/aie/Dialect/AIE/IR/AIEAttrs.td
+++ b/include/aie/Dialect/AIE/IR/AIEAttrs.td
@@ -100,7 +100,7 @@ def AIEDevice: I32EnumAttr<"AIEDevice", "AIE Device",
     I32EnumAttrCase<"xcvc1902", 1>,
     I32EnumAttrCase<"xcve2302", 2>,
     I32EnumAttrCase<"xcve2802", 3>,
-    I32EnumAttrCase<"ipu", 4>
+    I32EnumAttrCase<"npu", 4>
   ]> {
 
   let cppNamespace = "xilinx::AIE";
diff --git a/include/aie/Dialect/AIE/IR/AIETargetModel.h b/include/aie/Dialect/AIE/IR/AIETargetModel.h
index b524e97578..42ac68dc74 100644
--- a/include/aie/Dialect/AIE/IR/AIETargetModel.h
+++ b/include/aie/Dialect/AIE/IR/AIETargetModel.h
@@ -450,11 +450,11 @@ class VE2802TargetModel : public AIE2TargetModel {
   }
 };
 
-class IPUTargetModel : public AIE2TargetModel {
+class NPUTargetModel : public AIE2TargetModel {
   llvm::SmallDenseSet<unsigned, 16> nocColumns = {0, 1, 2, 3};
 
 public:
-  IPUTargetModel() = default;
+  NPUTargetModel() = default;
 
   int columns() const override { return 5; }
 
diff --git a/include/aie/Dialect/AIEX/IR/AIEX.td b/include/aie/Dialect/AIEX/IR/AIEX.td
index e38cb59d65..39ce49ada8 100644
--- a/include/aie/Dialect/AIEX/IR/AIEX.td
+++ b/include/aie/Dialect/AIEX/IR/AIEX.td
@@ -463,7 +463,7 @@ def AIE_SelectOp: AIEX_Op<"select", []>, Results<(outs Index)> {
   ];
 }
 
-def AIE_IpuDmaMemcpyNdOp: AIEX_Op<"ipu.dma_memcpy_nd", [
+def AIE_NpuDmaMemcpyNdOp: AIEX_Op<"npu.dma_memcpy_nd", [
     AttrSizedOperandSegments,
     MyOffsetSizeAndStrideOpInterface
   ]> {
@@ -519,10 +519,10 @@ def AIE_IpuDmaMemcpyNdOp: AIEX_Op<"ipu.dma_memcpy_nd", [
   let hasVerifier = 1;
 }
 
-def AIE_IpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> {
+def AIE_NpuDmaWaitOp: AIEX_Op<"npu.dma_wait", []> {
   let summary = "Blocking operation to wait for a DMA to complete execution.";
   let description = [{
-    The IpuDmaWaitOp blocks until the DMA referenced through `symbol` completes execution
+    The NpuDmaWaitOp blocks until the DMA referenced through `symbol` completes execution
     and issues a task-complete-token.
 
     Example:
@@ -530,13 +530,13 @@ def AIE_IpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> {
       ...
       aie.objectfifo @out0(%tile_0_1, {%tile_0_0}, 4 : i32) : !aie.objectfifo<memref<32x32xi32>>
       ...
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[1, 1, 0, 0][1, 1, 32, 32][1, 1, 64]) {id = 0 : i64, issue_token = true, metadata = @out0} : memref<32x64xi32>
       ...
-      aiex.ipu.dma_wait { symbol = @out0 }
+      aiex.npu.dma_wait { symbol = @out0 }
     ```
     Here, we have an objectfifo with symbol name `out0`, which is then referenced in the
-    `ipu.dma_memcpy_nd` operation as the target for the respective DMA operation. Afterwards,
-    an `ipu.dma_wait` operation references the same symbol to block until the respective DMA
+    `npu.dma_memcpy_nd` operation as the target for the respective DMA operation. Afterwards,
+    an `npu.dma_wait` operation references the same symbol to block until the respective DMA
     has executed all of its tasks.
   }];
   let arguments = (
@@ -549,7 +549,7 @@ def AIE_IpuDmaWaitOp: AIEX_Op<"ipu.dma_wait", []> {
 }
 
 // Write RTP
-def AIE_IpuWriteRTPOp: AIEX_Op<"ipu.rtp_write", []> {
+def AIE_NpuWriteRTPOp: AIEX_Op<"npu.rtp_write", []> {
   let summary = "rtp write operator";
   let arguments = (
     ins StrAttr:$buffer_sym_name,
@@ -567,7 +567,7 @@ def AIE_IpuWriteRTPOp: AIEX_Op<"ipu.rtp_write", []> {
 }
 
 // Push BD to Queue
-def AIE_IpuShimTilePushQueueOp: AIEX_Op<"ipu.shimtile_push_queue", []> {
+def AIE_NpuShimTilePushQueueOp: AIEX_Op<"npu.shimtile_push_queue", []> {
   let summary = "bd queue push operator";
   let arguments = (
     ins FlatSymbolRefAttr:$metadata,
@@ -586,7 +586,7 @@ def AIE_IpuShimTilePushQueueOp: AIEX_Op<"ipu.shimtile_push_queue", []> {
 }
 
 // WRITE32
-def AIE_IpuWrite32Op: AIEX_Op<"ipu.write32", []> {
+def AIE_NpuWrite32Op: AIEX_Op<"npu.write32", []> {
   let summary = "write32 operator";
   let arguments = (
     ins I32Attr:$column,
@@ -604,7 +604,7 @@ def AIE_IpuWrite32Op: AIEX_Op<"ipu.write32", []> {
 }
 
 // OP_SYNC
-def AIE_IpuSyncOp: AIEX_Op<"ipu.sync", []> {
+def AIE_NpuSyncOp: AIEX_Op<"npu.sync", []> {
   let summary = "sync operator";
   let arguments = (
     ins I32Attr:$column,
@@ -624,7 +624,7 @@ def AIE_IpuSyncOp: AIEX_Op<"ipu.sync", []> {
 }
 
 // WRITEBD_EXTEND_SHIMTILE
-def AIE_IpuWriteBdExShimTileOp: AIEX_Op<"ipu.writebd_shimtile", []> {
+def AIE_NpuWriteBdExShimTileOp: AIEX_Op<"npu.writebd_shimtile", []> {
   let summary = "dma operator";
   let arguments = (
     ins I32Attr:$column,
diff --git a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h
index f3e4f48ef2..b22b707712 100644
--- a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h
+++ b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.h
@@ -28,7 +28,7 @@ std::unique_ptr<mlir::OperationPass<AIE::DeviceOp>>
 createAIELowerMulticastPass();
 std::unique_ptr<mlir::OperationPass<AIE::DeviceOp>>
 createAIEBroadcastPacketPass();
-std::unique_ptr<mlir::OperationPass<AIE::DeviceOp>> createAIEDmaToIpuPass();
+std::unique_ptr<mlir::OperationPass<AIE::DeviceOp>> createAIEDmaToNpuPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createAIEXToStandardPass();
 
 /// Generate the code for registering passes.
diff --git a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td
index 911ca71df1..3c4b34a877 100644
--- a/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td
+++ b/include/aie/Dialect/AIEX/Transforms/AIEXPasses.td
@@ -17,7 +17,7 @@ def AIEXToStandard : Pass<"aiex-standard-lowering", "mlir::ModuleOp"> {
   let summary = "Lower AIEX operations";
   let description = [{
 
-    AIEX Ipu Ops are removed.
+    AIEX Npu Ops are removed.
 
   }];
 
@@ -133,13 +133,13 @@ def AIELowerMemcpy : Pass<"aie-lower-memcpy", "AIE::DeviceOp"> {
   ];
 }
 
-def AIEDmaToIpu : Pass<"aie-dma-to-ipu", "AIE::DeviceOp"> {
+def AIEDmaToNpu : Pass<"aie-dma-to-npu", "AIE::DeviceOp"> {
   let summary = "";
   let description = [{
 
   }];
 
-  let constructor = "xilinx::AIEX::createAIEDmaToIpuPass()";
+  let constructor = "xilinx::AIEX::createAIEDmaToNpuPass()";
   let dependentDialects = [
     "mlir::func::FuncDialect",
     "xilinx::AIE::AIEDialect",
diff --git a/include/aie/Targets/AIETargets.h b/include/aie/Targets/AIETargets.h
index b9b960d798..114f9c5335 100644
--- a/include/aie/Targets/AIETargets.h
+++ b/include/aie/Targets/AIETargets.h
@@ -31,9 +31,9 @@ mlir::LogicalResult AIETranslateShimSolution(mlir::ModuleOp module,
                                              llvm::raw_ostream &);
 mlir::LogicalResult AIETranslateGraphXPE(mlir::ModuleOp module,
                                          llvm::raw_ostream &);
-mlir::LogicalResult AIETranslateToIPU(mlir::ModuleOp module,
+mlir::LogicalResult AIETranslateToNPU(mlir::ModuleOp module,
                                       llvm::raw_ostream &output);
-std::vector<uint32_t> AIETranslateToIPU(mlir::ModuleOp);
+std::vector<uint32_t> AIETranslateToNPU(mlir::ModuleOp);
 mlir::LogicalResult AIETranslateToLdScript(mlir::ModuleOp module,
                                            llvm::raw_ostream &output,
                                            int tileCol, int tileRow);
diff --git a/lib/CAPI/Translation.cpp b/lib/CAPI/Translation.cpp
index 67f9cb8947..6488f44357 100644
--- a/lib/CAPI/Translation.cpp
+++ b/lib/CAPI/Translation.cpp
@@ -76,15 +76,15 @@ aieTranslateToCDODirect(MlirOperation moduleOp, MlirStringRef workDirPath,
   return wrap(status);
 }
 
-MlirStringRef aieTranslateToIPU(MlirOperation moduleOp) {
-  std::string ipu;
-  llvm::raw_string_ostream os(ipu);
+MlirStringRef aieTranslateToNPU(MlirOperation moduleOp) {
+  std::string npu;
+  llvm::raw_string_ostream os(npu);
   ModuleOp mod = llvm::cast<ModuleOp>(unwrap(moduleOp));
-  if (failed(AIETranslateToIPU(mod, os)))
+  if (failed(AIETranslateToNPU(mod, os)))
     return mlirStringRefCreate(nullptr, 0);
-  char *cStr = static_cast<char *>(malloc(ipu.size()));
-  ipu.copy(cStr, ipu.size());
-  return mlirStringRefCreate(cStr, ipu.size());
+  char *cStr = static_cast<char *>(malloc(npu.size()));
+  npu.copy(cStr, npu.size());
+  return mlirStringRefCreate(cStr, npu.size());
 }
 
 MlirStringRef aieTranslateToXAIEV2(MlirOperation moduleOp) {
diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp
index 2f9ab57f92..a9e80b44b1 100644
--- a/lib/Dialect/AIE/IR/AIEDialect.cpp
+++ b/lib/Dialect/AIE/IR/AIEDialect.cpp
@@ -104,7 +104,7 @@ LogicalResult myVerifyOffsetSizeAndStrideOp(OffsetSizeAndStrideOpInterface op) {
 static VC1902TargetModel VC1902model;
 static VE2302TargetModel VE2302model;
 static VE2802TargetModel VE2802model;
-static IPUTargetModel IPUmodel;
+static NPUTargetModel NPUmodel;
 
 const AIETargetModel &getTargetModel(Operation *op) {
   if (auto t = dyn_cast<AIETarget>(op))
@@ -983,8 +983,8 @@ const AIETargetModel &DeviceOp::getTargetModel() {
     return VE2302model;
   case AIEDevice::xcve2802:
     return VE2802model;
-  case AIEDevice::ipu:
-    return IPUmodel;
+  case AIEDevice::npu:
+    return NPUmodel;
   }
   return VC1902model;
 }
diff --git a/lib/Dialect/AIEX/IR/AIEXDialect.cpp b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
index 6363626aa2..f2c9ebc433 100644
--- a/lib/Dialect/AIEX/IR/AIEXDialect.cpp
+++ b/lib/Dialect/AIEX/IR/AIEXDialect.cpp
@@ -64,7 +64,7 @@ LogicalResult AIEX::BroadcastPacketOp::verify() {
   return success();
 }
 
-LogicalResult AIEX::IpuDmaMemcpyNdOp::verify() {
+LogicalResult AIEX::NpuDmaMemcpyNdOp::verify() {
   MemRefType buffer = getMemref().getType();
   if (buffer.getElementTypeBitWidth() != 32)
     return emitOpError("must be used with memref type with element width 32.");
@@ -105,7 +105,7 @@ LogicalResult AIEX::IpuDmaMemcpyNdOp::verify() {
   return success();
 }
 
-LogicalResult AIEX::IpuDmaWaitOp::verify() {
+LogicalResult AIEX::NpuDmaWaitOp::verify() {
   AIE::DeviceOp dev = (*this)->getParentOfType<AIE::DeviceOp>();
   // Some passes (e.g. aie-standard-lowering) use aiex ops outside a DeviceOp,
   // so we can't expect the device to always exist.
@@ -114,7 +114,7 @@ LogicalResult AIEX::IpuDmaWaitOp::verify() {
   return success();
 }
 
-LogicalResult AIEX::IpuShimTilePushQueueOp::verify() {
+LogicalResult AIEX::NpuShimTilePushQueueOp::verify() {
   const auto &targetModel = AIE::getTargetModel(*this);
   auto numBds = targetModel.getNumBDs(0, 0); // assume shim
   if (getBdId() > numBds)
@@ -124,7 +124,7 @@ LogicalResult AIEX::IpuShimTilePushQueueOp::verify() {
   return success();
 }
 
-LogicalResult AIEX::IpuWriteBdExShimTileOp::verify() {
+LogicalResult AIEX::NpuWriteBdExShimTileOp::verify() {
   const auto &targetModel = AIE::getTargetModel(*this);
   auto numBds = targetModel.getNumBDs(0, 0); // assume shim
   if (getBdId() > numBds)
diff --git a/lib/Dialect/AIEX/Transforms/AIEDmaToIpu.cpp b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
similarity index 87%
rename from lib/Dialect/AIEX/Transforms/AIEDmaToIpu.cpp
rename to lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
index 3841f73bf5..7239fbf5a0 100644
--- a/lib/Dialect/AIEX/Transforms/AIEDmaToIpu.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
@@ -1,4 +1,4 @@
-//===- AIEDmaToIpu.cpp ------------------------------------------*- C++ -*-===//
+//===- AIEDmaToNpu.cpp ------------------------------------------*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -20,14 +20,14 @@ using namespace mlir;
 using namespace xilinx;
 using namespace xilinx::AIEX;
 
-struct RtpToIpuPattern : OpConversionPattern<IpuWriteRTPOp> {
+struct RtpToNpuPattern : OpConversionPattern<NpuWriteRTPOp> {
   using OpConversionPattern::OpConversionPattern;
 
-  RtpToIpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
+  RtpToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(context, benefit) {}
 
   LogicalResult
-  matchAndRewrite(IpuWriteRTPOp op, OpAdaptor adaptor,
+  matchAndRewrite(NpuWriteRTPOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto ctx = op->getContext();
     auto i32ty = IntegerType::get(ctx, 32);
@@ -59,7 +59,7 @@ struct RtpToIpuPattern : OpConversionPattern<IpuWriteRTPOp> {
     IntegerAttr row = IntegerAttr::get(i32ty, r);
     IntegerAttr address = IntegerAttr::get(ui32ty, rtp_buffer_addr);
     IntegerAttr value = IntegerAttr::get(i32ty, v);
-    rewriter.create<IpuWrite32Op>(op->getLoc(), column.getInt(), row.getInt(),
+    rewriter.create<NpuWrite32Op>(op->getLoc(), column.getInt(), row.getInt(),
                                   address.getUInt(), value.getInt());
 
     rewriter.eraseOp(op);
@@ -81,14 +81,14 @@ getAllocOpForSymbol(AIE::DeviceOp dev, StringRef sym_name) {
   return std::nullopt;
 }
 
-struct PushToIpuPattern : OpConversionPattern<IpuShimTilePushQueueOp> {
+struct PushToNpuPattern : OpConversionPattern<NpuShimTilePushQueueOp> {
   using OpConversionPattern::OpConversionPattern;
 
-  PushToIpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
+  PushToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(context, benefit) {}
 
   LogicalResult
-  matchAndRewrite(IpuShimTilePushQueueOp op, OpAdaptor adaptor,
+  matchAndRewrite(NpuShimTilePushQueueOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto ctx = op->getContext();
     auto i32ty = IntegerType::get(ctx, 32);
@@ -134,7 +134,7 @@ struct PushToIpuPattern : OpConversionPattern<IpuShimTilePushQueueOp> {
       cmd |= 0x80000000;
     IntegerAttr value = IntegerAttr::get(ui32ty, cmd);
 
-    rewriter.create<IpuWrite32Op>(op->getLoc(), column.getInt(), zero.getInt(),
+    rewriter.create<NpuWrite32Op>(op->getLoc(), column.getInt(), zero.getInt(),
                                   address.getUInt(), value.getUInt());
 
     rewriter.eraseOp(op);
@@ -142,14 +142,14 @@ struct PushToIpuPattern : OpConversionPattern<IpuShimTilePushQueueOp> {
   }
 };
 
-struct DmaToIpuPattern : OpConversionPattern<IpuDmaMemcpyNdOp> {
+struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
   using OpConversionPattern::OpConversionPattern;
 
-  DmaToIpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
+  DmaToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(context, benefit) {}
 
   LogicalResult
-  matchAndRewrite(IpuDmaMemcpyNdOp op, OpAdaptor adaptor,
+  matchAndRewrite(NpuDmaMemcpyNdOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto ctx = op->getContext();
     auto i32ty = IntegerType::get(ctx, 32);
@@ -320,14 +320,14 @@ struct DmaToIpuPattern : OpConversionPattern<IpuDmaMemcpyNdOp> {
     if (!isMM2S)
       issue_token = BoolAttr::get(ctx, true);
 
-    (void)rewriter.create<IpuWriteBdExShimTileOp>(
+    (void)rewriter.create<NpuWriteBdExShimTileOp>(
         op->getLoc(), column, column_num, ddr_id, bd_id, buffer_length,
         buffer_offset, enable_packet, out_of_order_id, packet_id, packet_type,
         d0_size, d0_stride, d1_size, d1_stride, d2_stride, iteration_current,
         iteration_size, iteration_stride, next_bd, use_next_bd, valid_bd,
         lock_rel_val, lock_rel_id, lock_acq_enable, lock_acq_val, lock_acq_id);
 
-    rewriter.create<IpuShimTilePushQueueOp>(op->getLoc(), op.getMetadataAttr(),
+    rewriter.create<NpuShimTilePushQueueOp>(op->getLoc(), op.getMetadataAttr(),
                                             issue_token, repeat_count, bd_id);
 
     rewriter.eraseOp(op);
@@ -335,17 +335,17 @@ struct DmaToIpuPattern : OpConversionPattern<IpuDmaMemcpyNdOp> {
   }
 };
 
-/// Convert IpuDmaWaitOp into IpuSyncOp by retrieving the necessary
+/// Convert NpuDmaWaitOp into NpuSyncOp by retrieving the necessary
 /// information from the ShimDMAAllocationOp referenced through the
 /// symbol argument of this op.
-struct DmaWaitToIpuPattern : OpConversionPattern<IpuDmaWaitOp> {
+struct DmaWaitToNpuPattern : OpConversionPattern<NpuDmaWaitOp> {
   using OpConversionPattern::OpConversionPattern;
 
-  DmaWaitToIpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
+  DmaWaitToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1)
       : OpConversionPattern(context, benefit) {}
 
   LogicalResult
-  matchAndRewrite(IpuDmaWaitOp op, OpAdaptor adaptor,
+  matchAndRewrite(NpuDmaWaitOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     AIE::DeviceOp dev = op->getParentOfType<AIE::DeviceOp>();
     if (!dev)
@@ -364,13 +364,13 @@ struct DmaWaitToIpuPattern : OpConversionPattern<IpuDmaWaitOp> {
 
     // Create with `column_num == 1` and `row_num == 1` to check for a single
     // column and row. Row is always 0 for shim tiles.
-    (void)rewriter.replaceOpWithNewOp<IpuSyncOp>(op, column, 0, direction,
+    (void)rewriter.replaceOpWithNewOp<NpuSyncOp>(op, column, 0, direction,
                                                  channel, 1, 1);
     return success();
   }
 };
 
-struct AIEDmaToIpuPass : AIEDmaToIpuBase<AIEDmaToIpuPass> {
+struct AIEDmaToNpuPass : AIEDmaToNpuBase<AIEDmaToNpuPass> {
   void runOnOperation() override {
 
     AIE::DeviceOp device = getOperation();
@@ -379,22 +379,22 @@ struct AIEDmaToIpuPass : AIEDmaToIpuBase<AIEDmaToIpuPass> {
     target.addLegalDialect<AIEXDialect>();
     target.addLegalOp<AIE::BufferOp>();
     target.addLegalOp<AIE::ShimDMAAllocationOp>();
-    target.addIllegalOp<IpuWriteRTPOp>();
-    target.addIllegalOp<IpuDmaMemcpyNdOp>();
-    target.addIllegalOp<IpuDmaWaitOp>();
-    target.addIllegalOp<IpuShimTilePushQueueOp>();
+    target.addIllegalOp<NpuWriteRTPOp>();
+    target.addIllegalOp<NpuDmaMemcpyNdOp>();
+    target.addIllegalOp<NpuDmaWaitOp>();
+    target.addIllegalOp<NpuShimTilePushQueueOp>();
 
     RewritePatternSet patterns(&getContext());
-    patterns.insert<DmaToIpuPattern>(&getContext());
-    patterns.insert<DmaWaitToIpuPattern>(&getContext());
-    patterns.insert<PushToIpuPattern>(&getContext());
-    patterns.insert<RtpToIpuPattern>(&getContext());
+    patterns.insert<DmaToNpuPattern>(&getContext());
+    patterns.insert<DmaWaitToNpuPattern>(&getContext());
+    patterns.insert<PushToNpuPattern>(&getContext());
+    patterns.insert<RtpToNpuPattern>(&getContext());
 
     if (failed(applyPartialConversion(device, target, std::move(patterns))))
       signalPassFailure();
   }
 };
 
-std::unique_ptr<OperationPass<AIE::DeviceOp>> AIEX::createAIEDmaToIpuPass() {
-  return std::make_unique<AIEDmaToIpuPass>();
+std::unique_ptr<OperationPass<AIE::DeviceOp>> AIEX::createAIEDmaToNpuPass() {
+  return std::make_unique<AIEDmaToNpuPass>();
 }
diff --git a/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp b/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp
index 40609c18e5..b7aa242134 100644
--- a/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp
+++ b/lib/Dialect/AIEX/Transforms/AIEXToStandard.cpp
@@ -47,14 +47,14 @@ struct AIEXToStandardPass : AIEXToStandardBase<AIEXToStandardPass> {
     ModuleOp m = getOperation();
     ConversionTarget target(getContext());
     RewritePatternSet removepatterns(&getContext());
-    removepatterns.add<AIEXOpRemoval<IpuDmaMemcpyNdOp>>(m.getContext(), m);
-    removepatterns.add<AIEXOpRemoval<IpuDmaWaitOp>>(m.getContext(), m);
-    removepatterns.add<AIEXOpRemoval<IpuShimTilePushQueueOp>>(m.getContext(),
+    removepatterns.add<AIEXOpRemoval<NpuDmaMemcpyNdOp>>(m.getContext(), m);
+    removepatterns.add<AIEXOpRemoval<NpuDmaWaitOp>>(m.getContext(), m);
+    removepatterns.add<AIEXOpRemoval<NpuShimTilePushQueueOp>>(m.getContext(),
                                                               m);
-    removepatterns.add<AIEXOpRemoval<IpuWriteRTPOp>>(m.getContext(), m);
-    removepatterns.add<AIEXOpRemoval<IpuWrite32Op>>(m.getContext(), m);
-    removepatterns.add<AIEXOpRemoval<IpuSyncOp>>(m.getContext(), m);
-    removepatterns.add<AIEXOpRemoval<IpuWriteBdExShimTileOp>>(m.getContext(),
+    removepatterns.add<AIEXOpRemoval<NpuWriteRTPOp>>(m.getContext(), m);
+    removepatterns.add<AIEXOpRemoval<NpuWrite32Op>>(m.getContext(), m);
+    removepatterns.add<AIEXOpRemoval<NpuSyncOp>>(m.getContext(), m);
+    removepatterns.add<AIEXOpRemoval<NpuWriteBdExShimTileOp>>(m.getContext(),
                                                               m);
 
     if (failed(applyPartialConversion(m, target, std::move(removepatterns))))
diff --git a/lib/Dialect/AIEX/Transforms/CMakeLists.txt b/lib/Dialect/AIEX/Transforms/CMakeLists.txt
index 72cec940f7..3cfcac793d 100644
--- a/lib/Dialect/AIEX/Transforms/CMakeLists.txt
+++ b/lib/Dialect/AIEX/Transforms/CMakeLists.txt
@@ -13,7 +13,7 @@ add_mlir_dialect_library(AIEXTransforms
   AIECreateBroadcastPacket.cpp
   AIELowerMulticast.cpp
   AIELowerMemcpy.cpp
-  AIEDmaToIpu.cpp
+  AIEDmaToNpu.cpp
   ADDITIONAL_HEADER_DIRS
   ${AIE_BINARY_DIR}/include
 
diff --git a/lib/Targets/AIETargetCDODirect.cpp b/lib/Targets/AIETargetCDODirect.cpp
index b4778e22c4..5f3b94af5f 100644
--- a/lib/Targets/AIETargetCDODirect.cpp
+++ b/lib/Targets/AIETargetCDODirect.cpp
@@ -265,7 +265,7 @@ LogicalResult configureBdInBlock(XAie_DevInst &devInst, XAie_DmaDesc &dmaTileBd,
     // write them out like this so they show up with names in debug prints
     size_t smid = 0;
     size_t burstLen = 16; // (10):BLEN=16 (256Byte) (corresponds to
-                          // 0x800000000 from targetipu)
+                          // 0x800000000 from target)
     size_t qOs = 0;
     size_t cache = 0;
     size_t secure = 0;
@@ -559,8 +559,8 @@ struct AIEControl {
       int32_t col = switchboxOp.colIndex();
       int32_t row = switchboxOp.rowIndex();
       XAie_LocType tileLoc = XAie_TileLoc(col, row);
-      assert(targetOp.getDevice() == AIEDevice::ipu &&
-             "Only IPU currently supported");
+      assert(targetOp.getDevice() == AIEDevice::npu &&
+             "Only NPU currently supported");
       if (row == 0) {
         // FIXME hack for TCT routing
         // TODO Support both channels
@@ -780,9 +780,9 @@ LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath,
          "only exactly 1 device op supported.");
   DeviceOp targetOp = *devOps.begin();
   // things like XAIE_MEM_TILE_ROW_START and the missing
-  // shim dma on tile (0,0) are hard-coded assumptions about IPU...
-  assert(targetOp.getDevice() == AIEDevice::ipu &&
-         "Only IPU currently supported");
+  // shim dma on tile (0,0) are hard-coded assumptions about NPU...
+  assert(targetOp.getDevice() == AIEDevice::npu &&
+         "Only NPU currently supported");
   int maxCol = 0, minCol = 0;
   for (auto tileOp : targetOp.getOps<TileOp>()) {
     minCol = std::min(tileOp.getCol(), minCol);
diff --git a/lib/Targets/AIETargetHSA.cpp b/lib/Targets/AIETargetHSA.cpp
index 36701fc96d..098a1b5dac 100644
--- a/lib/Targets/AIETargetHSA.cpp
+++ b/lib/Targets/AIETargetHSA.cpp
@@ -14,7 +14,7 @@
 #include "aie/Dialect/AIEX/IR/AIEXDialect.h"
 #include "aie/Targets/AIETargets.h"
 
-#include "mlir/Dialect/Func/IR/FuncOps.h" // Eddie added to get the IPU func ops
+#include "mlir/Dialect/Func/IR/FuncOps.h" // Eddie added to get the NPU func ops
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/Pass/Pass.h"
@@ -95,7 +95,7 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
   // Looping over every Memcpy operation so we take the correct number of
   // buffers
   int num_ops = 0;
-  for (auto op : funcOp.getOps<IpuDmaMemcpyNdOp>()) {
+  for (auto op : funcOp.getOps<NpuDmaMemcpyNdOp>()) {
     // Getting the IDs of the buffers
     auto memref = op.getMemref();
     Block &entryBB = op->getParentOfType<func::FuncOp>().getBody().front();
@@ -117,7 +117,7 @@ mlir::LogicalResult AIETranslateToHSA(ModuleOp module, raw_ostream &output) {
   output << "\tuint64_t packet_id = 0;\n";
 
   int op_count = 0;
-  for (auto op : funcOp.getOps<IpuDmaMemcpyNdOp>()) {
+  for (auto op : funcOp.getOps<NpuDmaMemcpyNdOp>()) {
     auto dev = funcOp->getParentOfType<AIE::DeviceOp>();
     if (!dev) {
       op.emitOpError("couldn't get DeviceOp");
diff --git a/lib/Targets/AIETargetIPU.cpp b/lib/Targets/AIETargetNPU.cpp
similarity index 88%
rename from lib/Targets/AIETargetIPU.cpp
rename to lib/Targets/AIETargetNPU.cpp
index 6117bfdb40..7f17c3ad83 100644
--- a/lib/Targets/AIETargetIPU.cpp
+++ b/lib/Targets/AIETargetNPU.cpp
@@ -1,4 +1,4 @@
-//===- AIETargetIPU.cpp -----------------------------------------*- C++ -*-===//
+//===- AIETargetNPU.cpp -----------------------------------------*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -50,7 +50,7 @@ reserveAndGetTail(std::vector<uint32_t> &instructions, uint64_t tailSize) {
                                          tailSize);
 }
 
-void appendSync(std::vector<uint32_t> &instructions, IpuSyncOp op) {
+void appendSync(std::vector<uint32_t> &instructions, NpuSyncOp op) {
 
   auto words = reserveAndGetTail(instructions, 2);
 
@@ -65,7 +65,7 @@ void appendSync(std::vector<uint32_t> &instructions, IpuSyncOp op) {
   words[1] |= (op.getRowNum() & 0xff) << 8;
 }
 
-void appendWrite32(std::vector<uint32_t> &instructions, IpuWrite32Op op) {
+void appendWrite32(std::vector<uint32_t> &instructions, NpuWrite32Op op) {
 
   auto words = reserveAndGetTail(instructions, 3);
 
@@ -80,7 +80,7 @@ void appendWrite32(std::vector<uint32_t> &instructions, IpuWrite32Op op) {
 }
 
 void appendWriteBdShimTile(std::vector<uint32_t> &instructions,
-                           IpuWriteBdExShimTileOp op) {
+                           NpuWriteBdExShimTileOp op) {
 
   auto words = reserveAndGetTail(instructions, 10);
 
@@ -131,7 +131,7 @@ void appendWriteBdShimTile(std::vector<uint32_t> &instructions,
 
 } // namespace
 
-std::vector<uint32_t> xilinx::AIE::AIETranslateToIPU(ModuleOp module) {
+std::vector<uint32_t> xilinx::AIE::AIETranslateToNPU(ModuleOp module) {
 
   std::vector<uint32_t> instructions = getProlog();
 
@@ -143,9 +143,9 @@ std::vector<uint32_t> xilinx::AIE::AIETranslateToIPU(ModuleOp module) {
     Block &entry = f.getRegion().front();
     for (auto &o : entry) {
       llvm::TypeSwitch<Operation *>(&o)
-          .Case<IpuSyncOp>([&](auto op) { appendSync(instructions, op); })
-          .Case<IpuWrite32Op>([&](auto op) { appendWrite32(instructions, op); })
-          .Case<IpuWriteBdExShimTileOp>(
+          .Case<NpuSyncOp>([&](auto op) { appendSync(instructions, op); })
+          .Case<NpuWrite32Op>([&](auto op) { appendWrite32(instructions, op); })
+          .Case<NpuWriteBdExShimTileOp>(
               [&](auto op) { appendWriteBdShimTile(instructions, op); });
     }
   }
@@ -153,9 +153,9 @@ std::vector<uint32_t> xilinx::AIE::AIETranslateToIPU(ModuleOp module) {
   return instructions;
 }
 
-LogicalResult xilinx::AIE::AIETranslateToIPU(ModuleOp module,
+LogicalResult xilinx::AIE::AIETranslateToNPU(ModuleOp module,
                                              raw_ostream &output) {
-  auto instructions = AIETranslateToIPU(module);
+  auto instructions = AIETranslateToNPU(module);
   for (auto w : instructions)
     output << llvm::format("%08X\n", w);
   return success();
diff --git a/lib/Targets/AIETargets.cpp b/lib/Targets/AIETargets.cpp
index 05d79e3eaa..0caa039ef0 100644
--- a/lib/Targets/AIETargets.cpp
+++ b/lib/Targets/AIETargets.cpp
@@ -320,10 +320,10 @@ void registerAIETranslations() {
             cdoAieSim, cdoXaieDebug, cdoPartitionStartCol, cdoEnableCores);
       },
       registerDialects);
-  TranslateFromMLIRRegistration registrationIPU(
-      "aie-ipu-instgen", "Generate instructions for IPU",
+  TranslateFromMLIRRegistration registrationNPU(
+      "aie-npu-instgen", "Generate instructions for NPU",
       [](ModuleOp module, raw_ostream &output) {
-        return AIETranslateToIPU(module, output);
+        return AIETranslateToNPU(module, output);
       },
       registerDialects);
 }
diff --git a/lib/Targets/CMakeLists.txt b/lib/Targets/CMakeLists.txt
index 85b3405ede..e6e4307c2c 100644
--- a/lib/Targets/CMakeLists.txt
+++ b/lib/Targets/CMakeLists.txt
@@ -11,7 +11,7 @@ add_mlir_library(AIETargets
   AIETargets.cpp
   AIETargetBCF.cpp
   AIETargetCDODirect.cpp
-  AIETargetIPU.cpp
+  AIETargetNPU.cpp
   AIETargetLdScript.cpp
   AIETargetXAIEV2.cpp
   AIETargetHSA.cpp
diff --git a/programming_examples/basic/matrix_add_one/Makefile b/programming_examples/basic/matrix_add_one/Makefile
index 435b7b8c9e..83014fbeaf 100644
--- a/programming_examples/basic/matrix_add_one/Makefile
+++ b/programming_examples/basic/matrix_add_one/Makefile
@@ -13,15 +13,15 @@ ACDC_AIE = $(dir $(shell which aie-opt))/..
 SHELL := /bin/bash
 
 targetname = matrixAddOne
-devicename = ipu
+devicename = npu
 col = 0
 
 all: build/final.xclbin
 
 build/final.xclbin: build/aie.mlir
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/matrix_add_one/README.md b/programming_examples/basic/matrix_add_one/README.md
index 22afedbfc1..8516bdfe47 100644
--- a/programming_examples/basic/matrix_add_one/README.md
+++ b/programming_examples/basic/matrix_add_one/README.md
@@ -10,11 +10,11 @@
 
 # <ins>Matrix Addition</ins>
 
-Single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back. The DMA in the Shim tile is programmed to bring the bottom left `8x16` portion of a larger `16x128` matrix into the tile to perform the operation. This reference design can be run on either a RyzenAI IPU or a VCK5000.
+Single tile performs a very simple `+` operation where the kernel loads data from local memory, increments the value by `1` and stores it back. The DMA in the Shim tile is programmed to bring the bottom left `8x16` portion of a larger `16x128` matrix into the tile to perform the operation. This reference design can be run on either a RyzenAI NPU or a VCK5000.
 
-The kernel executes on AIE tile (`col`, 2). Input data is brought to the local memory of the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting IPU or VCK5000. The Shim tile is programmed with a 2D DMA to only bring a 2D submatrix into the AIE tile for processing. 
+The kernel executes on AIE tile (`col`, 2). Input data is brought to the local memory of the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting NPU or VCK5000. The Shim tile is programmed with a 2D DMA to only bring a 2D submatrix into the AIE tile for processing. 
 
-To compile and run the design for IPU:
+To compile and run the design for NPU:
 ```
 make
 make run
diff --git a/programming_examples/basic/matrix_add_one/aie2.py b/programming_examples/basic/matrix_add_one/aie2.py
index 36eb3a3d38..a80ba794e6 100644
--- a/programming_examples/basic/matrix_add_one/aie2.py
+++ b/programming_examples/basic/matrix_add_one/aie2.py
@@ -35,8 +35,8 @@ def my_matrix_add_one():
         if len(sys.argv) != 3:
             raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
 
-        if sys.argv[1] == "ipu":
-            dev = AIEDevice.ipu
+        if sys.argv[1] == "npu":
+            dev = AIEDevice.npu
         elif sys.argv[1] == "xcvc1902":
             dev = AIEDevice.xcvc1902
         else:
@@ -85,21 +85,21 @@ def core_body():
 
             @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
             def sequence(inTensor, notUsed, outTensor):
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="out0",
                     bd_id=0,
                     mem=outTensor,
                     sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
                     strides=[1, 1, IMAGE_WIDTH],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="in0",
                     bd_id=1,
                     mem=inTensor,
                     sizes=[1, 1, TILE_HEIGHT, TILE_WIDTH],
                     strides=[1, 1, IMAGE_WIDTH],
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/matrix_add_one/run.lit b/programming_examples/basic/matrix_add_one/run.lit
index a429e99221..1922c01828 100644
--- a/programming_examples/basic/matrix_add_one/run.lit
+++ b/programming_examples/basic/matrix_add_one/run.lit
@@ -3,9 +3,9 @@
 //
 // REQUIRES: ryzen_ai
 //
-// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 > ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/programming_examples/basic/matrix_multiplication/CMakeLists.txt b/programming_examples/basic/matrix_multiplication/CMakeLists.txt
index dfe345e188..0f062b0322 100644
--- a/programming_examples/basic/matrix_multiplication/CMakeLists.txt
+++ b/programming_examples/basic/matrix_multiplication/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index fd6a438ea0..6149657e1b 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -60,7 +60,7 @@ ${mlir_target}: aie2.py
 ${xclbin_target}: ${mlir_target} ${kernels:%=build/%.o}
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=${insts_target:build/%=%} $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=${insts_target:build/%=%} $(<:%=../%)
 
 ${targetname}.exe: test.cpp ../test.cpp ../common.h
 	rm -rf _build
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
index 80b5c89613..4ac31574fd 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/aie2.py
@@ -42,7 +42,7 @@ def my_matmul():
 
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             memRef_inA_ty = T.memref(m * k, T.bf16())
             memRef_inB_ty = T.memref(k, T.bf16())
@@ -176,7 +176,7 @@ def core_body():
                 T.memref(C_sz_in_i32s, T.i32()),
             )
             def sequence(A, B, C):
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata=inB_fifo_names[0],
                     bd_id=2,
                     mem=B,
@@ -186,7 +186,7 @@ def sequence(A, B, C):
                 for i in range(n_cores):
                     A_offset = i * M_div_m_div_n_cores * m * K * word_size_in // 4
                     C_offset = i * M_div_m_div_n_cores * m * word_size_out // 4
-                    ipu_dma_memcpy_nd(
+                    npu_dma_memcpy_nd(
                         metadata=memA_fifo_names[i],
                         bd_id=1,
                         mem=A,
@@ -194,7 +194,7 @@ def sequence(A, B, C):
                         sizes=[M_div_m_div_n_cores, K_div_k, m, k_in_i32s],
                         strides=[m_x_K_in_i32s, k_in_i32s, K_in_i32s],
                     )
-                    ipu_dma_memcpy_nd(
+                    npu_dma_memcpy_nd(
                         metadata=outC_fifo_names[i],
                         bd_id=0,
                         mem=C,
@@ -204,7 +204,7 @@ def sequence(A, B, C):
                     )
 
                 for i in range(n_cores):
-                    ipu_sync(column=i, row=0, direction=0, channel=0)
+                    npu_sync(column=i, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/run.lit b/programming_examples/basic/matrix_multiplication/matrix_vector/run.lit
index d446e4f966..eeaa69352a 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/run.lit
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/run.lit
@@ -5,8 +5,8 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../../aie_kernels/aie2/mv.cc -o ./mv.o
 // RUN: %python %S/aie2.py -M 288 -K 288 -N 1 > ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -M 288 -K 288 -N 1 -v 1 | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -M 288 -K 288 -N 1 -v 1 | FileCheck %s
 // CHECK: PASS!
 
diff --git a/programming_examples/basic/matrix_multiplication/single_core/aie2.py b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
index ef94adc74a..e00534e708 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/aie2.py
@@ -55,7 +55,7 @@ def my_matmul():
 
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             memref_a_ty = T.memref(m, k, T.bf16())
             memref_b_ty = T.memref(k, n, T.bf16())
@@ -195,7 +195,7 @@ def sequence(A, B, C):
                     num_tile_rows = min(
                         [rows_per_block, M_div_m - tile_row_block * rows_per_block]
                     )
-                    ipu_dma_memcpy_nd(
+                    npu_dma_memcpy_nd(
                         metadata="outC",
                         bd_id=0,
                         mem=C,
@@ -211,7 +211,7 @@ def sequence(A, B, C):
                             * word_size_in
                             // 4
                         )
-                        ipu_dma_memcpy_nd(
+                        npu_dma_memcpy_nd(
                             metadata="inA",
                             bd_id=2 * tile_row + 1,
                             mem=A,
@@ -219,7 +219,7 @@ def sequence(A, B, C):
                             sizes=[N_div_n, K_div_k, m, k_in_i32s],
                             strides=[0, k_in_i32s, K_in_i32s],
                         )
-                        ipu_dma_memcpy_nd(
+                        npu_dma_memcpy_nd(
                             metadata="inB",
                             bd_id=2 * tile_row + 2,
                             mem=B,
@@ -227,7 +227,7 @@ def sequence(A, B, C):
                             strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s],
                         )
 
-                    ipu_sync(column=0, row=0, direction=0, channel=0)
+                    npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/matrix_multiplication/single_core/run.lit b/programming_examples/basic/matrix_multiplication/single_core/run.lit
index 0209415093..6f6a32320a 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/run.lit
+++ b/programming_examples/basic/matrix_multiplication/single_core/run.lit
@@ -5,7 +5,7 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../../aie_kernels/aie2/mm.cc -o ./mm.o
 // RUN: %python %S/aie2.py -M 256 -K 256 -N 256 > ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -M 256 -K 256 -N 256 -v 1 | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -M 256 -K 256 -N 256 -v 1 | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
index 69a3c52394..d94a7e8eba 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/aie2.py
@@ -74,7 +74,7 @@ def my_matmul(M=512, K=512, N=512):
 
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             memRef_inA_ty = T.memref(m * k, T.bf16())
             memRef_inB_ty = T.memref(k * n, T.bf16())
@@ -317,7 +317,7 @@ def sequence(A, B, C):
                     for i in range(n_cols):
                         C_col_offset = i * n * word_size_out
                         C_offset_in_i32s = (C_col_offset + C_row_offset) // 4
-                        ipu_dma_memcpy_nd(
+                        npu_dma_memcpy_nd(
                             metadata=outC_fifo_names[i],
                             bd_id=0,
                             mem=C,
@@ -345,7 +345,7 @@ def sequence(A, B, C):
                             )
                             A_col_offset_in_i32s = i * m * K * word_size_in // 4
                             B_col_offset_in_i32s = i * n * word_size_in // 4
-                            ipu_dma_memcpy_nd(
+                            npu_dma_memcpy_nd(
                                 metadata=inA_fifo_names[i],
                                 bd_id=2 * tile_row + 1,
                                 mem=A,
@@ -358,7 +358,7 @@ def sequence(A, B, C):
                                 sizes=[N_div_n_div_n_cols, K_div_k, m, k_in_i32s],
                                 strides=[0, k_in_i32s, K_in_i32s],
                             )
-                            ipu_dma_memcpy_nd(
+                            npu_dma_memcpy_nd(
                                 metadata=inB_fifo_names[i],
                                 bd_id=2 * tile_row + 2,
                                 mem=B,
@@ -367,7 +367,7 @@ def sequence(A, B, C):
                                 strides=[n_x_n_cols_in_i32s, k_x_N_in_i32s, N_in_i32s],
                             )
                     for i in range(n_cols):
-                        ipu_sync(column=i, row=0, direction=0, channel=0)
+                        npu_sync(column=i, row=0, direction=0, channel=0)
 
     # print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run.lit b/programming_examples/basic/matrix_multiplication/whole_array/run.lit
index 202e66b71e..fc23355630 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/run.lit
+++ b/programming_examples/basic/matrix_multiplication/whole_array/run.lit
@@ -5,8 +5,8 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../../aie_kernels/aie2/mm.cc -o ./mm.o
 // RUN: %python %S/aie2.py -M 512 -K 512 -N 512 > ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -v 1 -M 512 -K 512 -N 512 | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -v 1 -M 512 -K 512 -N 512 | FileCheck %s
 // CHECK: PASS!
 
diff --git a/programming_examples/basic/passthrough_dmas/CMakeLists.txt b/programming_examples/basic/passthrough_dmas/CMakeLists.txt
index 3986c4a075..c17d3d365b 100644
--- a/programming_examples/basic/passthrough_dmas/CMakeLists.txt
+++ b/programming_examples/basic/passthrough_dmas/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/passthrough_dmas/Makefile b/programming_examples/basic/passthrough_dmas/Makefile
index 13a8d42aae..e09c8a91aa 100644
--- a/programming_examples/basic/passthrough_dmas/Makefile
+++ b/programming_examples/basic/passthrough_dmas/Makefile
@@ -26,13 +26,13 @@ inst/insts.txt: aie2.py
 	rm -rf inst
 	mkdir -p inst 
 	python3 $< ${devicename} ${col} ${LENGTH} > inst/aie.mlir
-	pushd inst && aiecc.py --aie-only-generate-ipu --ipu-insts-name=insts.txt aie.mlir && popd
+	pushd inst && aiecc.py --aie-only-generate-npu --npu-insts-name=insts.txt aie.mlir && popd
 	${powershell} ./build/${targetname}.exe -x build/final.xclbin -i inst/insts.txt -k MLIR_AIE -l ${LENGTH}
 
 build/final.xclbin: build/aie.mlir
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/passthrough_dmas/aie2.py b/programming_examples/basic/passthrough_dmas/aie2.py
index b59d9332ac..f8dc35a6d9 100755
--- a/programming_examples/basic/passthrough_dmas/aie2.py
+++ b/programming_examples/basic/passthrough_dmas/aie2.py
@@ -23,8 +23,8 @@
 if len(sys.argv) == 4:
     N = int(sys.argv[1])
 
-if sys.argv[1] == "ipu":
-    dev = AIEDevice.ipu
+if sys.argv[1] == "npu":
+    dev = AIEDevice.npu
 elif sys.argv[1] == "xcvc1902":
     dev = AIEDevice.xcvc1902
 else:
@@ -62,9 +62,9 @@ def core_body():
 
             @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
             def sequence(A, B, C):
-                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/passthrough_dmas/run.lit b/programming_examples/basic/passthrough_dmas/run.lit
index a4f5d568b6..a466533551 100644
--- a/programming_examples/basic/passthrough_dmas/run.lit
+++ b/programming_examples/basic/passthrough_dmas/run.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai
 //
-// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 > ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -l 4096 | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -l 4096 | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/basic/passthrough_kernel/CMakeLists.txt b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
index 47375adc84..fddc513396 100644
--- a/programming_examples/basic/passthrough_kernel/CMakeLists.txt
+++ b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
@@ -22,7 +22,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(PASSTHROUGH_SIZE 4096 CACHE STRING "size")
diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
index fbfc7580c4..458b992521 100644
--- a/programming_examples/basic/passthrough_kernel/Makefile
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -28,8 +28,8 @@ build/passThrough.cc.o: passThrough.cc
 	
 build/final_${PASSTHROUGH_SIZE}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_SIZE}.mlir build/passThrough.cc.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index baec4415fa..5b187a7d94 100644
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -29,7 +29,7 @@
 
 def passthroughKernel():
 
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         # define types
         memRef_ty = T.memref(lineWidthInBytes, T.ui8())
@@ -87,19 +87,19 @@ def sequence(inTensor, outTensor, notUsed):
                     events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F],
                 )
 
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="in",
                 bd_id=0,
                 mem=inTensor,
                 sizes=[1, 1, 1, tensorSizeInInt32s],
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="out",
                 bd_id=1,
                 mem=outTensor,
                 sizes=[1, 1, 1, tensorSizeInInt32s],
             )
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/passthrough_kernel/run.lit b/programming_examples/basic/passthrough_kernel/run.lit
index 30abe48152..7f1c2318b2 100644
--- a/programming_examples/basic/passthrough_kernel/run.lit
+++ b/programming_examples/basic/passthrough_kernel/run.lit
@@ -5,8 +5,8 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/generic/passThrough.cc -o passThrough.cc.o
 // RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DPASSTHROUGH_SIZE=4096 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
-// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE -s 4096 | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE -s 4096 | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/basic/vector_add/Makefile b/programming_examples/basic/vector_add/Makefile
index 9a1a7a2a56..61133a555b 100755
--- a/programming_examples/basic/vector_add/Makefile
+++ b/programming_examples/basic/vector_add/Makefile
@@ -13,15 +13,15 @@ ACDC_AIE = $(dir $(shell which aie-opt))/..
 SHELL := /bin/bash
 
 targetname = vectorAdd
-devicename = ipu
+devicename = npu
 col = 0
 
 all: build/final.xclbin
 
 build/final.xclbin: build/aie.mlir
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_add/README.md b/programming_examples/basic/vector_add/README.md
index 34e8d222bc..65cdafefca 100644
--- a/programming_examples/basic/vector_add/README.md
+++ b/programming_examples/basic/vector_add/README.md
@@ -10,11 +10,11 @@
 
 # <ins>Vector Add</ins>
 
-Single tile performs a very simple `+` operations from two vectors loaded into memory. The tile then stores the sum of those two vectors back to external memory. This reference design can be run on either a RyzenAI IPU or a VCK5000. 
+Single tile performs a very simple `+` operations from two vectors loaded into memory. The tile then stores the sum of those two vectors back to external memory. This reference design can be run on either a RyzenAI NPU or a VCK5000. 
 
-The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting IPU or VCK5000. The AIE tile performs the summation operations and the Shim tile brings the data back out to external memory.
+The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting NPU or VCK5000. The AIE tile performs the summation operations and the Shim tile brings the data back out to external memory.
 
-To compile and run the design for IPU:
+To compile and run the design for NPU:
 ```
 make
 make run
diff --git a/programming_examples/basic/vector_add/aie2.py b/programming_examples/basic/vector_add/aie2.py
index 6f8ad2d5b6..581729e6ec 100755
--- a/programming_examples/basic/vector_add/aie2.py
+++ b/programming_examples/basic/vector_add/aie2.py
@@ -28,8 +28,8 @@ def my_vector_add():
         if len(sys.argv) != 3:
             raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
 
-        if sys.argv[1] == "ipu":
-            dev = AIEDevice.ipu
+        if sys.argv[1] == "npu":
+            dev = AIEDevice.npu
         elif sys.argv[1] == "xcvc1902":
             dev = AIEDevice.xcvc1902
         else:
@@ -79,10 +79,10 @@ def core_body():
 
             @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
             def sequence(A, B, C):
-                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                ipu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                ipu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/vector_add/run.lit b/programming_examples/basic/vector_add/run.lit
index a429e99221..1922c01828 100644
--- a/programming_examples/basic/vector_add/run.lit
+++ b/programming_examples/basic/vector_add/run.lit
@@ -3,9 +3,9 @@
 //
 // REQUIRES: ryzen_ai
 //
-// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 > ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/programming_examples/basic/vector_exp/CMakeLists.txt b/programming_examples/basic/vector_exp/CMakeLists.txt
index 20452d080e..ee2050a94e 100644
--- a/programming_examples/basic/vector_exp/CMakeLists.txt
+++ b/programming_examples/basic/vector_exp/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/vector_exp/Makefile b/programming_examples/basic/vector_exp/Makefile
index 68205484e0..5b471771ba 100644
--- a/programming_examples/basic/vector_exp/Makefile
+++ b/programming_examples/basic/vector_exp/Makefile
@@ -32,7 +32,7 @@ build/aie.mlir: aie2.py
 build/final.xclbin: build/aie.mlir build/kernels.a
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_exp/aie2.py b/programming_examples/basic/vector_exp/aie2.py
index dd6e5e9773..dbf2a3ce2b 100644
--- a/programming_examples/basic/vector_exp/aie2.py
+++ b/programming_examples/basic/vector_exp/aie2.py
@@ -32,7 +32,7 @@ def my_eltwise_exp():
     buffer_depth = 2
 
     # Device declaration - aie2 device NPU (aka Ryzen AI)
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
 
         memRef_ty = T.memref(n, T.bf16())
@@ -106,13 +106,13 @@ def core_body():
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty)
         def sequence(A, C):
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_exp/run.lit b/programming_examples/basic/vector_exp/run.lit
index f2db79ab6a..247ca37a33 100644
--- a/programming_examples/basic/vector_exp/run.lit
+++ b/programming_examples/basic/vector_exp/run.lit
@@ -6,8 +6,8 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -I %S/../../../aie_runtime_lib/AIE2 -c %S/../../../aie_kernels/aie2/bf16_exp.cc -o exp.o
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -I. -c %S/../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o lut_based_ops.o
 // RUN: ar rvs kernels.a exp.o lut_based_ops.o
-// RUN: %python %S/aie2.py ipu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/basic/vector_mult/CMakeLists.txt b/programming_examples/basic/vector_mult/CMakeLists.txt
index 20452d080e..ee2050a94e 100644
--- a/programming_examples/basic/vector_mult/CMakeLists.txt
+++ b/programming_examples/basic/vector_mult/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/vector_mult/Makefile b/programming_examples/basic/vector_mult/Makefile
index 330692f4fb..bc07e3d05b 100755
--- a/programming_examples/basic/vector_mult/Makefile
+++ b/programming_examples/basic/vector_mult/Makefile
@@ -13,15 +13,15 @@ ACDC_AIE = $(dir $(shell which aie-opt))/..
 SHELL := /bin/bash
 
 targetname = vectorMult
-devicename = ipu
+devicename = npu
 col = 0
 
 all: build/final.xclbin
 
 build/final.xclbin: build/aie.mlir
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_mult/README.md b/programming_examples/basic/vector_mult/README.md
index 825b33d3cd..3abe2b9999 100644
--- a/programming_examples/basic/vector_mult/README.md
+++ b/programming_examples/basic/vector_mult/README.md
@@ -10,11 +10,11 @@
 
 # <ins>Vector Multiplication</ins>
 
-Single tile performs a very simple `*` operations from two vectors loaded into memory. The tile then stores the element wise multiplication of those two vectors back to external memory. This reference design can be run on either a RyzenAI IPU or a VCK5000.
+Single tile performs a very simple `*` operations from two vectors loaded into memory. The tile then stores the element wise multiplication of those two vectors back to external memory. This reference design can be run on either a RyzenAI NPU or a VCK5000.
 
-The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting IPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory.
+The kernel executes on AIE tile (`col`, 2). Both input vectors are brought into the tile from Shim tile (`col`, 0). The value of `col` is dependent on whether the application is targetting NPU or VCK5000. The AIE tile performs the multiplication operations and the Shim tile brings the data back out to external memory.
 
-To compile and run the design for IPU:
+To compile and run the design for NPU:
 ```
 make
 make run
diff --git a/programming_examples/basic/vector_mult/aie2.py b/programming_examples/basic/vector_mult/aie2.py
index 5a36f85a33..209f5243bb 100755
--- a/programming_examples/basic/vector_mult/aie2.py
+++ b/programming_examples/basic/vector_mult/aie2.py
@@ -28,8 +28,8 @@ def my_vector_add():
         if len(sys.argv) != 3:
             raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
 
-        if sys.argv[1] == "ipu":
-            dev = AIEDevice.ipu
+        if sys.argv[1] == "npu":
+            dev = AIEDevice.npu
         elif sys.argv[1] == "xcvc1902":
             dev = AIEDevice.xcvc1902
         else:
@@ -79,10 +79,10 @@ def core_body():
 
             @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
             def sequence(A, B, C):
-                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-                ipu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                ipu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in1", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_dma_memcpy_nd(metadata="in2", bd_id=2, mem=B, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/vector_mult/run.lit b/programming_examples/basic/vector_mult/run.lit
index a429e99221..1922c01828 100644
--- a/programming_examples/basic/vector_mult/run.lit
+++ b/programming_examples/basic/vector_mult/run.lit
@@ -3,9 +3,9 @@
 //
 // REQUIRES: ryzen_ai
 //
-// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 > ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/programming_examples/basic/vector_reduce_add/CMakeLists.txt b/programming_examples/basic/vector_reduce_add/CMakeLists.txt
index 9ae325a430..024b4cfd54 100644
--- a/programming_examples/basic/vector_reduce_add/CMakeLists.txt
+++ b/programming_examples/basic/vector_reduce_add/CMakeLists.txt
@@ -22,7 +22,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/vector_reduce_add/Makefile b/programming_examples/basic/vector_reduce_add/Makefile
index 37ca25abec..b0f8eebe0c 100644
--- a/programming_examples/basic/vector_reduce_add/Makefile
+++ b/programming_examples/basic/vector_reduce_add/Makefile
@@ -11,7 +11,7 @@ include ../../makefile-common
 ACDC_AIE = $(dir $(shell which aie-opt))/..
 
 targetname = vector_max
-devicename = ipu
+devicename = npu
 col = 0
 CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
 KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/aie2/
@@ -29,7 +29,7 @@ build/aie.mlir: aie2.py
 build/final.xclbin: build/aie.mlir build/i32_add_reduce.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_reduce_add/aie2.py b/programming_examples/basic/vector_reduce_add/aie2.py
index fe035bfc96..cf5d5691e9 100644
--- a/programming_examples/basic/vector_reduce_add/aie2.py
+++ b/programming_examples/basic/vector_reduce_add/aie2.py
@@ -24,8 +24,8 @@ def my_reduce_add():
     if len(sys.argv) != 3:
         raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
 
-    if sys.argv[1] == "ipu":
-        dev = AIEDevice.ipu
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu
     elif sys.argv[1] == "xcvc1902":
         dev = AIEDevice.xcvc1902
     else:
@@ -67,9 +67,9 @@ def core_body():
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty)
         def sequence(A, C):
-            ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
-            ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
+            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_reduce_add/run.lit b/programming_examples/basic/vector_reduce_add/run.lit
index 1ebe2c8741..f35b24884f 100644
--- a/programming_examples/basic/vector_reduce_add/run.lit
+++ b/programming_examples/basic/vector_reduce_add/run.lit
@@ -4,8 +4,8 @@
 // REQUIRES: ryzen_ai, chess
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/reduce_add.cc -o reduce_add.cc.o
-// RUN: %python %S/aie2.py ipu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/basic/vector_reduce_max/CMakeLists.txt b/programming_examples/basic/vector_reduce_max/CMakeLists.txt
index 9ae325a430..024b4cfd54 100644
--- a/programming_examples/basic/vector_reduce_max/CMakeLists.txt
+++ b/programming_examples/basic/vector_reduce_max/CMakeLists.txt
@@ -22,7 +22,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/vector_reduce_max/Makefile b/programming_examples/basic/vector_reduce_max/Makefile
index 55a013704d..5e47d478b2 100755
--- a/programming_examples/basic/vector_reduce_max/Makefile
+++ b/programming_examples/basic/vector_reduce_max/Makefile
@@ -11,7 +11,7 @@ include ../../makefile-common
 ACDC_AIE = $(dir $(shell which aie-opt))/..
 
 targetname = reduce_max
-devicename = ipu
+devicename = npu
 col = 0
 CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
 KERNEL_LIB=../../../aie_kernels/aie2
@@ -29,7 +29,7 @@ build/aie.mlir: aie2.py
 build/final.xclbin: build/aie.mlir build/reduce_max.cc.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_reduce_max/aie2.py b/programming_examples/basic/vector_reduce_max/aie2.py
index c081cf7659..bf9013e1b9 100755
--- a/programming_examples/basic/vector_reduce_max/aie2.py
+++ b/programming_examples/basic/vector_reduce_max/aie2.py
@@ -24,8 +24,8 @@ def my_reduce_max():
     if len(sys.argv) != 3:
         raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
 
-    if sys.argv[1] == "ipu":
-        dev = AIEDevice.ipu
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu
     elif sys.argv[1] == "xcvc1902":
         dev = AIEDevice.xcvc1902
     else:
@@ -67,9 +67,9 @@ def core_body():
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty)
         def sequence(A, C):
-            ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
-            ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
+            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_reduce_max/run.lit b/programming_examples/basic/vector_reduce_max/run.lit
index 6c3233183c..584d7c1628 100644
--- a/programming_examples/basic/vector_reduce_max/run.lit
+++ b/programming_examples/basic/vector_reduce_max/run.lit
@@ -4,8 +4,8 @@
 // REQUIRES: ryzen_ai, chess
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/reduce_max.cc -o reduce_max.cc.o
-// RUN: %python %S/aie2.py ipu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/basic/vector_reduce_min/CMakeLists.txt b/programming_examples/basic/vector_reduce_min/CMakeLists.txt
index 76d48dfe36..820bc8059d 100644
--- a/programming_examples/basic/vector_reduce_min/CMakeLists.txt
+++ b/programming_examples/basic/vector_reduce_min/CMakeLists.txt
@@ -22,7 +22,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/vector_reduce_min/Makefile b/programming_examples/basic/vector_reduce_min/Makefile
index 177213e22a..b4321855e1 100755
--- a/programming_examples/basic/vector_reduce_min/Makefile
+++ b/programming_examples/basic/vector_reduce_min/Makefile
@@ -11,7 +11,7 @@ include ../../makefile-common
 ACDC_AIE = $(dir $(shell which aie-opt))/..
 
 targetname = reduce_min
-devicename = ipu
+devicename = npu
 col = 0
 CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
 KERNEL_LIB=../../../aie_kernels/aie2
@@ -29,7 +29,7 @@ build/aie.mlir: aie2.py
 build/final.xclbin: build/aie.mlir build/reduce_min.cc.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_reduce_min/aie2.py b/programming_examples/basic/vector_reduce_min/aie2.py
index a8ef279a13..0b391f17a0 100755
--- a/programming_examples/basic/vector_reduce_min/aie2.py
+++ b/programming_examples/basic/vector_reduce_min/aie2.py
@@ -24,8 +24,8 @@ def my_reduce_min():
     if len(sys.argv) != 3:
         raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
 
-    if sys.argv[1] == "ipu":
-        dev = AIEDevice.ipu
+    if sys.argv[1] == "npu":
+        dev = AIEDevice.npu
     elif sys.argv[1] == "xcvc1902":
         dev = AIEDevice.xcvc1902
     else:
@@ -67,9 +67,9 @@ def core_body():
 
         @FuncOp.from_py_func(tensor_ty, tensor_ty)
         def sequence(A, C):
-            ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
-            ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
+            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 with mlir_mod_ctx() as ctx:
diff --git a/programming_examples/basic/vector_reduce_min/run.lit b/programming_examples/basic/vector_reduce_min/run.lit
index 95ecbd533a..710a9a02cd 100644
--- a/programming_examples/basic/vector_reduce_min/run.lit
+++ b/programming_examples/basic/vector_reduce_min/run.lit
@@ -4,8 +4,8 @@
 // REQUIRES: ryzen_ai, chess
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/reduce_min.cc -o reduce_min.cc.o
-// RUN: %python %S/aie2.py ipu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/basic/vector_scalar_add/CMakeLists.txt b/programming_examples/basic/vector_scalar_add/CMakeLists.txt
index c4ca0825d4..20f5d8a4a3 100644
--- a/programming_examples/basic/vector_scalar_add/CMakeLists.txt
+++ b/programming_examples/basic/vector_scalar_add/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/vector_scalar_add/Makefile b/programming_examples/basic/vector_scalar_add/Makefile
index 4ad8553675..463b63532b 100644
--- a/programming_examples/basic/vector_scalar_add/Makefile
+++ b/programming_examples/basic/vector_scalar_add/Makefile
@@ -18,8 +18,8 @@ build/aie.mlir: aie2.py
 
 build/final.xclbin: build/aie.mlir
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py
index 7c99acd401..ef36a8a891 100644
--- a/programming_examples/basic/vector_scalar_add/aie2.py
+++ b/programming_examples/basic/vector_scalar_add/aie2.py
@@ -15,7 +15,7 @@
 def my_vector_bias_add():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             memRef_16_ty = T.memref(16, T.i32())
             memRef_8_ty = T.memref(8, T.i32())
@@ -61,13 +61,13 @@ def core_body():
 
             @FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
             def sequence(inTensor, notUsed, outTensor):
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/vector_scalar_add/run.lit b/programming_examples/basic/vector_scalar_add/run.lit
index 49cd75e360..82fc93e501 100644
--- a/programming_examples/basic/vector_scalar_add/run.lit
+++ b/programming_examples/basic/vector_scalar_add/run.lit
@@ -4,8 +4,8 @@
 // REQUIRES: ryzen_ai
 //
 // RUN: %python %S/aie2.py > ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt
index 4d1000b813..e7b0f3d539 100644
--- a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt
+++ b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(VECTORSCALARMUL_SIZE 4096 CACHE STRING "vector size")
diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile
index 0d686b3068..8af81834f7 100755
--- a/programming_examples/basic/vector_scalar_mul/Makefile
+++ b/programming_examples/basic/vector_scalar_mul/Makefile
@@ -36,12 +36,12 @@ build/aie_trace_${data_size}.mlir: aie2.py
 build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
 
 build/final_trace_${data_size}.xclbin: build/aie_trace.mlir build/scale.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts_${data_size}.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
 
 ${targetname}_${data_size}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py
index d6ca3d0813..1f49b8d5a2 100755
--- a/programming_examples/basic/vector_scalar_mul/aie2.py
+++ b/programming_examples/basic/vector_scalar_mul/aie2.py
@@ -25,7 +25,7 @@ def my_vector_scalar(vector_size, trace_size):
 
     vectorized = True
 
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         memRef_ty = T.memref(n, T.i32())
         memRef_ty2 = T.memref(1, T.i32())
@@ -92,10 +92,10 @@ def sequence(A, F, C):
                     size=trace_size,
                     offset=N_in_bytes,
                 )
-            ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-            ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            ipu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/basic/vector_scalar_mul/run.lit b/programming_examples/basic/vector_scalar_mul/run.lit
index e599a22c04..fd55fdb97d 100644
--- a/programming_examples/basic/vector_scalar_mul/run.lit
+++ b/programming_examples/basic/vector_scalar_mul/run.lit
@@ -5,8 +5,8 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/scale.cc -o ./scale.o
 // RUN: %python %S/aie2.py 4096 0 > ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DVECTORSCALARMUL_SIZE=4096 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
-// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE -s 4096 | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE -s 4096 | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/basic/vector_sum/CMakeLists.txt b/programming_examples/basic/vector_sum/CMakeLists.txt
index f253b14fb0..5e637b4d7d 100644
--- a/programming_examples/basic/vector_sum/CMakeLists.txt
+++ b/programming_examples/basic/vector_sum/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/basic/vector_sum/Makefile b/programming_examples/basic/vector_sum/Makefile
index 8c0372f191..e9c2016543 100755
--- a/programming_examples/basic/vector_sum/Makefile
+++ b/programming_examples/basic/vector_sum/Makefile
@@ -13,15 +13,15 @@ ACDC_AIE = $(dir $(shell which aie-opt))/..
 SHELL := /bin/bash
 
 targetname = vectorSum
-devicename = ipu
+devicename = npu
 col = 0
 
 all: build/final.xclbin
 
 build/final.xclbin: build/aie.mlir
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/basic/vector_sum/README.md b/programming_examples/basic/vector_sum/README.md
index bb591ea622..60d9715528 100644
--- a/programming_examples/basic/vector_sum/README.md
+++ b/programming_examples/basic/vector_sum/README.md
@@ -10,9 +10,9 @@
 
 # <ins>Vector sum</ins>
 
-Single tile traverses through a vector in memory and returns the sum of each value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. This reference design can be run on either a RyzenAI IPU or a VCK5000. The value of `col` is dependent on whether the application is targetting IPU or VCK5000.
+Single tile traverses through a vector in memory and returns the sum of each value in the vector. The tile that performs the operation is tile (`col`, 2) and the data is read from and written to external memory through Shim tile (`col`, 0). A buffer in tile (`col`, 2) is used to store the temporary maximum value during processing, which is then pushed through an object FIFO to the Shim tile when processing is complete. This reference design can be run on either a RyzenAI NPU or a VCK5000. The value of `col` is dependent on whether the application is targetting NPU or VCK5000.
 
-To compile and run the design for IPU:
+To compile and run the design for NPU:
 ```
 make
 make run
diff --git a/programming_examples/basic/vector_sum/aie2.py b/programming_examples/basic/vector_sum/aie2.py
index 4e40b8009c..8073833962 100755
--- a/programming_examples/basic/vector_sum/aie2.py
+++ b/programming_examples/basic/vector_sum/aie2.py
@@ -26,8 +26,8 @@ def my_vector_sum():
         if len(sys.argv) != 3:
             raise ValueError("[ERROR] Need 2 command line arguments (Device name, Col)")
 
-        if sys.argv[1] == "ipu":
-            dev = AIEDevice.ipu
+        if sys.argv[1] == "npu":
+            dev = AIEDevice.npu
         elif sys.argv[1] == "xcvc1902":
             dev = AIEDevice.xcvc1902
         else:
@@ -77,9 +77,9 @@ def core_body():
 
             @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
             def sequence(A, B, C):
-                ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
-                ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 1])
+                npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/basic/vector_sum/run.lit b/programming_examples/basic/vector_sum/run.lit
index a429e99221..1922c01828 100644
--- a/programming_examples/basic/vector_sum/run.lit
+++ b/programming_examples/basic/vector_sum/run.lit
@@ -3,9 +3,9 @@
 //
 // REQUIRES: ryzen_ai
 //
-// RUN: %python %S/aie2.py ipu 0 > ./aie.mlir
-// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python %S/aie2.py npu 0 > ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index a03d2c7338..06db63f291 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -45,7 +45,7 @@
 # for python
 llvm_config.with_environment("PYTHONPATH", os.path.join(config.aie_obj_root, "python"))
 
-run_on_ipu = "echo"
+run_on_npu = "echo"
 xrt_flags = ""
 
 # Not using run_on_board anymore, need more specific per-platform commands
@@ -137,8 +137,8 @@
                     aie_model = m.group(2)
                     print("\tmodel:", aie_model)
                 config.available_features.add("ryzen_ai")
-                run_on_ipu = (
-                    f"flock /tmp/ipu.lock {config.aie_src_root}/utils/run_on_ipu.sh"
+                run_on_npu = (
+                    f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )
     except:
         print("Failed to run xbutil")
@@ -146,7 +146,7 @@
 else:
     print("xrt not found")
 
-config.substitutions.append(("%run_on_ipu", run_on_ipu))
+config.substitutions.append(("%run_on_npu", run_on_npu))
 config.substitutions.append(("%xrt_flags", xrt_flags))
 config.substitutions.append(("%XRT_DIR", config.xrt_dir))
 config.environment["XRT_HACK_UNSECURE_LOADING_XCLBIN"] = "1"
diff --git a/programming_examples/makefile-common b/programming_examples/makefile-common
index 5ab55c2e08..b5007535b8 100644
--- a/programming_examples/makefile-common
+++ b/programming_examples/makefile-common
@@ -1,4 +1,4 @@
-# Contains common definitions used across the Makefiles of ipu-xrt tests.
+# Contains common definitions used across the Makefiles of npu-xrt tests.
 REPO_ROOT ?= $(shell realpath $(dir $(shell which aie-opt))/../../..)
 INSTALL_ROOT ?= $(shell realpath $(dir $(shell which aie-opt))/..)
 
diff --git a/programming_examples/ml/bottleneck/CMakeLists.txt b/programming_examples/ml/bottleneck/CMakeLists.txt
index 4b897cb29c..c7db0e9c5c 100644
--- a/programming_examples/ml/bottleneck/CMakeLists.txt
+++ b/programming_examples/ml/bottleneck/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
diff --git a/programming_examples/ml/bottleneck/Makefile b/programming_examples/ml/bottleneck/Makefile
index 47ca6a78f7..0133e02c7a 100755
--- a/programming_examples/ml/bottleneck/Makefile
+++ b/programming_examples/ml/bottleneck/Makefile
@@ -16,7 +16,7 @@ build/${mlirFileName}.mlir: aie2.py
 	python3 $< > $@
 
 insts.txt: build/${mlirFileName}.mlir
-	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+	aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $<
 
 build/conv2dk1.o: ../../../aie_kernels/aie2/conv2dk1.cc
 	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
@@ -28,8 +28,8 @@ build/conv2dk1_skip.o: ../../../aie_kernels/aie2/conv2dk1_skip.cc
 	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
 
 build/final.xclbin: build/${mlirFileName}.mlir 
-	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 clean:
 	rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log .xclbin sim \
diff --git a/programming_examples/ml/bottleneck/aie2.py b/programming_examples/ml/bottleneck/aie2.py
index a488ae8ded..ac349259f4 100644
--- a/programming_examples/ml/bottleneck/aie2.py
+++ b/programming_examples/ml/bottleneck/aie2.py
@@ -38,7 +38,7 @@
 def bottleneck4AIEs():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def deviceBody():
 
             # define types
@@ -543,9 +543,9 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     # EVENTS_CORE_PORT_RUNNING_0 (0x4B)
 
                     # Trace_Event0  (4 slots)
-                    ipu_write32(0, 4, 0x340E0, 0x4B222125)
+                    npu_write32(0, 4, 0x340E0, 0x4B222125)
                     # Trace_Event1  (4 slots)
-                    ipu_write32(0, 4, 0x340E4, 0x2D2C1A4F)
+                    npu_write32(0, 4, 0x340E4, 0x2D2C1A4F)
 
                     # Event slots as configured above:
                     # 0: Kernel executes vector instruction
@@ -559,13 +559,13 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
 
                     # Stream_Switch_Event_Port_Selection_0
                     # This is necessary to capture the Port_Running_0 and Port_Running_1 events
-                    ipu_write32(0, 4, 0x3FF00, 0x121)
+                    npu_write32(0, 4, 0x3FF00, 0x121)
 
                     # Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
-                    ipu_write32(0, 4, 0x340D0, 0x10000)
+                    npu_write32(0, 4, 0x340D0, 0x10000)
 
                     # Start trace copy out.
-                    ipu_writebd_shimtile(
+                    npu_writebd_shimtile(
                         bd_id=3,
                         buffer_length=trace_sz_in_i32s,
                         buffer_offset=acitivationsOutSize32b,
@@ -593,45 +593,45 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                         use_next_bd=0,
                         valid_bd=1,
                     )
-                    ipu_write32(0, 2, 0x1D20C, 0x3)
+                    npu_write32(0, 2, 0x1D20C, 0x3)
 
                 # write RTP parameters
-                IpuWriteRTPOp(
+                NpuWriteRTPOp(
                     "rtpComputeTile2", col=0, row=2, index=0, value=1
                 )  # scale
-                IpuWriteRTPOp(
+                NpuWriteRTPOp(
                     "rtpComputeTile3", col=0, row=3, index=0, value=1
                 )  # scale
-                IpuWriteRTPOp(
+                NpuWriteRTPOp(
                     "rtpComputeTile5", col=0, row=5, index=0, value=1
                 )  # scale
-                IpuWriteRTPOp(
+                NpuWriteRTPOp(
                     "rtpComputeTile4", col=0, row=4, index=0, value=1
                 )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
-                IpuWriteRTPOp(
+                NpuWriteRTPOp(
                     "rtpComputeTile4", col=0, row=4, index=1, value=0
                 )  # skip_scale
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOF_act_L3L2",
                     bd_id=0,
                     mem=inputFromL3,
                     sizes=[1, 1, 1, activationsInSize32b],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outOFL2L3",
                     bd_id=2,
                     mem=outputToL3,
                     sizes=[1, 1, 1, acitivationsOutSize32b],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOF_wts_0_L3L2",
                     bd_id=1,
                     mem=weightsFromL3,
                     sizes=[1, 1, 1, totalWeightsSize32b],
                 )
 
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/ml/bottleneck/run.lit b/programming_examples/ml/bottleneck/run.lit
index 8a6024d66e..7d6f2abcc4 100644
--- a/programming_examples/ml/bottleneck/run.lit
+++ b/programming_examples/ml/bottleneck/run.lit
@@ -7,6 +7,6 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DUINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk3.cc -o conv2dk3.o
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DINT8_ACT -c %S/../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
+// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d/CMakeLists.txt b/programming_examples/ml/conv2d/CMakeLists.txt
index 4b897cb29c..c7db0e9c5c 100644
--- a/programming_examples/ml/conv2d/CMakeLists.txt
+++ b/programming_examples/ml/conv2d/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile
index 0f4c925ed3..c5f3576a1b 100755
--- a/programming_examples/ml/conv2d/Makefile
+++ b/programming_examples/ml/conv2d/Makefile
@@ -18,14 +18,14 @@ build/${mlirFileName}.mlir: aie2.py
 
 
 insts.txt: build/${mlirFileName}.mlir
-	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+	aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $<
 
 build/conv2dk1_i8.o: ../../../aie_kernels/aie2/conv2dk1_i8.cc
 	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
 
 build/final.xclbin: build/${mlirFileName}.mlir 
-	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 clean:
 	rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log* *.xclbin sim \
diff --git a/programming_examples/ml/conv2d/aie2.py b/programming_examples/ml/conv2d/aie2.py
index 74a2c38838..82584170cf 100644
--- a/programming_examples/ml/conv2d/aie2.py
+++ b/programming_examples/ml/conv2d/aie2.py
@@ -42,7 +42,7 @@
 def conv2dk1():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
 
             actIn_ty = T.memref(actIn, T.i8())
@@ -162,14 +162,14 @@ def sequence(I, W, O):
                     #              BB      <- Event to start trace capture
                     #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
                     # Configure so that "Event 1" (always true) causes tracing to start
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x340D0,
                         value=0x00010000,
                     )
                     # 0x340D4: Trace Control 1
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x340D4,
@@ -177,7 +177,7 @@ def sequence(I, W, O):
                     )
                     # 0x340E0: Trace Event Group 1  (Which events to trace)
                     #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x340E0,
@@ -185,14 +185,14 @@ def sequence(I, W, O):
                     )
                     # 0x340E4: Trace Event Group 2  (Which events to trace)
                     #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x340E4,
                         value=0x2D2C1A4F,
                     )
 
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x3FF00,
@@ -203,7 +203,7 @@ def sequence(I, W, O):
                     # out to host DDR memory
                     trace_bd_id = 13  # use BD 13 for writing trace output from compute tile to DDR host memory
                     output_size = bufOut
-                    ipu_writebd_shimtile(
+                    npu_writebd_shimtile(
                         bd_id=trace_bd_id,
                         buffer_length=trace_size,
                         buffer_offset=output_size,
@@ -232,29 +232,29 @@ def sequence(I, W, O):
                         valid_bd=1,
                     )
                     # Set start BD to our shim bd_Id (3)
-                    ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
+                    npu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
 
-                IpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=10)
+                NpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=10)
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOF_act_L3L2",
                     bd_id=0,
                     mem=I,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outOFL2L3",
                     bd_id=2,
                     mem=O,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOF_wts_0_L3L2",
                     bd_id=2,
                     mem=W,
                     sizes=[1, 1, 1, weightsInInt32s],
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/ml/conv2d/run.lit b/programming_examples/ml/conv2d/run.lit
index 349e45f9bc..59c3c8b031 100644
--- a/programming_examples/ml/conv2d/run.lit
+++ b/programming_examples/ml/conv2d/run.lit
@@ -5,6 +5,6 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8  -DINT8_ACT  -c %S/../../../aie_kernels/aie2/conv2dk1_i8.cc -o conv2dk1_i8.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
+// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt b/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt
index 4b897cb29c..c7db0e9c5c 100644
--- a/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt
+++ b/programming_examples/ml/conv2d_fused_relu/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
diff --git a/programming_examples/ml/conv2d_fused_relu/Makefile b/programming_examples/ml/conv2d_fused_relu/Makefile
index 5911238a7a..0ee3a81d07 100755
--- a/programming_examples/ml/conv2d_fused_relu/Makefile
+++ b/programming_examples/ml/conv2d_fused_relu/Makefile
@@ -17,14 +17,14 @@ build/${mlirFileName}.mlir: aie2.py
 
 
 insts.txt: build/${mlirFileName}.mlir
-	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+	aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $<
 
 build/conv2dk1.o: ../../../aie_kernels/aie2/conv2dk1.cc
 	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
 
 build/final.xclbin: build/${mlirFileName}.mlir 
-	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 clean:
 	rm -rf build *.elf* *.lst *.bif ${mlirFileName}.mlir.prj log* *.xclbin sim \
diff --git a/programming_examples/ml/conv2d_fused_relu/aie2.py b/programming_examples/ml/conv2d_fused_relu/aie2.py
index be0167e3b4..13a59f0934 100644
--- a/programming_examples/ml/conv2d_fused_relu/aie2.py
+++ b/programming_examples/ml/conv2d_fused_relu/aie2.py
@@ -42,7 +42,7 @@
 def conv2dk1():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
 
             actIn_ty = T.memref(actIn, T.i8())
@@ -162,14 +162,14 @@ def sequence(I, W, O):
                     #              BB      <- Event to start trace capture
                     #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
                     # Configure so that "Event 1" (always true) causes tracing to start
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x340D0,
                         value=0x00010000,
                     )
                     # 0x340D4: Trace Control 1
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x340D4,
@@ -177,7 +177,7 @@ def sequence(I, W, O):
                     )
                     # 0x340E0: Trace Event Group 1  (Which events to trace)
                     #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x340E0,
@@ -185,14 +185,14 @@ def sequence(I, W, O):
                     )
                     # 0x340E4: Trace Event Group 2  (Which events to trace)
                     #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x340E4,
                         value=0x2D2C1A4F,
                     )
 
-                    ipu_write32(
+                    npu_write32(
                         column=compute_tile2_col,
                         row=compute_tile2_row,
                         address=0x3FF00,
@@ -203,7 +203,7 @@ def sequence(I, W, O):
                     # out to host DDR memory
                     trace_bd_id = 13  # use BD 13 for writing trace output from compute tile to DDR host memory
                     output_size = bufOut
-                    ipu_writebd_shimtile(
+                    npu_writebd_shimtile(
                         bd_id=trace_bd_id,
                         buffer_length=trace_size,
                         buffer_offset=output_size,
@@ -232,29 +232,29 @@ def sequence(I, W, O):
                         valid_bd=1,
                     )
                     # Set start BD to our shim bd_Id (3)
-                    ipu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
+                    npu_write32(column=0, row=0, address=0x1D20C, value=trace_bd_id)
 
-                IpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=1)
+                NpuWriteRTPOp("rtp2", col=0, row=2, index=0, value=1)
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOF_act_L3L2",
                     bd_id=0,
                     mem=I,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outOFL2L3",
                     bd_id=2,
                     mem=O,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOF_wts_0_L3L2",
                     bd_id=2,
                     mem=W,
                     sizes=[1, 1, 1, weightsInInt32s],
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/ml/conv2d_fused_relu/run.lit b/programming_examples/ml/conv2d_fused_relu/run.lit
index cfddde9013..07d9b0b5b7 100644
--- a/programming_examples/ml/conv2d_fused_relu/run.lit
+++ b/programming_examples/ml/conv2d_fused_relu/run.lit
@@ -5,6 +5,6 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DINT8_ACT -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
+// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
\ No newline at end of file
diff --git a/programming_examples/ml/eltwise_add/CMakeLists.txt b/programming_examples/ml/eltwise_add/CMakeLists.txt
index c4ca0825d4..20f5d8a4a3 100644
--- a/programming_examples/ml/eltwise_add/CMakeLists.txt
+++ b/programming_examples/ml/eltwise_add/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/ml/eltwise_add/Makefile b/programming_examples/ml/eltwise_add/Makefile
index e0bb8ecc2c..702bd770ff 100644
--- a/programming_examples/ml/eltwise_add/Makefile
+++ b/programming_examples/ml/eltwise_add/Makefile
@@ -34,13 +34,13 @@ build/aie_trace.mlir: aie2.py
 
 build/final.xclbin: build/aie.mlir build/add.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 build/final_trace.xclbin: build/aie_trace.mlir build/add.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/ml/eltwise_add/aie2.py b/programming_examples/ml/eltwise_add/aie2.py
index a95d5e37ef..3127409485 100644
--- a/programming_examples/ml/eltwise_add/aie2.py
+++ b/programming_examples/ml/eltwise_add/aie2.py
@@ -33,7 +33,7 @@ def my_eltwise_add(trace_size):
     tiles = N_div_n // n_cores
     buffer_depth = 2
 
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         memRef_ty = T.memref(n, T.bf16())
 
@@ -143,16 +143,16 @@ def sequence(A, B, C):
                     offset=N_in_bytes,
                 )
 
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
             )
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/ml/eltwise_add/run.lit b/programming_examples/ml/eltwise_add/run.lit
index 8e6562b9e3..863e0d23c4 100644
--- a/programming_examples/ml/eltwise_add/run.lit
+++ b/programming_examples/ml/eltwise_add/run.lit
@@ -5,7 +5,7 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/add.cc -o add.o
 // RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall  -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/ml/eltwise_mul/CMakeLists.txt b/programming_examples/ml/eltwise_mul/CMakeLists.txt
index c4ca0825d4..20f5d8a4a3 100644
--- a/programming_examples/ml/eltwise_mul/CMakeLists.txt
+++ b/programming_examples/ml/eltwise_mul/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/ml/eltwise_mul/Makefile b/programming_examples/ml/eltwise_mul/Makefile
index 6ac247fbcc..60a39bb29b 100644
--- a/programming_examples/ml/eltwise_mul/Makefile
+++ b/programming_examples/ml/eltwise_mul/Makefile
@@ -27,13 +27,13 @@ build/aie_trace.mlir: aie2.py
 
 build/final.xclbin: build/aie.mlir build/mul.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 build/final_trace.xclbin: build/aie_trace.mlir build/mul.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 
 ${targetname}.exe: test.cpp
diff --git a/programming_examples/ml/eltwise_mul/aie2.py b/programming_examples/ml/eltwise_mul/aie2.py
index 7a0a0670a9..7f60f0ce43 100644
--- a/programming_examples/ml/eltwise_mul/aie2.py
+++ b/programming_examples/ml/eltwise_mul/aie2.py
@@ -33,7 +33,7 @@ def my_eltwise_mul(trace_size):
     tiles = N_div_n // n_cores
     buffer_depth = 2
 
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         memRef_ty = T.memref(n, T.bf16())
 
@@ -144,16 +144,16 @@ def sequence(A, B, C):
                     offset=N_in_bytes,
                 )
 
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="inB", bd_id=2, mem=B, sizes=[1, 1, 1, B_sz_in_i32s]
             )
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/ml/eltwise_mul/run.lit b/programming_examples/ml/eltwise_mul/run.lit
index b9a8bd9f2d..c6e794acae 100644
--- a/programming_examples/ml/eltwise_mul/run.lit
+++ b/programming_examples/ml/eltwise_mul/run.lit
@@ -5,7 +5,7 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/mul.cc -o mul.o
 // RUN: %python %S/aie2.py 4096 | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall  -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/ml/relu/CMakeLists.txt b/programming_examples/ml/relu/CMakeLists.txt
index c4ca0825d4..20f5d8a4a3 100644
--- a/programming_examples/ml/relu/CMakeLists.txt
+++ b/programming_examples/ml/relu/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/ml/relu/Makefile b/programming_examples/ml/relu/Makefile
index 6d18aa93db..4d52e165e1 100644
--- a/programming_examples/ml/relu/Makefile
+++ b/programming_examples/ml/relu/Makefile
@@ -27,13 +27,13 @@ build/aie_trace.mlir: aie2.py
 
 build/final.xclbin: build/aie.mlir build/relu.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 build/final_trace.xclbin: build/aie_trace.mlir build/relu.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt ${<F}
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt ${<F}
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/ml/relu/aie2.py b/programming_examples/ml/relu/aie2.py
index 0580a0113d..02e201eb7d 100644
--- a/programming_examples/ml/relu/aie2.py
+++ b/programming_examples/ml/relu/aie2.py
@@ -32,7 +32,7 @@ def my_relu(trace_size):
     tiles = N_div_n // n_cores
     buffer_depth = 2
 
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         memRef_ty = T.memref(n, T.bf16())
 
@@ -118,13 +118,13 @@ def sequence(A, C):
                     size=trace_size,
                     offset=N_in_bytes,
                 )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/ml/relu/run.lit b/programming_examples/ml/relu/run.lit
index 16c48f2aeb..7a13ec7850 100644
--- a/programming_examples/ml/relu/run.lit
+++ b/programming_examples/ml/relu/run.lit
@@ -5,7 +5,7 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/../../../aie_kernels/aie2/relu.cc -o relu.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/ml/resnet/layers_conv2_x/CMakeLists.txt b/programming_examples/ml/resnet/layers_conv2_x/CMakeLists.txt
index 4b897cb29c..c7db0e9c5c 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/CMakeLists.txt
+++ b/programming_examples/ml/resnet/layers_conv2_x/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
diff --git a/programming_examples/ml/resnet/layers_conv2_x/Makefile b/programming_examples/ml/resnet/layers_conv2_x/Makefile
index 6218e61fb5..c1551eb962 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/Makefile
+++ b/programming_examples/ml/resnet/layers_conv2_x/Makefile
@@ -17,7 +17,7 @@ build/${mlirFileName}.mlir: aie2.py
 	python3 $< > $@
 
 insts.txt: build/${mlirFileName}.mlir
-	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+	aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $<
 
 build/conv2dk1_i8.o: ../../../../aie_kernels/aie2/conv2dk1.cc
 	xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
@@ -35,8 +35,8 @@ build/conv2dk1_skip.o: ../../../../aie_kernels/aie2/conv2dk1_skip.cc
 	xchesscc -d ${CHESSCC2_FLAGS} -DUINT8_ACT -c $< -o $@
 
 build/final.xclbin: build/${mlirFileName}.mlir 
-	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd build && aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 clean:
 	rm -rf build *.elf* *.lst *.bif log* ${mlirFileName}.mlir.prj *.xclbin sim \
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
index ccc04efb9a..103cbbbcbe 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
@@ -6,7 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-aie.device(ipu) {
+aie.device(npu) {
 
   //shim
   %tile00 = aie.tile(0, 0)
@@ -909,9 +909,9 @@ aie.device(ipu) {
 
 
       // Trace_Event0  (4 slots)
-      aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 }
+      aiex.npu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 }
       // Trace_Event1  (4 slots)
-      aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 }
+      aiex.npu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 }
 
       // Event slots as configured above:
       // 0: Kernel executes vector instruction
@@ -925,13 +925,13 @@ aie.device(ipu) {
 
       // Stream_Switch_Event_Port_Selection_0
       // This is necessary to capture the Port_Running_0 and Port_Running_1 events
-      aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 }
+      aiex.npu.write32 { column = 2 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 }
 
       // Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
-      aiex.ipu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 }
+      aiex.npu.write32 { column = 2 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 }
 
       // Start trace copy out.
-      aiex.ipu.writebd_shimtile { bd_id = 3 : i32,
+      aiex.npu.writebd_shimtile { bd_id = 3 : i32,
                                   buffer_length = 16384 : i32,
                                   buffer_offset = 262144 : i32,
                                   enable_packet = 0 : i32,
@@ -965,30 +965,30 @@ aie.device(ipu) {
                                   next_bd = 0 : i32,
                                   use_next_bd = 0 : i32,
                                   valid_bd = 1 : i32}
-      aiex.ipu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 }
+      aiex.npu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 }
 
     //End trace dump
 
       
 
-      aiex.ipu.rtp_write(0, 2, 0,  1) { buffer_sym_name = "rtp2" }  
-      aiex.ipu.rtp_write(0, 3, 0,  1) { buffer_sym_name = "rtp3" } 
-      aiex.ipu.rtp_write(0, 5, 0,  1) { buffer_sym_name = "rtp4" }  
-      aiex.ipu.rtp_write(0, 4, 0,  1)  { buffer_sym_name = "rtp5" }  
-      aiex.ipu.rtp_write(0, 4, 1,  0)  { buffer_sym_name = "rtp5" }  
-      aiex.ipu.rtp_write(0, 4, 2,  1)  { buffer_sym_name = "rtp5" }  
+      aiex.npu.rtp_write(0, 2, 0,  1) { buffer_sym_name = "rtp2" }  
+      aiex.npu.rtp_write(0, 3, 0,  1) { buffer_sym_name = "rtp3" } 
+      aiex.npu.rtp_write(0, 5, 0,  1) { buffer_sym_name = "rtp4" }  
+      aiex.npu.rtp_write(0, 4, 0,  1)  { buffer_sym_name = "rtp5" }  
+      aiex.npu.rtp_write(0, 4, 1,  0)  { buffer_sym_name = "rtp5" }  
+      aiex.npu.rtp_write(0, 4, 2,  1)  { buffer_sym_name = "rtp5" }  
 
-      aiex.ipu.rtp_write(1, 5, 0,  1) { buffer_sym_name = "rtp15" }  
-      aiex.ipu.rtp_write(1, 4, 0,  1) { buffer_sym_name = "rtp14" }  
-      aiex.ipu.rtp_write(1, 2, 0,  1) { buffer_sym_name = "rtp12" }  
-      aiex.ipu.rtp_write(1, 3, 0,  1)  { buffer_sym_name = "rtp13" }  
-      aiex.ipu.rtp_write(1, 3, 1,  0)  { buffer_sym_name = "rtp13" }  
+      aiex.npu.rtp_write(1, 5, 0,  1) { buffer_sym_name = "rtp15" }  
+      aiex.npu.rtp_write(1, 4, 0,  1) { buffer_sym_name = "rtp14" }  
+      aiex.npu.rtp_write(1, 2, 0,  1) { buffer_sym_name = "rtp12" }  
+      aiex.npu.rtp_write(1, 3, 0,  1)  { buffer_sym_name = "rtp13" }  
+      aiex.npu.rtp_write(1, 3, 1,  0)  { buffer_sym_name = "rtp13" }  
 
-      aiex.ipu.rtp_write(2, 2, 0,  1) { buffer_sym_name = "rtp22" }  
-      aiex.ipu.rtp_write(2, 3, 0,  1) { buffer_sym_name = "rtp23" }  
-      aiex.ipu.rtp_write(2, 5, 0,  1) { buffer_sym_name = "rtp25" }  
-      aiex.ipu.rtp_write(2, 4, 0,  1)  { buffer_sym_name = "rtp24" }  
-      aiex.ipu.rtp_write(2, 4, 1,  0)  { buffer_sym_name = "rtp24" } 
+      aiex.npu.rtp_write(2, 2, 0,  1) { buffer_sym_name = "rtp22" }  
+      aiex.npu.rtp_write(2, 3, 0,  1) { buffer_sym_name = "rtp23" }  
+      aiex.npu.rtp_write(2, 5, 0,  1) { buffer_sym_name = "rtp25" }  
+      aiex.npu.rtp_write(2, 4, 0,  1)  { buffer_sym_name = "rtp24" }  
+      aiex.npu.rtp_write(2, 4, 1,  0)  { buffer_sym_name = "rtp24" } 
 
       %c0 = arith.constant 0 : i32
       %c1 = arith.constant 1 : i32
@@ -1000,13 +1000,13 @@ aie.device(ipu) {
       %total_wts_3_off = arith.constant  35840 : i64 
 
       //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
-      aiex.ipu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %in0[0, 0, 0, 0][1, 1, 1, %act_in][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %out[0, 0, 0, 0][1, 1, 1, %act_out][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, 0][1, 1, 1, %total_wts][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<53248xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts][1, 1, 1, %total_wts_2][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_1_L3L2} : memref<53248xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %wts0[0, 0, 0, %total_wts_3_off][1, 1, 1, %total_wts_3][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_2_L3L2} : memref<53248xi32>
 
-      aiex.ipu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie2.py b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
index 46fa10030f..ea14ca60e2 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie2.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie2.py
@@ -34,7 +34,7 @@ def resnet_conv_x():
 
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def deviceBody():
 
             # define types
@@ -836,52 +836,52 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
 
                 for c, col in enumerate(rtp_name):
                     for r, row in enumerate(col):
-                        IpuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1)  # scale
+                        npuWriteRTPOp(row, col=c, row=r + 2, index=0, value=1)  # scale
 
-                IpuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=0)
-                IpuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=1)
+                npuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=0)
+                npuWriteRTPOp("rtpComputeTile04", col=0, row=4, index=0, value=1)
 
-                IpuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=0)
+                npuWriteRTPOp("rtpComputeTile13", col=1, row=3, index=0, value=0)
 
-                IpuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=0)
+                npuWriteRTPOp("rtpComputeTile24", col=2, row=4, index=0, value=0)
 
                 # #     # write RTP parameters
-                # IpuWriteRTPOp(
+                # npuWriteRTPOp(
                 #     "rtpComputeTile02", col=0, row=2, index=0, value=1
                 # )  # scale
-                # IpuWriteRTPOp(
+                # npuWriteRTPOp(
                 #     "rtpComputeTile03", col=0, row=3, index=0, value=1
                 # )  # scale
-                # IpuWriteRTPOp(
+                # npuWriteRTPOp(
                 #     "rtpComputeTile05", col=0, row=5, index=0, value=1
                 # )  # scale
-                # IpuWriteRTPOp(
+                # npuWriteRTPOp(
                 #     "rtpComputeTile04", col=0, row=4, index=0, value=1
                 # )  # scale: conv1x1 with the same scale as the input so we match the scaling factor of output after conv1x1 and the initial input
-                # IpuWriteRTPOp(
+                # npuWriteRTPOp(
                 #     "rtpComputeTile04", col=0, row=4, index=1, value=0
                 # )  # skip_scale
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="act1_00_02_01",
                     bd_id=0,
                     mem=inputFromL3,
                     sizes=[1, 1, 1, activationsInSize32b],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outOFL2L3",
                     bd_id=2,
                     mem=outputToL3,
                     sizes=[1, 1, 1, acitivationsOutSize32b],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="wts_0_L3L2",
                     bd_id=1,
                     mem=weightsFromL3,
                     sizes=[1, 1, 1, totalWeightsSize32b_init],
                 )
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="wts_1_L3L2",
                     bd_id=1,
                     mem=weightsFromL3,
@@ -889,7 +889,7 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     sizes=[1, 1, 1, totalWeightsSize32b_rest],
                 )
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="wts_2_L3L2",
                     bd_id=1,
                     mem=weightsFromL3,
@@ -902,7 +902,7 @@ def sequence(inputFromL3, weightsFromL3, outputToL3):
                     sizes=[1, 1, 1, totalWeightsSize32b_rest],
                 )
 
-                ipu_sync(column=1, row=0, direction=0, channel=0)
+                npu_sync(column=1, row=0, direction=0, channel=0)
 
     res = ctx.module.operation.verify()
     if res == True:
diff --git a/programming_examples/ml/resnet/layers_conv2_x/run.lit b/programming_examples/ml/resnet/layers_conv2_x/run.lit
index 6496daafe7..394c68f9ca 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/run.lit
+++ b/programming_examples/ml/resnet/layers_conv2_x/run.lit
@@ -9,6 +9,6 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1.cc -o conv2dk1_ui8.o
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -DSCALAR -DUINT8_ACT -c %S/../../../../aie_kernels/aie2/conv2dk1_skip.cc -o conv2dk1_skip.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
-// RUN: %run_on_ipu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
+// RUN: %run_on_npu %python %S/test.py -x aie.xclbin -i insts.txt -k MLIR_AIE | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/ml/softmax/CMakeLists.txt b/programming_examples/ml/softmax/CMakeLists.txt
index c4ca0825d4..20f5d8a4a3 100644
--- a/programming_examples/ml/softmax/CMakeLists.txt
+++ b/programming_examples/ml/softmax/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/ml/softmax/Makefile b/programming_examples/ml/softmax/Makefile
index 3a0a4dcc44..87760b77d1 100755
--- a/programming_examples/ml/softmax/Makefile
+++ b/programming_examples/ml/softmax/Makefile
@@ -43,12 +43,12 @@ build/aie_trace.mlir: aie2.py
 build/final.xclbin: build/aie.mlir build/kernels.a
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 build/final_trace.xclbin: build/aie_trace.mlir build/kernels.a
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 
 ${targetname}.exe: test.cpp
diff --git a/programming_examples/ml/softmax/aie2.py b/programming_examples/ml/softmax/aie2.py
index 70873eebb6..3457191b24 100755
--- a/programming_examples/ml/softmax/aie2.py
+++ b/programming_examples/ml/softmax/aie2.py
@@ -33,7 +33,7 @@ def vector_softmax(trace_size):
     tiles = N_div_n // n_cores
     buffer_depth = 2
 
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         memRef_ty = T.memref(n, T.bf16())
 
@@ -122,13 +122,13 @@ def sequence(A, C):
                     offset=N_in_bytes,
                 )
 
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="outC", bd_id=0, mem=C, sizes=[1, 1, 1, C_sz_in_i32s]
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
             )
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
 
 try:
diff --git a/programming_examples/ml/softmax/run.lit b/programming_examples/ml/softmax/run.lit
index 54c7ccff98..42441e898a 100644
--- a/programming_examples/ml/softmax/run.lit
+++ b/programming_examples/ml/softmax/run.lit
@@ -9,7 +9,7 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/softmax.cc -o softmax.o
 // RUN: ar rvs kernels.a dut.o lut_based_ops.o softmax.o
 // RUN: %python %S/aie2.py | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/ml/weight_expand/CMakeLists.txt b/programming_examples/ml/weight_expand/CMakeLists.txt
index c4ca0825d4..20f5d8a4a3 100644
--- a/programming_examples/ml/weight_expand/CMakeLists.txt
+++ b/programming_examples/ml/weight_expand/CMakeLists.txt
@@ -27,7 +27,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/programming_examples/ml/weight_expand/Makefile b/programming_examples/ml/weight_expand/Makefile
index 641b4902b3..b4967596fb 100755
--- a/programming_examples/ml/weight_expand/Makefile
+++ b/programming_examples/ml/weight_expand/Makefile
@@ -23,7 +23,7 @@ build/aie.mlir: aie2.py
 build/final.xclbin: build/aie.mlir build/expand.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/ml/weight_expand/aie2.py b/programming_examples/ml/weight_expand/aie2.py
index 3ca1f7aee3..32fe95429f 100755
--- a/programming_examples/ml/weight_expand/aie2.py
+++ b/programming_examples/ml/weight_expand/aie2.py
@@ -45,7 +45,7 @@ def my_expand():
 
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             memRef_i_ty = T.memref(
                 input_buffer_size_bytes, T.i8()
@@ -91,13 +91,13 @@ def core_body():
             @FuncOp.from_py_func(tensor_ty, tensor_ty)
             def sequence(A, C):
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outB", bd_id=0, mem=C, sizes=[1, 1, 1, B_sz_in_i32s]
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inA", bd_id=1, mem=A, sizes=[1, 1, 1, A_sz_in_i32s]
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/utils/README.md b/programming_examples/utils/README.md
index 9dc2731012..1d59d46e08 100644
--- a/programming_examples/utils/README.md
+++ b/programming_examples/utils/README.md
@@ -54,7 +54,7 @@ The parse script create a temporary directory `tmpTrace` performs the following
 We prepend `0x` before each hex line and save it `prep.<trace file>` since the `hwfrontend` utility expects it.
 
 ### <u>2. Parse MLIR to build event table</u>
-The MLIR parser is pretty rudimentary as it scans the source mlir file looking for `aiex.ipu.write32` calls and does a pattern match for trace unit config address and then grab the hex events, which it looks up from an internal table to provide waveform labels. It would be better to use an MLIR pass that already has the config information and cross reference it with a more official event-to-label lookup table instead. 
+The MLIR parser is pretty rudimentary as it scans the source mlir file looking for `aiex.npu.write32` calls and does a pattern match for trace unit config address and then grab the hex events, which it looks up from an internal table to provide waveform labels. It would be better to use an MLIR pass that already has the config information and cross reference it with a more official event-to-label lookup table instead. 
 
 ### <u>3. Create .target file</u>
 Create a dummy file (`.target`) in the `tmpTrace` with the file content 'hw' since `hwfrontend` utility expects it.
diff --git a/programming_examples/utils/parse_eventIR.py b/programming_examples/utils/parse_eventIR.py
index b7c989ca3c..b41ff9c74a 100755
--- a/programming_examples/utils/parse_eventIR.py
+++ b/programming_examples/utils/parse_eventIR.py
@@ -594,9 +594,9 @@ def parse_mlir_trace_events(lines):
 
     # TODO Need to check if this line is commented out, check for // ? (harder to check of /* */)
     # TODO Need to support value in hex with 0x or decimal
-    # pattern = r"AIEX.ipu.write32\s*\{\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*\}"
-    # pattern = r"AIEX.ipu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
-    pattern = r"aiex.ipu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
+    # pattern = r"AIEX.npu.write32\s*\{\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*\}"
+    # pattern = r"AIEX.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
+    pattern = r"aiex.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
 
     pid_events = list()
     for t in range(NumTraceTypes):
diff --git a/programming_examples/utils/parse_trace.py b/programming_examples/utils/parse_trace.py
index ed45353f31..9d2cd144a6 100755
--- a/programming_examples/utils/parse_trace.py
+++ b/programming_examples/utils/parse_trace.py
@@ -582,9 +582,9 @@ def parse_mlir_trace_events(lines):
 
     # TODO Need to check if this line is commented out, check for // ? (harder to check of /* */)
     # TODO Need to support value in hex with 0x or decimal
-    # pattern = r"AIEX.ipu.write32\s*\{\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*\}"
-    # pattern = r"AIEX.ipu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
-    pattern = r"aiex.ipu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
+    # pattern = r"AIEX.npu.write32\s*\{\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\d+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(\w+)\s*:\s*\w+\s*\}"
+    # pattern = r"AIEX.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
+    pattern = r"aiex.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
 
     pid_events = list()
     for t in range(NumTraceTypes):
diff --git a/programming_examples/vision/color_detect/CMakeLists.txt b/programming_examples/vision/color_detect/CMakeLists.txt
index d850efcad5..f743789b61 100644
--- a/programming_examples/vision/color_detect/CMakeLists.txt
+++ b/programming_examples/vision/color_detect/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(COLORDETECT_WIDTH 1920 CACHE STRING "image width")
diff --git a/programming_examples/vision/color_detect/Makefile b/programming_examples/vision/color_detect/Makefile
index c8feea4cb6..ffb8ca55d1 100755
--- a/programming_examples/vision/color_detect/Makefile
+++ b/programming_examples/vision/color_detect/Makefile
@@ -39,8 +39,8 @@ build/aie2_lineBased_8b_${COLORDETECT_WIDTH}.mlir: aie2_colorDetect.py
 
 build/final_${COLORDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${COLORDETECT_WIDTH}.mlir build/rgba2hue.cc.o build/threshold.cc.o build/combined_bitwiseOR_gray2rgba_bitwiseAND.a
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 build/${targetname}.exe: test.cpp
 	mkdir -p ${@D}
diff --git a/programming_examples/vision/color_detect/README.md b/programming_examples/vision/color_detect/README.md
index 33d41a2339..f2f24dbea6 100644
--- a/programming_examples/vision/color_detect/README.md
+++ b/programming_examples/vision/color_detect/README.md
@@ -12,7 +12,7 @@
 
 The Color Detect pipeline design consists of the following blocks arranged in a pipeline fashion for the detecting of 2 colors in a sequence of images : `rgba2hue`, `threshold`, `threshold`, `bitwiseOR`, `gray2rgba`, `bitwiseAND`.
 
-The pipeline is mapped onto a single column of the ipu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). As shown in the image below, the `rgba2hue`, and the two `threshold` kernels are each mapped onto one compute tile, while `bitwiseOR`, `gray2rgba` and `bitwiseAND` are mapped together on AIE tile (0, 5). 
+The pipeline is mapped onto a single column of the npu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). As shown in the image below, the `rgba2hue`, and the two `threshold` kernels are each mapped onto one compute tile, while `bitwiseOR`, `gray2rgba` and `bitwiseAND` are mapped together on AIE tile (0, 5). 
 
 <p align="center">
   <img
diff --git a/programming_examples/vision/color_detect/aie2_colorDetect.py b/programming_examples/vision/color_detect/aie2_colorDetect.py
index f7410a4d12..6675b6fda4 100644
--- a/programming_examples/vision/color_detect/aie2_colorDetect.py
+++ b/programming_examples/vision/color_detect/aie2_colorDetect.py
@@ -32,7 +32,7 @@
 def color_detect():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def deviceBody():
             line_bytes_ty = MemRefType.get((lineWidthInBytes,), T.ui8())
             line_ty = MemRefType.get((lineWidth,), T.ui8())
@@ -254,19 +254,19 @@ def coreBody():
 
             @FuncOp.from_py_func(tensor_ty, memRef_16x16_ty, tensor_ty)
             def sequence(I, B, O):
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOF_L3L2",
                     bd_id=1,
                     mem=I,
                     sizes=[1, 1, 1, height * lineWidthInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outOF_L2L3",
                     bd_id=0,
                     mem=O,
                     sizes=[1, 1, 1, height * lineWidthInInt32s],
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/vision/color_detect/run.lit b/programming_examples/vision/color_detect/run.lit
index 766ddab92e..20b80f50fc 100644
--- a/programming_examples/vision/color_detect/run.lit
+++ b/programming_examples/vision/color_detect/run.lit
@@ -10,7 +10,7 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/bitwiseAND.cc -o ./bitwiseAND.cc.o
 // RUN: ar rvs combined_bitwiseOR_gray2rgba_bitwiseAND.a bitwiseOR.cc.o gray2rgba.cc.o bitwiseAND.cc.o
 // RUN: %python %S/aie2_colorDetect.py 1920 1080 > ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DCOLORDETECT_WIDTH=1920 -DCOLORDETECT_HEIGHT=1080 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp -I%S/../../utils %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags  -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/vision/color_threshold/CMakeLists.txt b/programming_examples/vision/color_threshold/CMakeLists.txt
index 040bc74533..f630f55106 100644
--- a/programming_examples/vision/color_threshold/CMakeLists.txt
+++ b/programming_examples/vision/color_threshold/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(COLORTHRESHOLD_WIDTH 128 CACHE STRING "image width")
diff --git a/programming_examples/vision/color_threshold/Makefile b/programming_examples/vision/color_threshold/Makefile
index 286f342b08..69958f4c2e 100644
--- a/programming_examples/vision/color_threshold/Makefile
+++ b/programming_examples/vision/color_threshold/Makefile
@@ -36,8 +36,8 @@ build/aie2_${COLORTHRESHOLD_WIDTH}.mlir: aie2_colorThreshold.py
 
 build/final_${COLORTHRESHOLD_WIDTH}.xclbin: build/aie2_${COLORTHRESHOLD_WIDTH}.mlir build/threshold.cc.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/vision/color_threshold/README.md b/programming_examples/vision/color_threshold/README.md
index fbab6235cc..ad8613544a 100644
--- a/programming_examples/vision/color_threshold/README.md
+++ b/programming_examples/vision/color_threshold/README.md
@@ -12,7 +12,7 @@
 
 The Color Threshold pipeline design consists of 4 threshold blocks in separate AIE tiles that process a different region of an input image, as shown in the image below.
 
-The pipeline is mapped onto a single column of the ipu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). 
+The pipeline is mapped onto a single column of the npu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). 
 
 <p align="center">
   <img
diff --git a/programming_examples/vision/color_threshold/aie2_colorThreshold.py b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
index 6a49466b64..b8528b26a9 100644
--- a/programming_examples/vision/color_threshold/aie2_colorThreshold.py
+++ b/programming_examples/vision/color_threshold/aie2_colorThreshold.py
@@ -30,7 +30,7 @@
 def color_threshold():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             line_channels_ty = T.memref(lineWidthChannels, T.ui8())
             line_ty = T.memref(lineWidth, T.ui8())
@@ -256,35 +256,35 @@ def core_body():
             )
             def sequence(inTensor, notUsed, outTensor):
                 # thresholdValue, maxValue, thresholdType
-                IpuWriteRTPOp("rtpComputeTile2", col=0, row=2, index=0, value=50)
-                IpuWriteRTPOp("rtpComputeTile2", col=0, row=2, index=1, value=255)
-                IpuWriteRTPOp("rtpComputeTile2", col=0, row=2, index=2, value=0)
+                NpuWriteRTPOp("rtpComputeTile2", col=0, row=2, index=0, value=50)
+                NpuWriteRTPOp("rtpComputeTile2", col=0, row=2, index=1, value=255)
+                NpuWriteRTPOp("rtpComputeTile2", col=0, row=2, index=2, value=0)
 
-                IpuWriteRTPOp("rtpComputeTile3", col=0, row=3, index=0, value=50)
-                IpuWriteRTPOp("rtpComputeTile3", col=0, row=3, index=1, value=255)
-                IpuWriteRTPOp("rtpComputeTile3", col=0, row=3, index=2, value=0)
+                NpuWriteRTPOp("rtpComputeTile3", col=0, row=3, index=0, value=50)
+                NpuWriteRTPOp("rtpComputeTile3", col=0, row=3, index=1, value=255)
+                NpuWriteRTPOp("rtpComputeTile3", col=0, row=3, index=2, value=0)
 
-                IpuWriteRTPOp("rtpComputeTile4", col=0, row=4, index=0, value=50)
-                IpuWriteRTPOp("rtpComputeTile4", col=0, row=4, index=1, value=255)
-                IpuWriteRTPOp("rtpComputeTile4", col=0, row=4, index=2, value=0)
+                NpuWriteRTPOp("rtpComputeTile4", col=0, row=4, index=0, value=50)
+                NpuWriteRTPOp("rtpComputeTile4", col=0, row=4, index=1, value=255)
+                NpuWriteRTPOp("rtpComputeTile4", col=0, row=4, index=2, value=0)
 
-                IpuWriteRTPOp("rtpComputeTile5", col=0, row=5, index=0, value=50)
-                IpuWriteRTPOp("rtpComputeTile5", col=0, row=5, index=1, value=255)
-                IpuWriteRTPOp("rtpComputeTile5", col=0, row=5, index=2, value=0)
+                NpuWriteRTPOp("rtpComputeTile5", col=0, row=5, index=0, value=50)
+                NpuWriteRTPOp("rtpComputeTile5", col=0, row=5, index=1, value=255)
+                NpuWriteRTPOp("rtpComputeTile5", col=0, row=5, index=2, value=0)
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOOB_L3L2",
                     bd_id=1,
                     mem=inTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outOOB_L2L3",
                     bd_id=0,
                     mem=outTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     # print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/vision/color_threshold/run.lit b/programming_examples/vision/color_threshold/run.lit
index 3033daed44..f452502155 100644
--- a/programming_examples/vision/color_threshold/run.lit
+++ b/programming_examples/vision/color_threshold/run.lit
@@ -5,7 +5,7 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/threshold.cc -o ./threshold.cc.o
 // RUN: %python %S/aie2_colorThreshold.py 1920 1080 > ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DCOLORTHRESHOLD_WIDTH=1920 -DCOLORTHRESHOLD_HEIGHT=1080 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp -I%S/../../utils %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags  -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/programming_examples/vision/edge_detect/CMakeLists.txt b/programming_examples/vision/edge_detect/CMakeLists.txt
index 59fe331831..c0ceb81739 100644
--- a/programming_examples/vision/edge_detect/CMakeLists.txt
+++ b/programming_examples/vision/edge_detect/CMakeLists.txt
@@ -25,7 +25,7 @@ else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(EDGEDETECT_WIDTH 1920 CACHE STRING "image width")
diff --git a/programming_examples/vision/edge_detect/Makefile b/programming_examples/vision/edge_detect/Makefile
index d40e606e63..71c2012432 100755
--- a/programming_examples/vision/edge_detect/Makefile
+++ b/programming_examples/vision/edge_detect/Makefile
@@ -39,8 +39,8 @@ build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir: aie2_edgeDetect.py
 
 build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/rgba2gray.cc.o build/gray2rgba.cc.o build/filter2d.cc.o build/threshold.cc.o build/addWeighted.cc.o build/combined_gray2rgba_addWeighted.a
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/vision/edge_detect/README.md b/programming_examples/vision/edge_detect/README.md
index 2450f019ec..26f2d4aff8 100644
--- a/programming_examples/vision/edge_detect/README.md
+++ b/programming_examples/vision/edge_detect/README.md
@@ -12,7 +12,7 @@
 
 The Edge Detect pipeline design consists of the following blocks arranged in a pipeline fashion for the detection of edges in a sequence of images : `rgba2gray`, `filter2D`, `threshold`, `gray2rgba`, `addWeighted`.
 
-The pipeline is mapped onto a single column of the ipu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). As shown in the image below, the `rgba2gray`, `filter2D` and `threshold` kernels are each mapped onto one compute tile, while `gray2rgba` and `addWeighted` are mapped together on AIE tile (0, 5). 
+The pipeline is mapped onto a single column of the npu device, with one Shim tile (0, 0), one Mem tile (0, 1) and four AIE compute tiles (0, 2) through (0, 5). As shown in the image below, the `rgba2gray`, `filter2D` and `threshold` kernels are each mapped onto one compute tile, while `gray2rgba` and `addWeighted` are mapped together on AIE tile (0, 5). 
 
 <p align="center">
   <img
diff --git a/programming_examples/vision/edge_detect/aie2_edgeDetect.py b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
index dd481cfc65..91ed5f9ed4 100644
--- a/programming_examples/vision/edge_detect/aie2_edgeDetect.py
+++ b/programming_examples/vision/edge_detect/aie2_edgeDetect.py
@@ -32,7 +32,7 @@
 def edge_detect():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             line_bytes_ty = T.memref(lineWidthInBytes, T.ui8())
             line_ty = T.memref(lineWidth, T.ui8())
@@ -300,19 +300,19 @@ def core_body():
 
             @FuncOp.from_py_func(tensor_ty, memRef_16x16_ty, tensor_ty)
             def sequence(I, B, O):
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outOF_L2L3",
                     bd_id=0,
                     mem=O,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="inOF_L3L2",
                     bd_id=1,
                     mem=I,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     #    print(ctx.module.operation.verify())
     print(ctx.module)
diff --git a/programming_examples/vision/edge_detect/run.lit b/programming_examples/vision/edge_detect/run.lit
index bf5b6eff92..5f7ab37e0e 100644
--- a/programming_examples/vision/edge_detect/run.lit
+++ b/programming_examples/vision/edge_detect/run.lit
@@ -10,8 +10,8 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/aie2/filter2d.cc -o ./filter2d.cc.o
 // RUN: ar rvs combined_gray2rgba_addWeighted.a gray2rgba.cc.o addWeighted.cc.o
 // RUN: %python %S/aie2_edgeDetect.py 1920 1080 > ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DEDGEDETECT_WIDTH=1920 -DEDGEDETECT_HEIGHT=1080 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp -I%S/../../utils %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags  -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/programming_examples/vision/vision_passthrough/CMakeLists.txt b/programming_examples/vision/vision_passthrough/CMakeLists.txt
index 7ba68b268b..a2bb8ac761 100644
--- a/programming_examples/vision/vision_passthrough/CMakeLists.txt
+++ b/programming_examples/vision/vision_passthrough/CMakeLists.txt
@@ -28,7 +28,7 @@ else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(OpenCV_DIR C:/Technical/thirdParty/opencv/build CACHE STRING "Path to OpenCV install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif ()
 
 set(PASSTHROUGH_WIDTH 1920 CACHE STRING "image width")
diff --git a/programming_examples/vision/vision_passthrough/Makefile b/programming_examples/vision/vision_passthrough/Makefile
index f07d90fda2..1ae853d942 100644
--- a/programming_examples/vision/vision_passthrough/Makefile
+++ b/programming_examples/vision/vision_passthrough/Makefile
@@ -32,8 +32,8 @@ build/passThrough.cc.o: passThrough.cc
 	
 build/final_${PASSTHROUGH_WIDTH}.xclbin: build/aie2_lineBased_8b_${PASSTHROUGH_WIDTH}.mlir build/passThrough.cc.o
 	mkdir -p ${@D}
-	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=${@F} --ipu-insts-name=insts.txt $(<:%=../%)
+	cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/programming_examples/vision/vision_passthrough/aie2.py b/programming_examples/vision/vision_passthrough/aie2.py
index 5422f803d1..920d109cfa 100644
--- a/programming_examples/vision/vision_passthrough/aie2.py
+++ b/programming_examples/vision/vision_passthrough/aie2.py
@@ -29,7 +29,7 @@
 def passThroughAIE2():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             # define types
             line_ty = T.memref(lineWidthInBytes, T.ui8())
@@ -101,9 +101,9 @@ def sequence(inTensor, notUsed, outTensor):
                     # EVENTS_CORE_PORT_RUNNING_0 (0x4B)
 
                     # Trace_Event0  (4 slots)
-                    IpuWrite32(0, 2, 0x340E0, 0x4B222125)
+                    NpuWrite32(0, 2, 0x340E0, 0x4B222125)
                     # Trace_Event1  (4 slots)
-                    IpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)
+                    NpuWrite32(0, 2, 0x340E4, 0x2D2C1A4F)
 
                     # Event slots as configured above:
                     # 0: Kernel executes vector instruction
@@ -117,13 +117,13 @@ def sequence(inTensor, notUsed, outTensor):
 
                     # Stream_Switch_Event_Port_Selection_0
                     # This is necessary to capture the Port_Running_0 and Port_Running_1 events
-                    IpuWrite32(0, 2, 0x3FF00, 0x121)
+                    NpuWrite32(0, 2, 0x3FF00, 0x121)
 
                     # Trace_Control0: Define trace start and stop triggers. Set start event TRUE.
-                    IpuWrite32(0, 2, 0x340D0, 0x10000)
+                    NpuWrite32(0, 2, 0x340D0, 0x10000)
 
                     # Start trace copy out.
-                    IpuWriteBdShimTile(
+                    NpuWriteBdShimTile(
                         bd_id=3,
                         buffer_length=traceSizeInBytes,
                         buffer_offset=tensorSize,
@@ -151,21 +151,21 @@ def sequence(inTensor, notUsed, outTensor):
                         use_next_bd=0,
                         valid_bd=1,
                     )
-                    IpuWrite32(0, 0, 0x1D20C, 0x3)
+                    NpuWrite32(0, 0, 0x1D20C, 0x3)
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="in",
                     bd_id=1,
                     mem=inTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="out",
                     bd_id=0,
                     mem=outTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
index 3c547e4016..0621e0b622 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
@@ -12,7 +12,7 @@
 // AIE tiles, buffers, and communication in an AI Engine design
 module @passThroughLine_aie2 {
 
- 	aie.device(ipu) {
+ 	aie.device(npu) {
         // declare kernel external kernel function 
         func.func private @passThroughLine(%in: memref<1920xui8>, %out: memref<1920xui8>, %tilewidth: i32) -> ()
         
@@ -53,9 +53,9 @@ module @passThroughLine_aie2 {
             %tilewidth  = arith.constant 480 : i64  // in 32b words so tileWidth/4
 
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
-            aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
-            aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
-            aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+            aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<518400xi32>
+            aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<518400xi32>
+            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
index d07ba213c4..c2c31b0d9b 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
@@ -12,7 +12,7 @@
 // AIE tiles, buffers, and communication in an AI Engine design
 module @passThroughLine_aie2 {
 
- 	aie.device(ipu) {
+ 	aie.device(npu) {
         // declare kernel external kernel function 
         func.func private @passThroughLine(%in: memref<7680xui8>, %out: memref<7680xui8>, %tilewidth: i32) -> ()
         
@@ -54,9 +54,9 @@ module @passThroughLine_aie2 {
             %totalLenRGBA = arith.constant 2073600 : i64
 
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
-            aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
-            aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
-            aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+            aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @inOF, id = 1 : i64 } : memref<2073600xi32>
+            aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %totalLenRGBA][%c0, %c0, %c0]) { metadata = @outOF, id = 0 : i64 } : memref<2073600xi32>
+            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
index 13f7417166..dd66475ca5 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
@@ -12,7 +12,7 @@
 // AIE tiles, buffers, and communication in an AI Engine design
 module @passThroughLine_aie2 {
 
- 	aie.device(ipu) {
+ 	aie.device(npu) {
         // declare kernel external kernel function 
         func.func private @passThroughLine(%in: memref<512xui8>, %out: memref<512xui8>, %tilewidth: i32) -> ()
         
@@ -53,9 +53,9 @@ module @passThroughLine_aie2 {
             %tilewidth  = arith.constant 128 : i64  // in 32b words so tileWidth/4
 
             //dma_memcpy_nd ([offset in 32b words][length in 32b words][stride in 32b words])
-            aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<1152xi32>
-            aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<1152xi32>
-            aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+            aiex.npu.dma_memcpy_nd (0, 0, %in[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @inOF, id = 1 : i64 } : memref<1152xi32>
+            aiex.npu.dma_memcpy_nd (0, 0, %out[%c0, %c0, %c0, %c0][%c1, %c1, %tileheight, %tilewidth][%c0, %c0, %tilewidth]) { metadata = @outOF, id = 0 : i64 } : memref<1152xi32>
+            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
             return
         }
     }
diff --git a/programming_examples/vision/vision_passthrough/run.lit b/programming_examples/vision/vision_passthrough/run.lit
index 5093e3c80c..58f914861c 100644
--- a/programming_examples/vision/vision_passthrough/run.lit
+++ b/programming_examples/vision/vision_passthrough/run.lit
@@ -5,7 +5,7 @@
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/../../../aie_kernels/generic/passThrough.cc -o passThrough.cc.o
 // RUN: %python %S/aie2.py 1920 1080 | aie-opt -cse -canonicalize -o ./aie.mlir
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt ./aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
 // RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -DPASSTHROUGH_WIDTH=1920 -DPASSTHROUGH_HEIGHT=1080 -I%S/../../../runtime_lib/test_lib %S/../../../runtime_lib/test_lib/test_utils.cpp -I%S/../../utils %S/../../utils/OpenCVUtils.cpp %xrt_flags %opencv_flags  -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/python/AIEMLIRModule.cpp b/python/AIEMLIRModule.cpp
index da4dd754c6..588f96fae0 100644
--- a/python/AIEMLIRModule.cpp
+++ b/python/AIEMLIRModule.cpp
@@ -107,11 +107,11 @@ PYBIND11_MODULE(_aie, m) {
       "enable_cores"_a = true);
 
   m.def(
-      "ipu_instgen",
+      "npu_instgen",
       [&stealCStr](MlirOperation op) {
-        py::str ipuInstructions = stealCStr(aieTranslateToIPU(op));
+        py::str npuInstructions = stealCStr(aieTranslateToNPU(op));
         auto individualInstructions =
-            ipuInstructions.attr("split")().cast<py::list>();
+            npuInstructions.attr("split")().cast<py::list>();
         for (size_t i = 0; i < individualInstructions.size(); ++i)
           individualInstructions[i] = individualInstructions[i].attr("strip")();
         return individualInstructions;
diff --git a/python/XRTModule.cpp b/python/XRTModule.cpp
index 9396f2073e..fb200f6650 100644
--- a/python/XRTModule.cpp
+++ b/python/XRTModule.cpp
@@ -25,8 +25,8 @@
 namespace py = pybind11;
 using namespace py::literals;
 
-// group_id 0 is for ipu instructions
-// group_id 1 is for number of ipu instructions
+// group_id 0 is for npu instructions
+// group_id 1 is for number of npu instructions
 // host side buffers/args follow starting from position 2
 // see aiecc.main.emit_design_kernel_json
 constexpr size_t HOST_BUFFERS_START_IDX = 2;
@@ -44,14 +44,14 @@ class PyXCLBin {
     kernel = std::make_unique<xrt::kernel>(*context, kernelName);
   }
 
-  void loadIPUInstructions(const std::vector<uint32_t> &insts) {
-    ipuInstructions =
+  void loadNPUInstructions(const std::vector<uint32_t> &insts) {
+    npuInstructions =
         std::make_unique<xrt::bo>(*device, insts.size() * sizeof(uint32_t),
                                   XCL_BO_FLAGS_CACHEABLE, kernel->group_id(0));
-    uint32_t *bufInstr = ipuInstructions->map<uint32_t *>();
+    uint32_t *bufInstr = npuInstructions->map<uint32_t *>();
     for (size_t i = 0; i < insts.size(); ++i)
       bufInstr[i] = insts.at(i);
-    ipuInstructions->sync(XCL_BO_SYNC_BO_TO_DEVICE);
+    npuInstructions->sync(XCL_BO_SYNC_BO_TO_DEVICE);
   }
 
   template <typename ElementT>
@@ -107,17 +107,17 @@ class PyXCLBin {
 
   void run() {
     run_ = std::make_unique<xrt::run>(*kernel);
-    run_->set_arg(0, *ipuInstructions);
-    run_->set_arg(1, ipuInstructions->size());
+    run_->set_arg(0, *npuInstructions);
+    run_->set_arg(1, npuInstructions->size());
     for (size_t i = 0; i < buffers.size(); ++i)
       run_->set_arg(HOST_BUFFERS_START_IDX + i, *buffers[i]);
     run_->start();
   }
 
-  void _runOnlyIpuInstructions() {
+  void _runOnlyNpuInstructions() {
     run_ = std::make_unique<xrt::run>(*kernel);
-    run_->set_arg(0, *ipuInstructions);
-    run_->set_arg(1, ipuInstructions->size());
+    run_->set_arg(0, *npuInstructions);
+    run_->set_arg(1, npuInstructions->size());
     run_->start();
   }
 
@@ -133,7 +133,7 @@ class PyXCLBin {
   std::unique_ptr<xrt::device> device;
   std::unique_ptr<xrt::hw_context> context;
   std::unique_ptr<xrt::kernel> kernel;
-  std::unique_ptr<xrt::bo> ipuInstructions;
+  std::unique_ptr<xrt::bo> npuInstructions;
 
   std::vector<std::unique_ptr<xrt::bo>> buffers;
 
@@ -145,11 +145,11 @@ PYBIND11_MODULE(_xrt, m) {
   py::class_<PyXCLBin>(m, "XCLBin", py::module_local())
       .def(py::init<const std::string &, const std::string &, int>(),
            "xclbin_path"_a, "kernel_name"_a, "device_index"_a = 0)
-      .def("load_ipu_instructions", &PyXCLBin::loadIPUInstructions, "insts"_a)
+      .def("load_npu_instructions", &PyXCLBin::loadNPUInstructions, "insts"_a)
       .def("sync_buffers_to_device", &PyXCLBin::syncBuffersToDevice)
       .def("sync_buffers_from_device", &PyXCLBin::syncBuffersFromDevice)
       .def("run", &PyXCLBin::run)
-      .def("_run_only_ipu_instructions", &PyXCLBin::_runOnlyIpuInstructions)
+      .def("_run_only_npu_instructions", &PyXCLBin::_runOnlyNpuInstructions)
       .def("wait", &PyXCLBin::wait, "timeout"_a = py::none())
       .def(
           "mmap_buffers",
diff --git a/python/_mlir_libs/_aie.pyi b/python/_mlir_libs/_aie.pyi
index c37cf64e08..ad7497117c 100644
--- a/python/_mlir_libs/_aie.pyi
+++ b/python/_mlir_libs/_aie.pyi
@@ -11,7 +11,7 @@ __all__ = [
     "generate_bcf",
     "generate_cdo",
     "generate_xaie",
-    "ipu_instgen",
+    "npu_instgen",
     "register_dialect",
     "translate_aie_vec_to_cpp",
     "translate_mlir_to_llvmir",
@@ -31,7 +31,7 @@ def generate_cdo(
     enable_cores: bool = True,
 ) -> None: ...
 def generate_xaie(module: Operation) -> str: ...
-def ipu_instgen(module: Operation) -> list: ...
+def npu_instgen(module: Operation) -> list: ...
 def register_dialect(registry: DialectRegistry) -> None: ...
 def translate_aie_vec_to_cpp(module: Operation, aieml: bool = False) -> str: ...
 def translate_mlir_to_llvmir(module: Operation) -> str: ...
diff --git a/python/_mlir_libs/_xrt.pyi b/python/_mlir_libs/_xrt.pyi
index b912f76738..d08862a4a5 100644
--- a/python/_mlir_libs/_xrt.pyi
+++ b/python/_mlir_libs/_xrt.pyi
@@ -8,8 +8,8 @@ class XCLBin:
         self, xclbin_path: str, kernel_name: str, device_index: int = 0
     ) -> None: ...
     def _get_buffer_host_address(self, arg0: int) -> int: ...
-    def _run_only_ipu_instructions(self) -> None: ...
-    def load_ipu_instructions(self, insts: list[int]) -> None: ...
+    def _run_only_npu_instructions(self) -> None: ...
+    def load_npu_instructions(self, insts: list[int]) -> None: ...
     def mmap_buffers(
         self, shapes: list[list[int]], np_format: typing.Any
     ) -> list[memoryview]: ...
diff --git a/python/compiler/aiecc/cl_arguments.py b/python/compiler/aiecc/cl_arguments.py
index 4979674955..a407dda971 100644
--- a/python/compiler/aiecc/cl_arguments.py
+++ b/python/compiler/aiecc/cl_arguments.py
@@ -195,26 +195,26 @@ def parse_args(args=None):
         help="Show progress visualization",
     )
     parser.add_argument(
-        "--aie-generate-ipu",
-        dest="ipu",
+        "--aie-generate-npu",
+        dest="npu",
         default=False,
         action="store_const",
         const=True,
-        help="Generate ipu instruction stream",
+        help="Generate npu instruction stream",
     )
     parser.add_argument(
-        "--aie-only-generate-ipu",
-        dest="only_ipu",
+        "--aie-only-generate-npu",
+        dest="only_npu",
         default=False,
         action="store_const",
         const=True,
-        help="Generate ipu instruction stream only",
+        help="Generate npu instruction stream only",
     )
     parser.add_argument(
-        "--ipu-insts-name",
+        "--npu-insts-name",
         dest="insts_name",
-        default="ipu_insts.txt",
-        help="Output instructions filename for IPU target",
+        default="npu_insts.txt",
+        help="Output instructions filename for NPU target",
     )
     parser.add_argument(
         "--aie-generate-cdo",
diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py
index b687c978dc..35a268d245 100644
--- a/python/compiler/aiecc/main.py
+++ b/python/compiler/aiecc/main.py
@@ -89,7 +89,7 @@
 CREATE_PATH_FINDER_FLOWS = Pipeline().Nested(
     "aie.device", Pipeline().add_pass("aie-create-pathfinder-flows")
 )
-DMA_TO_IPU = Pipeline().Nested("aie.device", Pipeline().add_pass("aie-dma-to-ipu"))
+DMA_TO_NPU = Pipeline().Nested("aie.device", Pipeline().add_pass("aie-dma-to-npu"))
 
 
 async def read_file_async(file_path: str) -> str:
@@ -1013,14 +1013,14 @@ async def run_flow(self):
                 exit(-3)
             aie_peano_target = aie_target.lower() + "-none-elf"
 
-            # Optionally generate insts.txt for IPU instruction stream
-            if opts.ipu or opts.only_ipu:
-                generated_insts_mlir = self.prepend_tmp("generated_ipu_insts.mlir")
+            # Optionally generate insts.txt for NPU instruction stream
+            if opts.npu or opts.only_npu:
+                generated_insts_mlir = self.prepend_tmp("generated_npu_insts.mlir")
                 await self.do_call(
                     progress_bar.task,
                     [
                         "aie-opt",
-                        "--aie-dma-to-ipu",
+                        "--aie-dma-to-npu",
                         file_with_addresses,
                         "-o",
                         generated_insts_mlir,
@@ -1030,13 +1030,13 @@ async def run_flow(self):
                     progress_bar.task,
                     [
                         "aie-translate",
-                        "--aie-ipu-instgen",
+                        "--aie-npu-instgen",
                         generated_insts_mlir,
                         "-o",
                         opts.insts_name,
                     ],
                 )
-                if opts.only_ipu:
+                if opts.only_npu:
                     return
 
             chess_intrinsic_wrapper_ll_path = await self.prepare_for_chesshack(
diff --git a/python/dialects/aie.py b/python/dialects/aie.py
index ffdb1b46ee..35702449fa 100644
--- a/python/dialects/aie.py
+++ b/python/dialects/aie.py
@@ -21,7 +21,7 @@
     generate_bcf,
     generate_cdo,
     generate_xaie,
-    ipu_instgen,
+    npu_instgen,
     register_dialect,
     translate_aie_vec_to_cpp,
     translate_mlir_to_llvmir,
@@ -617,7 +617,7 @@ def find_neighbors(tile, device=None, logical=True):
     if device is None:
         device = find_parent_of_type(lambda op: isinstance(op, DeviceOp))
 
-    assert int(device.device) == int(AIEDevice.ipu), "only ipu supported"
+    assert int(device.device) == int(AIEDevice.npu), "only npu supported"
 
     neighbors = {}
     col, row = map(int, (tile.col, tile.row))
diff --git a/python/dialects/aiex.py b/python/dialects/aiex.py
index 685c91ccfd..1c8d59ac2d 100644
--- a/python/dialects/aiex.py
+++ b/python/dialects/aiex.py
@@ -32,11 +32,11 @@
 # Comes from _aie
 register_dialect(get_dialect_registry())
 
-ipu_sync = partial(ipu_sync, column_num=1, row_num=1)
+npu_sync = partial(npu_sync, column_num=1, row_num=1)
 
 
-class IpuDmaMemcpyNd(IpuDmaMemcpyNdOp):
-    """Specialize IpuDmaMemcpyNdOp class constructor to take python integers"""
+class NpuDmaMemcpyNd(NpuDmaMemcpyNdOp):
+    """Specialize NpuDmaMemcpyNdOp class constructor to take python integers"""
 
     def __init__(
         self,
@@ -77,7 +77,7 @@ def __init__(
         )
 
 
-ipu_dma_memcpy_nd = IpuDmaMemcpyNd
+npu_dma_memcpy_nd = NpuDmaMemcpyNd
 
 
 _PROLOG = [
@@ -119,8 +119,8 @@ def _get_prolog():
     return _PROLOG[:]
 
 
-# based on https://github.com/Xilinx/mlir-aie/blob/cb232a43383ef3b8efd8b408545c9b74885578ad/lib/Targets/AIETargetIPU.cpp
-def _ipu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1):
+# based on https://github.com/Xilinx/mlir-aie/blob/cb232a43383ef3b8efd8b408545c9b74885578ad/lib/Targets/AIETargetNPU.cpp
+def _npu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1):
     if isinstance(channel, IntegerAttr):
         channel = int(channel)
     words = [None] * 2
@@ -137,7 +137,7 @@ def _ipu_sync(column, row=0, direction=0, channel=0, column_num=1, row_num=1):
     return words
 
 
-def _ipu_write32(column, row, address, value):
+def _npu_write32(column, row, address, value):
     words = [None] * 3
     op_code = 2
     words[0] = (op_code & 0xFF) << 24
@@ -149,7 +149,7 @@ def _ipu_write32(column, row, address, value):
     return words
 
 
-def _ipu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats=0):
+def _npu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats=0):
     if isinstance(channel_index, IntegerAttr):
         channel_index = int(channel_index)
     if channel_dir == DMAChannelDir.MM2S:
@@ -165,7 +165,7 @@ def _ipu_shimtile_push_queue(channel_dir, channel_index, column, bd_id, repeats=
         value |= XAIEMLGBL_NOC_MODULE_DMA_S2MM_0_TASK_QUEUE_ENABLE_TOKEN_ISSUE_MASK
 
     row = 0
-    return _ipu_write32(column, row, address, value)
+    return _npu_write32(column, row, address, value)
 
 
 # based on ExecWriteBdExtendShimTileOpt @ dpufw/src/include/RunInstOpt.h:666
@@ -181,14 +181,14 @@ def _exec_write_bd_extend_shim_tile_opt(iptr, tensor_addr):
     write_addr = SHIM_DMA_BD0_BASE_ADDR + (bd_id * SHIM_BD_OFFSET)
     row = 0
     words = [
-        *_ipu_write32(column, row, write_addr, iptr[2]),
-        *_ipu_write32(column, row, write_addr + 4, word3),
-        *_ipu_write32(column, row, write_addr + 8, word4),
-        *_ipu_write32(column, row, write_addr + 12, iptr[5]),
-        *_ipu_write32(column, row, write_addr + 16, iptr[6]),
-        *_ipu_write32(column, row, write_addr + 20, iptr[7]),
-        *_ipu_write32(column, row, write_addr + 24, iptr[8]),
-        *_ipu_write32(column, row, write_addr + 28, iptr[9]),
+        *_npu_write32(column, row, write_addr, iptr[2]),
+        *_npu_write32(column, row, write_addr + 4, word3),
+        *_npu_write32(column, row, write_addr + 8, word4),
+        *_npu_write32(column, row, write_addr + 12, iptr[5]),
+        *_npu_write32(column, row, write_addr + 16, iptr[6]),
+        *_npu_write32(column, row, write_addr + 20, iptr[7]),
+        *_npu_write32(column, row, write_addr + 24, iptr[8]),
+        *_npu_write32(column, row, write_addr + 28, iptr[9]),
     ]
     return words
 
@@ -202,14 +202,14 @@ def _update_tensor_addr_shim_tile(column, bd_id, tensor_addr, buffer_offset=0):
     write_addr = SHIM_DMA_BD0_BASE_ADDR + (bd_id * SHIM_BD_OFFSET)
     row = 0
     words = [
-        *_ipu_write32(column, row, write_addr + 4, word3),
-        *_ipu_write32(column, row, write_addr + 8, word4),
+        *_npu_write32(column, row, write_addr + 4, word3),
+        *_npu_write32(column, row, write_addr + 8, word4),
     ]
     return words
 
 
 # corresponds to ExecWriteBdExtendShimTileOpt
-def _ipu_writebd_shimtile(
+def _npu_writebd_shimtile(
     column,
     bd_id,
     buffer_length,
@@ -304,26 +304,26 @@ def _ipu_writebd_shimtile(
     return words
 
 
-def _ipu_noop():
+def _npu_noop():
     words = [None] * 1
     op_code = 0
     words[0] = (op_code & 0xFF) << 24
     return words
 
 
-def _ipu_core_enable(column, row):
+def _npu_core_enable(column, row):
     # note this clears the reset bit
-    return _ipu_write32(column, row, XAIEMLGBL_CORE_MODULE_CORE_CONTROL, 1)
+    return _npu_write32(column, row, XAIEMLGBL_CORE_MODULE_CORE_CONTROL, 1)
 
 
-class ipu:
-    noop = _ipu_noop
-    write32 = _ipu_write32
-    shimtile_push_queue = _ipu_shimtile_push_queue
-    writebd_shimtile = _ipu_writebd_shimtile
-    sync = _ipu_sync
+class npu:
+    noop = _npu_noop
+    write32 = _npu_write32
+    shimtile_push_queue = _npu_shimtile_push_queue
+    writebd_shimtile = _npu_writebd_shimtile
+    sync = _npu_sync
     get_prolog = _get_prolog
-    enable_cores = _ipu_core_enable
+    enable_cores = _npu_core_enable
     _exec_write_bd_extend_shim_tile_opt = _exec_write_bd_extend_shim_tile_opt
     _update_tensor_addr_shim_tile = _update_tensor_addr_shim_tile
 
diff --git a/python/utils/README.md b/python/utils/README.md
index b3223ed697..9e54561aa1 100644
--- a/python/utils/README.md
+++ b/python/utils/README.md
@@ -55,7 +55,7 @@ Test/ Host code utilities.
     * This function abstracts a number of python functions for configuring a core tile and an associated shim tile. It does not define the trace packet routing betweent he two however. To better appreciate what this wrapper function does, we need to delve more deeply into the details on how trace units are configured.
 
 
-Within the `func.func @sequence` block, we add a set of configuration register writes (`aiex.ipu.write32`) to configure the tile trace units and the shimDMA. 
+Within the `func.func @sequence` block, we add a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and the shimDMA.
 ### <u>How to configure wrapper and default values</u>
 The minimum function call we need is:
 ```python
@@ -105,14 +105,14 @@ The table below describes the general trace control registers.
 in C/C++
 ```c++
 // Start event = 1, Stop event = 0, Mode = event-time
-aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 }
-aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340D4 : ui32, value = 0x0 : ui32 }
+aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340D0 : ui32, value = 0x10000 : ui32 }
+aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340D4 : ui32, value = 0x0 : ui32 }
 ```
 in Python
 ```python
 # Start event = 1, Stop event = 0, Mode = event-time
-ipu_write32(column=0, row=4, address=0x340D0, value=pack4bytes(stop, start, 0, 0),)
-ipu_write32(column=0, row=4, address=0x340D4, value=0,)
+npu_write32(column=0, row=4, address=0x340D0, value=pack4bytes(stop, start, 0, 0),)
+npu_write32(column=0, row=4, address=0x340D4, value=0,)
 ```
 
 The table below describes which events the trace hardware monitors.
@@ -160,7 +160,7 @@ in C/C++
 // Core Instruction - Event 0 (0x21)
 // Core Instruction - Event 1 (0x22)
 // Core Port Running 0 (0x4B) 
-aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 }
+aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 }
 
 // Events 4-7 monitored
 // ------------------------
@@ -168,13 +168,13 @@ aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E0 : ui32, va
 // Lock stalls (0x1A)
 // Lock acquire requests (0x2C)
 // Lock release requests (0x2D)
-aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 }
+aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E4 : ui32, value = 0x2D2C1A4F : ui32 }
 ```
 in Python
 ```python
 # events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F]
-ipu_write32(column=0, row=4, address=0x340E0, value=*events[0:4],)
-ipu_write32(column=0, row=4, address=0x340E4, value=*events[4:8],)
+npu_write32(column=0, row=4, address=0x340E0, value=*events[0:4],)
+npu_write32(column=0, row=4, address=0x340E4, value=*events[4:8],)
 ```
 
 Some configurations like the Port Running 0/1 events are further configured by a secondary configuration register. In this case, we route the port activity from the stream switch to Port running 0 or 1. 
@@ -204,7 +204,7 @@ in C/C++
 // Stream_Switch_Event_Port_Selection_0
 // This is necessary to capture the Port_Running_0 and Port_Running_1 events
 // Port 0 - Master/ID=1, Port 1 - Slave/ID=1
-aiex.ipu.write32 { column = 0 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 }
+aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x3FF00 : ui32, value = 0x121 : ui32 }
 ```
 in Python
 ```python
@@ -214,8 +214,8 @@ in Python
 # def slave(port):
 #     return port
 
-ipu_write32(column=0, row=4, address=0x3FF00, value=pack4bytes(0, 0, slave(1), master(1)),)  # port 1 is FIFO0?
-ipu_write32(column=0, row=4, address=0x3FF04, value=pack4bytes(0, 0, 0, 0),)
+npu_write32(column=0, row=4, address=0x3FF00, value=pack4bytes(0, 0, slave(1), master(1)),)  # port 1 is FIFO0?
+npu_write32(column=0, row=4, address=0x3FF04, value=pack4bytes(0, 0, 0, 0),)
 ```
 
 ### <u>Configure shimDMA</u>
@@ -239,7 +239,7 @@ An example ddr_id to inout buffer mapping is below:
 
 in C/C++
 ```c++
-aiex.ipu.writebd_shimtile { bd_id = 3 : i32,
+aiex.npu.writebd_shimtile { bd_id = 3 : i32,
                             buffer_length = 16384 : i32,
                             buffer_offset = 262144 : i32,
                             enable_packet = 0 : i32,
@@ -274,11 +274,11 @@ aiex.ipu.writebd_shimtile { bd_id = 3 : i32,
                             use_next_bd = 0 : i32,
                             valid_bd = 1 : i32}
 // Set start BD to our shim bd_Id (3)
-aiex.ipu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 }
+aiex.npu.write32 { column = 0 : i32, row = 0 : i32, address = 0x1D20C : ui32, value = 0x3 : ui32 }
 ```
 in Python
 ```python
-ipu_writebd_shimtile(
+npu_writebd_shimtile(
     bd_id=3,
     buffer_length=16384,
     buffer_offset=262144,
diff --git a/python/utils/trace.py b/python/utils/trace.py
index 2f60b587da..510d7feae7 100644
--- a/python/utils/trace.py
+++ b/python/utils/trace.py
@@ -93,7 +93,7 @@ def configure_simple_tracing_aie2(
     #              BB      <- Event to start trace capture
     #                   C  <- Trace mode, 00=event=time, 01=event-PC, 10=execution
     # Configure so that "Event 1" (always true) causes tracing to start
-    ipu_write32(
+    npu_write32(
         column=int(tile.col),
         row=int(tile.row),
         address=0x340D0,
@@ -102,7 +102,7 @@ def configure_simple_tracing_aie2(
     # 0x340D4: Trace Control 1
     # This is used to control packet routing.  For the moment
     # only deal with the simple case of circuit routing.
-    ipu_write32(
+    npu_write32(
         column=int(tile.col),
         row=int(tile.row),
         address=0x340D4,
@@ -110,7 +110,7 @@ def configure_simple_tracing_aie2(
     )
     # 0x340E0: Trace Event Group 1  (Which events to trace)
     #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
-    ipu_write32(
+    npu_write32(
         column=int(tile.col),
         row=int(tile.row),
         address=0x340E0,
@@ -118,7 +118,7 @@ def configure_simple_tracing_aie2(
     )
     # 0x340E4: Trace Event Group 2  (Which events to trace)
     #          0xAABBCCDD    AA, BB, CC, DD <- four event slots
-    ipu_write32(
+    npu_write32(
         column=int(tile.col),
         row=int(tile.row),
         address=0x340E4,
@@ -132,13 +132,13 @@ def master(port):
     def slave(port):
         return port
 
-    ipu_write32(
+    npu_write32(
         column=int(tile.col),
         row=int(tile.row),
         address=0x3FF00,
         value=pack4bytes(0, 0, slave(1), master(1)),  # port 1 is FIFO0?
     )
-    ipu_write32(
+    npu_write32(
         column=int(tile.col),
         row=int(tile.row),
         address=0x3FF04,
@@ -147,7 +147,7 @@ def slave(port):
 
     # Configure a buffer descriptor to write tracing information that has been routed into this shim tile
     # out to host DDR memory
-    ipu_writebd_shimtile(
+    npu_writebd_shimtile(
         bd_id=bd_id,
         buffer_length=size,
         buffer_offset=offset,
@@ -176,7 +176,7 @@ def slave(port):
         valid_bd=1,
     )
     # configure S2MM channel
-    ipu_write32(
+    npu_write32(
         column=int(shim.col),
         row=int(shim.row),
         address=0x1D204 if channel == 0 else 0x1D20C,
diff --git a/test/Conversion/DmaToIpu/aiert_insts.mlir b/test/Conversion/DmaToNpu/aiert_insts.mlir
similarity index 83%
rename from test/Conversion/DmaToIpu/aiert_insts.mlir
rename to test/Conversion/DmaToNpu/aiert_insts.mlir
index ce82a1443e..bfcbe334ee 100644
--- a/test/Conversion/DmaToIpu/aiert_insts.mlir
+++ b/test/Conversion/DmaToNpu/aiert_insts.mlir
@@ -6,14 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s
-// CHECK: aiex.ipu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-// CHECK: aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483649 : ui32}
-// CHECK: aiex.ipu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-// CHECK: aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}
+// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s
+// CHECK: aiex.npu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 32 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+// CHECK: aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483649 : ui32}
+// CHECK: aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 2 : i32, d1_stride = 7 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+// CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @of_toMem : memref<32xi32>
     memref.global "public" @of_fromMem : memref<32xi32>
     func.func @sequence(%in : memref<4x2x8xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
@@ -24,8 +24,8 @@ module {
       %c8 = arith.constant 8 : i64
       %c16 = arith.constant 16 : i64
       %c32 = arith.constant 32 : i64
-      aiex.ipu.dma_memcpy_nd(0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c32][%c0,%c0,%c0]) { metadata = @of_toMem, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %in[%c0,%c2,%c0,%c0][%c1,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<4x2x8xi32>
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
diff --git a/test/Conversion/DmaToIpu/bad_rtp_write.mlir b/test/Conversion/DmaToNpu/bad_rtp_write.mlir
similarity index 62%
rename from test/Conversion/DmaToIpu/bad_rtp_write.mlir
rename to test/Conversion/DmaToNpu/bad_rtp_write.mlir
index a28466af13..746df29273 100644
--- a/test/Conversion/DmaToIpu/bad_rtp_write.mlir
+++ b/test/Conversion/DmaToNpu/bad_rtp_write.mlir
@@ -6,13 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-opt --aie-dma-to-ipu -verify-diagnostics %s
+// RUN: aie-opt --aie-dma-to-npu -verify-diagnostics %s
 
-aie.device(ipu) {
+aie.device(npu) {
   func.func @sequence() {
-    // expected-error@+2 {{'aiex.ipu.rtp_write' op RTP buffer address cannot be found. Has an RTP buffer been allocated?}}
-    // expected-error@+1 {{failed to legalize operation 'aiex.ipu.rtp_write' that was explicitly marked illegal}}
-    aiex.ipu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" }
+    // expected-error@+2 {{'aiex.npu.rtp_write' op RTP buffer address cannot be found. Has an RTP buffer been allocated?}}
+    // expected-error@+1 {{failed to legalize operation 'aiex.npu.rtp_write' that was explicitly marked illegal}}
+    aiex.npu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" }
     return
   }
 }
diff --git a/test/Conversion/DmaToIpu/dma_to_ipu.mlir b/test/Conversion/DmaToNpu/dma_to_npu.mlir
similarity index 73%
rename from test/Conversion/DmaToIpu/dma_to_ipu.mlir
rename to test/Conversion/DmaToNpu/dma_to_npu.mlir
index d86de2acce..059766fe7c 100644
--- a/test/Conversion/DmaToIpu/dma_to_ipu.mlir
+++ b/test/Conversion/DmaToNpu/dma_to_npu.mlir
@@ -1,4 +1,4 @@
-//===- dma_to_ipu.mlir -----------------------------------------*- MLIR -*-===//
+//===- dma_to_npu.mlir -----------------------------------------*- MLIR -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,22 +8,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-opt --split-input-file -aie-dma-to-ipu %s | FileCheck %s
+// RUN: aie-opt --split-input-file -aie-dma-to-npu %s | FileCheck %s
 
 // TODO - more
 // CHECK-LABEL: dma_memcpy_nd_0
-// CHECK: aiex.ipu.writebd_shimtile
+// CHECK: aiex.npu.writebd_shimtile
 // CHECK-SAME: ddr_id = 0 : i32
 // CHECK-SAME: valid_bd = 1 : i32
-// CHECK: aiex.ipu.writebd_shimtile
+// CHECK: aiex.npu.writebd_shimtile
 // CHECK-SAME: ddr_id = 1 : i32
 module  {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @toMem : memref<16xi32>
     memref.global "public" @fromMem : memref<16xi32>
     func.func @dma_memcpy_nd_0(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.ipu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
       return
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 0)
@@ -34,11 +34,11 @@ module  {
 // -----
 
 // CHECK-LABEL: dma_wait_s2mm
-// CHECK: aiex.ipu.writebd_shimtile
+// CHECK: aiex.npu.writebd_shimtile
 // CHECK-SAME: ddr_id = 0 : i32
 // CHECK-SAME: valid_bd = 1 : i32
-// CHECK: aiex.ipu.write32
-// CHECK: aiex.ipu.sync 
+// CHECK: aiex.npu.write32
+// CHECK: aiex.npu.sync 
 // CHECK-SAME: channel = 0 : i32
 // CHECK-SAME: column = 0 : i32
 // CHECK-SAME: column_num = 1 : i32
@@ -46,11 +46,11 @@ module  {
 // CHECK-SAME: row = 0 : i32
 // CHECK-SAME: row_num = 1 : i32
 module  {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @toMem : memref<16xi32>
     func.func @dma_wait_s2mm(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.ipu.dma_wait {symbol = @toMem}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_wait {symbol = @toMem}
       return
     }
     aie.shim_dma_allocation @toMem (S2MM, 0, 0)
@@ -60,11 +60,11 @@ module  {
 // -----
 
 // CHECK-LABEL: dma_wait_mm2s
-// CHECK: aiex.ipu.writebd_shimtile
+// CHECK: aiex.npu.writebd_shimtile
 // CHECK-SAME: ddr_id = 0 : i32
 // CHECK-SAME: valid_bd = 1 : i32
-// CHECK: aiex.ipu.write32
-// CHECK: aiex.ipu.sync 
+// CHECK: aiex.npu.write32
+// CHECK: aiex.npu.sync 
 // CHECK-SAME: channel = 1 : i32
 // CHECK-SAME: column = 1 : i32
 // CHECK-SAME: column_num = 1 : i32
@@ -72,11 +72,11 @@ module  {
 // CHECK-SAME: row = 0 : i32
 // CHECK-SAME: row_num = 1 : i32
 module  {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @toMem : memref<16xi32>
     func.func @dma_wait_mm2s(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.ipu.dma_wait {symbol = @toMem}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_wait {symbol = @toMem}
       return
     }
     aie.shim_dma_allocation @toMem (MM2S, 1, 1)
diff --git a/test/Conversion/DmaToIpu/dma_to_ipu_invalid.mlir b/test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir
similarity index 71%
rename from test/Conversion/DmaToIpu/dma_to_ipu_invalid.mlir
rename to test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir
index 89eff26d44..31ed2ed019 100644
--- a/test/Conversion/DmaToIpu/dma_to_ipu_invalid.mlir
+++ b/test/Conversion/DmaToNpu/dma_to_npu_invalid.mlir
@@ -1,4 +1,4 @@
-//===- dma_to_ipu_invalid.mlir ---------------------------------*- MLIR -*-===//
+//===- dma_to_npu_invalid.mlir ---------------------------------*- MLIR -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,15 +8,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-opt --split-input-file --aie-dma-to-ipu --verify-diagnostics %s
+// RUN: aie-opt --split-input-file --aie-dma-to-npu --verify-diagnostics %s
 
 module  {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @toMem : memref<16xi32>
     func.func @sequence() {
-      // expected-error@+2 {{failed to legalize operation 'aiex.ipu.dma_wait' that was explicitly marked illegal}}
+      // expected-error@+2 {{failed to legalize operation 'aiex.npu.dma_wait' that was explicitly marked illegal}}
       // expected-error@+1 {{couldn't find shim_dma_allocation op}}
-      aiex.ipu.dma_wait {symbol = @toMem}
+      aiex.npu.dma_wait {symbol = @toMem}
       return
     }
   }
diff --git a/test/Conversion/DmaToIpu/dma_to_ipu_issue_token.mlir b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir
similarity index 72%
rename from test/Conversion/DmaToIpu/dma_to_ipu_issue_token.mlir
rename to test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir
index 4eb5b02bdc..d73195973b 100644
--- a/test/Conversion/DmaToIpu/dma_to_ipu_issue_token.mlir
+++ b/test/Conversion/DmaToNpu/dma_to_npu_issue_token.mlir
@@ -1,4 +1,4 @@
-//===- dma_to_ipu_issue_token.mlir -----------------------------*- MLIR -*-===//
+//===- dma_to_npu_issue_token.mlir -----------------------------*- MLIR -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,26 +8,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-opt -aie-dma-to-ipu %s | FileCheck %s
+// RUN: aie-opt -aie-dma-to-npu %s | FileCheck %s
 
 // TODO - more
 // CHECK-LABEL: test1
-// CHECK: aiex.ipu.writebd_shimtile
+// CHECK: aiex.npu.writebd_shimtile
 // CHECK-SAME: ddr_id = 0 : i32
 // CHECK-SAME: valid_bd = 1 : i32
-// CHECK: aiex.ipu.write32
+// CHECK: aiex.npu.write32
 // CHECK-SAME: value = 2147483649
-// CHECK: aiex.ipu.writebd_shimtile
+// CHECK: aiex.npu.writebd_shimtile
 // CHECK-SAME: ddr_id = 1 : i32
-// CHECK: aiex.ipu.write32
+// CHECK: aiex.npu.write32
 // CHECK-SAME: value = 0
 module  {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @toMem : memref<16xi32>
     memref.global "public" @fromMem : memref<16xi32>
     func.func @test1(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-        aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32>
-        aiex.ipu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
+        aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64, issue_token = true } : memref<16xi32>
+        aiex.npu.dma_memcpy_nd(0, 1, %arg1[0, 0, 0, 16][1, 1, 16, 16][0, 0, 64]) { metadata = @fromMem, id = 0 : i64 } : memref<16xi32>
         return
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 0)
diff --git a/test/Conversion/DmaToIpu/push_to_queue.mlir b/test/Conversion/DmaToNpu/push_to_queue.mlir
similarity index 69%
rename from test/Conversion/DmaToIpu/push_to_queue.mlir
rename to test/Conversion/DmaToNpu/push_to_queue.mlir
index 841d9e7a0f..4c45c90e1d 100644
--- a/test/Conversion/DmaToIpu/push_to_queue.mlir
+++ b/test/Conversion/DmaToNpu/push_to_queue.mlir
@@ -6,17 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s
-// CHECK: aiex.ipu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483651 : ui32}
-// CHECK: aiex.ipu.write32 {address = 119316 : ui32, column = 2 : i32, row = 0 : i32, value = 196610 : ui32}
+// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s
+// CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483651 : ui32}
+// CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 2 : i32, row = 0 : i32, value = 196610 : ui32}
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @toMem : memref<32xi32>
     memref.global "public" @fromMem : memref<32xi32>
     func.func @sequence() {
-      aiex.ipu.shimtile_push_queue {metadata = @toMem, issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 }
-      aiex.ipu.shimtile_push_queue {metadata = @fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 }
+      aiex.npu.shimtile_push_queue {metadata = @toMem, issue_token = true, repeat_count = 0 : i32, bd_id = 3 : i32 }
+      aiex.npu.shimtile_push_queue {metadata = @fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 2 : i32 }
       return
     }
     aie.shim_dma_allocation @fromMem (MM2S, 0, 2)
diff --git a/test/Conversion/DmaToIpu/rtp_write.mlir b/test/Conversion/DmaToNpu/rtp_write.mlir
similarity index 66%
rename from test/Conversion/DmaToIpu/rtp_write.mlir
rename to test/Conversion/DmaToNpu/rtp_write.mlir
index 9aba5ad4e7..26f2876b95 100644
--- a/test/Conversion/DmaToIpu/rtp_write.mlir
+++ b/test/Conversion/DmaToNpu/rtp_write.mlir
@@ -6,19 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-opt --aie-dma-to-ipu %s | FileCheck %s
-// CHECK: aiex.ipu.write32 {address = 1536 : ui32, column = 2 : i32, row = 3 : i32, value = 50 : ui32}
-// CHECK: aiex.ipu.write32 {address = 3216 : ui32, column = 0 : i32, row = 2 : i32, value = 99 : ui32}
+// RUN: aie-opt --aie-dma-to-npu %s | FileCheck %s
+// CHECK: aiex.npu.write32 {address = 1536 : ui32, column = 2 : i32, row = 3 : i32, value = 50 : ui32}
+// CHECK: aiex.npu.write32 {address = 3216 : ui32, column = 0 : i32, row = 2 : i32, value = 99 : ui32}
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %0 = aie.tile(2, 3)
     %1 = aie.buffer(%0) {address = 1536 : i32, sym_name = "rtp"} : memref<16xi32>
     %2 = aie.tile(0, 2)
     %3 = aie.buffer(%2) {address = 3200 : i32, sym_name = "RTP"} : memref<16xi32>
     func.func @sequence() {
-      aiex.ipu.rtp_write(2, 3, 0, 50) { buffer_sym_name = "rtp" }
-      aiex.ipu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" }
+      aiex.npu.rtp_write(2, 3, 0, 50) { buffer_sym_name = "rtp" }
+      aiex.npu.rtp_write(0, 2, 4, 99) { buffer_sym_name = "RTP" }
       return
     }
   }
diff --git a/test/Passes/assign-bd-ids/bad_bd_assignments.mlir b/test/Passes/assign-bd-ids/bad_bd_assignments.mlir
index 71ab96951f..9ab8036f48 100644
--- a/test/Passes/assign-bd-ids/bad_bd_assignments.mlir
+++ b/test/Passes/assign-bd-ids/bad_bd_assignments.mlir
@@ -11,7 +11,7 @@
 // RUN: aie-opt --verify-diagnostics --split-input-file %s
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_2 = aie.tile(0, 2)
     %double_buffer = aie.buffer(%tile_0_2) : memref<32xi32>
     %lock_Y = aie.lock(%tile_0_2) {init = 0 : i32}
@@ -30,7 +30,7 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_2 = aie.tile(0, 2)
     %double_buffer = aie.buffer(%tile_0_2) : memref<32xi32>
     %lock_X = aie.lock(%tile_0_2) {init = 0 : i32}
@@ -49,7 +49,7 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %buffer_0_1 = aie.buffer(%tile_0_1) : memref<32xi32>
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
@@ -69,7 +69,7 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
       %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32}
@@ -90,7 +90,7 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
       %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32}
@@ -110,7 +110,7 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
       %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32}
diff --git a/test/Passes/assign-bd-ids/basic.mlir b/test/Passes/assign-bd-ids/basic.mlir
index b306c3053e..8862c5c09d 100644
--- a/test/Passes/assign-bd-ids/basic.mlir
+++ b/test/Passes/assign-bd-ids/basic.mlir
@@ -10,7 +10,7 @@
 
 // RUN: aie-opt --aie-assign-bd-ids --split-input-file %s | FileCheck %s
 
-// CHECK-LABEL:   aie.device(ipu) {
+// CHECK-LABEL:   aie.device(npu) {
 // CHECK:  %[[VAL_0:.*]] = aie.tile(0, 0)
 // CHECK:  %[[VAL_1:.*]] = aie.tile(0, 1)
 // CHECK:  %[[VAL_2:.*]] = aie.tile(0, 2)
@@ -30,7 +30,7 @@
 // CHECK:  aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32}
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
     %tile_0_2 = aie.tile(0, 2)
diff --git a/test/Passes/assign-bd-ids/user_assigned.mlir b/test/Passes/assign-bd-ids/user_assigned.mlir
index 777c07f7b8..c41d3aa7d3 100644
--- a/test/Passes/assign-bd-ids/user_assigned.mlir
+++ b/test/Passes/assign-bd-ids/user_assigned.mlir
@@ -10,7 +10,7 @@
 
 // RUN: aie-opt --aie-assign-bd-ids --split-input-file %s | FileCheck %s
 
-// CHECK-LABEL:   aie.device(ipu) {
+// CHECK-LABEL:   aie.device(npu) {
 // CHECK:  %[[VAL_0:.*]] = aie.tile(0, 0)
 // CHECK:  %[[VAL_1:.*]] = aie.tile(0, 1)
 // CHECK:  %[[VAL_2:.*]] = aie.tile(0, 2)
@@ -28,7 +28,7 @@
 // CHECK:  aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32}
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
     %tile_0_2 = aie.tile(0, 2)
@@ -153,7 +153,7 @@ module @aie_module  {
 
 // -----
 
-// CHECK-LABEL:   aie.device(ipu) {
+// CHECK-LABEL:   aie.device(npu) {
 // CHECK:  %[[VAL_0:.*]] = aie.tile(0, 0)
 // CHECK:  %[[VAL_1:.*]] = aie.tile(0, 1)
 // CHECK:  %[[VAL_2:.*]] = aie.tile(0, 2)
@@ -171,7 +171,7 @@ module @aie_module  {
 // CHECK:  aie.dma_bd(%[[VAL_4]] : memref<32xi32>) {bd_id = 25 : i32}
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
     %tile_0_2 = aie.tile(0, 2)
diff --git a/test/Targets/AIETargetHSA/input_with_addresses.mlir b/test/Targets/AIETargetHSA/input_with_addresses.mlir
index 1cf762054b..1efd284c53 100644
--- a/test/Targets/AIETargetHSA/input_with_addresses.mlir
+++ b/test/Targets/AIETargetHSA/input_with_addresses.mlir
@@ -46,9 +46,9 @@ module {
     aie.shim_dma_allocation @out0(S2MM, 0, 6)
 
     func.func @sequence(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @out0} : memref<64xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @in0} : memref<64xi32>
-      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @out0} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @in0} : memref<64xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   }
diff --git a/test/Targets/IPU/ipu_instgen.mlir b/test/Targets/NPU/npu_instgen.mlir
similarity index 90%
rename from test/Targets/IPU/ipu_instgen.mlir
rename to test/Targets/NPU/npu_instgen.mlir
index d1c2ef3c6a..4fd9636197 100644
--- a/test/Targets/IPU/ipu_instgen.mlir
+++ b/test/Targets/NPU/npu_instgen.mlir
@@ -1,4 +1,4 @@
-//===- ipu_instgen.mlir ----------------------------------------*- MLIR -*-===//
+//===- npu_instgen.mlir ----------------------------------------*- MLIR -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,9 +8,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-// RUN: aie-translate --aie-ipu-instgen %s | FileCheck %s
+// RUN: aie-translate --aie-npu-instgen %s | FileCheck %s
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     func.func @test0(%arg0: memref<16xf32>, %arg1: memref<16xf32>) {
 
       // look for the prolog.
@@ -48,7 +48,7 @@ module {
       // CHECK: 00000009
       // CHECK: 2CD0000C
       // CHECK: 2E107041
-      aiex.ipu.writebd_shimtile { bd_id = 6 : i32,
+      aiex.npu.writebd_shimtile { bd_id = 6 : i32,
                                   buffer_length = 1 : i32,
                                   buffer_offset = 2 : i32,
                                   enable_packet = 0 : i32,
@@ -77,10 +77,10 @@ module {
       // CHECK: 02030400
       // CHECK: ABC00DEF
       // CHECK: 00000042
-      aiex.ipu.write32 { column = 3 : i32, row = 4 : i32, address = 0xabc00def : ui32, value = 0x42 : ui32 }
+      aiex.npu.write32 { column = 3 : i32, row = 4 : i32, address = 0xabc00def : ui32, value = 0x42 : ui32 }
       // CHECK: 03030401
       // CHECK: 05010200
-      aiex.ipu.sync { column = 3 : i32, row = 4 : i32, direction = 1 : i32, channel = 5 : i32, column_num = 1 : i32, row_num = 2 : i32 }
+      aiex.npu.sync { column = 3 : i32, row = 4 : i32, direction = 1 : i32, channel = 5 : i32, column_num = 1 : i32, row_num = 2 : i32 }
       return
     }
   }
diff --git a/test/aie2xclbin/simple_xclbin.mlir b/test/aie2xclbin/simple_xclbin.mlir
index 09e9dcaa6b..55c6aa8ec9 100644
--- a/test/aie2xclbin/simple_xclbin.mlir
+++ b/test/aie2xclbin/simple_xclbin.mlir
@@ -19,7 +19,7 @@
 // PEANO-NOT: xchesscc_wrapper
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %12 = aie.tile(1, 2)
     %buf = aie.buffer(%12) : memref<256xi32>
     %4 = aie.core(%12)  {
diff --git a/test/aiecc/simple_xclbin.mlir b/test/aiecc/simple_xclbin.mlir
index 880225b0da..bec65be208 100644
--- a/test/aiecc/simple_xclbin.mlir
+++ b/test/aiecc/simple_xclbin.mlir
@@ -11,8 +11,8 @@
 // REQUIRES: chess
 // REQUIRES: peano
 
-// RUN: %PYTHON aiecc.py --xchesscc --no-link -nv --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %s | FileCheck %s --check-prefix=XCHESSCC
-// RUN: %PYTHON aiecc.py --no-xchesscc --no-link -nv --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %s | FileCheck %s --check-prefix=PEANO
+// RUN: %PYTHON aiecc.py --xchesscc --no-link -nv --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %s | FileCheck %s --check-prefix=XCHESSCC
+// RUN: %PYTHON aiecc.py --no-xchesscc --no-link -nv --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %s | FileCheck %s --check-prefix=PEANO
 
 // Note that llc determines the architecture from the llvm IR.
 // XCHESSCC-NOT: {{^[^ ]*llc}}
@@ -27,7 +27,7 @@
 // PEANO: xclbinutil
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
   %12 = aie.tile(1, 2)
   %buf = aie.buffer(%12) : memref<256xi32>
   %4 = aie.core(%12)  {
diff --git a/test/assign-buffer-addresses/bad_alignment.mlir b/test/assign-buffer-addresses/bad_alignment.mlir
index b9c2b83d7e..4b5ca8ffce 100644
--- a/test/assign-buffer-addresses/bad_alignment.mlir
+++ b/test/assign-buffer-addresses/bad_alignment.mlir
@@ -11,7 +11,7 @@
 // RUN: aie-opt --verify-diagnostics --split-input-file %s
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
       %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32}
@@ -31,7 +31,7 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
       %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32}
@@ -56,7 +56,7 @@ module {
 // prevent false-positives/false-negatives (I think).
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
       %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32}
@@ -77,7 +77,7 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {
       %lock_0_1 = aie.lock(%tile_0_1) {init = 1 : i32}
diff --git a/test/dialect/AIE/bad_cascade.mlir b/test/dialect/AIE/bad_cascade.mlir
index 21adc09a48..c204d79af1 100644
--- a/test/dialect/AIE/bad_cascade.mlir
+++ b/test/dialect/AIE/bad_cascade.mlir
@@ -31,7 +31,7 @@ aie.device(xcve2802) {
 
 // CHECK: error{{.*}}'aie.cascade_flow' op shimTile row has no cascade stream interface
 
-aie.device(ipu) {
+aie.device(npu) {
   %t10 = aie.tile(1, 0)
   %t20 = aie.tile(2, 0)
   aie.cascade_flow(%t10, %t20)
@@ -41,7 +41,7 @@ aie.device(ipu) {
 
 // CHECK: error{{.*}}'aie.cascade_flow' op memTile row has no cascade stream interface
 
-aie.device(ipu) {
+aie.device(npu) {
   %t11 = aie.tile(1, 1)
   %t21 = aie.tile(2, 1)
   aie.cascade_flow(%t11, %t21)
@@ -87,7 +87,7 @@ aie.device(xcve2802) {
 
 // CHECK: error{{.*}}'aie.configure_cascade' op memTile row has no cascade stream interface
 
-aie.device(ipu) {
+aie.device(npu) {
   %t11 = aie.tile(1, 1)
   aie.configure_cascade(%t11, North, West)
 }
diff --git a/test/dialect/AIE/bad_dma_op.mlir b/test/dialect/AIE/bad_dma_op.mlir
index c8338ae838..9ba149c65e 100644
--- a/test/dialect/AIE/bad_dma_op.mlir
+++ b/test/dialect/AIE/bad_dma_op.mlir
@@ -12,7 +12,7 @@
 
 // CHECK: error: 'aie.dma' op DMAOp can only appear in single block region
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_1 = aie.tile(0, 1)
     %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {address = 0 : i32} : memref<16xi32>
     %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 2 : i32}
diff --git a/test/dialect/AIE/badshimtiledma.mlir b/test/dialect/AIE/badshimtiledma.mlir
index 7edde144f0..da396798f0 100644
--- a/test/dialect/AIE/badshimtiledma.mlir
+++ b/test/dialect/AIE/badshimtiledma.mlir
@@ -12,7 +12,7 @@
 // CHECK: error{{.*}}'aie.shim_dma' op uses more input channels than available on this tile
 
 module @test {
-  aie.device(ipu) {
+  aie.device(npu) {
     %t00 = aie.tile(0, 0)
 
     %buf_e = aie.external_buffer : memref<256xi32>
diff --git a/test/dialect/AIE/badtiledma4.mlir b/test/dialect/AIE/badtiledma4.mlir
index 7d2cf2b9ce..6c498c62f4 100644
--- a/test/dialect/AIE/badtiledma4.mlir
+++ b/test/dialect/AIE/badtiledma4.mlir
@@ -12,7 +12,7 @@
 // CHECK: error{{.*}}'aie.mem' op uses more output channels than available on this tile
 
 module @test {
-  aie.device(ipu) {
+  aie.device(npu) {
     %t03 = aie.tile(0, 3)
 
     %buf_e = aie.buffer(%t03) : memref<256xi32>
diff --git a/test/dialect/AIE/buffer.mlir b/test/dialect/AIE/buffer.mlir
index d522f08dd4..a75392c5a3 100644
--- a/test/dialect/AIE/buffer.mlir
+++ b/test/dialect/AIE/buffer.mlir
@@ -11,7 +11,7 @@
 // RUN: aie-opt --aie-standard-lowering %s | FileCheck %s
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %t33 = aie.tile(3, 3)
     %t42 = aie.tile(4, 2)
     %t44 = aie.tile(4, 4)
diff --git a/test/dialect/AIEX/bad_ipu_nd.mlir b/test/dialect/AIEX/bad_npu_nd.mlir
similarity index 78%
rename from test/dialect/AIEX/bad_ipu_nd.mlir
rename to test/dialect/AIEX/bad_npu_nd.mlir
index ebd1715062..45ec8e0dd6 100644
--- a/test/dialect/AIEX/bad_ipu_nd.mlir
+++ b/test/dialect/AIEX/bad_npu_nd.mlir
@@ -1,4 +1,4 @@
-//===- bad_ipu_nd.mlir -----------------------------------------*- MLIR -*-===//
+//===- bad_npu_nd.mlir -----------------------------------------*- MLIR -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,14 +12,14 @@
 // RUN: aie-opt --split-input-file --verify-diagnostics %s
 
 module {
-  aie.device(ipu) {
-    func.func @bad_ipu_nd_length(%in : memref<1920x1080xi32>, %buf : memref<32xi32>, %out : memref<1920x1080xi32>) {
+  aie.device(npu) {
+    func.func @bad_npu_nd_length(%in : memref<1920x1080xi32>, %buf : memref<32xi32>, %out : memref<1920x1080xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c1920 = arith.constant 1920 : i64
       %c1080 = arith.constant 1080 : i64
       // expected-error@+1 {{Size 0 exceeds the [0:1023] range}}
-      aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi32>
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
@@ -29,8 +29,8 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
-    func.func @bad_ipu_nd_repeat(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
+  aie.device(npu) {
+    func.func @bad_npu_nd_repeat(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
@@ -40,7 +40,7 @@ module {
       %c32 = arith.constant 32 : i64
       %c128 = arith.constant 128 : i64
       // expected-error@+1 {{Size 3 exceeds the [1:64] range}}
-      aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c128,%c2,%c2,%c8][%c0,%c16,%c8]) { metadata = @of_fromMem, id = 0 : i64 } : memref<128x4x2x8xi32>
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
@@ -50,14 +50,14 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
-    func.func @bad_ipu_nd_stride(%in : memref<8388608xi32>, %buf : memref<32xi32>, %out : memref<8388608xi32>) {
+  aie.device(npu) {
+    func.func @bad_npu_nd_stride(%in : memref<8388608xi32>, %buf : memref<32xi32>, %out : memref<8388608xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2 = arith.constant 2 : i64
       %c2097152 = arith.constant 2097152 : i64
       // expected-error@+1 {{Stride 1 exceeds the [1:1M] range}}
-      aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c2,%c2][%c0,%c0,%c2097152]) { metadata = @of_fromMem, id = 0 : i64 } : memref<8388608xi32>
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
@@ -67,14 +67,14 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
-    func.func @bad_ipu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) {
+  aie.device(npu) {
+    func.func @bad_npu_nd_type(%in : memref<1920x1080xi8>, %buf : memref<32xi32>, %out : memref<1920x1080xi8>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c1920 = arith.constant 1920 : i64
       %c1080 = arith.constant 1080 : i64
       // expected-error@+1 {{must be used with memref type with element width 32.}}
-      aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1080,%c1920][%c0,%c0,%c1920]) { metadata = @of_fromMem, id = 0 : i64 } : memref<1920x1080xi8>
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
diff --git a/test/dialect/AIEX/bad_ipu_push_queue.mlir b/test/dialect/AIEX/bad_npu_push_queue.mlir
similarity index 82%
rename from test/dialect/AIEX/bad_ipu_push_queue.mlir
rename to test/dialect/AIEX/bad_npu_push_queue.mlir
index 49feece90d..64a11960ea 100644
--- a/test/dialect/AIEX/bad_ipu_push_queue.mlir
+++ b/test/dialect/AIEX/bad_npu_push_queue.mlir
@@ -1,4 +1,4 @@
-//===- bad_ipu_push_queue_bd.mlir ------------------------------*- MLIR -*-===//
+//===- bad_npu_push_queue_bd.mlir ------------------------------*- MLIR -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,10 +12,10 @@
 // RUN: aie-opt --split-input-file --verify-diagnostics %s
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{BD ID exceeds the maximum ID.}}
-      aiex.ipu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 }
+      aiex.npu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 3 : i32, bd_id = 28 : i32 }
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
@@ -25,10 +25,10 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     func.func @bad_repeat_count(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{Repeat count exceeds the [0:255] range.}}
-      aiex.ipu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 }
+      aiex.npu.shimtile_push_queue {metadata = @of_fromMem, issue_token = false, repeat_count = 384 : i32, bd_id = 8 : i32 }
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
diff --git a/test/dialect/AIEX/bad_ipu_write_bd.mlir b/test/dialect/AIEX/bad_npu_write_bd.mlir
similarity index 90%
rename from test/dialect/AIEX/bad_ipu_write_bd.mlir
rename to test/dialect/AIEX/bad_npu_write_bd.mlir
index f653614c8d..fdc9b425cc 100644
--- a/test/dialect/AIEX/bad_ipu_write_bd.mlir
+++ b/test/dialect/AIEX/bad_npu_write_bd.mlir
@@ -1,4 +1,4 @@
-//===- bad_ipu_write_bd_bd.mlir --------------------------------*- MLIR -*-===//
+//===- bad_npu_write_bd_bd.mlir --------------------------------*- MLIR -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -12,10 +12,10 @@
 // RUN: aie-opt --split-input-file --verify-diagnostics %s
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     func.func @bad_bd_id(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{BD ID exceeds the maximum ID.}}
-      aiex.ipu.writebd_shimtile {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.writebd_shimtile {bd_id = 17 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
@@ -25,10 +25,10 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     func.func @bad_iteration_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{Iteration Size exceeds the [0:63] range.}}
-      aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 4 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 1024 : i32, iteration_size = 128 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
@@ -38,10 +38,10 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     func.func @bad_stride(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{D0 Stride exceeds the [0:1M-1] range.}}
-      aiex.ipu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 2097356 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 2 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
@@ -51,10 +51,10 @@ module {
 // -----
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     func.func @bad_size(%in : memref<128x4x2x8xi32>, %buf : memref<32xi32>, %out : memref<8192xi32>) {
       // expected-error@+1 {{D1 Size exceeds the [0:1023] range.}}
-      aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+      aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 32 : i32, buffer_offset = 128 : i32, column = 0 : i32, column_num = 1 : i32, d0_stride = 0 : i32, d0_size = 8 : i32, d1_stride = 7 : i32, d1_size = 1024 : i32, d2_stride = 15 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_stride = 0 : i32, iteration_size = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
       return
     }
     aie.shim_dma_allocation @of_fromMem (MM2S, 0, 0)
diff --git a/test/dialect/AIEX/invalid.mlir b/test/dialect/AIEX/invalid.mlir
index 9b57d84b70..7a1a1fcc3b 100644
--- a/test/dialect/AIEX/invalid.mlir
+++ b/test/dialect/AIEX/invalid.mlir
@@ -10,10 +10,10 @@
 
 // RUN: aie-opt --split-input-file --verify-diagnostics %s
 
-aie.device(ipu) {
-  func.func @ipu_dma_wait_no_symbol() {
-    // expected-error@+1 {{'aiex.ipu.dma_wait' op couldn't find symbol in parent device}}
-    aiex.ipu.dma_wait {symbol = @out0}
+aie.device(npu) {
+  func.func @npu_dma_wait_no_symbol() {
+    // expected-error@+1 {{'aiex.npu.dma_wait' op couldn't find symbol in parent device}}
+    aiex.npu.dma_wait {symbol = @out0}
     return
   }
 }
diff --git a/test/dialect/AIEX/roundtrip.mlir b/test/dialect/AIEX/roundtrip.mlir
index 27611d5914..a7c698db09 100644
--- a/test/dialect/AIEX/roundtrip.mlir
+++ b/test/dialect/AIEX/roundtrip.mlir
@@ -10,21 +10,21 @@
 
 // RUN: aie-opt --split-input-file %s | FileCheck %s
 
-// CHECK-LABEL: func.func @ipu_dma_wait
-// CHECK: aiex.ipu.dma_wait {symbol = @out0}
-aie.device(ipu) {
+// CHECK-LABEL: func.func @npu_dma_wait
+// CHECK: aiex.npu.dma_wait {symbol = @out0}
+aie.device(npu) {
   memref.global "public" @out0 : memref<16xi32>
-  func.func @ipu_dma_wait() {
-    aiex.ipu.dma_wait {symbol = @out0}
+  func.func @npu_dma_wait() {
+    aiex.npu.dma_wait {symbol = @out0}
     return
   }
 }
 
 // -----
 
-// CHECK-LABEL: func.func @ipu_dma_wait_no_device
-// CHECK: aiex.ipu.dma_wait {symbol = @out0}
-func.func @ipu_dma_wait_no_device() {
-  aiex.ipu.dma_wait {symbol = @out0}
+// CHECK-LABEL: func.func @npu_dma_wait_no_device
+// CHECK: aiex.npu.dma_wait {symbol = @out0}
+func.func @npu_dma_wait_no_device() {
+  aiex.npu.dma_wait {symbol = @out0}
   return
 }
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index 474009e5c6..4eef6329da 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -54,7 +54,7 @@
 # for xchesscc_wrapper
 llvm_config.with_environment("AIETOOLS", config.vitis_aietools_dir)
 
-run_on_ipu = "echo"
+run_on_npu = "echo"
 xrt_flags = ""
 
 # Not using run_on_board anymore, need more specific per-platform commands
@@ -144,8 +144,8 @@
                     aie_model = m.group(2)
                     print("\tmodel:", aie_model)
                 config.available_features.add("ryzen_ai")
-                run_on_ipu = (
-                    f"flock /tmp/ipu.lock {config.aie_src_root}/utils/run_on_ipu.sh"
+                run_on_npu = (
+                    f"flock /tmp/npu.lock {config.aie_src_root}/utils/run_on_npu.sh"
                 )
     except:
         print("Failed to run xbutil")
@@ -153,7 +153,7 @@
 else:
     print("xrt not found")
 
-config.substitutions.append(("%run_on_ipu", run_on_ipu))
+config.substitutions.append(("%run_on_npu", run_on_npu))
 config.substitutions.append(("%xrt_flags", xrt_flags))
 config.substitutions.append(("%XRT_DIR", config.xrt_dir))
 
diff --git a/test/lower-to-standard/aiex_standard_lowering.mlir b/test/lower-to-standard/aiex_standard_lowering.mlir
index 639dbc1e83..6a0cd0b545 100644
--- a/test/lower-to-standard/aiex_standard_lowering.mlir
+++ b/test/lower-to-standard/aiex_standard_lowering.mlir
@@ -11,14 +11,14 @@
 // RUN: aie-opt --split-input-file --aiex-standard-lowering %s | FileCheck %s
 
 // CHECK-LABEL: dma_and_wait
-// CHECK-NOT: aiex.ipu.dma_memcpy_nd
-// CHECK-NOT: aiex.ipu.dma_wait
+// CHECK-NOT: aiex.npu.dma_memcpy_nd
+// CHECK-NOT: aiex.npu.dma_wait
 module  {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @toMem : memref<16xi32>
     func.func @dma_and_wait(%arg0: memref<16xi32>, %arg1: memref<16xi32>) {
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
-      aiex.ipu.dma_wait {symbol = @toMem}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32>
+      aiex.npu.dma_wait {symbol = @toMem}
       return
     }
     aie.shim_dma_allocation @toMem (MM2S, 1, 1)
diff --git a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
similarity index 95%
rename from test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
rename to test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
index e44add4a05..89bda05890 100644
--- a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
+++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/aie.mlir
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
     %tile_0_2 = aie.tile(0, 2)
@@ -100,9 +100,9 @@ module {
     aie.shim_dma_allocation @this_just_creates_a_symbol_and_the_type_means_nothing_in(MM2S, 0, 0)
     aie.shim_dma_allocation @this_just_creates_a_symbol_and_the_type_means_nothing_out(S2MM, 0, 0)
     func.func @bobsyouruncle(%arg0: memref<64xi32>, %arg1: memref<32xi32>, %arg2: memref<64xi32>) {
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32>
-      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 0 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_in} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 64][0, 0, 0]) {id = 1 : i64, metadata = @this_just_creates_a_symbol_and_the_type_means_nothing_out} : memref<64xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
   }
diff --git a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/run.lit
similarity index 70%
rename from test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit
rename to test/npu-xrt/add_256_using_dma_op_no_double_buffering/run.lit
index 5d29ef1058..67cf187967 100644
--- a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/run.lit
+++ b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/run.lit
@@ -7,7 +7,7 @@
 // RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir
 // RUN: cp *.elf aie.mlir.prj/
 // RUN: cp *.bin aie.mlir.prj/
-// RUN: %python aiecc.py --no-aiesim --aie-generate-ipu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++
-// RUN: %run_on_ipu ./test.exe aie.xclbin | FileCheck %s
+// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s
 // CHECK: PASS!
diff --git a/test/ipu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp b/test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp
similarity index 100%
rename from test/ipu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp
rename to test/npu-xrt/add_256_using_dma_op_no_double_buffering/test.cpp
diff --git a/test/ipu-xrt/add_314_using_dma_op/aie.mlir b/test/npu-xrt/add_314_using_dma_op/aie.mlir
similarity index 97%
rename from test/ipu-xrt/add_314_using_dma_op/aie.mlir
rename to test/npu-xrt/add_314_using_dma_op/aie.mlir
index 646f263804..37ef98c47c 100644
--- a/test/ipu-xrt/add_314_using_dma_op/aie.mlir
+++ b/test/npu-xrt/add_314_using_dma_op/aie.mlir
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @objFifo_in0 : memref<16xi32>
     memref.global "public" @objFifo_out0 : memref<16xi32>
 
@@ -65,9 +65,9 @@ module {
       %c0_i64 = arith.constant 0 : i64
       %c1_i64 = arith.constant 1 : i64
       %c64_i64 = arith.constant 64 : i64
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c1_i64, %c64_i64][%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 
diff --git a/test/ipu-xrt/add_314_using_dma_op/run.lit b/test/npu-xrt/add_314_using_dma_op/run.lit
similarity index 70%
rename from test/ipu-xrt/add_314_using_dma_op/run.lit
rename to test/npu-xrt/add_314_using_dma_op/run.lit
index 23c3e076c9..5329b2789e 100644
--- a/test/ipu-xrt/add_314_using_dma_op/run.lit
+++ b/test/npu-xrt/add_314_using_dma_op/run.lit
@@ -7,8 +7,8 @@
 // RUN: aie-translate --aie-generate-cdo aie.mlir.prj/input_physical.mlir
 // RUN: cp *.elf aie.mlir.prj/
 // RUN: cp *.bin aie.mlir.prj/
-// RUN: %python aiecc.py --no-aiesim --aie-generate-ipu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-npu --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++
-// RUN: %run_on_ipu ./test.exe aie.xclbin | FileCheck %s
+// RUN: %run_on_npu ./test.exe aie.xclbin | FileCheck %s
 // CHECK: PASS!
 
diff --git a/test/ipu-xrt/add_314_using_dma_op/test.cpp b/test/npu-xrt/add_314_using_dma_op/test.cpp
similarity index 100%
rename from test/ipu-xrt/add_314_using_dma_op/test.cpp
rename to test/npu-xrt/add_314_using_dma_op/test.cpp
diff --git a/test/ipu-xrt/add_one_objFifo/CMakeLists.txt b/test/npu-xrt/add_one_objFifo/CMakeLists.txt
similarity index 96%
rename from test/ipu-xrt/add_one_objFifo/CMakeLists.txt
rename to test/npu-xrt/add_one_objFifo/CMakeLists.txt
index ad13460e2b..c400599ea1 100644
--- a/test/ipu-xrt/add_one_objFifo/CMakeLists.txt
+++ b/test/npu-xrt/add_one_objFifo/CMakeLists.txt
@@ -22,7 +22,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/test/ipu-xrt/add_one_objFifo/Makefile b/test/npu-xrt/add_one_objFifo/Makefile
similarity index 91%
rename from test/ipu-xrt/add_one_objFifo/Makefile
rename to test/npu-xrt/add_one_objFifo/Makefile
index 9fd67f862e..ce9d9338b3 100644
--- a/test/ipu-xrt/add_one_objFifo/Makefile
+++ b/test/npu-xrt/add_one_objFifo/Makefile
@@ -7,7 +7,7 @@ all: build/final.xclbin build/insts.txt
 build/final.xclbin: aie.mlir
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/test/ipu-xrt/add_one_objFifo/aie.mlir b/test/npu-xrt/add_one_objFifo/aie.mlir
similarity index 91%
rename from test/ipu-xrt/add_one_objFifo/aie.mlir
rename to test/npu-xrt/add_one_objFifo/aie.mlir
index 3b55edb0d7..137f8b03d8 100644
--- a/test/ipu-xrt/add_one_objFifo/aie.mlir
+++ b/test/npu-xrt/add_one_objFifo/aie.mlir
@@ -6,7 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %t00 = aie.tile(0, 0)
     %t01 = aie.tile(0, 1)
     %t02 = aie.tile(0, 2)
@@ -44,9 +44,9 @@ module {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c64 = arith.constant 64 : i64
-      aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
-      aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
+      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }
diff --git a/test/ipu-xrt/add_one_objFifo/run.lit b/test/npu-xrt/add_one_objFifo/run.lit
similarity index 75%
rename from test/ipu-xrt/add_one_objFifo/run.lit
rename to test/npu-xrt/add_one_objFifo/run.lit
index 632a421a4d..a137e2e4ae 100644
--- a/test/ipu-xrt/add_one_objFifo/run.lit
+++ b/test/npu-xrt/add_one_objFifo/run.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai
 //
-// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/test/ipu-xrt/add_one_objFifo/run.sh b/test/npu-xrt/add_one_objFifo/run.sh
similarity index 100%
rename from test/ipu-xrt/add_one_objFifo/run.sh
rename to test/npu-xrt/add_one_objFifo/run.sh
diff --git a/test/ipu-xrt/add_one_objFifo/test.cpp b/test/npu-xrt/add_one_objFifo/test.cpp
similarity index 100%
rename from test/ipu-xrt/add_one_objFifo/test.cpp
rename to test/npu-xrt/add_one_objFifo/test.cpp
diff --git a/test/ipu-xrt/add_one_using_dma/aie.mlir b/test/npu-xrt/add_one_using_dma/aie.mlir
similarity index 97%
rename from test/ipu-xrt/add_one_using_dma/aie.mlir
rename to test/npu-xrt/add_one_using_dma/aie.mlir
index 058ae034bc..8647f6b710 100644
--- a/test/ipu-xrt/add_one_using_dma/aie.mlir
+++ b/test/npu-xrt/add_one_using_dma/aie.mlir
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @objFifo_in0 : memref<16xi32>
     memref.global "public" @objFifo_in0_cons : memref<16xi32>
     memref.global "public" @objFifo_in1 : memref<8xi32>
@@ -76,9 +76,9 @@ module {
       %c0_i64 = arith.constant 0 : i64
       %c1_i64 = arith.constant 1 : i64
       %c64_i64 = arith.constant 64 : i64
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
-      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @objFifo_in0} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c64_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @objFifo_out0} : memref<64xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 
diff --git a/test/ipu-xrt/add_one_using_dma/run.lit b/test/npu-xrt/add_one_using_dma/run.lit
similarity index 75%
rename from test/ipu-xrt/add_one_using_dma/run.lit
rename to test/npu-xrt/add_one_using_dma/run.lit
index 632a421a4d..a137e2e4ae 100644
--- a/test/ipu-xrt/add_one_using_dma/run.lit
+++ b/test/npu-xrt/add_one_using_dma/run.lit
@@ -3,8 +3,8 @@
 //
 // REQUIRES: ryzen_ai
 //
-// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/test/ipu-xrt/add_one_using_dma/test.cpp b/test/npu-xrt/add_one_using_dma/test.cpp
similarity index 100%
rename from test/ipu-xrt/add_one_using_dma/test.cpp
rename to test/npu-xrt/add_one_using_dma/test.cpp
diff --git a/test/ipu-xrt/cascade_flows/CMakeLists.txt b/test/npu-xrt/cascade_flows/CMakeLists.txt
similarity index 96%
rename from test/ipu-xrt/cascade_flows/CMakeLists.txt
rename to test/npu-xrt/cascade_flows/CMakeLists.txt
index 257e7ca075..aafc542dde 100644
--- a/test/ipu-xrt/cascade_flows/CMakeLists.txt
+++ b/test/npu-xrt/cascade_flows/CMakeLists.txt
@@ -22,7 +22,7 @@ if (NOT WSL)
 else()
     set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
     set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
-    set(XRT_LIB_DIR C:/Technical/xrtIPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+    set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
 endif()
 
 set(TARGET_NAME test CACHE STRING "Target to be built")
diff --git a/test/ipu-xrt/cascade_flows/Makefile b/test/npu-xrt/cascade_flows/Makefile
similarity index 95%
rename from test/ipu-xrt/cascade_flows/Makefile
rename to test/npu-xrt/cascade_flows/Makefile
index 6c88c72d19..ef6b2cf5a0 100644
--- a/test/ipu-xrt/cascade_flows/Makefile
+++ b/test/npu-xrt/cascade_flows/Makefile
@@ -19,7 +19,7 @@ build/%.o: %.cc
 build/final.xclbin: aie.mlir build/kernel1.o build/kernel2.o build/kernel3.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
-				--aie-generate-ipu --ipu-insts-name=insts.txt $(<:%=../%)
+				--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
 
 ${targetname}.exe: test.cpp
 	rm -rf _build
diff --git a/test/ipu-xrt/cascade_flows/aie.mlir b/test/npu-xrt/cascade_flows/aie.mlir
similarity index 92%
rename from test/ipu-xrt/cascade_flows/aie.mlir
rename to test/npu-xrt/cascade_flows/aie.mlir
index 967c3ecedc..e5b98481b5 100644
--- a/test/ipu-xrt/cascade_flows/aie.mlir
+++ b/test/npu-xrt/cascade_flows/aie.mlir
@@ -6,7 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %t00 = aie.tile(0, 0)
     %t01 = aie.tile(0, 1)
     %t03 = aie.tile(0, 3)
@@ -60,9 +60,9 @@ module {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c64 = arith.constant 64 : i64
-      aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
-      aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
-      aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<64xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<64xi32>
+      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }
diff --git a/test/ipu-xrt/cascade_flows/kernel1.cc b/test/npu-xrt/cascade_flows/kernel1.cc
similarity index 100%
rename from test/ipu-xrt/cascade_flows/kernel1.cc
rename to test/npu-xrt/cascade_flows/kernel1.cc
diff --git a/test/ipu-xrt/cascade_flows/kernel2.cc b/test/npu-xrt/cascade_flows/kernel2.cc
similarity index 100%
rename from test/ipu-xrt/cascade_flows/kernel2.cc
rename to test/npu-xrt/cascade_flows/kernel2.cc
diff --git a/test/ipu-xrt/cascade_flows/kernel3.cc b/test/npu-xrt/cascade_flows/kernel3.cc
similarity index 100%
rename from test/ipu-xrt/cascade_flows/kernel3.cc
rename to test/npu-xrt/cascade_flows/kernel3.cc
diff --git a/test/ipu-xrt/cascade_flows/run.lit b/test/npu-xrt/cascade_flows/run.lit
similarity index 83%
rename from test/ipu-xrt/cascade_flows/run.lit
rename to test/npu-xrt/cascade_flows/run.lit
index 6581a3a212..c3b2945605 100644
--- a/test/ipu-xrt/cascade_flows/run.lit
+++ b/test/npu-xrt/cascade_flows/run.lit
@@ -6,7 +6,7 @@
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel1.cc -o ./kernel1.o
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel2.cc -o ./kernel2.o
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel3.cc -o ./kernel3.o
-// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/test/ipu-xrt/cascade_flows/test.cpp b/test/npu-xrt/cascade_flows/test.cpp
similarity index 100%
rename from test/ipu-xrt/cascade_flows/test.cpp
rename to test/npu-xrt/cascade_flows/test.cpp
diff --git a/test/ipu-xrt/e2e/conftest.py b/test/npu-xrt/e2e/conftest.py
similarity index 96%
rename from test/ipu-xrt/e2e/conftest.py
rename to test/npu-xrt/e2e/conftest.py
index 7e27c4421b..c2f981380f 100644
--- a/test/ipu-xrt/e2e/conftest.py
+++ b/test/npu-xrt/e2e/conftest.py
@@ -8,7 +8,7 @@
 @pytest.fixture(autouse=True)
 def run_around_tests():
     subprocess.check_call(
-        [str(Path(__file__).parent.parent.parent.parent / "utils" / "reset_ipu.sh")]
+        [str(Path(__file__).parent.parent.parent.parent / "utils" / "reset_npu.sh")]
     )
     yield
 
diff --git a/test/ipu-xrt/e2e/lit.local.cfg b/test/npu-xrt/e2e/lit.local.cfg
similarity index 100%
rename from test/ipu-xrt/e2e/lit.local.cfg
rename to test/npu-xrt/e2e/lit.local.cfg
diff --git a/test/ipu-xrt/e2e/pytest.ini b/test/npu-xrt/e2e/pytest.ini
similarity index 100%
rename from test/ipu-xrt/e2e/pytest.ini
rename to test/npu-xrt/e2e/pytest.ini
diff --git a/test/ipu-xrt/e2e/run_all_tests_one_by_one.sh b/test/npu-xrt/e2e/run_all_tests_one_by_one.sh
similarity index 100%
rename from test/ipu-xrt/e2e/run_all_tests_one_by_one.sh
rename to test/npu-xrt/e2e/run_all_tests_one_by_one.sh
diff --git a/test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py b/test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py
similarity index 93%
rename from test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py
rename to test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py
index 8af3ee9cf5..fc57e8b0f1 100644
--- a/test/ipu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py
+++ b/test/npu-xrt/e2e/test_add_256_using_dma_op_no_double_buffering.py
@@ -8,7 +8,7 @@
 from pathlib import Path
 import random
 
-from aie.compiler.aiecc.main import DMA_TO_IPU
+from aie.compiler.aiecc.main import DMA_TO_NPU
 from aie.compiler.util import compile_without_vectorization, make_xclbin
 from aie.dialects import aie, aiex
 from aie.dialects.aie import (
@@ -16,7 +16,7 @@
     DMAChannelDir,
     LockAction,
     WireBundle,
-    ipu_instgen,
+    npu_instgen,
 )
 from aie.dialects.scf import for_ as range_, yield_
 from aie.extras.dialects.ext import arith, func, memref
@@ -47,8 +47,8 @@ def test_add_256_using_dma_op_no_double_buffering(ctx: MLIRContext, workdir: Pat
     LEN = 128
     LOCAL_MEM_SIZE = 32
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -115,7 +115,7 @@ def bobsyouruncle(
             _arg1: T.memref(1, T.i32()),
             arg2: T.memref(LEN, T.i32()),
         ):
-            aiex.ipu_dma_memcpy_nd(
+            aiex.npu_dma_memcpy_nd(
                 this_is_meaningless_1.sym_name.value,
                 0,
                 arg0,
@@ -123,7 +123,7 @@ def bobsyouruncle(
                 [1, 1, 1, LEN],
                 [0, 0, 0],
             )
-            aiex.ipu_dma_memcpy_nd(
+            aiex.npu_dma_memcpy_nd(
                 this_is_meaningless_2.sym_name.value,
                 1,
                 arg2,
@@ -132,7 +132,7 @@ def bobsyouruncle(
                 [0, 0, 0],
             )
 
-            aiex.ipu_sync(
+            aiex.npu_sync(
                 channel=0, column=0, column_num=1, direction=0, row=0, row_num=1
             )
 
@@ -188,12 +188,12 @@ def dma2():
             aie.end()
 
     compile_without_vectorization(ctx.module, workdir)
-    generated_ipu_insts = run_pipeline(ctx.module, DMA_TO_IPU)
-    ipu_insts = [int(inst, 16) for inst in ipu_instgen(generated_ipu_insts.operation)]
+    generated_npu_insts = run_pipeline(ctx.module, DMA_TO_NPU)
+    npu_insts = [int(inst, 16) for inst in npu_instgen(generated_npu_insts.operation)]
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(LEN,), (LEN,), (LEN,)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_locks.py b/test/npu-xrt/e2e/test_locks.py
similarity index 93%
rename from test/ipu-xrt/e2e/test_locks.py
rename to test/npu-xrt/e2e/test_locks.py
index 3f50bf1da6..4ecc07a095 100644
--- a/test/ipu-xrt/e2e/test_locks.py
+++ b/test/npu-xrt/e2e/test_locks.py
@@ -42,10 +42,10 @@ def test_one_global(ctx: MLIRContext, workdir: Path):
     iv = np.random.randint(0, 10, (K,), dtype=np.int32)
     column = 2
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         # TODO(max): figure this annoying thing out...
         if column != 0:
             _dummy_tile = aie.tile(0, 2)
@@ -112,8 +112,8 @@ def memtile_dma():
 
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 column=column,
                 bd_id=bd_id,
                 buffer_length=K,
@@ -121,16 +121,16 @@ def memtile_dma():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.shimtile_push_queue(
+        npu_insts.extend(
+            aiex.npu.shimtile_push_queue(
                 channel_dir=S2MM,
                 channel_index=flow_to_shim.dest_channel,
                 column=column,
                 bd_id=bd_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=flow_to_shim.dest_channel,
                 column=column,
                 direction=0,
@@ -140,9 +140,9 @@ def memtile_dma():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         [c] = xclbin.mmap_buffers([(K,)], np.int32)
         wrap_C = np.asarray(c)
         C = np.zeros((K,), dtype=np.int32)
@@ -166,10 +166,10 @@ def test_threesome(ctx: MLIRContext, workdir: Path):
     iv1 = np.random.randint(0, 10, (K,), dtype=np.int32)
     iv2 = np.random.randint(0, 10, (K,), dtype=np.int32)
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         _dummy_tile = aie.tile(0, 2)
         tile_1_2 = aie.tile(1, 2)
         global_weight_1_2 = memref.global_(initial_value=iv1)
@@ -249,8 +249,8 @@ def memtile_dma():
 
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 column=shim_tile_column,
                 bd_id=bd_id,
                 buffer_length=K,
@@ -258,16 +258,16 @@ def memtile_dma():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.shimtile_push_queue(
+        npu_insts.extend(
+            aiex.npu.shimtile_push_queue(
                 channel_dir=S2MM,
                 channel_index=flow_to_shim.dest_channel,
                 column=shim_tile_column,
                 bd_id=bd_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=flow_to_shim.dest_channel,
                 column=shim_tile_column,
                 direction=0,
@@ -277,9 +277,9 @@ def memtile_dma():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         [c] = xclbin.mmap_buffers([(K,)], np.int32)
         wrap_C = np.asarray(c)
         C = np.zeros((K,), dtype=np.int32)
@@ -305,10 +305,10 @@ def test_foursome(ctx: MLIRContext, workdir: Path):
     iv2 = np.random.randint(0, 10, (K,), dtype=np.int32)
     iv3 = np.random.randint(0, 10, (K,), dtype=np.int32)
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         _dummy_tile = aie.tile(0, 2)
 
         tile_1_3 = aie.tile(1, 3)
@@ -407,8 +407,8 @@ def memtile_dma():
 
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 column=shim_tile_column,
                 bd_id=bd_id,
                 buffer_length=K,
@@ -416,16 +416,16 @@ def memtile_dma():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.shimtile_push_queue(
+        npu_insts.extend(
+            aiex.npu.shimtile_push_queue(
                 channel_dir=S2MM,
                 channel_index=flow_to_shim.dest_channel,
                 column=shim_tile_column,
                 bd_id=bd_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=flow_to_shim.dest_channel,
                 column=shim_tile_column,
                 direction=0,
@@ -435,9 +435,9 @@ def memtile_dma():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         [c] = xclbin.mmap_buffers([(K,)], np.int32)
         wrap_C = np.asarray(c)
         C = np.zeros((K,), dtype=np.int32)
diff --git a/test/ipu-xrt/e2e/test_manual_dpu_args.py b/test/npu-xrt/e2e/test_manual_dpu_args.py
similarity index 88%
rename from test/ipu-xrt/e2e/test_manual_dpu_args.py
rename to test/npu-xrt/e2e/test_manual_dpu_args.py
index 3016384071..9b372e439e 100644
--- a/test/ipu-xrt/e2e/test_manual_dpu_args.py
+++ b/test/npu-xrt/e2e/test_manual_dpu_args.py
@@ -54,8 +54,8 @@ def test_manual_args(ctx: MLIRContext, workdir: Path):
     iters = 10
     loop = False
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -115,29 +115,29 @@ def dma6():
     kernel_json = emit_design_kernel_json(buffer_args=buffer_args)
     xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json)
 
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
         views = xclbin.mmap_buffers([(K,)] * iters, np.int32)
 
         col = 0
         channel_index = 0
-        ipu_insts = aiex.ipu.get_prolog()
+        npu_insts = aiex.npu.get_prolog()
         for bd_id in range(iters):
-            writebd_shimtile_insts = aiex.ipu.writebd_shimtile(
+            writebd_shimtile_insts = aiex.npu.writebd_shimtile(
                 col, bd_id, buffer_length=K
             )
-            ipu_insts.extend(
-                aiex.ipu._exec_write_bd_extend_shim_tile_opt(
+            npu_insts.extend(
+                aiex.npu._exec_write_bd_extend_shim_tile_opt(
                     writebd_shimtile_insts,
                     tensor_addr=xclbin._get_buffer_host_address(bd_id),
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(aiex.ipu.sync(column=col))
+            npu_insts.extend(aiex.npu.sync(column=col))
 
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
 
         wraps = list(map(np.asarray, views))
 
@@ -161,8 +161,8 @@ def test_manual_args_with_offset(ctx: MLIRContext, workdir: Path):
     iters = 10
     loop = False
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -222,30 +222,30 @@ def dma6():
     kernel_json = emit_design_kernel_json(buffer_args=buffer_args)
     xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json)
 
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
         views = xclbin.mmap_buffers([(K * iters,)] * iters, np.int32)
 
         col = 0
         channel_index = 0
-        ipu_insts = aiex.ipu.get_prolog()
+        npu_insts = aiex.npu.get_prolog()
         for i in range(iters):
             bd_id = i
-            writebd_shimtile_insts = aiex.ipu.writebd_shimtile(
+            writebd_shimtile_insts = aiex.npu.writebd_shimtile(
                 col, bd_id, buffer_length=K, buffer_offset=K * i
             )
-            ipu_insts.extend(
-                aiex.ipu._exec_write_bd_extend_shim_tile_opt(
+            npu_insts.extend(
+                aiex.npu._exec_write_bd_extend_shim_tile_opt(
                     writebd_shimtile_insts,
                     tensor_addr=xclbin._get_buffer_host_address(i),
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(aiex.ipu.sync(column=col))
+            npu_insts.extend(aiex.npu.sync(column=col))
 
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
 
         wraps = list(map(np.asarray, views))
 
@@ -268,8 +268,8 @@ def test_manual_args_with_different_cols(ctx: MLIRContext, workdir: Path):
     RANDOM_WEIGHT = np.random.randint(0, 10, (K,), dtype=np.int32)
     cols = [0, 1, 2, 3]
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         for c in cols:
             tile_c_0 = aie.tile(c, 0)
             tile_c_2 = aie.tile(c, 2)
@@ -306,29 +306,29 @@ def dma3():
     kernel_json = emit_design_kernel_json(buffer_args=buffer_args)
     xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json)
 
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
         views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32)
 
         bd_id = 0
         channel_index = 0
-        ipu_insts = aiex.ipu.get_prolog()
+        npu_insts = aiex.npu.get_prolog()
         for col in cols:
-            writebd_shimtile_insts = aiex.ipu.writebd_shimtile(
+            writebd_shimtile_insts = aiex.npu.writebd_shimtile(
                 col, bd_id, buffer_length=K
             )
-            ipu_insts.extend(
-                aiex.ipu._exec_write_bd_extend_shim_tile_opt(
+            npu_insts.extend(
+                aiex.npu._exec_write_bd_extend_shim_tile_opt(
                     writebd_shimtile_insts,
                     tensor_addr=xclbin._get_buffer_host_address(col),
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(aiex.ipu.sync(column=col))
+            npu_insts.extend(aiex.npu.sync(column=col))
 
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
 
         wraps = list(map(np.asarray, views))
 
@@ -353,8 +353,8 @@ def test_manual_args_with_shim_dma(ctx: MLIRContext, workdir: Path):
 
     iters = 21
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         if 0 not in cols:
             tile_dummy = aie.tile(0, 3)
         for c in cols:
@@ -408,20 +408,20 @@ def dma():
     kernel_json = emit_design_kernel_json(buffer_args=buffer_args)
     xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json)
 
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
         views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32)
 
         bd_id = 0
-        ipu_insts = aiex.ipu.get_prolog()
+        npu_insts = aiex.npu.get_prolog()
         for i, col in enumerate(cols):
-            update_addrs = aiex.ipu._update_tensor_addr_shim_tile(
+            update_addrs = aiex.npu._update_tensor_addr_shim_tile(
                 col, bd_id, tensor_addr=xclbin._get_buffer_host_address(i)
             )
-            ipu_insts.extend(update_addrs)
-            ipu_insts.extend(aiex.ipu.enable_cores(col, compute_tile_row))
+            npu_insts.extend(update_addrs)
+            npu_insts.extend(aiex.npu.enable_cores(col, compute_tile_row))
 
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
 
         wraps = list(map(np.asarray, views))
 
diff --git a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py b/test/npu-xrt/e2e/test_nonsquare_matrix_mult.py
similarity index 91%
rename from test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py
rename to test/npu-xrt/e2e/test_nonsquare_matrix_mult.py
index 20c5998709..0489b46381 100644
--- a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult.py
+++ b/test/npu-xrt/e2e/test_nonsquare_matrix_mult.py
@@ -45,10 +45,10 @@
 def test_nonsquare_matrix_mult(ctx: MLIRContext, workdir: Path):
     M, K, N = 16, 32, 16
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -91,8 +91,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * K,
@@ -100,14 +100,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         ddr_id = 1
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=K * N,
@@ -115,14 +115,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
         ddr_id = 2
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -130,9 +130,9 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=0,
                 column_num=1,
@@ -235,9 +235,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -268,10 +268,10 @@ def core():
 def test_nonsquare_matrix_mult_sugar(ctx: MLIRContext, workdir: Path):
     M, K, N = 16, 32, 16
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -309,8 +309,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * K,
@@ -318,14 +318,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         ddr_id = 1
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=K * N,
@@ -333,14 +333,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
         ddr_id = 2
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -348,9 +348,9 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=0,
                 column_num=1,
@@ -417,9 +417,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py b/test/npu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py
similarity index 94%
rename from test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py
rename to test/npu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py
index ae1079fd4e..816ddba151 100644
--- a/test/ipu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py
+++ b/test/npu-xrt/e2e/test_nonsquare_matrix_mult_vectorized.py
@@ -64,11 +64,11 @@ def matmul_i32_i32(
 
 
 def test_nonsquare_matrix_mult_vectorized(ctx: MLIRContext, workdir: Path):
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         matmul_i32_i32.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -112,8 +112,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * K,
@@ -121,14 +121,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         ddr_id = 1
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=K * N,
@@ -136,14 +136,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
         ddr_id = 2
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -151,9 +151,9 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=0,
                 column_num=1,
@@ -327,9 +327,9 @@ def super_vectorize(target: any_op_t()):
 
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -359,11 +359,11 @@ def super_vectorize(target: any_op_t()):
 
 
 def test_nonsquare_matrix_mult_vectorized_sugar(ctx: MLIRContext, workdir: Path):
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         matmul_i32_i32.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -402,8 +402,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * K,
@@ -411,14 +411,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         ddr_id = 1
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=K * N,
@@ -426,14 +426,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
         ddr_id = 2
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -441,9 +441,9 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=0,
                 column_num=1,
@@ -579,9 +579,9 @@ def super_vectorize(target: any_op_t()):
     )
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_offsets_sizes_strides.py b/test/npu-xrt/e2e/test_offsets_sizes_strides.py
similarity index 92%
rename from test/ipu-xrt/e2e/test_offsets_sizes_strides.py
rename to test/npu-xrt/e2e/test_offsets_sizes_strides.py
index 1262d59bf3..ecae0293ba 100644
--- a/test/ipu-xrt/e2e/test_offsets_sizes_strides.py
+++ b/test/npu-xrt/e2e/test_offsets_sizes_strides.py
@@ -54,10 +54,10 @@ def test_offsets_sizes_strides(ctx: MLIRContext, workdir: Path):
     tile_m_B, tile_n_B = M // tile_rows_B, N // tile_cols_B
     tile_m_C, tile_n_C = M // tile_rows_C, N // tile_cols_C
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -102,8 +102,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         for i, bd_id in enumerate(range(4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     64,
@@ -115,16 +115,16 @@ def ipu():
                     d0_stride=1,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
         channel_index = 1
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     64,
@@ -136,16 +136,16 @@ def ipu():
                     d0_stride=1,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
         channel_index = 0
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     64,
@@ -157,11 +157,11 @@ def ipu():
                     d0_stride=1,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0, column=0, column_num=1, direction=0, row=0, row_num=1
                 )
             )
@@ -257,9 +257,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_repeat_count.py b/test/npu-xrt/e2e/test_repeat_count.py
similarity index 90%
rename from test/ipu-xrt/e2e/test_repeat_count.py
rename to test/npu-xrt/e2e/test_repeat_count.py
index c769770283..e350931920 100644
--- a/test/ipu-xrt/e2e/test_repeat_count.py
+++ b/test/npu-xrt/e2e/test_repeat_count.py
@@ -53,10 +53,10 @@ def test_repeat_count(ctx: MLIRContext, workdir: Path):
     iters = 4
     loop = False
     RANDOM_WEIGHT = np.random.randint(0, 10, (K,), dtype=np.int32)
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -109,8 +109,8 @@ def dma6():
         ddr_id = 0
         col = 0
         for i, bd_id in enumerate(range(iters)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=K,
@@ -118,11 +118,11 @@ def dma6():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=col,
                     column_num=1,
@@ -136,9 +136,9 @@ def dma6():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(iters * K,)], np.int32)
 
         wrap_C = np.asarray(views[0])
@@ -165,10 +165,10 @@ def test_no_loop(ctx: MLIRContext, workdir: Path):
     RANDOM_WEIGHT = np.ones((K,), dtype=np.int32) * random.randint(1, 100)
     col = 2
     iters = 10
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         nonlocal col
 
         if col != 0:
@@ -206,28 +206,28 @@ def dma3():
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
 
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
         views = xclbin.mmap_buffers([(K,)], np.int32)
 
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=K,
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.shimtile_push_queue(
+        npu_insts.extend(
+            aiex.npu.shimtile_push_queue(
                 S2MM, channel_index, col, bd_id, repeats=iters - 1
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=col,
                 column_num=1,
@@ -237,7 +237,7 @@ def dma3():
             )
         )
 
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
 
         wraps = list(map(np.asarray, views))
 
diff --git a/test/ipu-xrt/e2e/test_shared_buffers_init_value.py b/test/npu-xrt/e2e/test_shared_buffers_init_value.py
similarity index 95%
rename from test/ipu-xrt/e2e/test_shared_buffers_init_value.py
rename to test/npu-xrt/e2e/test_shared_buffers_init_value.py
index 461031ebb3..e8fb6b4ebe 100644
--- a/test/ipu-xrt/e2e/test_shared_buffers_init_value.py
+++ b/test/npu-xrt/e2e/test_shared_buffers_init_value.py
@@ -38,10 +38,10 @@ def test_foursome(ctx: MLIRContext, workdir: Path):
     init_weights = [np.random.randint(0, 10, (K,), dtype=np.int32) for _ in range(7)]
     random_numbers = [random.randint(0, 10) for _ in range(7, 7 + 3)]
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         _dummy_tile = aie.tile(0, 2)
 
         # west
@@ -170,8 +170,8 @@ def memtile_dma():
 
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 column=shim_tile_column,
                 bd_id=bd_id,
                 buffer_length=K,
@@ -179,16 +179,16 @@ def memtile_dma():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.shimtile_push_queue(
+        npu_insts.extend(
+            aiex.npu.shimtile_push_queue(
                 channel_dir=S2MM,
                 channel_index=flow_to_shim.dest_channel,
                 column=shim_tile_column,
                 bd_id=bd_id,
             )
         )
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=flow_to_shim.dest_channel,
                 column=shim_tile_column,
                 direction=S2MM,
@@ -198,9 +198,9 @@ def memtile_dma():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         [c] = xclbin.mmap_buffers([(K,)], np.int32)
         wrap_C = np.asarray(c)
         C = np.zeros((K,), dtype=np.int32)
diff --git a/test/ipu-xrt/e2e/test_square_matrix_mult.py b/test/npu-xrt/e2e/test_square_matrix_mult.py
similarity index 91%
rename from test/ipu-xrt/e2e/test_square_matrix_mult.py
rename to test/npu-xrt/e2e/test_square_matrix_mult.py
index 6f746fc490..b229c3a1a8 100644
--- a/test/ipu-xrt/e2e/test_square_matrix_mult.py
+++ b/test/npu-xrt/e2e/test_square_matrix_mult.py
@@ -45,10 +45,10 @@
 def test_square_matrix_mult(ctx: MLIRContext, workdir: Path):
     M = N = 16
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -91,8 +91,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -100,14 +100,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         ddr_id = 1
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -115,14 +115,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
         ddr_id = 2
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -130,9 +130,9 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=0,
                 column_num=1,
@@ -229,9 +229,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -262,10 +262,10 @@ def core():
 def test_square_matrix_mult_sugar(ctx: MLIRContext, workdir: Path):
     M = N = 16
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -299,8 +299,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -308,14 +308,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         ddr_id = 1
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -323,14 +323,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
         ddr_id = 2
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -338,9 +338,9 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=0,
                 column_num=1,
@@ -397,9 +397,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py b/test/npu-xrt/e2e/test_square_matrix_mult_vectorized.py
similarity index 94%
rename from test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py
rename to test/npu-xrt/e2e/test_square_matrix_mult_vectorized.py
index b11e4463f8..55a8feeb3c 100644
--- a/test/ipu-xrt/e2e/test_square_matrix_mult_vectorized.py
+++ b/test/npu-xrt/e2e/test_square_matrix_mult_vectorized.py
@@ -64,12 +64,12 @@ def matmul_i32_i32(
 
 
 def test_square_matrix_mult_vectorized(ctx: MLIRContext, workdir: Path):
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         matmul_i32_i32.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -113,8 +113,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -122,14 +122,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         ddr_id = 1
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -137,14 +137,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
         ddr_id = 2
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -152,9 +152,9 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=0,
                 column_num=1,
@@ -326,9 +326,9 @@ def super_vectorize(target: any_op_t()):
 
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -358,11 +358,11 @@ def super_vectorize(target: any_op_t()):
 
 
 def test_square_matrix_mult_vectorized_sugar(ctx: MLIRContext, workdir: Path):
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         matmul_i32_i32.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -401,8 +401,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         bd_id = 0
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -410,14 +410,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         ddr_id = 1
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -425,14 +425,14 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
+        npu_insts.extend(aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
         ddr_id = 2
         bd_id += 1
-        ipu_insts.extend(
-            aiex.ipu.writebd_shimtile(
+        npu_insts.extend(
+            aiex.npu.writebd_shimtile(
                 col,
                 bd_id,
                 buffer_length=M * N,
@@ -440,9 +440,9 @@ def ipu():
                 ddr_id=ddr_id,
             )
         )
-        ipu_insts.extend(aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
-        ipu_insts.extend(
-            aiex.ipu.sync(
+        npu_insts.extend(aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id))
+        npu_insts.extend(
+            aiex.npu.sync(
                 channel=0,
                 column=0,
                 column_num=1,
@@ -581,9 +581,9 @@ def super_vectorize(target: any_op_t()):
 
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_tiled_matrix_add.py b/test/npu-xrt/e2e/test_tiled_matrix_add.py
similarity index 92%
rename from test/ipu-xrt/e2e/test_tiled_matrix_add.py
rename to test/npu-xrt/e2e/test_tiled_matrix_add.py
index 00755104ed..21d90f3231 100644
--- a/test/ipu-xrt/e2e/test_tiled_matrix_add.py
+++ b/test/npu-xrt/e2e/test_tiled_matrix_add.py
@@ -47,10 +47,10 @@ def test_tiled_matrix_add(ctx: MLIRContext, workdir: Path):
     _, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles(
         M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols
     )
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -100,8 +100,8 @@ def ipu():
         channel_index = 0
         ddr_id = 0
         for i, bd_id in enumerate(range(4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     tile_rows * tile_cols,
@@ -113,16 +113,16 @@ def ipu():
                     d0_stride=d0_stride,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
         channel_index = 1
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     tile_rows * tile_cols,
@@ -134,16 +134,16 @@ def ipu():
                     d0_stride=d0_stride,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
         channel_index = 0
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     tile_rows * tile_cols,
@@ -155,11 +155,11 @@ def ipu():
                     d0_stride=d0_stride,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0, column=0, column_num=1, direction=0, row=0, row_num=1
                 )
             )
@@ -258,9 +258,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -291,10 +291,10 @@ def test_matrix_add_sugar(ctx: MLIRContext, workdir: Path):
     _, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles(
         M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols
     )
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         shim_tile_0_0 = aie.tile(0, 0)
         mem_tile_0_1 = aie.tile(0, 1)
         compute_tile_0_2 = aie.tile(0, 2)
@@ -359,8 +359,8 @@ def ipu():
         # in A
         ddr_id = 0
         for i, bd_id in enumerate(range(4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     tile_rows * tile_cols,
@@ -372,8 +372,8 @@ def ipu():
                     d0_stride=d0_stride,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(
                     MM2S, input_a_tile_0_0_to_tile_0_1.source_channel, col, bd_id
                 )
             )
@@ -381,8 +381,8 @@ def ipu():
         # in B
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     tile_rows * tile_cols,
@@ -394,8 +394,8 @@ def ipu():
                     d0_stride=d0_stride,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(
                     MM2S, input_b_tile_0_0_to_tile_0_1.source_channel, col, bd_id
                 )
             )
@@ -403,8 +403,8 @@ def ipu():
         # out C
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     tile_rows * tile_cols,
@@ -416,13 +416,13 @@ def ipu():
                     d0_stride=d0_stride,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(
                     S2MM, output_c_tile_0_1_to_tile_0_0.dest_channel, col, bd_id
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0, column=0, column_num=1, direction=0, row=0, row_num=1
                 )
             )
@@ -496,9 +496,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py b/test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py
similarity index 93%
rename from test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py
rename to test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py
index 1b19015d33..4e5e41615b 100644
--- a/test/ipu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py
+++ b/test/npu-xrt/e2e/test_tiled_nonsquare_spatial_tile_matrix_mult.py
@@ -70,7 +70,7 @@ def shim_tensor_slice(
         M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols
     )
 
-    ipu_insts = aiex.ipu.writebd_shimtile(
+    npu_insts = aiex.npu.writebd_shimtile(
         column=column,
         bd_id=bd_id,
         ddr_id=ddr_id,
@@ -81,23 +81,23 @@ def shim_tensor_slice(
         d0_size=d0_size,
         d0_stride=d0_stride,
     )
-    ipu_insts.extend(
-        aiex.ipu.shimtile_push_queue(channel_dir, channel_index, column, bd_id=bd_id)
+    npu_insts.extend(
+        aiex.npu.shimtile_push_queue(channel_dir, channel_index, column, bd_id=bd_id)
     )
-    return ipu_insts
+    return npu_insts
 
 
 def shim_bd(direction, channel, buffer_length, column=0, bd_id=0, ddr_id=0):
-    ipu_insts = []
-    ipu_insts.extend(
-        aiex.ipu.writebd_shimtile(
+    npu_insts = []
+    npu_insts.extend(
+        aiex.npu.writebd_shimtile(
             column=column, bd_id=bd_id, ddr_id=ddr_id, buffer_length=buffer_length
         )
     )
-    ipu_insts.extend(
-        aiex.ipu.shimtile_push_queue(direction, channel, column, bd_id=bd_id)
+    npu_insts.extend(
+        aiex.npu.shimtile_push_queue(direction, channel, column, bd_id=bd_id)
     )
-    return ipu_insts
+    return npu_insts
 
 
 def test_tiled_nonsquare_tile_spatial_2x2(ctx: MLIRContext, workdir: Path):
@@ -136,10 +136,10 @@ def test_tiled_nonsquare_tile_spatial_2x2(ctx: MLIRContext, workdir: Path):
         M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C
     )
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         # col a0 (top row of matrix products)
         tiles = np.empty((5, 6), dtype=object)
         for col in [0, 1]:
@@ -167,17 +167,17 @@ def ipu():
         # fmt: off
         column = 0
         # broadcast a0
-        ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0))
+        npu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0))
         # broadcast b0
-        ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1))
+        npu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1))
 
         column = 1
         # broadcast a1
-        ipu_insts.extend(
+        npu_insts.extend(
             shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, d1_size_A * d1_stride_A, column, MM2S, broadcast_a1_flow_ep.source_channel, 0, 0)
         )
         # broadcast b1
-        ipu_insts.extend(
+        npu_insts.extend(
             shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, d0_size_B * d0_stride_B, column, MM2S, broadcast_b1_flow_ep.source_channel, 1, 1)
         )
         # fmt: on
@@ -339,15 +339,15 @@ def memtile_dma_c_1():
 
         # fmt: off
         for i, (column, channel, bd_id) in enumerate(channels):
-            ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2))
-            ipu_insts.extend(aiex.ipu.sync(channel=channel, column=column))
+            npu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2))
+            npu_insts.extend(aiex.npu.sync(channel=channel, column=column))
         # fmt: on
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -453,7 +453,7 @@ def test_tiled_nonsquare_tile_spatial_2x2_vectorized(ctx: MLIRContext, workdir:
         M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C
     )
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
     mod_aievec = ExplicitlyManagedModule()
     kernel = matmul_i32_i32_already_vectorized.emit(force=True)
@@ -462,8 +462,8 @@ def test_tiled_nonsquare_tile_spatial_2x2_vectorized(ctx: MLIRContext, workdir:
 
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         matmul_i32_i32_already_vectorized.emit(decl=True)
         # col a0 (top row of matrix products)
         tiles = np.empty((5, 6), dtype=object)
@@ -492,17 +492,17 @@ def ipu():
         # fmt: off
         column = 0
         # broadcast a0
-        ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0))
+        npu_insts.extend(shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, 0, column, MM2S, broadcast_a0_flow_ep.source_channel, 0, 0))
         # broadcast b0
-        ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1))
+        npu_insts.extend(shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, 0, column, MM2S, broadcast_b0_flow_ep.source_channel, 1, 1))
 
         column = 1
         # broadcast a1
-        ipu_insts.extend(
+        npu_insts.extend(
             shim_tensor_slice(M, N, tile_rows_A, tile_cols_A, d1_size_A * d1_stride_A, column, MM2S, broadcast_a1_flow_ep.source_channel, 0, 0)
         )
         # broadcast b1
-        ipu_insts.extend(
+        npu_insts.extend(
             shim_tensor_slice(M, N, tile_rows_B, tile_cols_B, d0_size_B * d0_stride_B, column, MM2S, broadcast_b1_flow_ep.source_channel, 1, 1)
         )
         # fmt: on
@@ -664,8 +664,8 @@ def memtile_dma_c_1():
 
         # fmt: off
         for i, (column, channel, bd_id) in enumerate(channels):
-            ipu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2))
-            ipu_insts.extend(aiex.ipu.sync(channel=channel, column=column))
+            npu_insts.extend(shim_tensor_slice(M, N, tile_rows_C, tile_cols_C, offsets[i], column, S2MM, channel, bd_id, 2))
+            npu_insts.extend(aiex.npu.sync(channel=channel, column=column))
         # fmt: on
 
     mod_aie = mod_aie.finish()
@@ -673,9 +673,9 @@ def memtile_dma_c_1():
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
 
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -712,8 +712,8 @@ def test_tiled_nonsquare_tile_spatial_4x4_weight_stationary_v1(
 
     dest_channels = {}
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray(cols, rows)
         for i, ((col, row), t) in enumerate(tiles[:, 2:]):
             b = aie.buffer(
@@ -784,28 +784,28 @@ def memtile_dma():
     kernel_json = emit_design_kernel_json(buffer_args=buffer_args)
     xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json)
 
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
         views = xclbin.mmap_buffers([(K,)] * len(cols), np.int32)
 
-        ipu_insts = aiex.ipu.get_prolog()
+        npu_insts = aiex.npu.get_prolog()
         bd_id = 0
         for col in cols:
             dest_channel = dest_channels[col]
-            writebd_shimtile_insts = aiex.ipu.writebd_shimtile(
+            writebd_shimtile_insts = aiex.npu.writebd_shimtile(
                 col, bd_id, buffer_length=K
             )
-            ipu_insts.extend(
-                aiex.ipu._exec_write_bd_extend_shim_tile_opt(
+            npu_insts.extend(
+                aiex.npu._exec_write_bd_extend_shim_tile_opt(
                     writebd_shimtile_insts,
                     tensor_addr=xclbin._get_buffer_host_address(col),
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, dest_channel, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, dest_channel, col, bd_id)
             )
-            ipu_insts.extend(aiex.ipu.sync(column=col))
-        xclbin.load_ipu_instructions(ipu_insts)
+            npu_insts.extend(aiex.npu.sync(column=col))
+        xclbin.load_npu_instructions(npu_insts)
 
         wraps = list(map(np.asarray, views))
 
@@ -826,8 +826,8 @@ def test_double_pump_single_buffer(ctx: MLIRContext, workdir: Path):
     source_channels = {}
     # dest_channels = {}
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray(cols=[0], rows=[0, 1, 2])
         buffer = tiles[0, 2].buffer([(K,)], [T.i32()], "double_buffer")
 
@@ -969,27 +969,27 @@ def memtile_dma():
     kernel_json = emit_design_kernel_json(buffer_args=buffer_args)
     xclbin_path = make_xclbin(ctx.module, workdir, kernel_json=kernel_json)
 
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
         views = xclbin.mmap_buffers([(K,)] * 2, np.int32)
 
-        ipu_insts = aiex.ipu.get_prolog()
+        npu_insts = aiex.npu.get_prolog()
         col = 0
         for bd_id, player in enumerate(["player_a", "player_b"]):
             source_channel = source_channels[player]
-            writebd_shimtile_insts = aiex.ipu.writebd_shimtile(
+            writebd_shimtile_insts = aiex.npu.writebd_shimtile(
                 col, bd_id, buffer_length=K
             )
-            ipu_insts.extend(
-                aiex.ipu._exec_write_bd_extend_shim_tile_opt(
+            npu_insts.extend(
+                aiex.npu._exec_write_bd_extend_shim_tile_opt(
                     writebd_shimtile_insts,
                     tensor_addr=xclbin._get_buffer_host_address(col),
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, source_channel, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, source_channel, col, bd_id)
             )
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
 
         wraps = list(map(np.asarray, views))
 
diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py
similarity index 91%
rename from test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py
rename to test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py
index fdce41d8ae..cc460b6122 100644
--- a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py
+++ b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult.py
@@ -79,10 +79,10 @@ def test_tiled_nonsquare_tile_matrix_mult(ctx: MLIRContext, workdir: Path):
         M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C
     )
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -130,8 +130,8 @@ def ipu():
             0 + d1_size_A * d1_stride_A,
         ]
         for i, bd_id in enumerate(range(2)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_A * tile_n_A,
@@ -139,16 +139,16 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
         channel_index = 1
         ddr_id = 1
         for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -160,13 +160,13 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
             bd_id += 1
             # B tiles are "tall" so need to offset by cols (i.e. d0 dim)
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -178,8 +178,8 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -193,8 +193,8 @@ def ipu():
         ]
 
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_C * tile_n_C,
@@ -206,11 +206,11 @@ def ipu():
                     d0_stride=d0_stride_C,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0, column=0, column_num=1, direction=0, row=0, row_num=1
                 )
             )
@@ -312,9 +312,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -378,10 +378,10 @@ def test_tiled_nonsquare_tile_matrix_mult_sugar(ctx: MLIRContext, workdir: Path)
         M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C
     )
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -424,8 +424,8 @@ def ipu():
             0 + d1_size_A * d1_stride_A,
         ]
         for i, bd_id in enumerate(range(2)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_A * tile_n_A,
@@ -433,16 +433,16 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
         channel_index = 1
         ddr_id = 1
         for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -454,13 +454,13 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
             bd_id += 1
             # B tiles are "tall" so need to offset by cols (i.e. d0 dim)
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -472,8 +472,8 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -487,8 +487,8 @@ def ipu():
         ]
 
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_C * tile_n_C,
@@ -500,11 +500,11 @@ def ipu():
                     d0_stride=d0_stride_C,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0, column=0, column_num=1, direction=0, row=0, row_num=1
                 )
             )
@@ -570,9 +570,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py
similarity index 92%
rename from test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py
rename to test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py
index 036400fb1a..3d5b85c45b 100644
--- a/test/ipu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py
+++ b/test/npu-xrt/e2e/test_tiled_nonsquare_tile_matrix_mult_vectorized.py
@@ -101,12 +101,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized(ctx: MLIRContext, workdir:
         M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C
     )
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         matmul_i32_i32.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -155,8 +155,8 @@ def ipu():
             0 + d1_size_A * d1_stride_A,
         ]
         for i, bd_id in enumerate(range(2)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_A * tile_n_A,
@@ -164,16 +164,16 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
         channel_index = 1
         ddr_id = 1
         for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -185,13 +185,13 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
             bd_id += 1
             # B tiles are "tall" so need to offset by cols (i.e. d0 dim)
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -203,8 +203,8 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -218,8 +218,8 @@ def ipu():
         ]
 
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_C * tile_n_C,
@@ -231,11 +231,11 @@ def ipu():
                     d0_stride=d0_stride_C,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -408,9 +408,9 @@ def super_vectorize(target: any_op_t()):
 
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -466,12 +466,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized_sugar(
         M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C
     )
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         matmul_i32_i32.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -515,8 +515,8 @@ def ipu():
             0 + d1_size_A * d1_stride_A,
         ]
         for i, bd_id in enumerate(range(2)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_A * tile_n_A,
@@ -524,8 +524,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
@@ -533,8 +533,8 @@ def ipu():
         col = 0
         ddr_id = 1
         for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -546,13 +546,13 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
             bd_id += 1
             # B tiles are "tall" so need to offset by cols (i.e. d0 dim)
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -564,8 +564,8 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -580,8 +580,8 @@ def ipu():
         ]
 
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_C * tile_n_C,
@@ -593,11 +593,11 @@ def ipu():
                     d0_stride=d0_stride_C,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -736,9 +736,9 @@ def super_vectorize(target: any_op_t()):
 
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -828,12 +828,12 @@ def test_tiled_nonsquare_tile_matrix_mult_vectorized_sugar_already_vectorized(
         M, N, n_tile_rows=tile_rows_C, n_tile_cols=tile_cols_C
     )
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         matmul_i32_i32_already_vectorized.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -877,8 +877,8 @@ def ipu():
             0 + d1_size_A * d1_stride_A,
         ]
         for i, bd_id in enumerate(range(2)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_A * tile_n_A,
@@ -886,15 +886,15 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))
+            npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))
 
         # in B
         channel_index = 1
         col = 0
         ddr_id = 1
         for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -906,11 +906,11 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))
+            npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))
             bd_id += 1
             # B tiles are "tall" so need to offset by cols (i.e. d0 dim)
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_B * tile_n_B,
@@ -922,7 +922,7 @@ def ipu():
                     d0_stride=d0_stride_B,
                 )
             )
-            ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))
+            npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))
 
         # out C
         channel_index = 0
@@ -936,8 +936,8 @@ def ipu():
         ]
 
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=tile_m_C * tile_n_C,
@@ -949,9 +949,9 @@ def ipu():
                     d0_stride=d0_stride_C,
                 )
             )
-            ipu_insts.extend(aiex.ipu.write32(S2MM, channel_index, col, bd_id))
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(aiex.npu.write32(S2MM, channel_index, col, bd_id))
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -1030,9 +1030,9 @@ def core():
 
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         wrap_A, wrap_B, wrap_C = map(
             np.asarray, xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)
         )
diff --git a/test/ipu-xrt/e2e/test_tiled_vec_add.py b/test/npu-xrt/e2e/test_tiled_vec_add.py
similarity index 90%
rename from test/ipu-xrt/e2e/test_tiled_vec_add.py
rename to test/npu-xrt/e2e/test_tiled_vec_add.py
index ab0cd13769..ff8c1e77d1 100644
--- a/test/ipu-xrt/e2e/test_tiled_vec_add.py
+++ b/test/npu-xrt/e2e/test_tiled_vec_add.py
@@ -48,10 +48,10 @@ def test_vec_add(ctx: MLIRContext, workdir: Path):
     tiles = 4
     k = K // tiles
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -95,8 +95,8 @@ def ipu():
         ddr_id = 0
         offsets = list(range(0, K, k))
         for i, bd_id in enumerate(range(tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -104,16 +104,16 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
         channel_index = 1
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -121,16 +121,16 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
         channel_index = 0
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -138,11 +138,11 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -241,9 +241,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -278,10 +278,10 @@ def test_vec_add_sugar(ctx: MLIRContext, workdir: Path):
     tiles = 4
     k = K // tiles
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -316,8 +316,8 @@ def ipu():
         ddr_id = 0
         offsets = list(range(0, K, k))
         for i, bd_id in enumerate(range(tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -325,8 +325,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
@@ -334,8 +334,8 @@ def ipu():
         col = 0
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -343,8 +343,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -352,8 +352,8 @@ def ipu():
         col = 0
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -361,11 +361,11 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -422,9 +422,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py b/test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py
similarity index 92%
rename from test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py
rename to test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py
index d0990390ae..bcc8beb2be 100644
--- a/test/ipu-xrt/e2e/test_tiled_vec_add_vectorized.py
+++ b/test/npu-xrt/e2e/test_tiled_vec_add_vectorized.py
@@ -66,11 +66,11 @@ def vec_add_i32_i32(
 
 
 def test_vec_add_vectorized(ctx: MLIRContext, workdir: Path):
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         vec_add_i32_i32.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -115,8 +115,8 @@ def ipu():
         ddr_id = 0
         offsets = list(range(0, K, k))
         for i, bd_id in enumerate(range(tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -124,8 +124,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
@@ -133,8 +133,8 @@ def ipu():
         col = 0
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -142,8 +142,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -151,8 +151,8 @@ def ipu():
         col = 0
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -160,11 +160,11 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -328,9 +328,9 @@ def super_vectorize(target: any_op_t()):
     )
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -359,11 +359,11 @@ def super_vectorize(target: any_op_t()):
 
 
 def test_vec_add_vectorized_sugar(ctx: MLIRContext, workdir: Path):
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
     mod_aie = ExplicitlyManagedModule()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         vec_add_i32_i32.emit(decl=True)
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
@@ -399,8 +399,8 @@ def ipu():
         ddr_id = 0
         offsets = list(range(0, K, k))
         for i, bd_id in enumerate(range(tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -408,8 +408,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
@@ -417,8 +417,8 @@ def ipu():
         col = 0
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -426,8 +426,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -435,8 +435,8 @@ def ipu():
         col = 0
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -444,11 +444,11 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -574,9 +574,9 @@ def super_vectorize(target: any_op_t()):
 
     compile_with_vectorization(mod_aie, mod_aievec, workdir)
     xclbin_path = make_xclbin(mod_aie, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(K,), (K,), (K,)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/test_vec_dot.py b/test/npu-xrt/e2e/test_vec_dot.py
similarity index 90%
rename from test/ipu-xrt/e2e/test_vec_dot.py
rename to test/npu-xrt/e2e/test_vec_dot.py
index 7a2012a1d0..f111316692 100644
--- a/test/ipu-xrt/e2e/test_vec_dot.py
+++ b/test/npu-xrt/e2e/test_vec_dot.py
@@ -52,10 +52,10 @@ def test_vec_dot(ctx: MLIRContext, workdir: Path):
     tiles = 4
     k = K // tiles
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -99,8 +99,8 @@ def ipu():
         ddr_id = 0
         offsets = list(range(0, K, k))
         for i, bd_id in enumerate(range(tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -108,8 +108,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
@@ -117,8 +117,8 @@ def ipu():
         col = 0
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -126,8 +126,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -135,8 +135,8 @@ def ipu():
         col = 0
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=1,
@@ -144,11 +144,11 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -252,9 +252,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(K,), (K,), (tiles,)], np.int32)
 
         wrap_A = np.asarray(views[0])
@@ -289,10 +289,10 @@ def test_vec_dot_sugar(ctx: MLIRContext, workdir: Path):
     tiles = 4
     k = K // tiles
 
-    ipu_insts = aiex.ipu.get_prolog()
+    npu_insts = aiex.npu.get_prolog()
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tile_0_0 = aie.tile(0, 0)
         tile_0_1 = aie.tile(0, 1)
         tile_0_2 = aie.tile(0, 2)
@@ -327,8 +327,8 @@ def ipu():
         ddr_id = 0
         offsets = list(range(0, K, k))
         for i, bd_id in enumerate(range(tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -336,8 +336,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # in B
@@ -345,8 +345,8 @@ def ipu():
         col = 0
         ddr_id = 1
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=k,
@@ -354,8 +354,8 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(MM2S, channel_index, col, bd_id)
             )
 
         # out C
@@ -363,8 +363,8 @@ def ipu():
         col = 0
         ddr_id = 2
         for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + tiles)):
-            ipu_insts.extend(
-                aiex.ipu.writebd_shimtile(
+            npu_insts.extend(
+                aiex.npu.writebd_shimtile(
                     col,
                     bd_id,
                     buffer_length=1,
@@ -372,11 +372,11 @@ def ipu():
                     ddr_id=ddr_id,
                 )
             )
-            ipu_insts.extend(
-                aiex.ipu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
+            npu_insts.extend(
+                aiex.npu.shimtile_push_queue(S2MM, channel_index, col, bd_id)
             )
-            ipu_insts.extend(
-                aiex.ipu.sync(
+            npu_insts.extend(
+                aiex.npu.sync(
                     channel=0,
                     column=0,
                     column_num=1,
@@ -440,9 +440,9 @@ def core():
 
     compile_without_vectorization(ctx.module, workdir)
     xclbin_path = make_xclbin(ctx.module, workdir)
-    with FileLock("/tmp/ipu.lock"):
+    with FileLock("/tmp/npu.lock"):
         xclbin = XCLBin(xclbin_path, "MLIR_AIE")
-        xclbin.load_ipu_instructions(ipu_insts)
+        xclbin.load_npu_instructions(npu_insts)
         views = xclbin.mmap_buffers([(K,), (K,), (tiles,)], np.int32)
 
         wrap_A = np.asarray(views[0])
diff --git a/test/ipu-xrt/e2e/tiled_matrix_add.ipynb b/test/npu-xrt/e2e/tiled_matrix_add.ipynb
similarity index 91%
rename from test/ipu-xrt/e2e/tiled_matrix_add.ipynb
rename to test/npu-xrt/e2e/tiled_matrix_add.ipynb
index 1d9a6f1c59..0c9a2247ac 100644
--- a/test/ipu-xrt/e2e/tiled_matrix_add.ipynb
+++ b/test/npu-xrt/e2e/tiled_matrix_add.ipynb
@@ -94,7 +94,7 @@
     "_, _, (d1_size, d1_stride), (d0_size, d0_stride) = tiling_calculator_n_tiles(\n",
     "    M, N, n_tile_rows=n_tile_rows, n_tile_cols=n_tile_cols\n",
     ")\n",
-    "ipu_insts = aiex.ipu.get_prolog()"
+    "npu_insts = aiex.npu.get_prolog()"
    ]
   },
   {
@@ -112,8 +112,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "@aie.device(AIEDevice.ipu)\n",
-    "def ipu():  # function name isn't load-bearing\n",
+    "@aie.device(AIEDevice.npu)\n",
+    "def npu():  # function name isn't load-bearing\n",
     "\n",
     "    # tiles that will participate\n",
     "    shim_tile_0_0 = aie.tile(0, 0)\n",
@@ -193,8 +193,8 @@
     "    # (yes this is a weird naming/assignment but it's due to a hack in implementation...)\n",
     "    ddr_id = 0\n",
     "    for i, bd_id in enumerate(range(4)):\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.writebd_shimtile(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.writebd_shimtile(\n",
     "                col,\n",
     "                bd_id,\n",
     "                tile_rows * tile_cols,\n",
@@ -206,8 +206,8 @@
     "                d0_stride=d0_stride,\n",
     "            )\n",
     "        )\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.shimtile_push_queue(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.shimtile_push_queue(\n",
     "                MM2S, input_a_tile_0_0_to_tile_0_1.source_channel, col, bd_id\n",
     "            )\n",
     "        )\n",
@@ -215,8 +215,8 @@
     "    # in B\n",
     "    ddr_id = 1\n",
     "    for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.writebd_shimtile(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.writebd_shimtile(\n",
     "                col,\n",
     "                bd_id,\n",
     "                tile_rows * tile_cols,\n",
@@ -228,8 +228,8 @@
     "                d0_stride=d0_stride,\n",
     "            )\n",
     "        )\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.shimtile_push_queue(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.shimtile_push_queue(\n",
     "                MM2S, input_b_tile_0_0_to_tile_0_1.source_channel, col, bd_id\n",
     "            )\n",
     "        )\n",
@@ -237,8 +237,8 @@
     "    # out C\n",
     "    ddr_id = 2\n",
     "    for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.writebd_shimtile(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.writebd_shimtile(\n",
     "                bd_id,\n",
     "                tile_rows * tile_cols,\n",
     "                offsets[i],\n",
@@ -249,13 +249,13 @@
     "                d0_stride=d0_stride,\n",
     "            )\n",
     "        )\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.shimtile_push_queue(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.shimtile_push_queue(\n",
     "                S2MM, output_c_tile_0_1_to_tile_0_0.dest_channel, col, bd_id\n",
     "            )\n",
     "        )\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.sync(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.sync(\n",
     "                channel=0, column=0, column_num=1, direction=0, row=0, row_num=1\n",
     "            )\n",
     "        )\n",
@@ -365,7 +365,7 @@
      "output_type": "stream",
      "text": [
       "module {\n",
-      "  aie.device(ipu) {\n",
+      "  aie.device(npu) {\n",
       "    %tile_0_0 = aie.tile(0, 0)\n",
       "    %tile_0_1 = aie.tile(0, 1)\n",
       "    %tile_0_2 = aie.tile(0, 2)\n",
@@ -376,34 +376,34 @@
       "    aie.flow(%tile_0_2, DMA : 0, %tile_0_1, DMA : 2)\n",
       "    aie.flow(%tile_0_1, DMA : 2, %tile_0_0, DMA : 0)\n",
       "    func.func @bobsyouruncle() {\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 2 : ui32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 4 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 4 : ui32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 5 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 5 : ui32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 6 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 6 : ui32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 7 : ui32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 8 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483656 : ui32}\n",
-      "      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483657 : ui32}\n",
-      "      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483658 : ui32}\n",
-      "      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n",
-      "      aiex.ipu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
-      "      aiex.ipu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483659 : ui32}\n",
-      "      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 0 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 0 : ui32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 1 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 1 : ui32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 2 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 2 : ui32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 0 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119316 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 4 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 4 : ui32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 5 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 5 : ui32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 6 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 6 : ui32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 7 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 1 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119324 : ui32, column = 0 : i32, row = 0 : i32, value = 7 : ui32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 8 : i32, buffer_length = 64 : i32, buffer_offset = 0 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483656 : ui32}\n",
+      "      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 9 : i32, buffer_length = 64 : i32, buffer_offset = 32 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483657 : ui32}\n",
+      "      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 10 : i32, buffer_length = 64 : i32, buffer_offset = 512 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483658 : ui32}\n",
+      "      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n",
+      "      aiex.npu.writebd_shimtile {bd_id = 11 : i32, buffer_length = 64 : i32, buffer_offset = 544 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 8 : i32, d0_stride = 0 : i32, d1_size = 8 : i32, d1_stride = 15 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}\n",
+      "      aiex.npu.write32 {address = 119300 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483659 : ui32}\n",
+      "      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}\n",
       "      return\n",
       "    }\n",
       "    %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) {\n",
@@ -525,7 +525,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# shim DMA as ipu instructions\n",
+    "# shim DMA as npu instructions\n",
     "compile_without_vectorization(ctx.module, workdir)\n",
     "xclbin_path = make_xclbin(ctx.module, workdir)"
    ]
@@ -546,11 +546,11 @@
    "outputs": [],
    "source": [
     "# FileLock because this runs in CI where multiple jobs might be attempting to run (and the device isn't multi-tenant yet)\n",
-    "with FileLock(\"/tmp/ipu.lock\"):\n",
+    "with FileLock(\"/tmp/npu.lock\"):\n",
     "    # XRT manager\n",
     "    xclbin = XCLBin(xclbin_path, \"MLIR_AIE\")\n",
     "    # configure shim dmas\n",
-    "    xclbin.load_ipu_instructions(ipu_insts)\n",
+    "    xclbin.load_npu_instructions(npu_insts)\n",
     "\n",
     "    # initialize input operands and zero out output\n",
     "    views = xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)\n",
diff --git a/test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb b/test/npu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb
similarity index 95%
rename from test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb
rename to test/npu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb
index d6b4248284..2b377f3fba 100644
--- a/test/ipu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb
+++ b/test/npu-xrt/e2e/tiled_nonsquare_tile_matrix_mult_vectorized.ipynb
@@ -442,10 +442,10 @@
     "        # A tiles are \"fat\" so need to offset by rows (i.e. d1 dim)\n",
     "        0 + d1_size_A * d1_stride_A,\n",
     "    ]\n",
-    "    ipu_insts = aiex.ipu.get_prolog()\n",
+    "    npu_insts = aiex.npu.get_prolog()\n",
     "    for i, bd_id in enumerate(range(2)):\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.writebd_shimtile(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.writebd_shimtile(\n",
     "                col,\n",
     "                bd_id,\n",
     "                buffer_length=tile_m_A * tile_n_A,\n",
@@ -453,14 +453,14 @@
     "                ddr_id=ddr_id,\n",
     "            )\n",
     "        )\n",
-    "        ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n",
+    "        npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n",
     "\n",
     "    # in B\n",
     "    channel_index = 1\n",
     "    ddr_id = 1\n",
     "    for bd_id in range(bd_id + 1, bd_id + 1 + 4, 2):\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.writebd_shimtile(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.writebd_shimtile(\n",
     "                col,\n",
     "                bd_id,\n",
     "                buffer_length=tile_m_B * tile_n_B,\n",
@@ -472,11 +472,11 @@
     "                d0_stride=d0_stride_B,\n",
     "            )\n",
     "        )\n",
-    "        ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n",
+    "        npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n",
     "        bd_id += 1\n",
     "        # B tiles are \"tall\" so need to offset by cols (i.e. d0 dim)\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.writebd_shimtile(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.writebd_shimtile(\n",
     "                col,\n",
     "                bd_id,\n",
     "                buffer_length=tile_m_B * tile_n_B,\n",
@@ -488,7 +488,7 @@
     "                d0_stride=d0_stride_B,\n",
     "            )\n",
     "        )\n",
-    "        ipu_insts.extend(aiex.ipu.write32(MM2S, channel_index, col, bd_id))\n",
+    "        npu_insts.extend(aiex.npu.write32(MM2S, channel_index, col, bd_id))\n",
     "\n",
     "    # out C\n",
     "    channel_index = 0\n",
@@ -501,8 +501,8 @@
     "    ]\n",
     "\n",
     "    for i, bd_id in enumerate(range(bd_id + 1, bd_id + 1 + 4)):\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.writebd_shimtile(\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.writebd_shimtile(\n",
     "                col,\n",
     "                bd_id,\n",
     "                buffer_length=tile_m_C * tile_n_C,\n",
@@ -514,9 +514,9 @@
     "                d0_stride=d0_stride_C,\n",
     "            )\n",
     "        )\n",
-    "        ipu_insts.extend(aiex.ipu.write32(S2MM, channel_index, col, bd_id))\n",
-    "        ipu_insts.extend(\n",
-    "            aiex.ipu.sync(\n",
+    "        npu_insts.extend(aiex.npu.write32(S2MM, channel_index, col, bd_id))\n",
+    "        npu_insts.extend(\n",
+    "            aiex.npu.sync(\n",
     "                channel=0,\n",
     "                column=0,\n",
     "                column_num=1,\n",
@@ -526,7 +526,7 @@
     "            )\n",
     "        )\n",
     "\n",
-    "    return ipu_insts"
+    "    return npu_insts"
    ]
   },
   {
@@ -559,8 +559,8 @@
    },
    "outputs": [],
    "source": [
-    "@aie.device(AIEDevice.ipu)\n",
-    "def ipu():\n",
+    "@aie.device(AIEDevice.npu)\n",
+    "def npu():\n",
     "    matmul_i32_i32.emit(decl=True)\n",
     "    tile_0_0 = aie.tile(0, 0)\n",
     "    tile_0_1 = aie.tile(0, 1)\n",
@@ -675,10 +675,10 @@
    "outputs": [],
    "source": [
     "xclbin_path = make_xclbin(mod_aie, workdir)\n",
-    "with FileLock(\"/tmp/ipu.lock\"):\n",
+    "with FileLock(\"/tmp/npu.lock\"):\n",
     "    xclbin = XCLBin(xclbin_path, \"MLIR_AIE\")\n",
-    "    ipu_insts = command_control()\n",
-    "    xclbin.load_ipu_instructions(ipu_insts)\n",
+    "    npu_insts = command_control()\n",
+    "    xclbin.load_npu_instructions(npu_insts)\n",
     "\n",
     "    wrap_A, wrap_B, wrap_C = map(\n",
     "        np.asarray, xclbin.mmap_buffers([(M, N), (M, N), (M, N)], np.int32)\n",
diff --git a/test/ipu-xrt/e2e/util.py b/test/npu-xrt/e2e/util.py
similarity index 100%
rename from test/ipu-xrt/e2e/util.py
rename to test/npu-xrt/e2e/util.py
diff --git a/test/ipu-xrt/lit.local.cfg b/test/npu-xrt/lit.local.cfg
similarity index 91%
rename from test/ipu-xrt/lit.local.cfg
rename to test/npu-xrt/lit.local.cfg
index 04b92ba609..2d7aa71633 100644
--- a/test/ipu-xrt/lit.local.cfg
+++ b/test/npu-xrt/lit.local.cfg
@@ -7,7 +7,7 @@
 config.suffixes = [".lit", ".py"]
 
 if "ryzen_ai" not in config.available_features:
-    config.unsupported = ["ipu-xrt"]
+    config.unsupported = ["npu-xrt"]
 else:
     config.unsupported = []
 
diff --git a/test/ipu-xrt/makefile-common b/test/npu-xrt/makefile-common
similarity index 92%
rename from test/ipu-xrt/makefile-common
rename to test/npu-xrt/makefile-common
index d9a0a69015..51e9a19245 100644
--- a/test/ipu-xrt/makefile-common
+++ b/test/npu-xrt/makefile-common
@@ -1,4 +1,4 @@
-# Contains common definitions used across the Makefiles of ipu-xrt tests.
+# Contains common definitions used across the Makefiles of npu-xrt tests.
 
 # VITIS related variables
 VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../)
diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir
similarity index 97%
rename from test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir
rename to test/npu-xrt/matrix_multiplication_using_dma/aie.mlir
index 01594e64cf..541b44ecea 100644
--- a/test/ipu-xrt/matrix_multiplication_using_dma/aie.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_dma/aie.mlir
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @inA : memref<64x32xi16>
     memref.global "public" @inA_cons : memref<64x32xi16>
     memref.global "public" @inB : memref<32x64xi16>
@@ -111,12 +111,12 @@ module {
       %c64_i64 = arith.constant 64 : i64
       %c32_i64 = arith.constant 32 : i64
       %c4096_i64 = arith.constant 4096 : i64
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c2_i64, %c64_i64, %c32_i64] [%c4096_i64, %c32_i64, %c64_i64]) {id = 0 : i64, metadata = @outC} : memref<8192xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 1 : i64, metadata = @inA} : memref<8192xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32>
-      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c2_i64, %c64_i64, %c32_i64] [%c4096_i64, %c32_i64, %c64_i64]) {id = 0 : i64, metadata = @outC} : memref<8192xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 1 : i64, metadata = @inA} : memref<8192xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 2 : i64, metadata = @inB} : memref<8192xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c4096_i64] [%c2_i64, %c4_i64, %c64_i64, %c16_i64] [%c0_i64, %c16_i64, %c64_i64]) {id = 3 : i64, metadata = @inA} : memref<8192xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg1[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c2_i64, %c4_i64, %c32_i64, %c32_i64] [%c32_i64, %c2048_i64, %c64_i64]) {id = 4 : i64, metadata = @inB} : memref<8192xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 
diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/mm.cc b/test/npu-xrt/matrix_multiplication_using_dma/mm.cc
similarity index 100%
rename from test/ipu-xrt/matrix_multiplication_using_dma/mm.cc
rename to test/npu-xrt/matrix_multiplication_using_dma/mm.cc
diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit b/test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit
similarity index 77%
rename from test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit
rename to test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit
index 483c7967b7..dd8a83ef1a 100644
--- a/test/ipu-xrt/matrix_multiplication_using_dma/run-a2x.lit
+++ b/test/npu-xrt/matrix_multiplication_using_dma/run-a2x.lit
@@ -4,8 +4,8 @@
 // REQUIRES: ryzen_ai
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o
-// RUN: aie2xclbin --use-chess --xclbin-name=aie2.xclbin --ipu-insts-name=insts2.txt --tmpdir=aie2xclbin.prj -v %S/aie.mlir
+// RUN: aie2xclbin --use-chess --xclbin-name=aie2.xclbin --npu-insts-name=insts2.txt --tmpdir=aie2xclbin.prj -v %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie2.xclbin -k MLIR_AIE -i insts2.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie2.xclbin -k MLIR_AIE -i insts2.txt | FileCheck %s
 // CHECK: PASS!
 
diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit b/test/npu-xrt/matrix_multiplication_using_dma/run.lit
similarity index 78%
rename from test/ipu-xrt/matrix_multiplication_using_dma/run.lit
rename to test/npu-xrt/matrix_multiplication_using_dma/run.lit
index ac347dcce6..850baf0a7d 100644
--- a/test/ipu-xrt/matrix_multiplication_using_dma/run.lit
+++ b/test/npu-xrt/matrix_multiplication_using_dma/run.lit
@@ -4,7 +4,7 @@
 // REQUIRES: ryzen_ai, chess
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/mm.cc -o ./mm.o
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/test.cpp b/test/npu-xrt/matrix_multiplication_using_dma/test.cpp
similarity index 100%
rename from test/ipu-xrt/matrix_multiplication_using_dma/test.cpp
rename to test/npu-xrt/matrix_multiplication_using_dma/test.cpp
diff --git a/test/ipu-xrt/matrix_multiplication_using_dma/zero.cc b/test/npu-xrt/matrix_multiplication_using_dma/zero.cc
similarity index 100%
rename from test/ipu-xrt/matrix_multiplication_using_dma/zero.cc
rename to test/npu-xrt/matrix_multiplication_using_dma/zero.cc
diff --git a/test/ipu-xrt/two_col/Makefile b/test/npu-xrt/two_col/Makefile
similarity index 72%
rename from test/ipu-xrt/two_col/Makefile
rename to test/npu-xrt/two_col/Makefile
index 9fe6d4d097..08c3e61293 100644
--- a/test/ipu-xrt/two_col/Makefile
+++ b/test/npu-xrt/two_col/Makefile
@@ -5,14 +5,14 @@ VPATH := $(VISION_KERNELS_VPATH_BASE)/threshold
 all: final.xclbin
 
 insts.txt: aie.mlir
-	aiecc.py -v --aie-only-generate-ipu --ipu-insts-name=$@ $<
+	aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $<
 
 threshold.o: threshold.cc
 	xchesscc -d ${CHESSCC2_FLAGS} -DBIT_WIDTH=8 -c $< -o $@
 	
 final.xclbin: aie.mlir threshold.o
-	aiecc.py -v --aie-generate-cdo --aie-generate-ipu --no-compile-host \
-		--xclbin-name=$@ --ipu-insts-name=insts.txt $<
+	aiecc.py -v --aie-generate-cdo --aie-generate-npu --no-compile-host \
+		--xclbin-name=$@ --npu-insts-name=insts.txt $<
 
 clean:
 	rm -rf *.elf* *.bif aie.mlir.prj *.xclbin sim \
diff --git a/test/ipu-xrt/two_col/aie.mlir b/test/npu-xrt/two_col/aie.mlir
similarity index 91%
rename from test/ipu-xrt/two_col/aie.mlir
rename to test/npu-xrt/two_col/aie.mlir
index 10975fd06b..692ef5db0a 100644
--- a/test/ipu-xrt/two_col/aie.mlir
+++ b/test/npu-xrt/two_col/aie.mlir
@@ -1,5 +1,5 @@
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     %0 = aie.tile(0, 0)
     %1 = aie.tile(0, 1)
     %2 = aie.tile(0, 2)
@@ -123,17 +123,17 @@ module {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
       %c2048 = arith.constant 2048 : i64
-      aiex.ipu.rtp_write(0, 2, 0, 50) { buffer_sym_name = "rtp0" }
-      aiex.ipu.rtp_write(0, 3, 0, 50) { buffer_sym_name = "rtp1" }
-      aiex.ipu.rtp_write(1, 4, 0, 50) { buffer_sym_name = "rtp2" }
-      aiex.ipu.rtp_write(1, 5, 0, 50) { buffer_sym_name = "rtp3" }
-      aiex.ipu.rtp_write(0, 2, 1, 0) { buffer_sym_name = "rtp0" }
-      aiex.ipu.rtp_write(0, 3, 1, 0) { buffer_sym_name = "rtp1" }
-      aiex.ipu.rtp_write(1, 4, 1, 0) { buffer_sym_name = "rtp2" }
-      aiex.ipu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" }
-      aiex.ipu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32>
-      aiex.ipu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32>
-      aiex.ipu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
+      aiex.npu.rtp_write(0, 2, 0, 50) { buffer_sym_name = "rtp0" }
+      aiex.npu.rtp_write(0, 3, 0, 50) { buffer_sym_name = "rtp1" }
+      aiex.npu.rtp_write(1, 4, 0, 50) { buffer_sym_name = "rtp2" }
+      aiex.npu.rtp_write(1, 5, 0, 50) { buffer_sym_name = "rtp3" }
+      aiex.npu.rtp_write(0, 2, 1, 0) { buffer_sym_name = "rtp0" }
+      aiex.npu.rtp_write(0, 3, 1, 0) { buffer_sym_name = "rtp1" }
+      aiex.npu.rtp_write(1, 4, 1, 0) { buffer_sym_name = "rtp2" }
+      aiex.npu.rtp_write(1, 5, 1, 0) { buffer_sym_name = "rtp3" }
+      aiex.npu.dma_memcpy_nd (0, 0, %out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_out0, id = 1 : i64 } : memref<2048xi32>
+      aiex.npu.dma_memcpy_nd (0, 0, %in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c2048][%c0,%c0,%c0]) { metadata = @objFifo_in0, id = 0 : i64 } : memref<2048xi32>
+      aiex.npu.sync { column = 0 : i32, row = 0 : i32, direction = 0 : i32, channel = 0 : i32, column_num = 1 : i32, row_num = 1 : i32 }
       return
     }
   }
diff --git a/test/ipu-xrt/two_col/run.lit b/test/npu-xrt/two_col/run.lit
similarity index 73%
rename from test/ipu-xrt/two_col/run.lit
rename to test/npu-xrt/two_col/run.lit
index 01ff6afed4..5b2b54b291 100644
--- a/test/ipu-xrt/two_col/run.lit
+++ b/test/npu-xrt/two_col/run.lit
@@ -4,7 +4,7 @@
 // REQUIRES: ryzen_ai, chess
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -DBIT_WIDTH=8 -c %S/threshold.cc -o ./threshold.o
-// RUN: %python aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --xchesscc --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/test/ipu-xrt/two_col/run.sh b/test/npu-xrt/two_col/run.sh
similarity index 100%
rename from test/ipu-xrt/two_col/run.sh
rename to test/npu-xrt/two_col/run.sh
diff --git a/test/ipu-xrt/two_col/test.cpp b/test/npu-xrt/two_col/test.cpp
similarity index 100%
rename from test/ipu-xrt/two_col/test.cpp
rename to test/npu-xrt/two_col/test.cpp
diff --git a/test/ipu-xrt/two_col/threshold.cc b/test/npu-xrt/two_col/threshold.cc
similarity index 100%
rename from test/ipu-xrt/two_col/threshold.cc
rename to test/npu-xrt/two_col/threshold.cc
diff --git a/test/ipu-xrt/vector_scalar_using_dma/aie.mlir b/test/npu-xrt/vector_scalar_using_dma/aie.mlir
similarity index 95%
rename from test/ipu-xrt/vector_scalar_using_dma/aie.mlir
rename to test/npu-xrt/vector_scalar_using_dma/aie.mlir
index ebdd9aaefb..e2e9643370 100644
--- a/test/ipu-xrt/vector_scalar_using_dma/aie.mlir
+++ b/test/npu-xrt/vector_scalar_using_dma/aie.mlir
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 module {
-  aie.device(ipu) {
+  aie.device(npu) {
     memref.global "public" @in : memref<1024xi32>
     memref.global "public" @in_cons : memref<1024xi32>
     memref.global "public" @out : memref<1024xi32>
@@ -66,9 +66,9 @@ module {
       %c0_i64 = arith.constant 0 : i64
       %c1_i64 = arith.constant 1 : i64
       %c4096_i64 = arith.constant 4096 : i64
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
-      aiex.ipu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
-      aiex.ipu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+      aiex.npu.dma_memcpy_nd(0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
+      aiex.npu.dma_memcpy_nd(0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64] [%c1_i64, %c1_i64, %c1_i64, %c4096_i64] [%c0_i64, %c0_i64, %c0_i64]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
+      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
       return
     }
 
diff --git a/test/ipu-xrt/vector_scalar_using_dma/run.lit b/test/npu-xrt/vector_scalar_using_dma/run.lit
similarity index 78%
rename from test/ipu-xrt/vector_scalar_using_dma/run.lit
rename to test/npu-xrt/vector_scalar_using_dma/run.lit
index 56b5153e7b..494056eba0 100644
--- a/test/ipu-xrt/vector_scalar_using_dma/run.lit
+++ b/test/npu-xrt/vector_scalar_using_dma/run.lit
@@ -4,7 +4,7 @@
 // REQUIRES: ryzen_ai, chess
 //
 // RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/scale.cc -o ./scale.o
-// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-ipu --no-compile-host --xclbin-name=aie.xclbin --ipu-insts-name=insts.txt %S/aie.mlir
+// RUN: %python aiecc.py --xbridge --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir
 // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
-// RUN: %run_on_ipu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
 // CHECK: PASS!
diff --git a/test/ipu-xrt/vector_scalar_using_dma/scale.cc b/test/npu-xrt/vector_scalar_using_dma/scale.cc
similarity index 100%
rename from test/ipu-xrt/vector_scalar_using_dma/scale.cc
rename to test/npu-xrt/vector_scalar_using_dma/scale.cc
diff --git a/test/ipu-xrt/vector_scalar_using_dma/test.cpp b/test/npu-xrt/vector_scalar_using_dma/test.cpp
similarity index 100%
rename from test/ipu-xrt/vector_scalar_using_dma/test.cpp
rename to test/npu-xrt/vector_scalar_using_dma/test.cpp
diff --git a/test/objectFifo-stateful-transform/nested_loop_test.mlir b/test/objectFifo-stateful-transform/nested_loop_test.mlir
index 12d35fce7e..c2ba81e1cb 100644
--- a/test/objectFifo-stateful-transform/nested_loop_test.mlir
+++ b/test/objectFifo-stateful-transform/nested_loop_test.mlir
@@ -9,7 +9,7 @@
 
 // RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s
 
-// CHECK-LABEL: aie.device(ipu)
+// CHECK-LABEL: aie.device(npu)
 //       CHECK:   scf.for
 //       CHECK:   {
 //       CHECK:     aie.use_lock
@@ -74,7 +74,7 @@
 //       CHECK:     aie.use_lock
 //       CHECK:   }
 
-aie.device(ipu) {
+aie.device(npu) {
   %tile_0_1 = aie.tile(0, 1)
   %tile_1_2 = aie.tile(1, 2)
   %tile_0_2 = aie.tile(0, 2)
diff --git a/test/python/ipu.py b/test/python/npu.py
similarity index 95%
rename from test/python/ipu.py
rename to test/python/npu.py
index e2ad6959e8..79b8c64bb6 100644
--- a/test/python/ipu.py
+++ b/test/python/npu.py
@@ -23,7 +23,7 @@
     object_fifo_link,
     tile,
 )
-from aie.dialects.aiex import ipu_sync, ipu_dma_memcpy_nd
+from aie.dialects.aiex import npu_sync, npu_dma_memcpy_nd
 from aie.dialects.func import FuncOp
 from aie.dialects.scf import for_
 from aie.dialects.scf import yield_
@@ -49,7 +49,7 @@ def my_vector_scalar(module):
 
     buffer_depth = 2
 
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         scale_int32 = external_func(
             "scale_int32", inputs=[T.memref(n, T.i32()), T.memref(n, T.i32())]
@@ -79,9 +79,9 @@ def core_body():
             T.memref(N, T.i32()), T.memref(N, T.i32()), T.memref(N, T.i32())
         )
         def sequence(A, B, C):
-            ipu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
-            ipu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+            npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
     assert module.operation.verify()
 
@@ -124,7 +124,7 @@ def my_matmul(module):
 
     vectorized = True
 
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         zero_scalar = external_func("zero_scalar_i16", inputs=[T.memref(m, n, T.i16())])
         zero = external_func("zero_i16", inputs=[T.memref(m, n, T.i16())])
@@ -194,7 +194,7 @@ def sequence(A, B, C):
                 num_tile_rows = min(
                     [rows_per_block, M_div_m - tile_row_block * rows_per_block]
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="outC",
                     bd_id=0,
                     mem=C,
@@ -210,7 +210,7 @@ def sequence(A, B, C):
                         * word_size_in
                         // 4
                     )
-                    ipu_dma_memcpy_nd(
+                    npu_dma_memcpy_nd(
                         metadata="inA",
                         bd_id=2 * tile_row + 1,
                         mem=A,
@@ -218,7 +218,7 @@ def sequence(A, B, C):
                         sizes=[N_div_n, K_div_k, m, k_in_i32s],
                         strides=[0, k_in_i32s, K_in_i32s],
                     )
-                    ipu_dma_memcpy_nd(
+                    npu_dma_memcpy_nd(
                         metadata="inB",
                         bd_id=2 * tile_row + 2,
                         mem=B,
@@ -226,7 +226,7 @@ def sequence(A, B, C):
                         strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s],
                     )
 
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     assert module.operation.verify()
 
@@ -234,7 +234,7 @@ def sequence(A, B, C):
 # CHECK-LABEL: edge_detect
 @construct_and_print_module
 def edge_detect(module):
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         rgba2gray_line = external_func(
             "rgba2gray_line",
@@ -441,21 +441,21 @@ def core_body():
             T.memref(2304, T.i32()), T.memref(2304, T.i32()), T.memref(2304, T.i32())
         )
         def sequence(I, B, O):
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="outOF_L2L3",
                 bd_id=0,
                 mem=O,
                 sizes=[1, 1, 36, 64],
                 strides=[0, 0, 64],
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="inOF_L3L2",
                 bd_id=1,
                 mem=I,
                 sizes=[1, 1, 36, 64],
                 strides=[0, 0, 64],
             )
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
     assert module.operation.verify()
 
@@ -463,7 +463,7 @@ def sequence(I, B, O):
 # CHECK-LABEL: my_add_one_objFifo
 @construct_and_print_module
 def my_add_one_objFifo(module):
-    @device(AIEDevice.ipu)
+    @device(AIEDevice.npu)
     def device_body():
         shim_tile = tile(0, 0)
         mem_tile = tile(0, 1)
@@ -496,12 +496,12 @@ def core_body():
             T.memref(64, T.i32()), T.memref(32, T.i32()), T.memref(64, T.i32())
         )
         def sequence(inTensor, notUsed, outTensor):
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
             )
-            ipu_dma_memcpy_nd(
+            npu_dma_memcpy_nd(
                 metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
             )
-            ipu_sync(column=0, row=0, direction=0, channel=0)
+            npu_sync(column=0, row=0, direction=0, channel=0)
 
     assert module.operation.verify()
diff --git a/test/python/tile_array.py b/test/python/tile_array.py
index 272b48832e..e6735a7565 100644
--- a/test/python/tile_array.py
+++ b/test/python/tile_array.py
@@ -31,8 +31,8 @@
 # CHECK-LABEL: broadcast
 @construct_and_print_module
 def broadcast(module):
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         df = TileArray()
         assert df[[0, 1], 0].shape == (2, 1)
         assert df[[0, 1], 3:].shape == (2, 3)
@@ -125,7 +125,7 @@ def ipu():
             print(f)
 
         # CHECK: module {
-        # CHECK:   aie.device(ipu) {
+        # CHECK:   aie.device(npu) {
         # CHECK:     %tile_0_0 = aie.tile(0, 0)
         # CHECK:     %tile_0_1 = aie.tile(0, 1)
         # CHECK:     %tile_0_2 = aie.tile(0, 2)
@@ -194,8 +194,8 @@ def ipu():
 # CHECK-LABEL: lshift
 @construct_and_print_module
 def lshift(module):
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray()
 
         fls = tiles[2, 1] << tiles[0, [2, 3]]
@@ -214,8 +214,8 @@ def ipu():
 # CHECK-LABEL: locks
 @construct_and_print_module
 def locks(module):
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray()
 
         aie.lock(tiles[0, 1].tile)
@@ -249,8 +249,8 @@ def ipu():
 # CHECK-LABEL: neighbors
 @construct_and_print_module
 def neighbors(module):
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray()
 
         # CHECK: Neighbors(north=%tile_2_3 = aie.tile(2, 3), west=%tile_1_2 = aie.tile(1, 2), south=None)
@@ -279,8 +279,8 @@ def channels_basic(module):
     # CHECK-LABEL: test-basic
     print("test-basic")
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray()
 
         b = aie.buffer(tiles[2, 2].tile, (10, 10), T.i32(), name="bob")
@@ -295,13 +295,13 @@ def ipu():
     # CHECK: %alice = aie.buffer(%tile_2_2) {sym_name = "alice"} : memref<10x10xi32>
     # CHECK: %alice_producer_lock = aie.lock(%tile_2_2) {sym_name = "alice_producer_lock"}
     # CHECK: %alice_consumer_lock = aie.lock(%tile_2_2) {sym_name = "alice_consumer_lock"}
-    print(ipu)
+    print(npu)
 
     # CHECK-LABEL: test-context-manager
     print("test-context-manager")
 
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray()
 
         c = Channel(
@@ -334,14 +334,14 @@ def core():
     # CHECK:   aie.use_lock(%alice_producer_lock, Release)
     # CHECK:   aie.end
     # CHECK: }
-    print(ipu)
+    print(npu)
 
 
 # CHECK-LABEL: nd_channels
 @construct_and_print_module
 def nd_channels(module):
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray()
 
         shapes = np.array([(10, 10)], dtype="i,i").astype(object)
@@ -377,8 +377,8 @@ def ipu():
 def buffer_test_this_needs_to_distinct_from_all_other_mentions_of_buffer_in_this_file(
     module,
 ):
-    @aie.device(AIEDevice.ipu)
-    def ipu():
+    @aie.device(AIEDevice.npu)
+    def npu():
         tiles = TileArray()
 
         shapes = [(10, 10)]
diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py
index 437df0f336..4837ea8748 100644
--- a/test/python/trace_utils.py
+++ b/test/python/trace_utils.py
@@ -7,14 +7,14 @@
 
 # RUN: %python %s | FileCheck %s --check-prefix TRACE
 #
-# TRACE: aiex.ipu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 65536 : ui32}
-# TRACE: aiex.ipu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
-# TRACE: aiex.ipu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527909 : ui32}
-# TRACE: aiex.ipu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32}
-# TRACE: aiex.ipu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32}
-# TRACE: aiex.ipu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
-# TRACE: aiex.ipu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
-# TRACE: aiex.ipu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}
+# TRACE: aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 65536 : ui32}
+# TRACE: aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
+# TRACE: aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260527909 : ui32}
+# TRACE: aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 757865039 : ui32}
+# TRACE: aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32}
+# TRACE: aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32}
+# TRACE: aiex.npu.writebd_shimtile {bd_id = 3 : i32, buffer_length = 8192 : i32, buffer_offset = 1024 : i32, column = 0 : i32, column_num = 1 : i32, d0_size = 0 : i32, d0_stride = 0 : i32, d1_size = 0 : i32, d1_stride = 0 : i32, d2_stride = 0 : i32, ddr_id = 2 : i32, enable_packet = 0 : i32, iteration_current = 0 : i32, iteration_size = 0 : i32, iteration_stride = 0 : i32, lock_acq_enable = 0 : i32, lock_acq_id = 0 : i32, lock_acq_val = 0 : i32, lock_rel_id = 0 : i32, lock_rel_val = 0 : i32, next_bd = 0 : i32, out_of_order_id = 0 : i32, packet_id = 0 : i32, packet_type = 0 : i32, use_next_bd = 0 : i32, valid_bd = 1 : i32}
+# TRACE: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 3 : ui32}
 
 import sys
 
@@ -40,7 +40,7 @@
 def passthroughKernel():
     with mlir_mod_ctx() as ctx:
 
-        @device(AIEDevice.ipu)
+        @device(AIEDevice.npu)
         def device_body():
             # define types
             memRef_ty = T.memref(lineWidthInBytes, T.ui8())
@@ -96,19 +96,19 @@ def sequence(inTensor, outTensor, notUsed):
                         events=[0x4B, 0x22, 0x21, 0x25, 0x2D, 0x2C, 0x1A, 0x4F],
                     )
 
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="in",
                     bd_id=0,
                     mem=inTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_dma_memcpy_nd(
+                npu_dma_memcpy_nd(
                     metadata="out",
                     bd_id=1,
                     mem=outTensor,
                     sizes=[1, 1, 1, tensorSizeInInt32s],
                 )
-                ipu_sync(column=0, row=0, direction=0, channel=0)
+                npu_sync(column=0, row=0, direction=0, channel=0)
 
     print(ctx.module)
 
diff --git a/tools/aie2xclbin/XCLBinGen.cpp b/tools/aie2xclbin/XCLBinGen.cpp
index 86c2966f6a..d8de0e06d6 100644
--- a/tools/aie2xclbin/XCLBinGen.cpp
+++ b/tools/aie2xclbin/XCLBinGen.cpp
@@ -815,7 +815,7 @@ static LogicalResult generateUnifiedObject(MLIRContext *context,
 }
 
 LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp,
-                                 XCLBinGenConfig &TK, StringRef OutputIPU,
+                                 XCLBinGenConfig &TK, StringRef OutputNPU,
                                  StringRef OutputXCLBin) {
   PassManager pm(ctx, moduleOp.getOperationName());
   applyConfigToPassManager(TK, pm);
@@ -842,25 +842,25 @@ LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp,
     return moduleOp.emitOpError()
            << "Unexpected target architecture: " << TK.TargetArch;
 
-  // generateIPUInstructions
+  // generateNPUInstructions
   {
     PassManager pm(ctx, moduleOp.getOperationName());
     applyConfigToPassManager(TK, pm);
 
-    pm.addNestedPass<AIE::DeviceOp>(AIEX::createAIEDmaToIpuPass());
+    pm.addNestedPass<AIE::DeviceOp>(AIEX::createAIEDmaToNpuPass());
     ModuleOp copy = moduleOp.clone();
     if (failed(pm.run(copy)))
-      return moduleOp.emitOpError("IPU Instruction pipeline failed");
+      return moduleOp.emitOpError("NPU Instruction pipeline failed");
 
     std::string errorMessage;
-    auto output = openOutputFile(OutputIPU, &errorMessage);
+    auto output = openOutputFile(OutputNPU, &errorMessage);
     if (!output) {
       llvm::errs() << errorMessage << "\n";
       return moduleOp.emitOpError("");
     }
 
-    if (failed(AIE::AIETranslateToIPU(copy, output->os())))
-      return moduleOp.emitOpError("IPU Instruction translation failed");
+    if (failed(AIE::AIETranslateToNPU(copy, output->os())))
+      return moduleOp.emitOpError("NPU Instruction translation failed");
 
     output->keep();
     copy->erase();
diff --git a/tools/aie2xclbin/XCLBinGen.h b/tools/aie2xclbin/XCLBinGen.h
index 25fcb07396..809daa101c 100644
--- a/tools/aie2xclbin/XCLBinGen.h
+++ b/tools/aie2xclbin/XCLBinGen.h
@@ -40,7 +40,7 @@ struct XCLBinGenConfig {
 void findVitis(XCLBinGenConfig &TK);
 
 mlir::LogicalResult aie2xclbin(mlir::MLIRContext *ctx, mlir::ModuleOp moduleOp,
-                               XCLBinGenConfig &TK, llvm::StringRef OutputIPU,
+                               XCLBinGenConfig &TK, llvm::StringRef OutputNPU,
                                llvm::StringRef OutputXCLBin);
 
 } // namespace xilinx
diff --git a/tools/aie2xclbin/aie2xclbin.cpp b/tools/aie2xclbin/aie2xclbin.cpp
index bbd34c5528..7bfe6a2982 100644
--- a/tools/aie2xclbin/aie2xclbin.cpp
+++ b/tools/aie2xclbin/aie2xclbin.cpp
@@ -70,9 +70,9 @@ cl::opt<std::string>
              cl::init(HOST_ARCHITECTURE), cl::cat(AIE2XCLBinCat));
 
 cl::opt<std::string>
-    IPUInstsName("ipu-insts-name",
-                 cl::desc("Output instructions filename for IPU target"),
-                 cl::init("ipu_insts.txt"), cl::cat(AIE2XCLBinCat));
+    NPUInstsName("npu-insts-name",
+                 cl::desc("Output instructions filename for NPU target"),
+                 cl::init("npu_insts.txt"), cl::cat(AIE2XCLBinCat));
 
 cl::opt<bool>
     PrintIRAfterAll("print-ir-after-all",
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]) {
   if (!owning)
     return 1;
 
-  if (failed(aie2xclbin(&ctx, *owning, TK, IPUInstsName.getValue(),
+  if (failed(aie2xclbin(&ctx, *owning, TK, NPUInstsName.getValue(),
                         XCLBinName.getValue())))
     return 1;
 
diff --git a/utils/reset_ipu.sh b/utils/reset_npu.sh
similarity index 93%
rename from utils/reset_ipu.sh
rename to utils/reset_npu.sh
index 6a4f02e647..9a07adb48b 100755
--- a/utils/reset_ipu.sh
+++ b/utils/reset_npu.sh
@@ -12,6 +12,6 @@ if [ x"$NUMBER" != x"" ]; then
 #    /opt/xilinx/xrt/test/example_noop_test /lib/firmware/amdipu/1502/validate.xclbin
 #  fi
 else
-  echo "couldn't find ipu"
+  echo "couldn't find npu"
 fi
 
diff --git a/utils/run_on_ipu.sh b/utils/run_on_npu.sh
similarity index 100%
rename from utils/run_on_ipu.sh
rename to utils/run_on_npu.sh