diff --git a/.gitignore b/.gitignore
index 47aa4037dd..b59e080f32 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
/cmakeModules*
.vscode
__pycache__
+.DS_Store
/platforms/vck190_bare/petalinux/build
/platforms/vck190_bare/petalinux/components
diff --git a/README.md b/README.md
index 22f7858093..cbd8d78088 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@
This repository contains an [MLIR-based](https://mlir.llvm.org/) toolchain for AI Engine-enabled devices, such as [AMD Ryzen™ AI](https://www.amd.com/en/products/ryzen-ai) and [Versal™](https://www.xilinx.com/products/technology/ai-engine.html). This repository can be used to generate low-level configurations for the AI Engine portion of these devices. AI Engines are organized as a spatial array of tiles, where each tile contains AI Engine cores and/or memories. The spatial array is connected by stream switches that can be configured to route data between AI Engine tiles scheduled by their programmable Data Movement Accelerators (DMAs). This repository contains MLIR representations, with multiple levels of abstraction, to target AI Engine devices. This enables compilers and developers to program AI Engine cores, as well as describe data movements and array connectivity. A Python API is made available as a convenient interface for generating MLIR design descriptions. Backend code generation is also included, targeting the [aie-rt](https://github.com/Xilinx/aie-rt/tree/main-aie) library. This toolchain uses the AI Engine compiler tool which is part of the AMD Vitis™ software installation: these tools require a free license for use from the [Product Licensing Site](https://www.xilinx.com/member/forms/license-form.html).
-This project is primarily intended to support the open-source community, particularly tool builders, with low-level access to AIE devices and enable the development of a wide variety of programming models from higher level abstractions. As such, although it contains some examples, this project is not intended to represent an end-to-end compilation flow for application design. If you're looking for an out-of-the-box experience for highly efficient machine learning, check out the [AMD Ryzen™ AI Software Platform](https://github.com/amd/RyzenAI-SW/).
+This project is primarily intended to support the open-source community, particularly tool builders, with low-level access to AIE devices and enable the development of a wide variety of programming models from higher level abstractions. We provide an example programming flow: Interface Representation for hands-ON (IRON) close-to-metal programming of the AIE-array. IRON is an open access toolkit enabling performance engineers to build fast and efficient, often specialized designs through a set of Python language bindings around the mlir-aie dialect. As such, it contains some examples, however this project is not intended to represent an end-to-end compilation flow for all application designs. If you're looking for an out-of-the-box experience for highly efficient machine learning, check out the [AMD Ryzen™ AI Software Platform](https://github.com/amd/RyzenAI-SW/).
[Getting Started on a Versal™ board](docs/Building.md)
@@ -24,8 +24,10 @@ This project is primarily intended to support the open-source community, particu
[Getting Started and Running on Linux Ryzen™ AI](docs/buildHostLin.md)
-[Full Documentation](https://xilinx.github.io/mlir-aie/)
+[IRON AIE Application Programming Guide](programming_guide)
+
+[MLIR Dialect and Compiler Documentation](https://xilinx.github.io/mlir-aie/)
-----
-
diff --git a/aie_kernels/aie2/scale.cc b/aie_kernels/aie2/scale.cc
index bf8a430b19..b73ac3ff1f 100755
--- a/aie_kernels/aie2/scale.cc
+++ b/aie_kernels/aie2/scale.cc
@@ -15,6 +15,7 @@
#include
+// Scalar scale template
template
void scale_scalar(T *a, T *c, T factor, const int32_t N) {
event0();
@@ -24,20 +25,43 @@ void scale_scalar(T *a, T *c, T factor, const int32_t N) {
event1();
}
+// Vectorized scale template
// Assume N is multiple of 16
template
-void scale_vectorized(T *a, T *c, T factor, const int32_t N) {
- constexpr int vec_factor = 16;
+void scale_vectorized(T *a, T *c, int32_t factor, const int32_t N) {
event0();
+ constexpr int vec_factor = 32;
T *__restrict pA1 = a;
T *__restrict pC1 = c;
const int F = N / vec_factor;
+ T fac = factor;
for (int i = 0; i < F; i++)
chess_prepare_for_pipelining chess_loop_range(16, ) {
aie::vector A0 = aie::load_v(pA1);
pA1 += vec_factor;
+ aie::accum cout = aie::mul(A0, fac);
+ aie::store_v(pC1, cout.template to_vector(0));
+ pC1 += vec_factor;
+ }
+ event1();
+}
+
+// Vectorized scale tempalte for int32_t (acc64 used)
+// Assume N is multiple of 16
+template <>
+void scale_vectorized(int32_t *a, int32_t *c, int32_t factor,
+ const int32_t N) {
+ event0();
+ constexpr int vec_factor = 32;
+ int32_t *__restrict pA1 = a;
+ int32_t *__restrict pC1 = c;
+ const int F = N / vec_factor;
+ for (int i = 0; i < F; i++)
+ chess_prepare_for_pipelining chess_loop_range(16, ) {
+ aie::vector A0 = aie::load_v(pA1);
+ pA1 += vec_factor;
aie::accum cout = aie::mul(A0, factor);
- aie::store_v(pC1, cout.to_vector(0));
+ aie::store_v(pC1, cout.template to_vector(0));
pC1 += vec_factor;
}
event1();
@@ -45,14 +69,26 @@ void scale_vectorized(T *a, T *c, T factor, const int32_t N) {
extern "C" {
-void vector_scalar_mul_aie(int32_t *a_in, int32_t *c_out, int32_t *factor,
- int32_t N) {
+// 16-bit datatype
+void vector_scalar_mul_int32_scalar(int32_t *a_in, int32_t *c_out,
+ int32_t *factor, int32_t N) {
+ scale_scalar(a_in, c_out, *factor, N);
+}
+
+void vector_scalar_mul_int32_vector(int32_t *a_in, int32_t *c_out,
+ int32_t *factor, int32_t N) {
scale_vectorized(a_in, c_out, *factor, N);
}
-void vector_scalar_mul_aie_scalar(int32_t *a_in, int32_t *c_out,
- int32_t *factor, int32_t N) {
- scale_scalar(a_in, c_out, *factor, N);
+// 32-bit datatype
+void vector_scalar_mul_int16_scalar(int16_t *a_in, int16_t *c_out,
+ int32_t *factor, int32_t N) {
+ scale_scalar(a_in, c_out, *factor, N);
+}
+
+void vector_scalar_mul_int16_vector(int16_t *a_in, int16_t *c_out,
+ int32_t *factor, int32_t N) {
+ scale_vectorized(a_in, c_out, *factor, N);
}
} // extern "C"
diff --git a/docs/buildHostLin.md b/docs/buildHostLin.md
index 422b48b64e..0bd91487d7 100644
--- a/docs/buildHostLin.md
+++ b/docs/buildHostLin.md
@@ -1,11 +1,30 @@
# Linux Setup and Build Instructions
-These instructions will guide you through everything required for building and executing a program on the Ryzen AI NPU, starting from a fresh bare-bones **Ubuntu 22.04 LTS** install. Only Ubuntu 22.04 LTS is supported. The instructions were tested on a ASUS Vivobook Pro 15.
+These instructions will guide you through everything required for building and executing a program on the Ryzen™ AI NPU, starting from a fresh bare-bones **Ubuntu 22.04 LTS** install. Only Ubuntu 22.04 LTS is supported.
+
+## Initial Setup
+
+#### Update BIOS:
+
+Be sure you have the latest BIOS for your laptop or mini PC, this will ensure the NPU (sometimes referred to as IPU) is enabled in the system. You may need to manually enable the NPU:
+:
+ ```Advanced → CPU Configuration → IPU```
+
+> **NOTE:** Some manufacturers only provide Windows executables to update the BIOS, please do this before installing Ubuntu.
+
+#### BIOS Settings:
+1. Turn off SecureBoot (Allows for unsigned drivers to be installed)
+
+ ```BIOS → Security → Secure boot → Disable```
+
+1. Turn Ac Power Loss to "Always On" (Can be used for PDU reset, turns computer back on after power loss)
+
+ ```BIOS → Advanced → AMD CBS → FCH Common Options → Ac Power Loss Options → Set Ac Power Loss to "Always On"```
## Overview
You will...
-1. Install a driver for the Ryzen AI. As part of this, you will need to...
+1. Install a driver for the Ryzen™ AI. As part of this, you will need to...
1. [...compile and install a more recent Linux kernel.](#update-linux)
@@ -15,7 +34,7 @@ You will...
1. [...install Xilinx Vitis and obtain a license.](#install-xilinx-vitis-20232-and-other-mlir-aie-prerequisites)
- 1. ...install MLIR-AIE [from precompiled binaries (fast)](#option-a---quick-setup-for-ryzen-ai-application-development) or [from source (slow)](#option-b---build-mlir-aie-tools-from-source-for-development).
+ 1. ...install mlir-aie [from precompiled binaries (fast)](#option-a---quick-setup-for-ryzen-ai-application-development) or [from source (slow)](#option-b---build-mlir-aie-tools-from-source-for-development).
1. Build and execute one of the example designs. This consists of...
@@ -25,7 +44,7 @@ You will...
3. [...building and executing host (x86) code and device (NPU) code.](#build-and-run-host-part)
-> Be advised that two of the steps (Linux compilation and Vitis install) may take hours. If you decide to build MLIR-AIE from source, this will also take a long time as it contains an LLVM build. Allocate enough time and patience. Once done, you will have an amazing toolchain allowing you to harness this great hardware at your hands.
+> Be advised that two of the steps (Linux compilation and Vitis install) may take hours. If you decide to build mlir-aie from source, this will also take a long time as it contains an LLVM build. Allocate enough time and patience. Once done, you will have an amazing toolchain allowing you to harness this great hardware at your hands.
## Prerequisites
@@ -203,7 +222,7 @@ You will...
> [0000:66:00.1] : RyzenAI-Phoenix
> ```
-### Install Xilinx Vitis 2023.2 and Other MLIR-AIE Prerequisites
+### Install Xilinx Vitis 2023.2 and Other mlir-aie Prerequisites
1. Install Vitis under from [Xilinx Downloads](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vitis.html). You will need to run the installer as root. We will assume you use the default installation directory, `/tools/Xilinx`.
@@ -225,17 +244,17 @@ You will...
export LM_LICENSE_FILE=/opt/Xilinx.lic
```
-1. Install the following packages needed for building MLIR-AIE:
+1. Install the following packages needed for building mlir-aie:
```
sudo apt install \
build-essential clang clang-14 lld lld-14 cmake python3-venv python3-pip libxrender1 libxtst6 libxi6
```
-1. Choose *one* of the two options (A or B) below for installing MLIR-AIE.
+1. Choose *one* of the two options (A or B) below for installing mlir-aie.
-### Option A - Quick Setup for Ryzen AI Application Development
+### Option A - Quick Setup for Ryzen™ AI Application Development
-1. Clone [the MLIR-AIE repository](https://github.com/Xilinx/mlir-aie.git), best under /home/username for speed (yourPathToBuildMLIR-AIE):
+1. Clone [the mlir-aie repository](https://github.com/Xilinx/mlir-aie.git), best under /home/username for speed (yourPathToBuildMLIR-AIE):
```
git clone https://github.com/Xilinx/mlir-aie.git
cd mlir-aie
@@ -246,7 +265,7 @@ You will...
1. Jump ahead to [Build Device AIE Part](#build-device-aie-part) step 2 below.
-### Option B - Build MLIR-AIE Tools from Source for Development
+### Option B - Build mlir-aie Tools from Source for Development
1. Clone [https://github.com/Xilinx/mlir-aie.git](https://github.com/Xilinx/mlir-aie.git) best under /home/username for speed (yourPathToBuildMLIR-AIE), with submodules:
```
@@ -275,7 +294,7 @@ source ${MLIR_AIE_BUILD_DIR}/ironenv/bin/activate
source ${MLIR_AIE_BUILD_DIR}/utils/env_setup.sh ${MLIR_AIE_BUILD_DIR}/my_install/mlir_aie ${MLIR_AIE_BUILD_DIR}/my_install/mlir
```
-> Replace `${MLIR_AIE_BUILD_DIR}` with the directory in which you *built* MLIR-AIE above. Replace `${NEW_CMAKE_DIR}` with the directory in which you installed CMake 3.28 above. Instead of search and replace, you can also define these values as environment variables.
+> Replace `${MLIR_AIE_BUILD_DIR}` with the directory in which you *built* mlir-aie above. Replace `${NEW_CMAKE_DIR}` with the directory in which you installed CMake 3.28 above. Instead of search and replace, you can also define these values as environment variables.
> For quick setup, this step is only needed if you are starting with a new terminal. If you are continuing in the same terminal you used to install the prerequisites, the environment variables should all be set.
@@ -289,7 +308,7 @@ source /opt/xilinx/xrt/setup.sh
source ${MLIR_AIE_BUILD_DIR}/utils/env_setup.sh ${MLIR_AIE_BUILD_DIR}/install ${MLIR_AIE_BUILD_DIR}/llvm/install
```
-> Replace `${MLIR_AIE_BUILD_DIR}` with the directory in which you *built* MLIR-AIE above. Instead of search and replace, you can also define `MLIR_AIE_BUILD_DIR` as an environment variable.
+> Replace `${MLIR_AIE_BUILD_DIR}` with the directory in which you *built* mlir-aie above. Instead of search and replace, you can also define `MLIR_AIE_BUILD_DIR` as an environment variable.
## Build a Design
@@ -297,7 +316,7 @@ For your design of interest, for instance [vector_add](../programming_examples/b
### Build Device AIE Part
-1. Prepare your enviroment with the MLIR-AIE tools (built during prerequisites part of this guide) - see **"Setting Up Your Environment"** avove.
+1. Prepare your enviroment with the mlir-aie tools (built during prerequisites part of this guide) - see **"Setting Up Your Environment"** avove.
2. Goto the design of interest and run `make`
diff --git a/docs/buildHostWin.md b/docs/buildHostWin.md
index d6daf03704..f51a8014a2 100644
--- a/docs/buildHostWin.md
+++ b/docs/buildHostWin.md
@@ -1,14 +1,34 @@
# Windows Setup and Build Instructions
-These instructions will guide you through everything required for building and executing a program on the Ryzen AI NPU on Windows. The instructions were tested on a ASUS Vivobook Pro 15.
+These instructions will guide you through everything required for building and executing a program on the Ryzen™ AI NPU on Windows. The instructions were tested on a ASUS Vivobook Pro 15.
You will set up a Windows subsystem for Linux (WSL) Ubuntu install, which will be used for building NPU device code. For building the host (x86) code, you will use MS Visual Code Community.
-- Rely on WSL Ubuntu 22.04 LTS for Vitis tool install and to build and run our MLIR-AIE tools
+- Rely on WSL Ubuntu 22.04 LTS for Vitis tool install and to build and run our mlir-aie tools
- Rely on MS Visual Studio 17 2022 to natively build the host code (aka test.cpp)
+## Initial Setup
+
+#### Update BIOS:
+
+Be sure you have the latest BIOS for your laptop or mini PC, this will ensure the NPU (sometimes referred to as IPU) is enabled in the system. You may need to manually enable the NPU:
+
+ ```Advanced → CPU Configuration → IPU```
+
+> **NOTE:** Some manufacturers only provide Windows executables to update the BIOS.
+
+#### BIOS Settings:
+1. Turn off SecureBoot (Allows for unsigned drivers to be installed)
+
+ ```BIOS → Security → Secure boot → Disable```
+
+1. Turn Ac Power Loss to "Always On" (Can be used for PDU reset, turns computer back on after power loss)
+
+ ```BIOS → Advanced → AMD CBS → FCH Common Options → Ac Power Loss Options → Set Ac Power Loss to "Always On"```
+
+
## Prerequisites
-### MLIR-AIE tools: WSL Ubuntu 22.04
+### mlir-aie tools: WSL Ubuntu 22.04
All steps in WSL Ubuntu terminal.
1. Clone [https://github.com/Xilinx/mlir-aie.git](https://github.com/Xilinx/mlir-aie.git) best under /home/username for speed (yourPathToBuildMLIR-AIE), with submodules:
```
@@ -44,7 +64,7 @@ All steps in WSL Ubuntu terminal.
ip link set vmnic0 addr
```
-1. Install or Build MLIR-AIE tools under WSL2:
+1. Install or Build mlir-aie tools under WSL2:
* Use quick setup script to install from whls:
```
@@ -56,7 +76,7 @@ All steps in WSL Ubuntu terminal.
* [Optional] Build from source following regular get started instructions [https://xilinx.github.io/mlir-aie/Building.html](https://xilinx.github.io/mlir-aie/Building.html)
-1. After installing the updated RyzenAI driver (see next subsection), use the gendef tool (from the mingw-w64-tools package) to create a .def file with the symbols:
+1. After installing the updated Ryzen™ AI driver (see next subsection), use the gendef tool (from the mingw-w64-tools package) to create a .def file with the symbols:
```
mkdir /mnt/c/Technical/xrtNPUfromDLL; cd /mnt/c/Technical/xrtNPUfromDLL
cp /mnt/c/Windows/System32/AMD/xrt_coreutil.dll .
@@ -83,7 +103,7 @@ All steps in Win11 (powershell where needed).
```
lib /def:xrt_coreutil.def /machine:x64 /out:xrt_coreutil.lib
```
-1. Clone [https://github.com/Xilinx/mlir-aie.git]([https://gitenterprise.xilinx.com/XRLabs/pynqMLIR-AIE](https://github.com/Xilinx/mlir-aie.git)) for instance under C:\Technical to be used to build designs (yourPathToDesignsWithMLIR-AIE)
+1. Clone [https://github.com/Xilinx/mlir-aie.git](https://github.com/Xilinx/mlir-aie.git) for instance under C:\Technical to be used to build designs (yourPathToDesignsWithMLIR-AIE)
## Set up your environment
@@ -91,7 +111,7 @@ To make the compilation toolchain available for use in your WSL terminal, you wi
### `setup.sh` - Option A - Using Quick Setup
-If you used the quick setup script (precompiled MLIR-AIE binaries), use this setup script.
+If you used the quick setup script (precompiled mlir-aie binaries), use this setup script.
```
# NOTE: if you did NOT exit the terminal you can skip this step.
diff --git a/docs/conferenceDescriptions/asplos24TutorialDescription.md b/docs/conferenceDescriptions/asplos24TutorialDescription.md
new file mode 100644
index 0000000000..b7e1d85a79
--- /dev/null
+++ b/docs/conferenceDescriptions/asplos24TutorialDescription.md
@@ -0,0 +1,51 @@
+# ASPLOS'24 Tutorial: Levering MLIR to Design for AI Engines on Ryzen AI
+
+## Introduction
+
+The AI Engine array in the NPU of the AMD Ryzen AI device includes a set of VLIW vector processors with adaptable interconnect. This tutorial is targeted at performance engineers and tool developers who are looking for fast and completely open source design tools to support their research. Participants will first get insight into the AI Engine compute and data movement capabilities. Through small design examples expressed in the MLIR-AIE python language bindings and executed on an Ryzen AI device, participants will leverage AI Engine features for optimizing performance of increasingly complex designs. The labs will be done on Ryzen AI enabled miniPCs giving participants the ability to execute their own designs on real hardware.
+
+
+This tutorial will cover the following key topics:
+1. AI Engine architecture introduction
+1. AIE core, array configuration and host application code compilation
+1. Data movement and communication abstraction layers
+1. Tracing for performance monitoring
+1. Putting it all together on larger examples: matrix multiplication, convolutions as building blocks for ML and computer vision examples
+
+## Agenda
+
+Date: Saturday April 27th 2024 (morning)
+Location: Hilton La Jolla Torrey Pines, San Diego, California (with ASPLOS’24)
+Prerequisite: please bring your laptop, so that you can ssh into our Ryzen AI enabled miniPCs for the hands-on exercises.
+
+### Contents and Timeline (tentative)
+
+| Time | Topic | Presenter | Slides or Code |
+|------|-------|-----------|----------------|
+| 08:30am | Intro to spatial compute and explicit data movement | Kristof | [Programming Guide](../../programming_guide/) |
+| 08:45am | "Hello World" from Ryzen AI | Joe | [AI Engine Basic Building Blocks](../../programming_guide/section-1/) |
+| 09:00am | Data movement on Ryzen AI with objectFIFOs | Joe | [Data Movement](../../programming_guide/section-2/) |
+| 09:30am | Your First Program | Kristof | [My First Program](../../programming_guide/section-3) |
+| 09:50am | Exercise 1: Build and run your first program | All | [Passthrough](../../programming_examples/basic/passthrough_kernel/) |
+| 10:00am | Break | | |
+| 10:30am | Exercise 2: Vector-Scalar Mul | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
+| 10:40am | Tracing and performance analysis | Jack | [Timers](../../programming_guide/section-4/section-4a/) and [Tracing](../../programming_guide/section-4/section-4b/) |
+| 11:10am | Exercise 3: Tracing vector-scalar | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
+| 11:30am | Vectorizing on AIE | Jack | [Kernel Vectorization](../../programming_guide/section-4/section-4c/) |
+| 11:40am | Exercise 4: Vectorized vector-scalar | All | [Vector Scalar Mul](../../programming_examples/basic/vector_scalar_mul/) |
+| 12:00pm | Dataflow and larger designs | Joe | [Example Vector Designs](../../programming_guide/section-5/) and [Large Example Designs](../../programming_guide/section-6/) |
+| 12:15pm | Exercises | All | [Programming Examples](../../programming_examples/) |
+| 12:30pm | Close Tutorial | All | |
+
+
+## Organizers
+
+*Jack Lo* is a Senior Member of Technical Staff in AMD’s Research and Advanced Development group. At AMD, he is focused on developing tool frameworks and optimizing applications for current and future AMD devices, particularly in the area of adaptive computing and AI processing.
+
+*Joseph Melber* is a Senior Member of Technical Staff in AMD’s Research and Advanced Development group. At AMD, he is working on hardware architectures and compiler technologies for current and future AMD devices. He received a BS in electrical engineering from the University Buffalo, as well as MS and PhD degrees from the electrical and computer engineering department at Carnegie Mellon University. His research interests include runtime systems, compiler abstractions for data movement, and hardware prototypes for future adaptive heterogeneous computing architectures.
+
+*Kristof Denolf* is a Fellow in AMD's Research and Advanced Development group where he is working on energy efficient computer vision and video processing applications to shape future AMD devices. He earned a M.Eng. in electronics from the Katholieke Hogeschool Brugge-Oostende (1998), now part of KULeuven, a M.Sc. in electronic system design from Leeds Beckett University (2000) and a Ph.D. from the Technical University Eindhoven (2007). He has over 25 years of combined research and industry experience at IMEC, Philips, Barco, Apple, Xilinx and AMD. His main research interest are all aspects of the cost-efficient and dataflow oriented design of video, vision and graphics systems.
+
+*Phil James-Roxby* is a Senior Fellow in AMD’s Research and Advanced Development group, working on compilers and runtimes to support current and future AMD devices, particularly in the domain on AI processing. In the past, he has been responsible for a number of software enablement activities for hardware devices, including SDNet and SDAccel at Xilinx, and the original development environement for the AI Engines. He holds a PhD from the University of Manchester on hardware acceleration of embedded machine learning applications, and his main research interest continues to be how to enable users to efficiently use diverse hardware in heterogenous systems.
+
+*Samuel Bayliss* is a Fellow in the Research and Advanced Development group at AMD. His academic experience includes formative study at Imperial College London, for which he earned MEng and PhD degrees in 2006 and 2012 respectively. He is energized by his current work in advancing compiler tooling using MLIR, developing programming abstractions for parallel compute and evolving hardware architectures for efficient machine learning.
\ No newline at end of file
diff --git a/programming_examples/basic/dma_transpose/CMakeLists.txt b/programming_examples/basic/dma_transpose/CMakeLists.txt
new file mode 100644
index 0000000000..c17d3d365b
--- /dev/null
+++ b/programming_examples/basic/dma_transpose/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(CMAKE_C_COMPILER gcc-13)
+ set(CMAKE_CXX_COMPILER g++-13)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName proj_${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC
+ ${XRT_INC_DIR}
+ ${Boost_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ )
+endif()
diff --git a/programming_examples/basic/dma_transpose/Makefile b/programming_examples/basic/dma_transpose/Makefile
new file mode 100644
index 0000000000..39274a8a48
--- /dev/null
+++ b/programming_examples/basic/dma_transpose/Makefile
@@ -0,0 +1,51 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+include ../../makefile-common
+
+SHELL := /bin/bash
+
+all: build/final.xclbin build/insts.txt
+
+targetname = dmaTranspose
+M ?= 64
+K ?= 32
+
+build/aie.mlir: aie2.py
+ mkdir -p ${@D}
+ python3 $< ${M} ${K} > $@
+
+.PHONY: inst/insts.txt
+inst/insts.txt: aie2.py
+ rm -rf inst
+ mkdir -p inst
+ python3 $< ${LENGTH} > inst/aie.mlir
+ pushd inst && aiecc.py --aie-only-generate-npu --npu-insts-name=insts.txt aie.mlir && popd
+ ${powershell} ./build/${targetname}.exe -x build/final.xclbin -i inst/insts.txt -k MLIR_AIE -l ${LENGTH}
+
+build/final.xclbin: build/aie.mlir
+ mkdir -p ${@D}
+ cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+ rm -rf _build
+ mkdir -p _build
+ cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+ cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+ cp _build/${targetname}.exe $@
+else
+ cp _build/${targetname} $@
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt
+ ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE --M ${M} --K ${K}
+
+clean:
+ rm -rf build _build inst ${targetname}.exe
diff --git a/programming_examples/basic/dma_transpose/README.md b/programming_examples/basic/dma_transpose/README.md
new file mode 100644
index 0000000000..32dd7ac3d3
--- /dev/null
+++ b/programming_examples/basic/dma_transpose/README.md
@@ -0,0 +1,25 @@
+
+
+# 2-D Array Transpose using AIE DMAs
+
+This reference design can be run on a Ryzen™ AI NPU.
+
+In the [design](./aie2.py) a 2-D array in row-major layout is read from external memory to `ComputeTile2` with a transposed layout,
+by using an implicit copy via the compute tile's Data Movement Accelerator (DMA). The data is read from and written to external memory through Shim tile (`col`, 0).
+
+The implicit copy is performed using the `object_fifo_link` operation that specifies how input data arriving via `of_in` should be sent further via `of_out` by specifically leveraging the compute tile's DMA. This operation and its functionality are described in more depth in [Section-2b](../../../programming_guide/section-2/section-2b/README.md/#object-fifo-link) of the programming guide.
+
+
+To compile and run the design for NPU:
+```
+make
+make run
+```
\ No newline at end of file
diff --git a/programming_examples/basic/dma_transpose/aie2.py b/programming_examples/basic/dma_transpose/aie2.py
new file mode 100755
index 0000000000..5ebc7c27bf
--- /dev/null
+++ b/programming_examples/basic/dma_transpose/aie2.py
@@ -0,0 +1,66 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+N = 4096
+M = 64
+K = 64
+
+if len(sys.argv) == 3:
+ M = int(sys.argv[1])
+ K = int(sys.argv[2])
+ N = M * K
+
+
+def my_passthrough():
+ with mlir_mod_ctx() as ctx:
+
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_ty = T.memref(M, K, T.i32())
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ ComputeTile2 = tile(0, 2)
+
+ # AIE-array data movement with object fifos
+ of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
+ of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+ object_fifo_link(of_in, of_out)
+
+ # Set up compute tiles
+
+ # Compute tile 2
+ @core(ComputeTile2)
+ def core_body():
+ for _ in for_(sys.maxsize):
+ yield_([])
+
+ # To/from AIE-array data movement
+ tensor_ty = T.memref(N, T.i32())
+
+ @FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
+ def sequence(A, B, C):
+ npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
+ # The strides below are configured to read across all rows in the same column
+ # Stride of K in dim/wrap 2 skips an entire row to read a full column
+ npu_dma_memcpy_nd(
+ metadata="in", bd_id=1, mem=A, sizes=[1, K, M, 1], strides=[1, 1, K]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+ print(ctx.module)
+
+
+my_passthrough()
diff --git a/programming_examples/basic/dma_transpose/run.lit b/programming_examples/basic/dma_transpose/run.lit
new file mode 100644
index 0000000000..d513e9bca8
--- /dev/null
+++ b/programming_examples/basic/dma_transpose/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// RUN: %python %S/aie2.py 64 32 > ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt --M 64 --K 32 | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_examples/basic/dma_transpose/test.cpp b/programming_examples/basic/dma_transpose/test.cpp
new file mode 100644
index 0000000000..fa9a918669
--- /dev/null
+++ b/programming_examples/basic/dma_transpose/test.cpp
@@ -0,0 +1,214 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+ if (!vm_in.count(name)) {
+ throw std::runtime_error("Error: no " + name + " file was provided\n");
+ } else {
+ std::ifstream test(vm_in[name].as());
+ if (!test) {
+ throw std::runtime_error("The " + name + " file " +
+ vm_in[name].as() +
+ " does not exist.\n");
+ }
+ }
+}
+
+std::vector load_instr_sequence(std::string instr_path) {
+ std::ifstream instr_file(instr_path);
+ std::string line;
+ std::vector instr_v;
+ while (std::getline(instr_file, line)) {
+ std::istringstream iss(line);
+ uint32_t a;
+ if (!(iss >> std::hex >> a)) {
+ throw std::runtime_error("Unable to parse instruction file\n");
+ }
+ instr_v.push_back(a);
+ }
+ return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+ // Program arguments parsing
+ po::options_description desc("Allowed options");
+
+ desc.add_options()("help,h", "produce help message")(
+ "xclbin,x", po::value()->required(),
+ "the input xclbin path")(
+ "kernel,k", po::value()->required(),
+ "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+ "verbosity,v", po::value()->default_value(0),
+ "the verbosity of the output")(
+ "instr,i", po::value()->required(),
+ "path of file containing userspace instructions to be sent to the LX6")(
+ "M", po::value()->default_value(64),
+ "M, number of rows in the input matrix")(
+ "K", po::value()->default_value(64),
+ "K, number of columns in the input matrix");
+
+ po::variables_map vm;
+
+ try {
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ if (vm.count("help")) {
+ std::cout << desc << std::endl;
+ return 1;
+ }
+ } catch (const std::exception &ex) {
+ std::cerr << ex.what() << "\n\n";
+ std::cerr << "Usage:\n" << desc << std::endl;
+ return 1;
+ }
+
+ check_arg_file_exists(vm, "xclbin");
+ check_arg_file_exists(vm, "instr");
+
+ std::vector instr_v =
+ load_instr_sequence(vm["instr"].as());
+
+ int verbosity = vm["verbosity"].as();
+ if (verbosity >= 1)
+ std::cout << "Sequence instr count: " << instr_v.size() << std::endl;
+
+ uint32_t M = vm["M"].as();
+ uint32_t K = vm["K"].as();
+ uint32_t N = M * K;
+
+ if ((N % 1024)) {
+ std::cerr
+ << "Length (M * K) must be a multiple of 1024. Change M and K inputs"
+ << std::endl;
+ return 1;
+ }
+
+ // Start the XRT test code
+ // Get a device handle
+ unsigned int device_index = 0;
+ auto device = xrt::device(device_index);
+
+ // Load the xclbin
+ if (verbosity >= 1)
+ std::cout << "Loading xclbin: " << vm["xclbin"].as()
+ << std::endl;
+ auto xclbin = xrt::xclbin(vm["xclbin"].as());
+
+ if (verbosity >= 1)
+ std::cout << "Kernel opcode: " << vm["kernel"].as()
+ << std::endl;
+ std::string Node = vm["kernel"].as();
+
+ // Get the kernel from the xclbin
+ auto xkernels = xclbin.get_kernels();
+ auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+ [Node](xrt::xclbin::kernel &k) {
+ auto name = k.get_name();
+ std::cout << "Name: " << name << std::endl;
+ return name.rfind(Node, 0) == 0;
+ });
+ auto kernelName = xkernel.get_name();
+
+ if (verbosity >= 1)
+ std::cout << "Registering xclbin: " << vm["xclbin"].as()
+ << "\n";
+
+ device.register_xclbin(xclbin);
+
+ // get a hardware context
+ if (verbosity >= 1)
+ std::cout << "Getting hardware context." << std::endl;
+ xrt::hw_context context(device, xclbin.get_uuid());
+
+ // get a kernel handle
+ if (verbosity >= 1)
+ std::cout << "Getting handle to kernel:" << kernelName << std::endl;
+ auto kernel = xrt::kernel(context, kernelName);
+
+ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+ XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+ auto bo_inA = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY,
+ kernel.group_id(2));
+ auto bo_inB = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY,
+ kernel.group_id(3));
+ auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY,
+ kernel.group_id(4));
+
+ if (verbosity >= 1)
+ std::cout << "Writing data into buffer objects." << std::endl;
+
+ int32_t *bufInA = bo_inA.map();
+ std::vector srcVecA;
+ for (int i = 0; i < N; i++)
+ srcVecA.push_back(i + 1);
+ memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+ void *bufInstr = bo_instr.map();
+ memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+ bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+ if (verbosity >= 1)
+ std::cout << "Running Kernel." << std::endl;
+ auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+ run.wait();
+
+ bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+ uint32_t *bufOut = bo_out.map();
+ int errors = 0;
+
+ std::vector refVecA(N);
+
+ // Doing a transpose on the source vector to produce a ref vector
+ for (uint32_t i = 0; i < M; i++) {
+ for (uint32_t j = 0; j < K; j++) {
+ uint32_t src_index = i * K + j;
+ uint32_t dst_index = j * M + i;
+ refVecA[dst_index] = srcVecA[src_index];
+ }
+ }
+
+ for (uint32_t i = 0; i < N; i++) {
+ uint32_t ref = refVecA[i];
+ if (*(bufOut + i) != ref) {
+ std::cout << "ref = " << ref << " result = " << *(bufOut + i) << "\n";
+ errors++;
+ }
+ }
+
+ if (!errors) {
+ std::cout << std::endl << "PASS!" << std::endl << std::endl;
+ return 0;
+ } else {
+ std::cout << std::endl
+ << errors << " mismatches." << std::endl
+ << std::endl;
+ std::cout << std::endl << "fail." << std::endl << std::endl;
+ return 1;
+ }
+}
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index 6149657e1b..a57dff389c 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -38,6 +38,8 @@ M?=512
K?=512
N?=512
+trace_size=16384
+
mlir_target?=build/aie_${M}x${K}x${N}.mlir
xclbin_target?=build/final_${M}x${K}x${N}.xclbin
insts_target?=build/insts_${M}x${K}x${N}.txt
@@ -83,14 +85,19 @@ run: ${targetname}.exe ${xclbin_target} ${insts_target} #sign
export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs}
-.PHONY: clean
-clean:
- rm -rf build _build ${targetname}.exe
+trace: ${targetname}.exe ${xclbin_target} ${insts_target} # sign
+ export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
+ ${powershell} ./$< -x ${xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}
+ ../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json
.PHONY: parse_trace
parse_trace:
- ../../../utils/parse_eventIR.py --filename trace.txt --mlir ./build/aie.mlir --colshift 1 > trace_eventIR.json
+ ../../../utils/parse_trace.py --filename trace.txt --mlir ${mlir_target} --colshift 1 > parse_trace_mm.json
+
+.PHONY: clean
+clean: clean_trace
+ rm -rf build _build ${targetname}.exe
.PHONY: clean_trace
clean_trace:
- rm -rf tmpTrace trace_eventIR.json
+ rm -rf tmpTrace parse*.json
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/run.lit b/programming_examples/basic/matrix_multiplication/whole_array/run.lit
index fc23355630..0a15bd164e 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/run.lit
+++ b/programming_examples/basic/matrix_multiplication/whole_array/run.lit
@@ -9,4 +9,3 @@
// RUN: g++-13 %S/test.cpp -o test.exe -std=c++23 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt -v 1 -M 512 -K 512 -N 512 | FileCheck %s
// CHECK: PASS!
-
diff --git a/programming_examples/basic/passthrough_kernel/CMakeLists.txt b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
index fddc513396..52431b80ea 100644
--- a/programming_examples/basic/passthrough_kernel/CMakeLists.txt
+++ b/programming_examples/basic/passthrough_kernel/CMakeLists.txt
@@ -41,7 +41,7 @@ project(${ProjectName})
find_package(Boost REQUIRED)
add_executable(${currentTarget}
-${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
test.cpp
)
diff --git a/programming_examples/basic/passthrough_kernel/aie2.py b/programming_examples/basic/passthrough_kernel/aie2.py
index 5b187a7d94..ddde187360 100644
--- a/programming_examples/basic/passthrough_kernel/aie2.py
+++ b/programming_examples/basic/passthrough_kernel/aie2.py
@@ -69,8 +69,6 @@ def core_body():
tensorSizeInInt32s = tensorSize // 4
tensor_ty = T.memref(lineWidthInInt32s, T.i32())
- compute_tile2_col, compute_tile2_row = 0, 2
-
@FuncOp.from_py_func(tensor_ty, tensor_ty, tensor_ty)
def sequence(inTensor, outTensor, notUsed):
if enableTrace:
diff --git a/programming_examples/basic/vector_exp/Makefile b/programming_examples/basic/vector_exp/Makefile
index 5b471771ba..751779d2ee 100644
--- a/programming_examples/basic/vector_exp/Makefile
+++ b/programming_examples/basic/vector_exp/Makefile
@@ -10,17 +10,17 @@ include ../../makefile-common
targetname = testExp
-KERNELLIB = ${REPO_ROOT}/aie_kernels/aie2
-
all: build/final.xclbin build/insts.txt
-build/exp.o: ${KERNELLIB}/bf16_exp.cc
+VPATH := ../../../aie_kernels/aie2
+
+build/exp.o: bf16_exp.cc
mkdir -p ${@D}
- cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c $< -o ${@F}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I../../../../aie_runtime_lib/AIE2 -c $(<:%=../%) -o ${@F}
-build/lut_based_ops.o:
+build/lut_based_ops.o: ../../../aie_runtime_lib/AIE2/lut_based_ops.cpp
mkdir -p ${@D}
- cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_runtime_lib/AIE2/lut_based_ops.cpp -o ${@F}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c $(<:%=../%) -o ${@F}
build/kernels.a: build/exp.o build/lut_based_ops.o
ar rvs $@ $+
diff --git a/programming_examples/basic/vector_exp/README.md b/programming_examples/basic/vector_exp/README.md
index 8ab0602545..7b6fe0eb23 100644
--- a/programming_examples/basic/vector_exp/README.md
+++ b/programming_examples/basic/vector_exp/README.md
@@ -30,12 +30,18 @@ The design also uses a single file from the AIE runtime, in order to initialize
### C++ Testbench
-To compile the design and C++ testbench:
+To compile the design:
```
make
```
+To compile the C++ testbench:
+
+```
+make testExp.exe
+```
+
To run the design:
```
diff --git a/programming_examples/basic/vector_reduce_add/CMakeLists.txt b/programming_examples/basic/vector_reduce_add/CMakeLists.txt
index 024b4cfd54..23ca6a61e5 100644
--- a/programming_examples/basic/vector_reduce_add/CMakeLists.txt
+++ b/programming_examples/basic/vector_reduce_add/CMakeLists.txt
@@ -68,4 +68,4 @@ else()
target_link_libraries(${currentTarget} PUBLIC
xrt_coreutil
)
-endif()
\ No newline at end of file
+endif()
diff --git a/programming_examples/basic/vector_reduce_add/Makefile b/programming_examples/basic/vector_reduce_add/Makefile
index b0f8eebe0c..b9f45c87a7 100644
--- a/programming_examples/basic/vector_reduce_add/Makefile
+++ b/programming_examples/basic/vector_reduce_add/Makefile
@@ -10,23 +10,24 @@ include ../../makefile-common
ACDC_AIE = $(dir $(shell which aie-opt))/..
-targetname = vector_max
+targetname = reduce_add
devicename = npu
col = 0
CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
-KERNEL_LIB=${ACDC_AIE}/../../aie_kernels/aie2/
all: build/final.xclbin build/insts.txt
-build/i32_add_reduce.o: ${KERNEL_LIB}/i32_add_reduce.cc
+VPATH := ../../../aie_kernels/aie2
+
+build/%.cc.o: %.cc
mkdir -p ${@D}
- cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -c $< -o ${@F}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
build/aie.mlir: aie2.py
mkdir -p ${@D}
python3 $< ${devicename} ${col} > $@
-build/final.xclbin: build/aie.mlir build/i32_add_reduce.o
+build/final.xclbin: build/aie.mlir build/reduce_add.cc.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
@@ -73,4 +74,4 @@ vck5000: build/aie.mlir build/scale.o
-Wl,--whole-archive -Wl,--no-whole-archive -lstdc++ -ldl -lelf -o test.elf
clean: clean_trace
- rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe vector_max.o* vector_max.cc
\ No newline at end of file
+ rm -rf build _build inst aie.mlir.prj core_* test.elf ${targetname}.exe
diff --git a/programming_examples/basic/vector_reduce_add/README.md b/programming_examples/basic/vector_reduce_add/README.md
index 4e53dce2cf..7548165a1a 100644
--- a/programming_examples/basic/vector_reduce_add/README.md
+++ b/programming_examples/basic/vector_reduce_add/README.md
@@ -32,7 +32,7 @@ To compile the design and C++ testbench:
```
make
-make build/reduce_add.exe
+make reduce_add.exe
```
To run the design:
diff --git a/programming_examples/basic/vector_reduce_add/run_vck5000.lit b/programming_examples/basic/vector_reduce_add/run_vck5000.lit
index ca267e6084..3e162a8ed3 100644
--- a/programming_examples/basic/vector_reduce_add/run_vck5000.lit
+++ b/programming_examples/basic/vector_reduce_add/run_vck5000.lit
@@ -8,4 +8,4 @@
// RUN: %run_on_vck5000 ./test.elf
// this file needs update
-// XFAIL: *
\ No newline at end of file
+// XFAIL: *
diff --git a/programming_examples/basic/vector_reduce_max/Makefile b/programming_examples/basic/vector_reduce_max/Makefile
index 5e47d478b2..37bcaf43e8 100755
--- a/programming_examples/basic/vector_reduce_max/Makefile
+++ b/programming_examples/basic/vector_reduce_max/Makefile
@@ -14,13 +14,14 @@ targetname = reduce_max
devicename = npu
col = 0
CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
-KERNEL_LIB=../../../aie_kernels/aie2
all: build/final.xclbin build/insts.txt
-build/reduce_max.cc.o: ${KERNEL_LIB}/reduce_max.cc
+VPATH := ../../../aie_kernels/aie2
+
+build/%.cc.o: %.cc
mkdir -p ${@D}
- cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -c $(<:%=../%) -o ${@F}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
build/aie.mlir: aie2.py
mkdir -p ${@D}
diff --git a/programming_examples/basic/vector_reduce_max/README.md b/programming_examples/basic/vector_reduce_max/README.md
index 6bfa382fb4..ac2756f2dc 100644
--- a/programming_examples/basic/vector_reduce_max/README.md
+++ b/programming_examples/basic/vector_reduce_max/README.md
@@ -32,7 +32,7 @@ To compile the design and C++ testbench:
```
make
-make build/reduce_max.exe
+make reduce_max.exe
```
To run the design:
diff --git a/programming_examples/basic/vector_reduce_max/run_vck5000.lit b/programming_examples/basic/vector_reduce_max/run_vck5000.lit
index ca267e6084..3e162a8ed3 100644
--- a/programming_examples/basic/vector_reduce_max/run_vck5000.lit
+++ b/programming_examples/basic/vector_reduce_max/run_vck5000.lit
@@ -8,4 +8,4 @@
// RUN: %run_on_vck5000 ./test.elf
// this file needs update
-// XFAIL: *
\ No newline at end of file
+// XFAIL: *
diff --git a/programming_examples/basic/vector_reduce_min/CMakeLists.txt b/programming_examples/basic/vector_reduce_min/CMakeLists.txt
index 820bc8059d..fba5a600d5 100644
--- a/programming_examples/basic/vector_reduce_min/CMakeLists.txt
+++ b/programming_examples/basic/vector_reduce_min/CMakeLists.txt
@@ -40,15 +40,17 @@ project(${ProjectName})
find_package(Boost REQUIRED)
add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
test.cpp
)
target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
-target_include_directories (${currentTarget} PUBLIC
+target_include_directories (${currentTarget} PUBLIC
+ ../../utils
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
${XRT_INC_DIR}
${Boost_INCLUDE_DIRS}
- ../../../programming_examples/utils
)
target_link_directories(${currentTarget} PUBLIC
diff --git a/programming_examples/basic/vector_reduce_min/Makefile b/programming_examples/basic/vector_reduce_min/Makefile
index b4321855e1..3f4d1105df 100755
--- a/programming_examples/basic/vector_reduce_min/Makefile
+++ b/programming_examples/basic/vector_reduce_min/Makefile
@@ -14,13 +14,14 @@ targetname = reduce_min
devicename = npu
col = 0
CHESS_FLAGS=${CHESSCCWRAP2_FLAGS}
-KERNEL_LIB=../../../aie_kernels/aie2
all: build/final.xclbin build/insts.txt
-build/reduce_min.cc.o: ${KERNEL_LIB}/reduce_min.cc
+VPATH := ../../../aie_kernels/aie2
+
+build/%.cc.o: %.cc
mkdir -p ${@D}
- cd ${@D} && xchesscc_wrapper ${CHESS_FLAGS} -c $(<:%=../%) -o ${@F}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
build/aie.mlir: aie2.py
mkdir -p ${@D}
diff --git a/programming_examples/basic/vector_reduce_min/README.md b/programming_examples/basic/vector_reduce_min/README.md
index 1d5e9677d1..feb6328142 100644
--- a/programming_examples/basic/vector_reduce_min/README.md
+++ b/programming_examples/basic/vector_reduce_min/README.md
@@ -32,7 +32,7 @@ To compile the design and C++ testbench:
```
make
-make build/reduce_min.exe
+make reduce_min.exe
```
To run the design:
diff --git a/programming_examples/basic/vector_scalar_add/README.md b/programming_examples/basic/vector_scalar_add/README.md
index 3cf2a7cfcd..b1cb33333f 100644
--- a/programming_examples/basic/vector_scalar_add/README.md
+++ b/programming_examples/basic/vector_scalar_add/README.md
@@ -30,7 +30,7 @@ To compile the design and C++ testbench:
```
make
-make build/vectorScalarAdd.exe
+make vectorScalarAdd.exe
```
To run the design:
diff --git a/programming_examples/basic/vector_scalar_add/aie2.py b/programming_examples/basic/vector_scalar_add/aie2.py
index ef36a8a891..b4e2a84b97 100644
--- a/programming_examples/basic/vector_scalar_add/aie2.py
+++ b/programming_examples/basic/vector_scalar_add/aie2.py
@@ -13,63 +13,67 @@
def my_vector_bias_add():
- with mlir_mod_ctx() as ctx:
- @device(AIEDevice.npu)
- def device_body():
- memRef_16_ty = T.memref(16, T.i32())
- memRef_8_ty = T.memref(8, T.i32())
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_16_ty = T.memref(16, T.i32())
+ memRef_8_ty = T.memref(8, T.i32())
- # Tile declarations
- ShimTile = tile(0, 0)
- MemTile = tile(0, 1)
- ComputeTile2 = tile(0, 2)
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ MemTile = tile(0, 1)
+ ComputeTile2 = tile(0, 2)
- # AIE-array data movement with object fifos
- # Input
- of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty)
- of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
- object_fifo_link(of_in0, of_in1)
+ # AIE-array data movement with object fifos
+ # Input
+ of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_16_ty)
+ of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
+ object_fifo_link(of_in0, of_in1)
- # Output
- of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty)
- of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
- object_fifo_link(of_out1, of_out0)
+ # Output
+ of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_16_ty)
+ of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
+ object_fifo_link(of_out1, of_out0)
- # Set up compute tiles
+ # Set up compute tiles
- # Compute tile 2
- @core(ComputeTile2)
- def core_body():
- # Effective while(1)
- for _ in for_(8):
- elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
- elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
- for i in for_(8):
- v0 = memref.load(elem_in, [i])
- v1 = arith.addi(v0, arith.constant(1, T.i32()))
- memref.store(v1, elem_out, [i])
- yield_([])
- of_in1.release(ObjectFifoPort.Consume, 1)
- of_out1.release(ObjectFifoPort.Produce, 1)
+ # Compute tile 2
+ @core(ComputeTile2)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(8):
+ elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
yield_([])
+ of_in1.release(ObjectFifoPort.Consume, 1)
+ of_out1.release(ObjectFifoPort.Produce, 1)
+ yield_([])
- # To/from AIE-array data movement
+ # To/from AIE-array data movement
- memRef_64_ty = T.memref(64, T.i32())
- memRef_32_ty = T.memref(32, T.i32())
+ memRef_64_ty = T.memref(64, T.i32())
+ memRef_32_ty = T.memref(32, T.i32())
- @FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
- def sequence(inTensor, notUsed, outTensor):
- npu_dma_memcpy_nd(
- metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
- )
- npu_dma_memcpy_nd(
- metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
- )
- npu_sync(column=0, row=0, direction=0, channel=0)
+ @FuncOp.from_py_func(memRef_64_ty, memRef_32_ty, memRef_64_ty)
+ def sequence(inTensor, notUsed, outTensor):
+ npu_dma_memcpy_nd(
+ metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
+ )
+ npu_dma_memcpy_nd(
+ metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
- print(ctx.module)
-
-my_vector_bias_add()
+# Declares that subsequent code is in mlir-aie context
+with mlir_mod_ctx() as ctx:
+ my_vector_bias_add()
+ res = ctx.module.operation.verify()
+ if res == True:
+ print(ctx.module)
+ else:
+ print(res)
diff --git a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt
index e7b0f3d539..69edfb687b 100644
--- a/programming_examples/basic/vector_scalar_mul/CMakeLists.txt
+++ b/programming_examples/basic/vector_scalar_mul/CMakeLists.txt
@@ -33,8 +33,6 @@ endif()
set(VECTORSCALARMUL_SIZE 4096 CACHE STRING "vector size")
set(TARGET_NAME test CACHE STRING "Target to be built")
-message(STATUS "NOLF NOLF VECTORSCALARMUL_SIZE: ${VECTORSCALARMUL_SIZE}")
-
SET (ProjectName ${TARGET_NAME})
SET (currentTarget ${TARGET_NAME})
diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile
old mode 100755
new mode 100644
index 8af81834f7..ee55a2ae2b
--- a/programming_examples/basic/vector_scalar_mul/Makefile
+++ b/programming_examples/basic/vector_scalar_mul/Makefile
@@ -11,9 +11,7 @@ include ../../makefile-common
VPATH := ../../../aie_kernels/aie2
targetname = vectorScalar
-#data_size = 4096
-data_size = 512
-#data_size = 1024
+data_size = 4096
trace_size = 8192
all: build/final_${data_size}.xclbin build/insts_${data_size}.txt
@@ -38,7 +36,7 @@ build/final_${data_size}.xclbin: build/aie_${data_size}.mlir build/scale.o
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
-build/final_trace_${data_size}.xclbin: build/aie_trace.mlir build/scale.o
+build/final_trace_${data_size}.xclbin: build/aie_trace_${data_size}.mlir build/scale.o
mkdir -p ${@D}
cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
--aie-generate-npu --npu-insts-name=insts_${data_size}.txt $(<:%=../%)
@@ -62,15 +60,15 @@ run_py: build/final_${data_size}.xclbin build/insts_${data_size}.txt
trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size}
- ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace__${data_size}.mlir --colshift 1 > parse_eventIR_vs.json
+ ../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json
trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
${powershell} python3 test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size}
- ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > parse_eventIR_vs.json
+ ../../utils/parse_trace.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > trace_vs.json
clean_trace:
- rm -rf tmpTrace trace.txt
+ rm -rf tmpTrace trace.txt parse*json trace*json
clean: clean_trace
- rm -rf build _build ${targetname}_*.exe
+ rm -rf build _build ${targetname}*.exe
diff --git a/programming_examples/basic/vector_scalar_mul/README.md b/programming_examples/basic/vector_scalar_mul/README.md
index b1e78561d4..2ee29e2e19 100644
--- a/programming_examples/basic/vector_scalar_mul/README.md
+++ b/programming_examples/basic/vector_scalar_mul/README.md
@@ -78,7 +78,7 @@ To compile the design and C++ testbench:
```
make
-make build/vectorScalar.exe
+make vectorScalar.exe
```
To run the design:
diff --git a/programming_examples/basic/vector_scalar_mul/aie2.py b/programming_examples/basic/vector_scalar_mul/aie2.py
old mode 100755
new mode 100644
index 1f49b8d5a2..fec869ae6b
--- a/programming_examples/basic/vector_scalar_mul/aie2.py
+++ b/programming_examples/basic/vector_scalar_mul/aie2.py
@@ -16,8 +16,10 @@
def my_vector_scalar(vector_size, trace_size):
+ word_size_in = 2
N = vector_size
- N_in_bytes = N * 4
+ N_in_i32s = N * word_size_in // 4
+ N_in_bytes = N_in_i32s * 4
N_div_n = 4 # chop input vector into 4 sub-vectors
n = N // N_div_n
@@ -27,17 +29,18 @@ def my_vector_scalar(vector_size, trace_size):
@device(AIEDevice.npu)
def device_body():
- memRef_ty = T.memref(n, T.i32())
+ memRef_ty = T.memref(n, T.i16())
memRef_ty2 = T.memref(1, T.i32())
# AIE Core Function declarations
scale_scalar = external_func(
- "vector_scalar_mul_aie_scalar",
+ "vector_scalar_mul_int16_scalar",
inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()],
)
scale = external_func(
- "vector_scalar_mul_aie", inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()]
+ "vector_scalar_mul_int16_vector",
+ inputs=[memRef_ty, memRef_ty, memRef_ty2, T.i32()],
)
# Tile declarations
@@ -78,7 +81,7 @@ def core_body():
yield_([])
# To/from AIE-array data movement
- tensor_ty = T.memref(N, T.i32())
+ tensor_ty = T.memref(N_in_i32s, T.i32())
scalar_ty = T.memref(1, T.i32())
@FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
@@ -92,15 +95,17 @@ def sequence(A, F, C):
size=trace_size,
offset=N_in_bytes,
)
- npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N])
- npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N])
+ npu_dma_memcpy_nd(
+ metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N_in_i32s]
+ )
+ npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, N_in_i32s])
npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
npu_sync(column=0, row=0, direction=0, channel=0)
try:
vector_size = int(sys.argv[1])
- if vector_size % 64 != 0 or vector_size <= 512:
+ if vector_size % 64 != 0 or vector_size < 512:
print("Vector size must be a multiple of 64 and greater than or equal to 512")
raise ValueError
trace_size = 0 if (len(sys.argv) != 3) else int(sys.argv[2])
diff --git a/programming_examples/basic/vector_scalar_mul/test.cpp b/programming_examples/basic/vector_scalar_mul/test.cpp
index e147d91fa4..fe81d3ba9e 100644
--- a/programming_examples/basic/vector_scalar_mul/test.cpp
+++ b/programming_examples/basic/vector_scalar_mul/test.cpp
@@ -22,7 +22,8 @@
// Configure this to match your buffer data type
// ------------------------------------------------------
// using DATATYPE = std::uint8_t;
-using DATATYPE = std::uint32_t;
+// using DATATYPE = std::uint32_t;
+using DATATYPE = std::uint16_t;
#endif
const int scaleFactor = 3;
@@ -67,7 +68,7 @@ int main(int argc, const char *argv[]) {
XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
auto bo_inA =
xrt::bo(device, IN_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
- auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE),
+ auto bo_inFactor = xrt::bo(device, 1 * sizeof(int32_t),
XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
auto bo_outC =
xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
@@ -85,8 +86,8 @@ int main(int argc, const char *argv[]) {
bufInA[i] = i + 1;
// Initialize buffer bo_inFactor
- DATATYPE *bufInFactor = bo_inFactor.map();
- *bufInFactor = scaleFactor;
+ int32_t *bufInFactor = bo_inFactor.map();
+ *bufInFactor = (DATATYPE)scaleFactor;
// Zero out buffer bo_outC
DATATYPE *bufOut = bo_outC.map();
diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py
index e0ada9be1e..996bb90c78 100644
--- a/programming_examples/basic/vector_scalar_mul/test.py
+++ b/programming_examples/basic/vector_scalar_mul/test.py
@@ -33,9 +33,9 @@ def main(opts):
INOUT1_VOLUME = int(1) # Input only, 1 uint32_t scale factor
INOUT2_VOLUME = int(opts.size) # Output only, 64x uint32_t in this example
- INOUT0_DATATYPE = np.int32
+ INOUT0_DATATYPE = np.int16
INOUT1_DATATYPE = np.int32
- INOUT2_DATATYPE = np.int32
+ INOUT2_DATATYPE = np.int16
INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
@@ -90,7 +90,7 @@ def main(opts):
bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
# Copy output results and verify they are correct
- entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
+ entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint16)
output_buffer = entire_buffer[:INOUT2_VOLUME]
if opts.verify:
if opts.verbosity >= 1:
diff --git a/programming_examples/lit.cfg.py b/programming_examples/lit.cfg.py
index 06db63f291..d7ef6ebd17 100755
--- a/programming_examples/lit.cfg.py
+++ b/programming_examples/lit.cfg.py
@@ -19,7 +19,7 @@
# Configuration file for the 'lit' test runner.
# name: The name of this test suite.
-config.name = "AIE_REFERENCE_DESIGNS"
+config.name = "AIE_TUTORIALS"
config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
diff --git a/programming_examples/makefile-common b/programming_examples/makefile-common
index b5007535b8..42565c463f 100644
--- a/programming_examples/makefile-common
+++ b/programming_examples/makefile-common
@@ -2,7 +2,6 @@
REPO_ROOT ?= $(shell realpath $(dir $(shell which aie-opt))/../../..)
INSTALL_ROOT ?= $(shell realpath $(dir $(shell which aie-opt))/..)
-
# VITIS related variables
VITIS_ROOT ?= $(shell realpath $(dir $(shell which vitis))/../)
VITIS_AIETOOLS_DIR ?= ${VITIS_ROOT}/aietools
diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile
index c5f3576a1b..a09be78dd5 100755
--- a/programming_examples/ml/conv2d/Makefile
+++ b/programming_examples/ml/conv2d/Makefile
@@ -9,6 +9,8 @@ include ../../makefile-common
mlirFileName = aieWithTrace_1core
+VPATH := ../../../aie_kernels/aie2
+
all: build/conv2dk1_i8.o build/final.xclbin
@@ -20,7 +22,7 @@ build/${mlirFileName}.mlir: aie2.py
insts.txt: build/${mlirFileName}.mlir
aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $<
-build/conv2dk1_i8.o: ../../../aie_kernels/aie2/conv2dk1_i8.cc
+build/conv2dk1_i8.o: conv2dk1_i8.cc
xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
build/final.xclbin: build/${mlirFileName}.mlir
diff --git a/programming_examples/ml/conv2d_fused_relu/Makefile b/programming_examples/ml/conv2d_fused_relu/Makefile
index 0ee3a81d07..22ebe63104 100755
--- a/programming_examples/ml/conv2d_fused_relu/Makefile
+++ b/programming_examples/ml/conv2d_fused_relu/Makefile
@@ -9,6 +9,8 @@ include ../../makefile-common
mlirFileName = aieWithTrace_1core
+VPATH := ../../../aie_kernels/aie2
+
all: build/conv2dk1.o build/final.xclbin
build/${mlirFileName}.mlir: aie2.py
@@ -19,7 +21,7 @@ build/${mlirFileName}.mlir: aie2.py
insts.txt: build/${mlirFileName}.mlir
aiecc.py -v --aie-only-generate-npu --npu-insts-name=$@ $<
-build/conv2dk1.o: ../../../aie_kernels/aie2/conv2dk1.cc
+build/conv2dk1.o: conv2dk1.cc
xchesscc -d ${CHESSCC2_FLAGS} -DINT8_ACT -c $< -o $@
build/final.xclbin: build/${mlirFileName}.mlir
diff --git a/programming_examples/ml/eltwise_add/Makefile b/programming_examples/ml/eltwise_add/Makefile
index 702bd770ff..3597fc276b 100644
--- a/programming_examples/ml/eltwise_add/Makefile
+++ b/programming_examples/ml/eltwise_add/Makefile
@@ -17,7 +17,7 @@ VPATH := ../../../aie_kernels/aie2
build/%.o: %.cc
mkdir -p ${@D}
- cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_kernels/aie2/add.cc -o ${@F}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
build/aie.mlir: aie2.py
mkdir -p ${@D}
diff --git a/programming_examples/ml/eltwise_mul/Makefile b/programming_examples/ml/eltwise_mul/Makefile
index 60a39bb29b..9f24b09350 100644
--- a/programming_examples/ml/eltwise_mul/Makefile
+++ b/programming_examples/ml/eltwise_mul/Makefile
@@ -13,9 +13,11 @@ all: build/final.xclbin
targetname = myEltwiseMul
trace_size = 8192
-build/mul.o:
+VPATH := ../../../aie_kernels/aie2
+
+build/%.o: %.cc
mkdir -p ${@D}
- cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_kernels/aie2/mul.cc -o ${@F}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
build/aie.mlir: aie2.py
mkdir -p ${@D}
diff --git a/programming_examples/ml/relu/Makefile b/programming_examples/ml/relu/Makefile
index 4d52e165e1..dbd72f5213 100644
--- a/programming_examples/ml/relu/Makefile
+++ b/programming_examples/ml/relu/Makefile
@@ -13,9 +13,11 @@ all: build/final.xclbin
targetname = myReLU
trace_size = 8192
-build/relu.o:
+VPATH := ../../../aie_kernels/aie2
+
+build/%.o: %.cc
mkdir -p ${@D}
- cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -I. -c ../../../../aie_kernels/aie2/relu.cc -o ${@F}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
build/aie.mlir: aie2.py
mkdir -p ${@D}
diff --git a/programming_examples/utils/README.md b/programming_examples/utils/README.md
index 1d59d46e08..01f8d6e2ae 100644
--- a/programming_examples/utils/README.md
+++ b/programming_examples/utils/README.md
@@ -13,10 +13,10 @@
These utilities are helpful in the current programming examples context and include helpful C/C++ libraries, and python and shell scripts.
-- [Open CV Utilities](#Open-CV-Utilities-(OpenCVUtils.h)) ([OpenCVUtils.h](./OpenCVUtils.h))
-- [Clean microcode shell script](#Clean-microcode-shell-script) ([clean_microcode.sh](./clean_microcode.sh))
-- [Trace parser - eventIR based](#Trace-parser---eventIR-based-(parse_eventIR.py)) ([parse_eventIR.py](./parse_eventIR.py))
-- [Trace parser, custom](#Trace-parser,-custom) ([parse_trace.py](./parse_trace.py))
+- [Open CV Utilities](#open-cv-utilities-opencvutilsh) ([OpenCVUtils.h](./OpenCVUtils.h))
+- [Clean microcode shell script](#clean-microcode-shell-script-clean_microcodesh) ([clean_microcode.sh](./clean_microcode.sh))
+- [Trace parser](#trace-parser-parse_tracepy) ([parse_trace.py](./parse_trace.py))
+- [Trace parser - eventIR based](#trace-parser---eventir-based-parse_eventirpy) ([parse_eventIR.py](./parse_eventIR.py))
## Open CV Utilities ([OpenCVUtils.h](./OpenCVUtils.h))
OpenCV utilities used in vision processing pipelines to help read and/or initialize images and video. Currently supported functions include the following. Please view header for more specific function information.
@@ -32,8 +32,24 @@ OpenCV utilities used in vision processing pipelines to help read and/or initial
## Clean microcode shell script ([clean_microcode.sh](./clean_microcode.sh))
Shell script to do in-place cleanup of microcode files (e.g. core_*.lst). When viewing microcode, it's helpful for some of the extra information like hardware and software breakpoints to be removed so it's easier to see back-to-back lines of microcode.
+## Trace parser ([parse_trace.py](./parse_trace.py))
+The text file generated by the host code (`test.cpp` or `test.py`) are formatted as 32-bit hex values, one per line. This python script parses the raw trace packet data and creates a waveform json file for view on Perfetto http://ui.perfetto.dev. The script syntax is:
+
+```bash
+parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > parse_eventIR_vs.json
+```
+
+* **--filename** : Input trace packet text file. This is generated during the running of our python host code
+* **--mlir** : MLIR source. This is needed to parse what events and tiles we are monitoring to generate labels for our waveform visualizer.
+* **--colshift** : runtime column shift. This specifies how much the actual design was shifted from the default position when it was scheduled and called. The reason we need this is becuase even if our design is configured for column 0, the actual loading and execution of the design may place it in column 1, 2, 3 etc. We account for this shift since the parser needs to match the actual column location of the generated trace data. Usually 1 is the right value. **NOTE** - the underlying tools currently default to column 1 to avoid using column 0 on Ryzen AI since that column does not have a shimDMA and is therefore avoided at the moment.
+
+
## Trace parser - eventIR based ([parse_eventIR.py](./parse_eventIR.py))
-The text file generated by the host code (`test.cpp` or `test.py`) are formatted as 32-bit hex values, one per line. This python script executes a number of steps in order to transform it from trace packet text file into a waveform json file. The script syntax is:
+The text file generated by the host code (`test.cpp` or `test.py`) are formatted as 32-bit hex values, one per line. This python script executes a number of steps in order to transform it from trace packet text file into a waveform json file.
+
+**NOTE** - There seems to be some inconsistencies in the results generated by this parser. As of now, it is used to compare to existing the `hwfrontend` tool only.
+
+The script syntax is:
```bash
parse_eventIR.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > parse_eventIR_vs.json
@@ -43,12 +59,13 @@ parse_eventIR.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 >
* **--colshift** : runtime column shift. This specifies how much the actual design was shifted from the default position when it was scheduled and called. The reason we need this is becuase even if our design is configured for column 0, the actual loading and execution of the design may place it in column 1, 2, 3 etc. We account for this shift since the parser needs to match the actual column location of the generated trace data. Usually 1 is the right value. **NOTE** - the underlying tools currently default to column 1 to avoid using column 0 on Ryzen AI since that column does not have a shimDMA and is therefore avoided at the moment.
The parse script create a temporary directory `tmpTrace` performs the following steps within that folder:
-1. Fixes raw trace data
-1. Parse MLIR to build event table
-1. Create .target file
-1. Create config.json
-1. Run Vitis/aietools hwfrontend utility to parse raw trace data --> generates eventIR.txt
-1. Convert eventIR.txt to perfetto_compatible.json
+1. [Fixes raw trace data](#1-fixes-raw-trace-data)
+1. [Parse MLIR to build event table](#2-parse-mlir-to-build-event-table)
+1. [Create .target file](#3-create-target-file)
+1. [Create config.json](#4-create-configjson)
+1. [Run Vitis/aietools hwfrontend utility to parse raw trace data --> generates eventIR.txt](#5-run-vitisaietools-hwfrontend-utility-to-parse-raw-trace-data----generates-eventirtxt)
+1. [Convert eventIR.txt to perfetto_compatible.json](#6-convert-eventirtxt-to-perfetto_compatiblejson)
+* [Additional Tips](#tips)
### 1. Fixes raw trace data
We prepend `0x` before each hex line and save it `prep.` since the `hwfrontend` utility expects it.
@@ -89,9 +106,3 @@ to
```
which reduces the timer from 11,091,042 cycles to 381,175 seems to fix it.
-## Trace parser, custom ([parse_trace.py](./parse_trace.py))
-This is our custom trace packet parser based on the trace packet spec, but it's currently a work in progress as some inconsisteancies with the generated waveform have cropped up. It is run in the same way as `parse_eventIR.py` but does not generate the intermediate directory `tmpTrace` or the other intermediate files use by `parse_eventIR.py`.
-
-```bash
-parse_eventIR.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > parse_eventIR_vs.json
-```
diff --git a/programming_examples/utils/parse_eventIR.py b/programming_examples/utils/parse_eventIR.py
index b41ff9c74a..2e8b0474f6 100755
--- a/programming_examples/utils/parse_eventIR.py
+++ b/programming_examples/utils/parse_eventIR.py
@@ -18,7 +18,7 @@
rowoffset = 1 # TODO tmeporary workaround to figure out row offset for AIE2 for tiles
DEBUG = False
-verbose = False
+verbose = True
eventIRFile = "eventIR.txt"
tmpTraceDirName = "tmpTrace"
@@ -733,6 +733,8 @@ def lookup_event_name_by_type(trace_type, code):
if trace_type == 0:
if code == 0x1:
event = "True"
+ elif code == 23: # 0x17:
+ event = "MemoryStall"
elif code == 24: # 0x18:
event = "StreamStall"
elif code == 26: # 0x1A:
diff --git a/programming_examples/utils/xrtUtils.cpp b/programming_examples/utils/xrtUtils.cpp
deleted file mode 100644
index ebd106f505..0000000000
--- a/programming_examples/utils/xrtUtils.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-//===- xrtUtils.cpp --------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#include "xrtUtils.h"
-
-#include
-#include
-#include
-
-namespace po = boost::program_options;
-
-void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
- if (!vm_in.count(name)) {
- throw std::runtime_error("Error: no " + name + " file was provided\n");
- } else {
- std::ifstream test(vm_in[name].as());
- if (!test) {
- throw std::runtime_error("The " + name + " file " +
- vm_in[name].as() +
- " does not exist.\n");
- }
- }
-}
-
-std::vector load_instr_sequence(std::string instr_path) {
- std::ifstream instr_file(instr_path);
- std::string line;
- std::vector instr_v;
- while (std::getline(instr_file, line)) {
- std::istringstream iss(line);
- uint32_t a;
- if (!(iss >> std::hex >> a)) {
- throw std::runtime_error("Unable to parse instruction file\n");
- }
- instr_v.push_back(a);
- }
- return instr_v;
-}
-
-void initXrtLoadKernel(xrt::device &device, xrt::kernel &kernel, int verbosity,
- std::string xclbinFileName,
- std::string kernelNameInXclbin) {
- // Start the XRT test code
- // Get a device handle
- unsigned int device_index = 0;
- device = xrt::device(device_index);
-
- // Load the xclbin
- if (verbosity >= 1)
- std::cout << "Loading xclbin: " << xclbinFileName << "\n";
- auto xclbin = xrt::xclbin(xclbinFileName);
-
- if (verbosity >= 1)
- std::cout << "Kernel opcode: " << kernelNameInXclbin << "\n";
-
- // Get the kernel from the xclbin
- auto xkernels = xclbin.get_kernels();
- auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
- [kernelNameInXclbin](xrt::xclbin::kernel &k) {
- auto name = k.get_name();
- std::cout << "Name: " << name << std::endl;
- return name.rfind(kernelNameInXclbin, 0) == 0;
- });
- auto kernelName = xkernel.get_name();
-
- if (verbosity >= 1)
- std::cout << "Registering xclbin: " << xclbinFileName << "\n";
-
- device.register_xclbin(xclbin);
-
- // get a hardware context
- if (verbosity >= 1)
- std::cout << "Getting hardware context.\n";
- xrt::hw_context context(device, xclbin.get_uuid());
-
- // get a kernel handle
- if (verbosity >= 1)
- std::cout << "Getting handle to kernel:" << kernelName << "\n";
- kernel = xrt::kernel(context, kernelName);
-
- return;
-}
\ No newline at end of file
diff --git a/programming_examples/utils/xrtUtils.h b/programming_examples/utils/xrtUtils.h
deleted file mode 100644
index dc1d47dc09..0000000000
--- a/programming_examples/utils/xrtUtils.h
+++ /dev/null
@@ -1,30 +0,0 @@
-//===- xrtUtils.h -----------------------------------------------*- C++ -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _XRTUTILS_H_
-#define _XRTUTILS_H_
-
-#include
-#include
-
-#include
-
-#include "xrt/xrt_device.h"
-#include "xrt/xrt_kernel.h"
-
-std::vector load_instr_sequence(std::string instr_path);
-void check_arg_file_exists(boost::program_options::variables_map &vm_in,
- std::string name);
-
-void initXrtLoadKernel(xrt::device &device, xrt::kernel &kernel, int verbosity,
- std::string xclbinFileName,
- std::string kernelNameInXclbin);
-
-#endif //_XRTUTILS_H_
\ No newline at end of file
diff --git a/programming_examples/vision/color_detect/Makefile b/programming_examples/vision/color_detect/Makefile
index ffb8ca55d1..0d8091620f 100755
--- a/programming_examples/vision/color_detect/Makefile
+++ b/programming_examples/vision/color_detect/Makefile
@@ -42,7 +42,7 @@ build/final_${COLORDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${COLORDETECT_W
cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
--xclbin-name=${@F} --npu-insts-name=insts.txt $(<:%=../%)
-build/${targetname}.exe: test.cpp
+${targetname}.exe: test.cpp
mkdir -p ${@D}
rm -rf _build
mkdir -p _build
@@ -55,7 +55,7 @@ else
cp _build/${targetname} $@
endif
-run: build/${targetname}.exe build/final_${COLORDETECT_WIDTH}.xclbin build/insts.txt
+run: ${targetname}.exe build/final_${COLORDETECT_WIDTH}.xclbin build/insts.txt
${powershell} ./$< -x build/final_${COLORDETECT_WIDTH}.xclbin -i build/insts.txt -k MLIR_AIE
clean:
diff --git a/programming_examples/vision/vision_passthrough/CMakeLists.txt b/programming_examples/vision/vision_passthrough/CMakeLists.txt
index a2bb8ac761..ecbeea0ba9 100644
--- a/programming_examples/vision/vision_passthrough/CMakeLists.txt
+++ b/programming_examples/vision/vision_passthrough/CMakeLists.txt
@@ -2,7 +2,7 @@
# See https://llvm.org/LICENSE.txt for license information.
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
#
-# (c) Copyright 2023 Xilinx Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# parameters
# -DBOOST_ROOT: Path to Boost install
diff --git a/programming_examples/vision/vision_passthrough/README.md b/programming_examples/vision/vision_passthrough/README.md
index 31d4add65f..ebb86bc0f2 100644
--- a/programming_examples/vision/vision_passthrough/README.md
+++ b/programming_examples/vision/vision_passthrough/README.md
@@ -15,7 +15,7 @@ Single tile applies a pass through kernel on data from local memory. There are t
To compile desing in Windows:
```
make
-make build/passThrough.exe
+make passThrough.exe
```
To run the design:
diff --git a/programming_guide/README.md b/programming_guide/README.md
new file mode 100644
index 0000000000..df0471ebc0
--- /dev/null
+++ b/programming_guide/README.md
@@ -0,0 +1,77 @@
+
+
+# IRON AIE Programming Guide
+
+
+
+The AI Engine (AIE) array is a spatial compute architecture: a modular and scalable system with spatially distributed compute and memories. Its compute dense vector processing runs independently and concurrently to explicitly scheduled data movement. Since the vector compute core (green) of each AIE can only operate on data in its L1 scratchpad memory (light blue), data movement accelerators (purple) bi-directionally transport this data over a switched (dark blue) interconnect network, from any level in the memory hierarchy.
+
+Programming the AIE-array configures all its spatial building blocks: the compute cores' program memory, the data movers' buffer descriptors, interconnect with switches, etc. This guide introduces our Interface Representation for hands-ON (IRON) close-to-metal programming of the AIE-array. IRON is an open access toolkit enabling performance engineers to build fast and efficient, often specialized designs through a set of Python language bindings around mlir-aie, our MLIR-based representation of the AIE-array. mlir-aie provides the foundation from which complex and performant AI Engine designs can be defined and is supported by simulation and hardware implementation infrastructure.
+
+> **NOTE:** For those interested in better understanding how AI Engine designs are defined at the MLIR level, take a look through the [MLIR tutorial](../mlir_tutorials/) material. mlir-aie also serves as a lower layer for other higher-level abstraction MLIR layers such as [mlir-air](https://github.com/Xilinx/mlir-air).
+
+This IRON AIE programming guide first introduces the language bindings for AIE-array's structural elements ([section 1](./section-1/README.md)). After explaining how to set up explicit data movement ([section 2](./section-2/README.md)) to transport the necessary data, you can run your first program on the AIE compute core ([section 3](./section-3/README.md)). [Section 4](./section-4/README.md) adds tracing for performance analysis and explains how to exploit the compute dense vector operations. More vector design examples, basic and larger (ML or computer vision), are given in sections [5](./section-5/README.md) and [6](./section-6/README.md). Finally, the [quick reference](./quick_reference.md) summarizes the most important API elements.
+
+## Outline
+Section 0 - Getting Set Up for IRON
+
+* Introduce recommended hardware to target with IRON
+* Simple instructions to set up your hardware, tools and environment
+
+Section 1 - Basic AI Engine building blocks
+
+* Introduce the AI Engine building blocks for expressing an application design
+* Give example of python bindings for MLIR source that definre AIE tiles
+
+Section 2 - Data Movement (Object FIFOs)
+
+* Introduce topic of objectfifos and how they abstract connections between tiles and data in the AIE array memories
+* Explain key objectfifo data movement patterns
+* Introduce more complex objectfifo connection patterns (link/ broadcast, join/ distribute)
+* Demonstrate objectfifos with practical examples
+* Explain runtime data movement between the host and AIE array
+
+Section 3 - My First Program
+
+* Introduce example of first simple program (Vector Scalar Multiplication)
+* Illustrate how to run designs on Ryzen™ AI enabled hardware
+
+Section 4 - Vector programming & Peformance Measurement
+
+* Discuss topic of vector programming at the kernel level
+* Introduce performance measurement (trace) and how we measure cycle count and efficiency
+* Performant Vector Scalar Multiplication design example
+
+Section 5 - Example Vector Designs
+
+* Introduce additional vector design examples with exercises to measure performance on each
+ * Passthrough
+ * Vector $e^x$
+ * Vector Scalar Addition
+ * GEMM
+ * CONV2D
+ * ...
+
+Section 6 - Larger Example Designs
+
+* Introduce larger design examples with performance measured over multiple cores
+ * Edge Detect
+ * Resnet
+ * ...
+
+
+### [Quick Reference](./quick_reference.md)
+
+## AI Engine architecture documentation
+* [AIE1 Architecture Manual - AM009](https://docs.amd.com/r/en-US/am009-versal-ai-engine/Overview)
+* [AIE2 Architecture Manual - AM020](https://docs.amd.com/r/en-US/am020-versal-aie-ml/Overview)
+
+
diff --git a/programming_guide/assets/AIEarray.svg b/programming_guide/assets/AIEarray.svg
new file mode 100644
index 0000000000..79cc03f3be
--- /dev/null
+++ b/programming_guide/assets/AIEarray.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/AcquireRelease.png b/programming_guide/assets/AcquireRelease.png
new file mode 100644
index 0000000000..b39af303af
Binary files /dev/null and b/programming_guide/assets/AcquireRelease.png differ
diff --git a/programming_guide/assets/Broadcast.png b/programming_guide/assets/Broadcast.png
new file mode 100644
index 0000000000..acf20950e0
Binary files /dev/null and b/programming_guide/assets/Broadcast.png differ
diff --git a/programming_guide/assets/ComputeTile.png b/programming_guide/assets/ComputeTile.png
new file mode 100644
index 0000000000..065fed189f
Binary files /dev/null and b/programming_guide/assets/ComputeTile.png differ
diff --git a/programming_guide/assets/ComputeTile_2.png b/programming_guide/assets/ComputeTile_2.png
new file mode 100644
index 0000000000..6141e4edd7
Binary files /dev/null and b/programming_guide/assets/ComputeTile_2.png differ
diff --git a/programming_guide/assets/DMA_BDs.png b/programming_guide/assets/DMA_BDs.png
new file mode 100644
index 0000000000..d31023d433
Binary files /dev/null and b/programming_guide/assets/DMA_BDs.png differ
diff --git a/programming_guide/assets/DataLayoutTransformation.svg b/programming_guide/assets/DataLayoutTransformation.svg
new file mode 100644
index 0000000000..9b9d21c68c
--- /dev/null
+++ b/programming_guide/assets/DataLayoutTransformation.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/Distribute.png b/programming_guide/assets/Distribute.png
new file mode 100644
index 0000000000..4c2b051f17
Binary files /dev/null and b/programming_guide/assets/Distribute.png differ
diff --git a/programming_guide/assets/DistributeL2.svg b/programming_guide/assets/DistributeL2.svg
new file mode 100644
index 0000000000..c5322a9fed
--- /dev/null
+++ b/programming_guide/assets/DistributeL2.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/DoubleBuffer.svg b/programming_guide/assets/DoubleBuffer.svg
new file mode 100644
index 0000000000..03c42f5a82
--- /dev/null
+++ b/programming_guide/assets/DoubleBuffer.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/ExtMemToCore.svg b/programming_guide/assets/ExtMemToCore.svg
new file mode 100644
index 0000000000..899c673933
--- /dev/null
+++ b/programming_guide/assets/ExtMemToCore.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/ExtMemToCoreL2.svg b/programming_guide/assets/ExtMemToCoreL2.svg
new file mode 100644
index 0000000000..43d333e3ae
--- /dev/null
+++ b/programming_guide/assets/ExtMemToCoreL2.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/Join.png b/programming_guide/assets/Join.png
new file mode 100644
index 0000000000..44af1d5770
Binary files /dev/null and b/programming_guide/assets/Join.png differ
diff --git a/programming_guide/assets/JoinL2.svg b/programming_guide/assets/JoinL2.svg
new file mode 100644
index 0000000000..126347f379
--- /dev/null
+++ b/programming_guide/assets/JoinL2.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/MultiDesign.svg b/programming_guide/assets/MultiDesign.svg
new file mode 100644
index 0000000000..1cca8a3097
--- /dev/null
+++ b/programming_guide/assets/MultiDesign.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/ObjectFifo.svg b/programming_guide/assets/ObjectFifo.svg
new file mode 100644
index 0000000000..459d017b60
--- /dev/null
+++ b/programming_guide/assets/ObjectFifo.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/Reuse.png b/programming_guide/assets/Reuse.png
new file mode 100644
index 0000000000..897e9f7bf8
Binary files /dev/null and b/programming_guide/assets/Reuse.png differ
diff --git a/programming_guide/assets/SingleBuffer.svg b/programming_guide/assets/SingleBuffer.svg
new file mode 100644
index 0000000000..1efa304f82
--- /dev/null
+++ b/programming_guide/assets/SingleBuffer.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/SingleDesign.svg b/programming_guide/assets/SingleDesign.svg
new file mode 100644
index 0000000000..cb70886552
--- /dev/null
+++ b/programming_guide/assets/SingleDesign.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/SkipBroadcast.png b/programming_guide/assets/SkipBroadcast.png
new file mode 100644
index 0000000000..7412c86948
Binary files /dev/null and b/programming_guide/assets/SkipBroadcast.png differ
diff --git a/programming_guide/assets/SkipBroadcastFix.png b/programming_guide/assets/SkipBroadcastFix.png
new file mode 100644
index 0000000000..2284f5afce
Binary files /dev/null and b/programming_guide/assets/SkipBroadcastFix.png differ
diff --git a/programming_guide/assets/SkipBroadcastNoFix.png b/programming_guide/assets/SkipBroadcastNoFix.png
new file mode 100644
index 0000000000..10f43601f1
Binary files /dev/null and b/programming_guide/assets/SkipBroadcastNoFix.png differ
diff --git a/programming_guide/assets/aie-ml_shift_adder_path.png b/programming_guide/assets/aie-ml_shift_adder_path.png
new file mode 100644
index 0000000000..5723cd6b6f
Binary files /dev/null and b/programming_guide/assets/aie-ml_shift_adder_path.png differ
diff --git a/programming_guide/assets/aie-ml_srs_ups.png b/programming_guide/assets/aie-ml_srs_ups.png
new file mode 100644
index 0000000000..1d45408bfe
Binary files /dev/null and b/programming_guide/assets/aie-ml_srs_ups.png differ
diff --git a/programming_guide/assets/aie-ml_vector_unit.png b/programming_guide/assets/aie-ml_vector_unit.png
new file mode 100644
index 0000000000..2151f8fecc
Binary files /dev/null and b/programming_guide/assets/aie-ml_vector_unit.png differ
diff --git a/programming_guide/assets/aie_compute_details1.png b/programming_guide/assets/aie_compute_details1.png
new file mode 100755
index 0000000000..3bcc968d94
Binary files /dev/null and b/programming_guide/assets/aie_compute_details1.png differ
diff --git a/programming_guide/assets/aie_vector_scalar_ml_opt1.png b/programming_guide/assets/aie_vector_scalar_ml_opt1.png
new file mode 100755
index 0000000000..712fa28814
Binary files /dev/null and b/programming_guide/assets/aie_vector_scalar_ml_opt1.png differ
diff --git a/programming_guide/assets/binaryArtifacts.svg b/programming_guide/assets/binaryArtifacts.svg
new file mode 100644
index 0000000000..3584466377
--- /dev/null
+++ b/programming_guide/assets/binaryArtifacts.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/passthrough_simple.svg b/programming_guide/assets/passthrough_simple.svg
new file mode 100644
index 0000000000..d53d259afb
--- /dev/null
+++ b/programming_guide/assets/passthrough_simple.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/trace_vector_scalar_add1.png b/programming_guide/assets/trace_vector_scalar_add1.png
new file mode 100644
index 0000000000..fd6e9e8e42
Binary files /dev/null and b/programming_guide/assets/trace_vector_scalar_add1.png differ
diff --git a/programming_guide/assets/vectorScalarMul.svg b/programming_guide/assets/vectorScalarMul.svg
new file mode 100644
index 0000000000..9c924cafa0
--- /dev/null
+++ b/programming_guide/assets/vectorScalarMul.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/vectorScalarMulPhysicalDataFlow.svg b/programming_guide/assets/vectorScalarMulPhysicalDataFlow.svg
new file mode 100644
index 0000000000..83e74838aa
--- /dev/null
+++ b/programming_guide/assets/vectorScalarMulPhysicalDataFlow.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/programming_guide/assets/vector_scalar.svg b/programming_guide/assets/vector_scalar.svg
new file mode 100644
index 0000000000..030229bfaa
--- /dev/null
+++ b/programming_guide/assets/vector_scalar.svg
@@ -0,0 +1,21 @@
+
\ No newline at end of file
diff --git a/programming_guide/quick_reference.md b/programming_guide/quick_reference.md
new file mode 100644
index 0000000000..4e0c5d11b7
--- /dev/null
+++ b/programming_guide/quick_reference.md
@@ -0,0 +1,81 @@
+
+
+# IRON Quick Reference
+
+* [Python Bindings](#python-bindings)
+* [Python Helper Functions](#python-helper-functions)
+* [Helpful AI Engine Architecture References and Tables](#helpful-ai-engine-architecture-references-and-tables)
+* [AI Engine documentation](#ai-engine-documentation)
+
+----
+
+## Python Bindings
+
+| Function Signature | Definition | Parameters | Return Type | Example |
+|---------------------|------------|------------|-------------|---------|
+| `tile(column, row)` | Declare AI Engine tile | `column`: column index number `row`: row index number | `` | ComputeTile = tile(1,3) |
+| `external_func(name, inputs, output)` | Declare external kernel function that will run on AIE Cores| `name`: external function name `input`: list of input types `output`: list of output types | `` | scale_scalar = external_func("vector_scalar_mul_aie_scalar", inputs=[memRef_ty, memRef_ty, T.memref(1, T.i32()), T.i32()]) | |
+| `npu_dma_memcpy_nd(metadata, bd_id, mem, sizes)` | configure n-dimensional DMA accessing external memory | `metadata`: String with name of `object_fifo` `bd_id`: Identifier number `mem`: memory for transfer `sizes`: 4-D transfer size in 4B granularity | `None` | npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) |
+| `npu_sync(column, row, direction, channel, column_num=1, row_num=1)` | configure host-ShimDMA syncronization for accessing external memory | `column` and `row`: Specify the tile location for initiating the synchronization. `direction`: Indicates the DMA direction (0 for write to host, 1 for read from host). `channel`: Identifies the DMA channel (0 or 1) for the synchronization token `column_num` and `row_num` (optional): Define the range of tiles to wait for synchronization| `None` | npu_sync(column=0, row=0, direction=0, channel=1) |
+| **Object FIFO** |||
+| `object_fifo(name, producerTile, consumerTiles, depth, datatype)` | Construct Object FIFO | `name`: Object FIFO name `producerTile`: producer tile object `ConsumerTiles`: list of consumer tile objects `depth`: number of object in Object FIFO `datatype`: type of the objects in the Object FIFO| `` | of0 = object_fifo("objfifo0", A, B, 3, T.memref(256, T.i32())) |
+| `.acquire(port, num_elem)` | Acquire from Object FIFO | `port`: `ObjectFifoPort.Produce` or `ObjectFifoPort.Consume` `num_elem`: number of objects to acquire | `` | elem0 = of0.acquire(ObjectFifoPort.Produce, 1) | |
+| `object_fifo.release(port, num_elem)` | Release from Object FIFO | `port`: `ObjectFifoPort.Produce` or `ObjectFifoPort.Consume` `num_elem`: | `None` | of0.release(ObjectFifoPort.Consume, 2) |
+| `object_fifo_link(fifoIns, fifoOuts)` | Create a link between Object FIFOs | `fifoIns`: list of Object FIFOs (variables or names) `fifoOuts`: list of Object FIFOs (variables or names) | `None` | object_fifo_link(of0, of1) |
+| **Routing Bindings (relevant for trace and low-level design)** |||
+| `flow(source, source_bundle, source_channel, dest, dest_bundle, dest_channel)` | Create a circuit switched flow between src and dest | `source`: source tile of the flow `source_bundle`: type of source WireBundle (see full list in [AIEAttrs.td](../include/aie/Dialect/AIE/IR/AIEAttrs.td)) `source_channel`: source channel index `dest`: destination tile of the flow `dest_bundle`: type of destination WireBundle (see full list in [AIEAttrs.td](../include/aie/Dialect/AIE/IR/AIEAttrs.td)) `dest_channel`: destination channel index | `None` | flow(ComputeTile, WireBundle.DMA, 0, ShimTile, WireBundle.DMA, 1) | In the case when we're routing for trace, the srcPort and srcChannel can be WireBundle.Trace and 0 respectively|
+| `packetflow(pkt_id, source, source_port, source_channel, dest, dest_port, dest_channel, keep_pkt_header)` | Create a packet switched flow between src and dest | `pkt_id`: unique packet ID `source`: source tile of the packet flow `source_port`: type of source WireBundle (see full list in [AIEAttrs.td](../include/aie/Dialect/AIE/IR/AIEAttrs.td)) `source_channel`: source channel index `dest`: destination tile of the packet flow `dest_port`: type of destination WireBundle (see full list in [AIEAttrs.td](../include/aie/Dialect/AIE/IR/AIEAttrs.td)) `dest_channel`: destination channel index `keep_pkt_header`: boolean flag to keep header | `None` | packetflow(1, ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1, keep_pkt_hdr=True) | Example shows trace routing. If you want to route from the core memory trace unit, then we would use channel 1 |
+|||||
+
+> **NOTE:** `tile`: The actual tile coordinates run on the device may deviate from the ones declared here. In Ryzen AI, for example, these coordinates tend to be relative coordinates as the runtime scheduler may assign it to a different available column.
+
+> **NOTE:** `object_fifo`: The `producerTile` and `consumerTiles` inputs are AI Engine tiles. The `consumerTiles` may also be specified as an array of tiles for multiple consumers.
+
+> **NOTE:** `.{acquire,release}`: The output may be either a single object or an array of objects which can then be indexed in an array-like fashion.
+
+> **NOTE:** `object_fifo_link` The tile that is used as the shared tile in the link must currently be a Mem tile. The inputs `fifoIns` and `fifoOuts` may be either a single Object FIFO or a list of them. Both can be specified either using their python variables or their names. Currently, if one of the two inputs is a list of ObjectFIFOs then the other can only be a single Object FIFO.
+
+## Python helper functions
+| Function Signature | Description |
+|--------------------|-------------|
+| `print(ctx.module)` | Converts our ctx wrapped structural code to mlir and prints to stdout|
+| `ctx.module.operation.verify()` | Runs additional structural verficiation on the python binded source code and return result to stdout |
+
+## Helpful AI Engine Architecture References and Tables
+* [AIE2 - Table of supported data types and vector sizes (AIE API)](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__basic__types.html)
+
+* Some useful Tile core Trace Events
+ | Some common events | event ID | dec value |
+ |--------------------|----------|-----------|
+ | True |0x01| 1 |
+ | Stream stalls |0x18| 24 |
+ | Core Instruction - Event 0 |0x21| 33|
+ | Core Instruction - Event 1 |0x22| 34 |
+ | Vector Instructions (e.g. VMAC, VADD, VCMP) |0x25| 37 |
+ | Lock acquire requests |0x2C| 44 |
+ | Lock release requests |0x2D| 45 |
+ | Lock stall |0x1A| 26 |
+ | Core Port Running 1 |0x4F| 79 |
+ | Core Port Running 0 |0x4B| 75 |
+ * A more exhaustive list of events for core tile, core memory, memtile and shim tile can be found in [this header file](https://github.com/Xilinx/aie-rt/blob/main-aie/driver/src/events/xaie_events_aie.h)
+
+## AI Engine documentation
+* [Summary Documentation Links in UG1076](https://docs.amd.com/r/en-US/ug1076-ai-engine-environment/Documentation)
+* [AIE1 Architecture Manual - AM009](https://docs.amd.com/r/en-US/am009-versal-ai-engine/Overview)
+* [AIE1 Register Reference - AM015](https://docs.amd.com/r/en-US/am015-versal-aie-register-reference/Overview)
+* [AIE2 Architecture Manual - AM020](https://docs.amd.com/r/en-US/am020-versal-aie-ml/Overview)
+* [AIE2 Register Reference - AM025](https://docs.amd.com/r/en-US/am025-versal-aie-ml-register-reference/Overview)
+* [AIE API User Guide - v2023.2](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_intrinsics/intrinsics/index.html)
+* [AIE1 Intrinsics User Guide - v2023.2](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_intrinsics/intrinsics/index.html)
+* [AIE2 Intrinsics User Guide - v2023.2](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/index.html)
+
+## AIE Detailed References
+* [AIE2 - Table of supported data types and vector sizes (AIE API)](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/group__group__basic__types.html)
diff --git a/programming_guide/section-0/README.md b/programming_guide/section-0/README.md
new file mode 100644
index 0000000000..c4b5558927
--- /dev/null
+++ b/programming_guide/section-0/README.md
@@ -0,0 +1,36 @@
+
+
+# Section 0 - Getting Set Up for IRON
+
+This programming guide focuses on application programming for the NPU found in Ryzen™ AI laptops and mini PCs. The latest information on Ryzen AI CPUs can be found [here](https://www.amd.com/en/products/processors/consumer/ryzen-ai.html).
+
+## Recommended Hardware
+
+* [Phoenix Point Mini PC: Minisforum UM790 Pro : AMD Ryzen™ 9 7940HS](https://store.minisforum.com/products/minisforum-um790-pro?variant=43865372786933)
+
+* [Hawk Point Mini PC: GMKtec NucBox K8 : AMD Ryzen™ 7 8845HS](https://www.gmktec.com/products/amd-ryzen-7-8845hs-mini-pc-nucbox-k8?spm=..product_fe40bedf-d378-40fc-a60a-54d18f1dbc53.header_1.1&variant=71118fa4-6acb-4d6e-abcb-1d3cf00e6438)
+
+* [Phoenix Point Laptop: ASUS Vivobook Pro 15 M6500XV-EB96 : AMD Ryzen™ 9 7940HS](https://www.asus.com/us/laptops/for-creators/vivobook/asus-vivobook-pro-15-oled-m6500/)
+
+## AMD Ryzen™ AI Initial Setup
+
+#### **Please be sure to follow the quick setup path.**
+
+### [Getting Started on AMD Ryzen™ AI with Linux](../../docs/buildHostLin.md) **\*Recommended**
+
+### [Getting Started on AMD Ryzen™ AI with Windows](../../docs/buildHostWin.md)
+
+## VCK5000 Versal™ Initial Setup
+
+A few programming examples are provided to target the previous generation AIEs in a [VCK5000 Versal™ Development PCIe card](https://www.xilinx.com/products/boards-and-kits/vck5000.html). To get started with VCK5000 please visit the [ROCm-air-platforms](https://github.com/Xilinx/ROCm-air-platforms/tree/main) and [rocm-5.6.x-air](https://github.com/ROCm/ROCR-Runtime/tree/experimental/rocm-5.6.x-air) repositories.
+
+-----
+[[Top](..)] [[Next - Section 1](../section-1/)]
diff --git a/programming_guide/section-1/Makefile b/programming_guide/section-1/Makefile
new file mode 100644
index 0000000000..9a89112879
--- /dev/null
+++ b/programming_guide/section-1/Makefile
@@ -0,0 +1,18 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+include ../../tutorials/makefile-common
+
+build/aie.mlir: aie2.py
+ mkdir -p ${@D}
+ python3 $< > $@
+
+all: build/aie.mlir
+
+clean:
+ rm -rf build
diff --git a/programming_guide/section-1/README.md b/programming_guide/section-1/README.md
new file mode 100644
index 0000000000..3444d9e392
--- /dev/null
+++ b/programming_guide/section-1/README.md
@@ -0,0 +1,89 @@
+
+
+# Section 1 - Basic AI Engine building blocks
+
+When we program the AIE-array, we need to declare and configure its structural building blocks: compute tiles for vector processing, memory tiles as larger level-2 shared scratchpads, and shim tiles supporting data movement to external memory. In this programming guide, we will be utilizing the IRON Python bindings for MLIR-AIE components to describe our design at the tile level of granularity. Later on, when we focus on kernel programming, we will explore vector programming in C/C++. But let's first look at a basic Python source file (named [aie2.py](./aie2.py)) for an IRON design.
+
+## Walkthrough of Python source file (aie2.py)
+At the top of this Python source, we include modules that define the IRON AIE language bindings `aie.dialects.aie` and the mlir-aie context `aie.extras.context` which binds to MLIR definitions for AI Engines.
+
+```
+from aie.dialects.aie import * # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx # mlir-aie context
+```
+Then we declare a structural design function that will expand into MLIR code when it will get called from within an mlir-aie context (see last part of this subsection).
+```
+# AI Engine structural design function
+def mlir_aie_design():
+ <... AI Engine device, blocks and connections ...>
+```
+Let's look at how we declare the AI Engine device, blocks and connections. We start off by declaring our AIE device via `@device(AIEDevice.npu)` or `@device(AIEDevice.xcvc1902)`. The blocks and connections themselves will then be declared inside the `def device_body():`. Here, we instantiate our AI Engine blocks, which in this first example are AIE compute tiles.
+
+The arguments for the tile declaration are the tile coordinates (column, row). We assign each declared tile to a variable in our Python program.
+
+> **NOTE:** The actual tile coordinates used on the device when the program is run may deviate from the ones declared here. For example, on the NPU on Ryzen™ AI (`@device(AIEDevice.npu)`), these coordinates tend to be relative coordinates as the runtime scheduler may assign it to a different available column during runtime.
+
+```
+ # Device declaration - here using aie2 device NPU
+ @device(AIEDevice.npu)
+ def device_body():
+
+ # Tile declarations
+ ComputeTile1 = tile(1, 3)
+ ComputeTile2 = tile(2, 3)
+ ComputeTile3 = tile(2, 4)
+```
+Once we are done declaring our blocks (and connections) within our design function, we move onto the main body of our program where we call the function and output our design in MLIR. This is done by first declaring the MLIR context via the `with mlir_mod_ctx() as ctx:` line. This indicates that subsequent indented Python code is in the MLIR context and we follow this by calling our previosly defined design function `mlir_aie_design()`. This means all the code within the design function is understood to be in the MLIR context and contains the IRON custom Python binding definitions of the more detailed MLIR block definitions. The final line is `print(ctx.module)` which takes the code defined in our MLIR context and prints it stdout. This will then convert our Python-bound code to its MLIR equivalent and print it to stdout.
+```
+# Declares that subsequent code is in mlir-aie context
+with mlir_mod_ctx() as ctx:
+ mlir_aie_design() # Call design function within the mlir-aie context
+ print(ctx.module) # Print the Python-to-MLIR conversion to stdout
+```
+
+## Other Tile Types
+Next to the compute tiles, an AIE-array also contains data movers for accessing L3 memory (also called shim DMAs) and larger L2 scratchpads (called mem tiles) which are available since the AIE-ML generation - see [the introduction of this programming guide](../README.md). Declaring these other types of structural blocks follows the same syntax but requires physical layout details for the specific target device. Shim DMAs typically occupy row 0, while mem tiles (when available) often reside on row 1. The following code segment declares all the different tile types found in a single NPU column.
+
+```
+ # Device declaration - here using aie2 device NPU
+ @device(AIEDevice.npu)
+ def device_body():
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ MemTile = tile(0, 1)
+ ComputeTile1 = tile(0, 2)
+ ComputeTile2 = tile(0, 3)
+ ComputeTile3 = tile(0, 4)
+ ComputeTile4 = tile(0, 5)
+```
+
+## Exercises
+1. To run our Python program from the command line, we type `python3 aie2.py` which converts our Python structural design into MLIR source code. This works from the command line if our design environment already contains the mlir-aie Python-bound dialect module. We included this in the [Makefile](./Makefile), so go ahead and run `make` now. Then take a look at the generated MLIR source under `build/aie.mlir`.
+
+2. Run `make clean` to remove the generated files. Then introduce an error to the Python source such as misspelling `tile` to `tilex` and then run `make` again. What messages do you see?
+
+3. Run `make clean` again. Now change the error by renaming `tilex` back to `tile`, but change the coordinates to (-1,3) which is an invalid location. Run `make` again. What messages do you see now?
+
+4. No error is generated but our code is invalid. Take a look at the generated MLIR code under `build/aie.mlir`. This generated output is invalid MLIR syntax and running our mlir-aie tools on this MLIR source will generate an error. We do, however, have some additional Python structural syntax checks that can be enabled if we use the function `ctx.module.operation.verify()`. This verifies that our Python-bound code has valid operation within the mlir-aie context.
+
+ Qualify the `print(ctx.module)` call with a check on `ctx.module.operation.verify()` using a code block like the following:
+ ```
+ res = ctx.module.operation.verify()
+ if(res == True):
+ print(ctx.module)
+ else:
+ print(res)
+ ```
+ Make this change and run `make` again. What message do you see now?
+
+-----
+[[Prev - Section 0](../section-0/)] [[Top](..)] [[Next - Section 2](../section-2/)]
diff --git a/programming_guide/section-1/aie2.py b/programming_guide/section-1/aie2.py
new file mode 100644
index 0000000000..f3d6cc58bd
--- /dev/null
+++ b/programming_guide/section-1/aie2.py
@@ -0,0 +1,28 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import * # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx # mlir-aie context
+
+
+# AI Engine structural design function
+def mlir_aie_design():
+
+ # Device declaration - aie2 device NPU
+ @device(AIEDevice.npu)
+ def device_body():
+
+ # Tile(s) declarations
+ ComputeTile1 = tile(1, 3)
+ ComputeTile2 = tile(2, 3)
+ ComputeTile3 = tile(2, 4)
+
+
+# Declares that subsequent code is in mlir-aie context
+with mlir_mod_ctx() as ctx:
+ mlir_aie_design() # Call design function within the mlir-aie context
+ print(ctx.module) # Print the python-to-mlir conversion to stdout
diff --git a/programming_guide/section-2/README.md b/programming_guide/section-2/README.md
new file mode 100644
index 0000000000..0679fb2cb1
--- /dev/null
+++ b/programming_guide/section-2/README.md
@@ -0,0 +1,67 @@
+
+
+# Section 2 - Data Movement (Object FIFOs)
+
+In this section of the programming guide, we introduce the Object FIFO high-level communication primitive used to describe the data movement within the AIE array. At the end of this guide you will:
+1. have a high-level understanding of the communication primitive API,
+2. have learned how to initialize and access an Object FIFO through meaningful design examples,
+3. understand the design decisions which led to current limitations and/or restrictions in the Object FIFO design,
+4. know where to find more in-depth material of the Object FIFO implementation and lower-level lowering.
+
+To understand the need for a data movement abstraction we must first understand the hardware architecture with which we are working. The AIE array is a [spatial compute architecture](../README.md) with explicit data movement requirements. Each compute unit of the array works on data that is stored within its L1 memory module and that data needs to be explicitly moved there as part of the AIE's array global data movement configuration. This configuration involves several specialized hardware resources which handle the data movement over the entire array in such a way that data arrives at its destination without loss. The Object FIFO provides users with a way to specify the data movement in a more human-comprehensible and accessible manner, without sacrificing some of the more advanced control possibilities which the hardware provides.
+
+> **NOTE:** For more in-depth, low-level material on Object FIFO programming in MLIR please see the MLIR-AIE [tutorials](../mlir_tutorials).
+
+This guide is split into five sections, where each section builds on top of the previous ones:
+> **NOTE:** Section 2e contains several practical code examples with common design patterns using the Object FIFO which can be quickly picked up and tweaked for desired use.
+
+Section 2a - Introduction
+
+* Initializing an Object FIFO
+* Accessing the objects of an Object FIFO
+* Object FIFOs with same producer / consumer
+
+Section 2b - Key Object FIFO Patterns
+
+* Introduce data movement patterns supported by the Object FIFO
+ * Reuse
+ * Broadcast
+ * Distribute
+ * Join
+
+Section 2c - Data Layout Transformations
+
+* Introduce data layout transformation capabilities
+
+Section 2d - Programming for multiple cores
+
+* Walkthrough of the process of efficiently upgrading to designs with multiple cores
+
+Section 2e - Practical Examples
+
+* Practical examples using Object FIFOs
+ * Single / Double buffer
+ * External memory to core
+ * External memory to core using L2
+ * Distribute in L2
+ * Join in L2
+
+Section 2f - Data Movement Without Object FIFOs
+
+* Walkthrough of the process of programming DMA regions
+
+Section 2g - Runtime Data Movement
+
+* Walkthrough of the process of managing runtime data movement from/to host memory to/from the AIE array
+
+
+-----
+[[Prev - Section 1](../section-1/)] [[Top](..)] [[Next - Section 3](../section-3/)]
diff --git a/programming_guide/section-2/section-2a/README.md b/programming_guide/section-2/section-2a/README.md
new file mode 100644
index 0000000000..2eb790472e
--- /dev/null
+++ b/programming_guide/section-2/section-2a/README.md
@@ -0,0 +1,190 @@
+
+
+# Section 2a - Introduction
+
+### Initializing an Object FIFO
+
+An Object FIFO represents the data movement connection between a point A and a point B. In the AIE array, these points are AIE tiles (see [Section 1 - Basic AI Engine building blocks](../../section-1/)). Under the hood, the data movement configuration for different types of tiles (Shim tiles, Mem tiles, and compute tile) is different, but there is no difference between them when using an Object FIFO.
+
+To initialize an Object FIFO, users can use the `object_fifo` class constructor (defined in [aie.py](../../../python/dialects/aie.py)):
+```python
+class object_fifo:
+ def __init__(
+ self,
+ name,
+ producerTile,
+ consumerTiles,
+ depth,
+ datatype,
+ dimensionsToStream=None,
+ dimensionsFromStreamPerConsumer=None,
+ )
+```
+We will now go over each of the inputs, what they represent and why they are required by the abstraction. We will first focus on the mandatory inputs and in a later section of the guide on the default-valued ones (see Data Layout Transformations in [section-2c](../section-2c/README.md#data-layout-transformations)).
+
+First of all, an Object FIFO has a unique `name` which is required for the lowering steps. The Object FIFO functions as an ordered buffer that has a count of `depth` objects of specified `datatype`. Currently, all objects in an Object FIFO have to be of the same datatype. The `datatype` is a tensor-like attribute where the size of the tensor and the type of the individual elements are specified at the same time (i.e. `<16xi32>`). The `depth` can be either an integer or an array of integers. The latter is explained further down in this section.
+
+An Object FIFO is created between a producer, or source tile, and a consumer, or destination tile. The tiles are where producer and consumer processes accessing the Object FIFO will be executed. These processes are also refered to as the "actors" of the Object FIFO, based on dataflow theory terminology. Below, you can see an example of an Object FIFO created between producer tile A and consumer tile B:
+```python
+A = tile(1, 3)
+B = tile(2, 4)
+of0 = object_fifo("objfifo0", A, B, 3, T.memref(256, T.i32()))
+```
+The created Object FIFO is stored in the `of0` variable and is named `objfifo0`. It has a depth of `3` objects of datatype `<256xi32>`. The figure below represents a logical view of `of0` where no assumptions are made about where the tiles and the Object FIFO resources are placed:
+
+
+
+As you will see in the ["Key Object FIFO Patterns" section](../section-2b/README.md#key-object-fifo-patterns), an Object FIFO can have multiple consumer tiles, which describes a broadcast connection from the source tile to all of the consumer tiles. As such, the `consumerTiles` input can be either a single tile or an array of tiles. This is not the case for the `producerTile` input, as currently the Object FIFO does not support multiple producers.
+
+### Accessing the objects of an Object FIFO
+
+An Object FIFO can be accessed by the processes running on the producer and consumer tiles registered to it. Before a process can have access to the objects, it has to acquire them from the Object FIFO. This is because the Object FIFO is a synchronized communication primitive that leverages the synchronization mechanism available in the target hardware architecture to ensure that two processes cannot access the same object at the same time. Once a process has finished working with an object and has no further use for it, it must release it so that another process will be able to acquire and access it. The patterns in which a producer or a consumer process acquires and releases objects from an Object FIFO are called "access patterns". We can specifically refer to the acquire and release patterns as well.
+
+To acquire one or multiple objects users should use the acquire function of the `object_fifo` class:
+```python
+def acquire(self, port, num_elem)
+```
+Based on the `num_elem` input representing the number of acquired elements, the acquire function will either directly return an object, or an array of objects.
+
+The Object FIFO is an ordered primitive and the API keeps track for each process which object is the next one that they will have access to when acquiring, based on how many they have already acquired and released. Specifically, the first time a process acquires an object it will have access to the first object of the Object FIFO, and after releasing it and acquiring a new one, it'll have access to the second object, and so on until the last object, after which the order starts from the first one again. When acquiring multiple objects and accessing them in the returned array, the object at index 0 will always be the oldest object that that process has access to, which may not be the first object in the pool of that Object FIFO.
+
+To release one or multiple objects users should use the release function of the `object_fifo` class:
+```python
+def release(self, port, num_elem)
+```
+A process may release one, some or all of the objects it has acquired. The release function will release objects from oldest to youngest in acquired order. If a process does not release all of the objects it has acquired, then the next time it acquires objects the oldest objects will be those that were not released. This functionality is intended to achieve the behaviour of a sliding window through the Object FIFO primitive. This is described further in the ["Key Object FIFO Patterns" section](../section-2b/01_Reuse/README.md#object-fifo-reuse-pattern).
+
+When acquiring the objects of an Object FIFO using the acquire function it is important to note that any unreleased objects from a previous acquire will also be returned by the most recent acquire call. Unreleased objects will not be reacquired in the sense that the synchronization mechanism used under the hood has already been set in place such that the process already has the sole access rights to the unreleased objects from the previous acquire. As such, two acquire calls back-to-back without a release call in-between will result in the same objects being returned by both acquire calls. This decision was made to facilitate the understanding of releasing objects between calls to the acquire function as well as to ensure a proper lowering through the Object FIFO primitive. A code example of this behaviour is available in the ["Key Object FIFO Patterns" section](../section-2b/01_Reuse/README.md#object-fifo-reuse-pattern).
+
+The `port` input of both the acquire and the release functions represents whether that process is a producer or a consumer process and it is an important indication for the Object FIFO lowering to properly leverage the underlying synchronization mechanism. Its value may be either `ObjectFifoPort.Produce` or `ObjectFifoPort.Consume`. However, an important thing to note is that the terms producer and consumers are used mainly as a means to provide a logical reference for a human user to keep track of what process is at what end of the data movement, but it does not restrict the behaviour of that process, i.e., a producer process may simply access an object to read it and is not required to modify it.
+
+Below you can see an example of two processes that are iterating over the objects of the Object FIFO `of0` that we initialized in the previous section, one running on the producer tile and the other on the consumer tile. To do this, the producer process runs a loop of three iterations, equal to the depth of `of0`, and during each iteration it acquires one object from `of0`, calls a `test_func` function on the acquired object, and releases the object. The consumer process only runs once and acquires all three objects from `of0` at once and stores them in the `elems` array, from which it can access each object individually in any order. It then calls a `test_func2` function three times and in each call it gives as input one of the objects it acquired, before releasing all three objects at the end.
+```python
+A = tile(1, 3)
+B = tile(2, 4)
+of0 = object_fifo("objfifo0", A, B, 3, T.memref(256, T.i32()))
+
+@core(A)
+def core_body():
+ for _ in range_(3):
+ elem0 = of0.acquire(ObjectFifoPort.Produce, 1)
+ call(test_func, [elem0])
+ of0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+@core(B)
+def core_body():
+ elems = of0.acquire(ObjectFifoPort.Consume, 3)
+ call(test_func2, [elems[0]])
+ call(test_func2, [elems[1]])
+ call(test_func2, [elems[2]])
+ of0.release(ObjectFifoPort.Consume, 3)
+```
+
+The figure below illustrates this code: Each of the 4 drawings represents the state of the system during one iteration of execution. In the first three iterations, the producer process on tile A, drawn in blue, progressively acquires the elements of `of0` one by one. Once the third element has been released in the forth iteration, the consumer process on tile B, drawn in green, is able to acquire all three objects at once.
+
+
+
+Examples of designs that use these features are available in Section 2e: [01_single_double_buffer](../section-2e/01_single_double_buffer/) and [02_external_mem_to_core](../section-2e/02_external_mem_to_core/).
+
+### Object FIFOs with the same producer / consumer
+
+An Object FIFO can be created with the same tile as both its producer and consumer tile. This is mostly done to ensure proper synchronization within the process itself, as opposed to synchronization across multiple processes running on different tiles, as we have seen in examples up until this point. Composing two kernels with access to a shared buffer is an application that leverages this property of the Object FIFO, as showcased in the code snippet below, where `test_func` and `test_func2` are composed using `of0`:
+```python
+A = tile(1, 3)
+of0 = object_fifo("objfifo0", A, A, 3, T.memref(256, T.i32()))
+
+@core(A)
+def core_body():
+ for _ in range_(3):
+ elem0 = of0.acquire(ObjectFifoPort.Produce, 1)
+ call(test_func, [elem0])
+ of0.release(ObjectFifoPort.Produce, 1)
+
+ elem1 = of0.acquire(ObjectFifoPort.Consume, 1)
+ call(test_func2, [elem1])
+ of0.release(ObjectFifoPort.Consume, 1)
+ yield_([])
+```
+
+### Specifying the Object FIFO Depth as an Array
+
+As was mentioned in the beginning of this section, the AIE architecture is a spatial architecture that requires explicit data movement. As such, while the Object FIFO's conceptual design is that of an ordered buffer between two or more AIE tiles, in reality its conceptual depth is spread out over multiple resource pools that may be located at different levels of the memory hierarchy and on different tiles.
+
+A more in-depth, yet still abstract, view of the Object FIFO's depth is that the producer and each consumer have their own working resource pool available in their local memory modules which they can use to send and receive data in relation to the data movement described by the Object FIFO. The Object FIFO primitive and its lowering typically allocate the depth of each of these pools such that the resulting behaviour matches that of the conceptual depth.
+
+The user does however have the possibility to manually choose the depth of these pools. This feature is available because, while the Object FIFO primitive tries to offer a unified representation of the data movement across the AIE array, it also aims to provide performance programmers with the tools to more finely control it.
+
+For example, in the code snippet below `of0` describes the data movement between producer A and consumer B:
+```python
+A = tile(1, 3)
+B = tile(2, 4)
+of0 = object_fifo("objfifo0", A, B, 3, T.memref(256, T.i32()))
+```
+The conceptual depth of the Object FIFO is `3`. The reasoning behind this choice of depth can be understood by looking at the acquire and release patterns of the two actors:
+```python
+@core(A)
+def core_body():
+ for _ in range_(9):
+ elem0 = of0.acquire(ObjectFifoPort.Produce, 1)
+ call(produce_func, [elem0])
+ of0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+@core(B)
+def core_body():
+ for _ in range_(9):
+ elems = of0.acquire(ObjectFifoPort.Consume, 2)
+ call(consume_func, [elems[0], elems[1]])
+ of0.release(ObjectFifoPort.Consume, 2)
+ yield_([])
+```
+Each iteration:
+* producer A acquires one object to produce into, calls the kernel function `produce_func` to store new data in it for B to consume, and releases the object,
+* consumer B acquires two objects to consume, reads the data and applies kernel function `consume_func`, then releases both objects.
+
+A conceptual depth of `2` would have sufficed for this system to function without deadlocking. However, with a depth of `3`, A and B can execute concurrently, i.e., while B consumes two objects and applies the kernel function, A has one object available into which it can produce at the same time.
+
+The equivalent of this conceptual depth of `3` using an array of depths would be:
+```python
+of0 = object_fifo("objfifo0", A, B, [1, 2], T.memref(256, T.i32()))
+```
+where `1` is the number of resources available locally to producer A and `2` is the number available to consumer B.
+
+> **NOTE:** For a correct lowering, this feature should be used in situations where the producers and consumers of the Object FIFO are running on different tiles.
+
+The feature of specifying the depths of the resource pools for different actors of the Object FIFO is used to support a specific dependency that can arise when working with multiple Object FIFOs and it is further explained in the ["Key Object FIFO Patterns" section](../section-2b/02_Broadcast/README.md#object-fifo-broadcast-pattern).
+
+### Advanced Topic: Data Movement Accelerators
+
+**The following topic is not required to understand the rest of this guide.**
+
+This part of the guide introduces a few lower level concepts in the AIE hardware and takes a closer look at the individual resource pools on each tile and the reasoning behind their depths.
+
+Every tile in the AIE array has its own dedicated Data Movement Accelerator (or "DMA"). The DMAs are responsible for moving data from the tile's memory module to the AXI stream interconnect, or from the stream to the memory module. In the case of compute tiles, both the compute core and the tile's DMA are able to access the tile's memory module. Because of this, there is a need for a **synchronization mechanism** that will allow the compute core and the DMA to signal to each other when data is available for the other party to read or write in order to avoid data corruption. This is very similar to the concept of the Object FIFO where producers and consumers must first acquire objects before they can access them, and release them when they are done so they may be acquired by the other party.
+
+The figure below showcases a high-level view of a compute tile, where the compute core and the DMA are both reading and writing data to a location `buff` in the local memory module:
+
+
+
+The intent of this high-level view showcases that the DMA is able to interact with memory buffers while the compute core is simultaneously accessing them. The DMA can send data from a buffer onto the AXI stream, and receive data from the stream to write into a buffer which the core is processing. Because this concurrency can lead to data races, a ping-pong buffer (also called double buffer) is often used instead of a single buffer. This is showcased in the figure below where the `buff` has been extended to a `buff_ping` and `buff_pong`:
+
+
+
+> **NOTE:** It is possible to directly configure the DMAs without the use of the Object FIFO primitive to setup data movement between tiles. This is described in [Section 2f](../section-2f/README.md).
+
+## Exercises
+1. In the previous [subsection](./README.md/#specifying-the-object-fifo-depth-as-an-array) it was explained that the conceptual depth of `3` for `of0` could be represented as an array of depths `[1, 2]`. With the advanced knowledge on the topic of DMAs, do you think those depths suffice for the compute cores on tiles A and B to run concurrently with their local DMAs?
+
+1. How would you update the depths?
+
+-----
+[[Up](..)] [[Next - Section 2b](../section-2b/)]
diff --git a/programming_guide/section-2/section-2b/01_Reuse/README.md b/programming_guide/section-2/section-2b/01_Reuse/README.md
new file mode 100644
index 0000000000..5ffb69d2d4
--- /dev/null
+++ b/programming_guide/section-2/section-2b/01_Reuse/README.md
@@ -0,0 +1,67 @@
+
+
+# Object FIFO Reuse Pattern
+
+In the previous [section](../../section-2a/README.md#accessing-the-objects-of-an-object-fifo) it was mentioned that the Object FIFO acquire and release functions can be paired together to achieve the behaviour of a sliding window with data reuse. Specifically, this communication pattern occurs when a producer or a consumer of an Object FIFO releases less objects than it had previously acquired. As acquiring from an Object FIFO does not destroy the data, unreleased objects can continue to be used without requiring new copies of the data.
+
+It is important to note that each new acquire function will return a new object or array of objects that a process can access, which **includes unreleased objects from previous acquire calls**. The process should always use the result of the **most recent** acquire call to access unreleased objects to ensure a proper lowering through the Object FIFO primitive.
+
+In the example below `of0` is created between producer A and consumer B with a depth of 3 objects: object0, object1, and object2. The process running on the core of tile B is showcased in the next figure and explained in-depth below.
+```python
+A = tile(1, 3)
+B = tile(2, 4)
+of0 = object_fifo("objfifo0", A, B, 3, T.memref(256, T.i32())) # 3 objects: object0, object1, object2
+
+@core(B)
+def core_body():
+ ### Situation 1
+ elems = of0.acquire(ObjectFifoPort.Consume, 2) # acquires object0 and object1
+ call(test_func2, [elems[0], elems[1]])
+ of0.release(ObjectFifoPort.Consume, 1) # releases object0
+
+ ### Situation 2
+ elems_2 = of0.acquire(ObjectFifoPort.Consume, 2) # acquires object2; object1 was previously acquired
+ call(test_func2, [elems_2[0], elems_2[1]])
+ of0.release(ObjectFifoPort.Consume, 1) # releases object1
+
+ ### Situation 3
+ elems_3 = of0.acquire(ObjectFifoPort.Consume, 2) # acquires object0; object2 was previously acquired
+ call(test_func2, [elems_3[0], elems_3[1]])
+ of0.release(ObjectFifoPort.Consume, 1) # releases object2
+
+ ### Situation 4
+ elems_4 = of0.acquire(ObjectFifoPort.Consume, 2) # acquires object1; object0 was previously acquired
+```
+
+The figure below represents the status of the system in each of the marked situations 1 through 4:
+1. Consumer B first acquires 2 elements from `of0` in the variable `elems`. As this is the first time that B acquires, it will have access to object0 and object1. B then applies `test_func2` on the two acquired elements. Finally, B releases a single object, the oldest acquired one, and keeps object1.
+2. B acquires 2 elements in variable `elems_2`. It now has access to object1 (which remains acquired from the first acquire call at step 1), and also to the newly acquired object2. B again applies the function, after which it only releases a single object and keeps object2.
+3. B acquires 2 objects in `elems_3` and has access to object2 and object0. B releases a single object and keeps object0.
+4. B acquires 2 objects in `elems_4` and has access to object0 and object1 thus returning to the situation at the beginning of step 1.
+
+
+
+The situations above can be fused into a `for`-loop with 4 iterations. By continuously releasing one less element than it acquired every iteration, the consumer process running on tile B is implementing the behaviour of a sliding window with 2 objects that slides down by 1 in each iteration.
+```python
+A = tile(1, 3)
+B = tile(2, 4)
+of0 = object_fifo("objfifo0", A, B, 3, T.memref(256, T.i32())) # 3 objects: object0, object1, object2
+
+@core(B)
+def core_body():
+ for _ in range_(4):
+ elems = of0.acquire(ObjectFifoPort.Consume, 2)
+ call(test_func2, [elems[0], elems[1]])
+ of0.release(ObjectFifoPort.Consume, 1)
+```
+
+-----
+[[Up](..)] [[Next](../02_Broadcast/)]
diff --git a/programming_guide/section-2/section-2b/02_Broadcast/README.md b/programming_guide/section-2/section-2b/02_Broadcast/README.md
new file mode 100644
index 0000000000..1381619588
--- /dev/null
+++ b/programming_guide/section-2/section-2b/02_Broadcast/README.md
@@ -0,0 +1,77 @@
+
+
+# Object FIFO Broadcast Pattern
+
+As was explained in the Introduction [section](../../section-2a/README.md#initializing-an-object-fifo), the `consumerTiles` input can be either a single tile or an array of tiles. When the input is specified as an array of tiles, this creates a broadcast communication from a single producer tile to multiple consumer tiles. The data from the producer tile's memory module is sent to each of the consumer tiles' memory modules via the AXI stream interconnect, which handles the back-pressure from consumers with different execution times. The AXI stream is also where the low-level copy of the data is made before being sent to each of the consumers. To achieve the broadcast, the lowering will use one output port of the producer tile to establish a connection to all consumer tiles, as shown in the figure below:
+
+
+
+For more low-level details regarding how the objects in the Object FIFO are transferred via the AXI stream through the DMAs of the producer and consumer tiles please see the mlir-aie [tutorials](/mlir-aie/tutorials/tutorial-7/). They are, however, not required to understand or use the Object FIFO API.
+
+Below is an example of the Object FIFO `of0` shown in the previous figure. It has a depth of `3` with one producer tile A and three consumer tiles B, C and D:
+```python
+A = tile(1, 1)
+B = tile(1, 3)
+C = tile(2, 3)
+D = tile(3, 3)
+of0 = object_fifo("objfifo0", A, [B, C, D], 3, T.memref(256, T.i32()))
+```
+
+The `depth` input of an Object FIFO can also be specified as an array of integers, which describe the number of objects that are available to each tile (the producer tile plus each consumer tile) when accessing the Object FIFO. For the previous example, each of the four tiles has a resource pool of 3 objects available to perform the data movement of `of_0`.
+
+> **NOTE:** This functionality of the Object FIFO primitive exposes what is actually going on at the hardware level when the data movement is established for a broadcast. The object pool of the Object FIFO is not a single structure but rather composed of several pools of objects that are allocated in the memory module of each tile involved in the data movement. Specifying the `depth` as an array of integers allows the user full control to set the sizes of the pools on each individual tile. Please see [Section 2a](../../section-2a/README.md/#specifying-the-object-fifo-depth-as-an-array) for more details.
+
+The main advantage of this feature comes to light during a situation like the one showcased in the example below, which we refer to as a broadcast with a skip-connection. In the example below two Object FIFOs are created: `of0` is a broadcast from producer tile A to consumer tiles B and C, while `of1` is a 1-to-1 data movement from producer tile B to consumer tile C. We refer to `of0` as a skip-connection because it skips over B in the A → B → C chain when connecting A → C.
+```python
+A = tile(1, 3)
+B = tile(2, 3)
+C = tile(2, 4)
+of0 = object_fifo("objfifo0", A, [B, C], 1, T.memref(256, T.i32()))
+of1 = object_fifo("objfifo1", B, C, 1, T.memref(256, T.i32()))
+```
+
+The situation can be viewed as in the figure below:
+
+
+
+Now assume that the processes running on the two consumers of `of0` are as in the code snippet below.
+```python
+@core(B)
+def core_body():
+ elem0 = of0.acquire(ObjectFifoPort.Consume, 1)
+ elem1 = of1.acquire(ObjectFifoPort.Produce, 1)
+ call(test_func2, [elem0, elem1])
+ of0.release(ObjectFifoPort.Consume, 1)
+ of1.release(ObjectFifoPort.Produce, 1)
+
+@core(C)
+def core_body():
+ elem0 = of0.acquire(ObjectFifoPort.Consume, 1)
+ elem1 = of1.acquire(ObjectFifoPort.Consume, 1)
+ call(test_func2, [elem0, elem1])
+ of0.release(ObjectFifoPort.Consume, 1)
+ of1.release(ObjectFifoPort.Consume, 1)
+```
+We can see that C requires one object from both `of0` and `of1` before it can proceed with its execution. However, B also requires an object from `of0` before it can produce the data for `of1`. Because C is waiting on B, the two tiles do not have the same rate of consumption from the broadcast connection and this results in the production rate of A being impacted.
+
+To further represent this we can take the slightly lower-level view that the consumer tiles each have a pool of objects allocated for their Object FIFOs. To simplify things, only the pools used by the consumers are shown (for example, for `of1` only the pool on the side of consumer tile C is visible). Currently, all the pools have a depth of `1`.
+
+
+To avoid having the production of A impacted by the skip-connection, an additional object is required by C for `of0`. It can be used as buffering space for data coming from `of0` while waiting for the data from B via `of1`. To achieve this `of0` is created with an array of integers for its `depth`:
+```python
+of0 = object_fifo("objfifo0", A, [B, C], [1, 1, 2], T.memref(256, T.i32()))
+```
+where tiles A and B retain the original depth of 1 while C now has a depth of 2 objects. This change can be visualized as in the figure below where the pool of objects of `of0` in tile C has increased:
+
+
+
+-----
+[[Prev](../01_Reuse/)] [[Up](..)] [[Next](../03_Link_Distribute_Join/)]
diff --git a/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md b/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md
new file mode 100644
index 0000000000..d550f214f1
--- /dev/null
+++ b/programming_guide/section-2/section-2b/03_Link_Distribute_Join/README.md
@@ -0,0 +1,87 @@
+
+
+# Object FIFO Distribute & Join Patterns with Object FIFO Link
+
+### Object FIFO Link
+
+By design, an Object FIFO handles both the configuration of the data movement between the producer and consumer tiles, as well as the allocation of objects over the memory modules of the tiles. In order to put data consumed from one Object FIFO into another Object FIFO, the user could explicitly do this in the core code of a shared tile between the two FIFOs. However, if the goal is to simply copy data from one Object FIFO to the other without modifying it, doing it in the manner described above results in allocating more objects than necessary, i.e., the data being copied to the second Object FIFO is already available in the first one. Additionally, Shim tiles and Mem tiles do not have a core on which the copy can be done explicitly.
+
+Instead of an explicit copy, the Object FIFO API provides an implicit copy via an `object_fifo_link`, which can be initialized using its class constructor (defined in [aie.py](../../../../python/dialects/aie.py)):
+```python
+class object_fifo_link(ObjectFifoLinkOp):
+ def __init__(
+ self,
+ fifoIns,
+ fifoOuts,
+ )
+```
+A link allows the user to specify a set of input Object FIFOs via the `fifoIns` input and a set of output ones via the `fifoOuts` input. Each Object FIFO may be specified either using its `name` or its Python object. Both inputs can be either a single Object FIFO or an array of them. It is required that there exists at least one shared tile between the consumer tiles of `fifoIns` and the producer tiles of `fifoOuts` for a link to be valid. This is because the implicit copy of data will be done using the Data Movement Accelerators (DMAs) of that tile.
+
+Below is an example of a link created between two FIFOs `of0` and `of1`, where tile B is the shared tile between them:
+```python
+A = tile(1, 0)
+B = tile(1, 1)
+C = tile(1, 3)
+of0 = object_fifo("objfifo0", A, B, 2, T.memref(256, T.i32()))
+of1 = object_fifo("objfifo1", B, C, 2, T.memref(256, T.i32()))
+object_fifo_link(of0, of1)
+```
+
+Depending on how many Object FIFOs are specified in `fifoIns` and `fifoOuts`, two different data patterns can be achieved: a Distribute or a Join. They are described in the two next subsections. Currently, it is not possible to do both patterns at once, i.e., if `fifoIns` is an array then `fifoOuts` can only be a single Object FIFO, and the other way around.
+
+A full design example that uses this features is available in Section 2e: [03_external_mem_to_core_L2](../../section-2e/03_external_mem_to_core_L2/).
+
+### Link & Distribute
+
+By using the link with one input Object FIFO and multiple output Object FIFOs, a user can describe a distribute pattern where parts of data in every object from the producer tile are distributed to each output FIFO. The `datatype` of the output FIFOs should be of a smaller size than the input one, and the sum of the sizes of the output FIFOs should equal the size of the `datatype` of the input FIFO.
+
+Currently, the Object FIFO lowering uses the order in which the output FIFOs are specified in the `fifoOuts` to know which part of the input object should go to each output FIFO. To achieve the distribute, the lowering will use one output port of the shared tile to establish a connection per output FIFO, as in the figure below:
+
+
+
+The following code snippet describes the figure above. There are three Object FIFOs: `of0` has a producer tile A and a consumer tile B, while `of1` and `of2` have B as their producer tile and C and D respectively as their consumer tiles. The link specifies that data from `of0` is distributed to `of1` and `of2`. In this link, B is the shared tile where the implicit data copy will take place via B's DMAs. We can also note how `of1` and `of2`'s datatypes are half of `of0`'s, which means that the first half of objects in `of0` will go to `of1` and the second half to `of2`, based on their order in the link.
+```python
+A = tile(1, 0)
+B = tile(1, 1)
+C = tile(1, 3)
+D = tile(2, 3)
+of0 = object_fifo("objfifo0", A, B, 2, T.memref(256, T.i32()))
+of1 = object_fifo("objfifo1", B, C, 2, T.memref(128, T.i32()))
+of2 = object_fifo("objfifo2", B, D, 2, T.memref(128, T.i32()))
+object_fifo_link(of0, [of1, of2])
+```
+
+A full design example that uses this feature is available in Section 2e: [04_distribute_L2](../../section-2e/04_distribute_L2/).
+
+### Link & Join
+
+The join pattern is the opposite of the distribute pattern in that the link will have multiple input Object FIFOs and a single output Object FIFO. With this pattern the user can combine the smaller inputs from multiple sources into a single bigger output data movement. The `datatype` of the input FIFOs should be of a smaller size than the output one, and the sum of the sizes of the input FIFOs should be equal to the size of the `datatype` of the output FIFO.
+
+Similarly, the order in `fifoIns` specifies which input object will make up which part of the larger objects of the output Object FIFO. To achieve the join, the lowering will use one input port of the shared tile to establish a connection per input FIFO, as in the figure below:
+
+
+
+The following code snippet describes the figure above. There are three Object FIFOs: `of2` has a producer tile B and a consumer tile A, while `of0` and `of1` have C and D respectively as their producer tiles and B as their consumer tile. The link specifies that data from `of0` and `of1` is joined into `of2`. In this link, B is the shared tile where the implicit data copy will take place via B's DMAs. We can also note how `of0` and `of1`'s datatypes are half of `of2`'s, which means that objects from `of0` will become the first half of objects in `of2` while objects in `of1` will become the second half, based on their order in the link.
+```python
+A = tile(1, 0)
+B = tile(1, 1)
+C = tile(1, 3)
+D = tile(2, 3)
+of0 = object_fifo("objfifo0", B, A, 2, T.memref(256, T.i32()))
+of1 = object_fifo("objfifo1", C, B, 2, T.memref(128, T.i32()))
+of2 = object_fifo("objfifo2", D, B, 2, T.memref(128, T.i32()))
+object_fifo_link([of1, of2], of0)
+```
+
+A full design example that uses these features is available in Section 2e: [05_join_L2](../../section-2e/05_join_L2/).
+
+-----
+[[Prev](../02_Broadcast/)] [[Up](..)] [[Next - Section 2c](../../section-2c/)]
diff --git a/programming_guide/section-2/section-2b/README.md b/programming_guide/section-2/section-2b/README.md
new file mode 100644
index 0000000000..ac883aef17
--- /dev/null
+++ b/programming_guide/section-2/section-2b/README.md
@@ -0,0 +1,31 @@
+
+
+# Section 2b - Key Object FIFO Patterns
+
+The Object FIFO primitive supports several data movement patterns. We will now describe each of the currently supported patterns in three subsections and provide links to more in-depth practical code examples that showcase each of them.
+
+Object FIFO Reuse Pattern
+
+* Reuse the unreleased objects of an Object FIFO
+
+Object FIFO Broadcast Pattern
+
+* Broadcast data from one producer to multiple consumers
+
+Object FIFO Distribute & Join Patterns with Object FIFO Link
+
+* Implicit copy of data from one Object FIFO to another via an Object FIFO Link
+* Distribute different pieces of the input data to multiple consumers
+* Join outputs from different consumers into a bigger data tensor
+
+
+-----
+[[Prev - Section 2a](../section-2a/)] [[Up](..)] [[Next - Section 2c](../section-2c/)]
diff --git a/programming_guide/section-2/section-2c/README.md b/programming_guide/section-2/section-2c/README.md
new file mode 100644
index 0000000000..d1a7ff98ad
--- /dev/null
+++ b/programming_guide/section-2/section-2c/README.md
@@ -0,0 +1,123 @@
+
+
+# Section 2c - Data Layout Transformations
+
+While the Object FIFO primitive aims to reduce the complexity tied to data movement configuration on the AI Engine array, it also gives the user control over some advanced features of the underlying architecture. One such feature is the ability to do data layout transformations on the fly using the tile's dedicated hardware: the Data Movement Accelerators (DMAs). **This is available on AIE-ML devices.**
+
+Tile DMAs interact directly with the memory modules of their tiles and are responsible for pushing and retrieving data to and from the AXI stream interconnect. When data is pushed onto the stream, the user can program the DMA's n-dimensional address generation scheme such that the data's layout when pushed may be different than how it is stored in the tile's local memory. In the same way, a user can also specify in what layout a DMA should store the data retrieved from the AXI stream.
+
+DMA blocks contain buffer descriptor operations that summarize what data is being moved, from what offset, how much of it, and in what layout. These buffer descriptors are the `AIE_DMABDOp` operations in MLIR and have their own auto-generated python binding (available under `/python/aie/dialects/_aie_ops_gen.py` after the repository is built):
+```python
+def dma_bd
+ (
+ buffer,
+ *,
+ offset=None,
+ len=None,
+ dimensions=None,
+ bd_id=None,
+ next_bd_id=None,
+ loc=None,
+ ip=None
+ )
+```
+It is not necessary to understand these low-level operations in order to use the data layout transformations with the Object FIFO primitive.
+
+A data layout transformation is presented as a tuple of pairs, where each pair represents a `size` and a `stride` for a particular dimension of the data:
+```c
+[, , ]
+```
+Transformations can be expressed in up to three dimensions on each compute and Shim tile, and in up to four dimensions on Mem tiles. The first pair of this array gives the outer-most dimension's stride and size ``, while the last pair of the array gives the inner-most dimension's stride and size ``. All strides are expressed in **multiples of the element width**.
+
+> **NOTE:** Only for 4B data types the inner-most dimension's stride must be 1 by design.
+
+Data layout transformations can be viewed as a way to specify to the hardware which location in the data to access next and as such it is possible to model the access pattern using a series of nested loops. For example, the transformation using the strides and sizes from above can be expressed as:
+```c
+int *buffer;
+for(int i = 0; i < size_2; i++)
+ for(int j = 0; j < size_1; j++)
+ for(int k = 0; k < size_0; k++)
+ # access/store element at/to buffer[ i * stride_2
+ # + j * stride_1
+ # + k * stride_0]
+```
+
+As a practical example, here is an access pattern that corresponds to alternating between even and odd elements every 8 elements in a 128 element buffer/stream:
+```mlir
+aie.dma_bd(%buf : memref<128xi32>, 0, 128, [<8, 16>, <2, 1>, <8, 2>])
+```
+which translates to:
+```c
+for(int i = 0; i < 8; i++) # size_2
+ for(int j = 0; j < 2; j++) # size_1
+ for(int k = 0; k < 8; k++) # size_0
+ # access/store element at/to index:
+ (
+ i * 16 # stride_2
+ + j * 1 # stride_1
+ + k * 2 # stride_0
+ )
+```
+
+### Data Layout Transformations with the Object FIFO
+
+Remember that the Object FIFO class constructor has two default-valued inputs: `dimensionsToStream` and `dimensionsFromStreamPerConsumer`.
+```python
+class object_fifo:
+ def __init__(
+ self,
+ name,
+ producerTile,
+ consumerTiles,
+ depth,
+ datatype,
+ dimensionsToStream=None,
+ dimensionsFromStreamPerConsumer=None,
+ )
+```
+
+Our compiler directly lowers Object FIFOs that make use of the aforementioned data layout transformations to `AIE_DMABDOp`. You can use the `dimensionsToStream` input to describe in which order the `producerTile`'s DMA should push the objects onto the stream. Similarly, the `dimensionsFromStreamPerConsumer` input describes to the DMAs of each individual tile in the `consumerTiles` in what layout to retrieve the objects from the stream.
+
+As an example, the Object FIFO in the code below contains objects with datatype `<4x8xi8>`. Using the `dimensionsToStream` input it performs a data layout transformation on the producer tile side that pushes elements from memory onto the stream as follows: For every even length-8 row, select the first three even-indexed elements.
+```python
+A = tile(1, 1)
+B = tile(1, 3)
+of0 = object_fifo
+ (
+ "objfifo0",
+ A,
+ B,
+ 3,
+ T.memref((4, 8), T.i8()),
+ [
+ (2, 16),
+ (3, 2),
+ ],
+ )
+```
+The access pattern of the transformation can be written as:
+```c
+for(int i = 0; i < 2; i++) # size_1
+ for(int j = 0; j < 3; j++) # size_0
+ # access/store element at/to index:
+ (
+ i * 16 # stride_1
+ + j * 2 # stride_0
+ )
+```
+and further represented as in the image below:
+
+
+
+Other examples containing data layout transformations are available in the [programming_examples](../../../programming_examples/). A few notable ones are [matrix_vector_multiplication](../../../programming_examples/basic/matrix_multiplication/matrix_vector/) and [matrix_multiplication_whole_array](../../../programming_examples/basic/matrix_multiplication/whole_array/).
+
+-----
+[[Prev - Section 2b](../section-2b/)] [[Up](..)] [[Next - Section 2d](../section-2d/)]
diff --git a/programming_guide/section-2/section-2d/README.md b/programming_guide/section-2/section-2d/README.md
new file mode 100644
index 0000000000..d9008649c2
--- /dev/null
+++ b/programming_guide/section-2/section-2d/README.md
@@ -0,0 +1,137 @@
+
+
+# Section 2d - Programming for multiple cores
+
+This section will focus on the process of taking code written for a single core and transforming it into a design with multiple cores relatively quickly. For this we will start with the code in [aie2.py](./aie2.py) which contains a simple design running on a single compute tile, and progressively turn it into the code in [aie2_multi.py](./aie2_multi.py) which contains the same design that distributes the work to three compute tiles.
+
+The first step in the design is the tile declaration. In the simple design we use one Shim tile to bring data from external memory into the AIE array inside of a Mem tile that will then send the data to a compute tile, wait for the output and send it back to external memory through the Shim tile. Below is how those tiles are declared in the simple design:
+```python
+ShimTile = tile(0, 0)
+MemTile = tile(0, 1)
+ComputeTile = tile(0, 2)
+```
+For our scale out design we will keep using a single Shim tile and a single Mem tile, but we will increase the number of compute tiles to three. We can do so cleanly and efficiently in the following way:
+```python
+n_cores = 3
+
+ShimTile = tile(0, 0)
+MemTile = tile(0, 1)
+ComputeTiles = [tile(0, 2 + i) for i in range(n_cores)]
+```
+Each compute tile can now be accessed by indexing into the `ComputeTiles` array.
+
+Once the tiles have been declared, the next step is to set up the data movement using Object FIFOs. The simple design has a total of four double-buffered Object FIFOs and two `object_fifo_links`. The Object FIFOs move objects of datatype `<48xi32>`. `of_in` brings data from the Shim tile to the Mem tile and is linked to `of_in0` which brings data from the Mem tile to the compute tile. For the output side, `of_out0` brings data from the compute tile to the Mem tile where it is linked to `of_out` to bring the data out through the Shim tile. The corresponding code is shown below:
+```python
+data_size = 48
+buffer_depth = 2
+memRef_48_ty = T.memref(48, T.i32())
+
+
+# Input data movement
+
+of_in = object_fifo("in", ShimTile, MemTile, buffer_depth, memRef_data_ty)
+of_in1 = object_fifo("in0", MemTile, ComputeTile, buffer_depth, memRef_data_ty)
+object_fifo_link(of_in, of_in0)
+
+
+# Output data movement
+
+of_out = object_fifo("out", MemTile, ShimTile, buffer_depth, memRef_data_ty)
+of_out0 = object_fifo("out0", ComputeTile, MemTile, buffer_depth, memRef_data_ty)
+object_fifo_link(of_out0, of_out)
+```
+
+
+
+We can apply the same method as in the tile declaration to generate the data movement from the Mem tile to the three compute tiles and back ([see distribute and join patterns](../section-2b/03_Link_Distribute_Join/README.md)). The `object_fifo_link` operations change from the 1-to-1 case to distributing the original `<48xi32>` data tensors to the three compute tiles as smaller `<16xi32>` tensors on the input side, and to joining the output from each compute tile to the Mem tile on the output side. A list of names and a map from names to Object FIFO is used in order to keep track of the input and output Object FIFOs. With these changes the code becomes:
+```python
+n_cores = 3
+data_size = 48
+tile_size = data_size // 3
+
+buffer_depth = 2
+memRef_data_ty = T.memref(data_size, T.i32())
+memRef_tiles_ty = T.memref(tile_size, T.i32())
+
+# Input data movement
+
+inX_fifo_names = [f"in{i}" for i in range(n_cores)] # list of input object FIFO names
+inX_fifos = {} # map name to its object FIFO
+
+of_in = object_fifo("in", ShimTile, MemTile, buffer_depth, memRef_data_ty)
+for i in range(n_cores):
+ inX_fifos[inX_fifo_names[i]] = object_fifo(
+ inX_fifo_names[i], MemTile, ComputeTiles[i], buffer_depth, memRef_tiles_ty
+ )
+object_fifo_link(of_in, [inX_fifo_names])
+
+
+# Output data movement
+
+outX_fifo_names = [f"out{i}" for i in range(n_cores)] # list of output object FIFO names
+outX_fifos = {} # map name to its object FIFO
+
+of_out = object_fifo("out", ShimTile, MemTile, buffer_depth, memRef_data_ty)
+for i in range(n_cores):
+ outX_fifos[outX_fifo_names[i]] = object_fifo(
+ outX_fifo_names[i], ComputeTiles[i], MemTile, buffer_depth, memRef_tiles_ty
+ )
+object_fifo_link([outX_fifo_names], of_out)
+```
+
+
+
+The core of this simple design acquires one object of each Object FIFO, adds `1` to each entry of the incoming data, copies it to the object of the outgoing Object FIFO, then releases both objects:
+```python
+@core(ComputeTile)
+def core_body():
+ # Effective while(1)
+ for _ in for_(0xFFFFFFFF):
+ elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(data_size):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in0.release(ObjectFifoPort.Consume, 1)
+ of_out0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+```
+Once again we apply the same logic and use a `for`-loop over our three cores to write the code which will be executed on the three compute tiles. Each tile will index the `inX_fifos` and `outX_fifos` maps to retrieve the Object FIFOs it will acquire and release from. This process results in the following code:
+```python
+for i in range(n_cores):
+ # Compute tile i
+ @core(ComputeTiles[i])
+ def core_body():
+ for _ in for_(0xFFFFFFFF):
+ elem_in = inX_fifos[inX_fifo_names[i]].acquire(
+ ObjectFifoPort.Consume, 1
+ )
+ elem_out = outX_fifos[outX_fifo_names[i]].acquire(
+ ObjectFifoPort.Produce, 1
+ )
+ for i in for_(tile_size):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ inX_fifos[inX_fifo_names[i]].release(
+ ObjectFifoPort.Consume, 1
+ )
+ outX_fifos[outX_fifo_names[i]].release(
+ ObjectFifoPort.Produce, 1
+ )
+ yield_([])
+```
+
+-----
+[[Prev - Section 2c](../section-2c/)] [[Up](..)] [[Next - Section 2e](../section-2e/)]
diff --git a/programming_guide/section-2/section-2d/aie2.py b/programming_guide/section-2/section-2d/aie2.py
new file mode 100644
index 0000000000..6382ddeef2
--- /dev/null
+++ b/programming_guide/section-2/section-2d/aie2.py
@@ -0,0 +1,73 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import * # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx # mlir ctx wrapper
+
+from aie.dialects.aiex import * # extended mlir-aie dialect definitions
+from aie.dialects.scf import * # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects
+
+buffer_depth = 2
+data_size = 48
+
+
+# AI Engine structural design function
+def mlir_aie_design():
+ # ctx wrapper - to convert python to mlir
+ with mlir_mod_ctx() as ctx:
+
+ # Device declaration - aie2 device xcvc1902
+ @device(AIEDevice.xcvc1902)
+ def device_body():
+ memRef_48_ty = T.memref(data_size, T.i32())
+
+ # Tile(s) declarations
+ ShimTile = tile(0, 0)
+ MemTile = tile(0, 1)
+ ComputeTile = tile(0, 2)
+
+ # Data movement with object FIFOs
+
+ # Input data movement
+
+ of_in = object_fifo("in", ShimTile, MemTile, buffer_depth, memRef_48_ty)
+ of_in1 = object_fifo(
+ "in1", MemTile, ComputeTile, buffer_depth, memRef_48_ty
+ )
+ object_fifo_link(of_in, of_in1)
+
+ # Output data movement
+
+ of_out = object_fifo("out", MemTile, ShimTile, buffer_depth, memRef_48_ty)
+ of_out1 = object_fifo(
+ "out1", ComputeTile, MemTile, buffer_depth, memRef_48_ty
+ )
+ object_fifo_link(of_out1, of_out)
+
+ # Set up compute tiles
+ @core(ComputeTile)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(0xFFFFFFFF):
+ elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(data_size):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in1.release(ObjectFifoPort.Consume, 1)
+ of_out1.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # Print the mlir conversion
+ print(ctx.module)
+
+
+# Call design function to generate mlir code to stdout
+mlir_aie_design()
diff --git a/programming_guide/section-2/section-2d/aie2_multi.py b/programming_guide/section-2/section-2d/aie2_multi.py
new file mode 100644
index 0000000000..04727e7b00
--- /dev/null
+++ b/programming_guide/section-2/section-2d/aie2_multi.py
@@ -0,0 +1,103 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import * # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx # mlir ctx wrapper
+
+from aie.dialects.aiex import * # extended mlir-aie dialect definitions
+from aie.dialects.scf import * # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects
+
+n_cores = 3
+buffer_depth = 2
+data_size = 48
+tile_size = data_size // 3
+
+
+# AI Engine structural design function
+def mlir_aie_design():
+ # ctx wrapper - to convert python to mlir
+ with mlir_mod_ctx() as ctx:
+
+ # Device declaration - aie2 device xcvc1902
+ @device(AIEDevice.xcvc1902)
+ def device_body():
+ memRef_16_ty = T.memref(16, T.i32())
+ memRef_48_ty = T.memref(48, T.i32())
+
+ # Tile(s) declarations
+ ShimTile = tile(0, 0)
+ MemTile = tile(0, 1)
+ ComputeTiles = [tile(0, 2 + i) for i in range(n_cores)]
+
+ # Data movement with object FIFOs
+
+ # Input data movement
+
+ inX_fifo_names = [
+ f"in{i}" for i in range(n_cores)
+ ] # list of input object FIFO names
+ inX_fifos = {} # map name to its object FIFO
+
+ of_in = object_fifo("in", ShimTile, MemTile, buffer_depth, memRef_48_ty)
+ for i in range(n_cores):
+ inX_fifos[inX_fifo_names[i]] = object_fifo(
+ inX_fifo_names[i],
+ MemTile,
+ ComputeTiles[i],
+ buffer_depth,
+ memRef_16_ty,
+ )
+ object_fifo_link(of_in, inX_fifo_names[0:n_cores])
+
+ # Output data movement
+
+ outX_fifo_names = [
+ f"out{i}" for i in range(n_cores)
+ ] # list of output object FIFO names
+ outX_fifos = {} # map name to its object FIFO
+
+ of_out = object_fifo("out", MemTile, ShimTile, buffer_depth, memRef_48_ty)
+ for i in range(n_cores):
+ outX_fifos[outX_fifo_names[i]] = object_fifo(
+ outX_fifo_names[i],
+ ComputeTiles[i],
+ MemTile,
+ buffer_depth,
+ memRef_16_ty,
+ )
+ object_fifo_link(outX_fifo_names[0:n_cores], of_out)
+
+ # Set up compute tiles
+ for i in range(n_cores):
+ # Compute tile i
+ @core(ComputeTiles[i])
+ def core_body():
+ for _ in for_(0xFFFFFFFF):
+ elem_in = inX_fifos[inX_fifo_names[i]].acquire(
+ ObjectFifoPort.Consume, 1
+ )
+ elem_out = outX_fifos[outX_fifo_names[i]].acquire(
+ ObjectFifoPort.Produce, 1
+ )
+ for j in for_(tile_size):
+ v0 = memref.load(elem_in, [j])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [j])
+ yield_([])
+ inX_fifos[inX_fifo_names[i]].release(ObjectFifoPort.Consume, 1)
+ outX_fifos[outX_fifo_names[i]].release(
+ ObjectFifoPort.Produce, 1
+ )
+ yield_([])
+
+ # Print the mlir conversion
+ print(ctx.module)
+
+
+# Call design function to generate mlir code to stdout
+mlir_aie_design()
diff --git a/programming_guide/section-2/section-2e/01_single_double_buffer/README.md b/programming_guide/section-2/section-2e/01_single_double_buffer/README.md
new file mode 100644
index 0000000000..ace7c9cb8f
--- /dev/null
+++ b/programming_guide/section-2/section-2e/01_single_double_buffer/README.md
@@ -0,0 +1,30 @@
+
+
+# Single / Double Buffer
+
+The design in [single_buffer.py](./single_buffer.py) uses an Object FIFO `of_in` to transfer data from producer tile `ComputeTile2` to consumer tile `ComputeTile3`. The Object FIFO has a depth of `1` which describes a single buffer between the two tiles as shown in the figure below.
+
+
+
+Both the producer and the consumer processes in this design have trivial tasks. The producer process running on `ComputeTile2` acquires the single buffer and writes `1` into all its entries before releasing it for consumption. The producer process running on `ComputeTile3` acquires the single buffer and immediately releases it back for the producer.
+
+To have this design use a double, or ping-pong, buffer for the data transfer instead, the user need only set the depth of the Object FIFO to `2`. No other change is required as the Object FIFO lowering will take care of properly cycling between the ping and pong buffers. To change the depth of `of_in` the user should write:
+```python
+of_in = object_fifo("in", ComputeTile2, ComputeTile3, 2, memRef_16_ty) # double buffer
+```
+This change effectively increases the number of available resources of the Object FIFO as is shown in the figure below:
+
+
+
+All examples available in the [programming_examples](../../../../programming_examples/) contain this data movement pattern.
+
+-----
+[[Up](..)] [[Next](../02_external_mem_to_core/)]
diff --git a/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py b/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py
new file mode 100644
index 0000000000..46a5f763a8
--- /dev/null
+++ b/programming_guide/section-2/section-2e/01_single_double_buffer/single_buffer.py
@@ -0,0 +1,58 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+
+def single_buffer():
+ with mlir_mod_ctx() as ctx:
+
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_16_ty = T.memref(16, T.i32())
+
+ # Tile declarations
+ ComputeTile2 = tile(0, 2)
+ ComputeTile3 = tile(0, 3)
+
+ # AIE-array data movement with object fifos
+ # Input
+ of_in = object_fifo(
+ "in", ComputeTile2, ComputeTile3, 1, memRef_16_ty
+ ) # single buffer
+
+ # Set up compute tiles
+ # Compute tile 2
+ @core(ComputeTile2)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(8):
+ elem_out = of_in.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(16):
+ v1 = arith.constant(1, T.i32())
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # Compute tile 3
+ @core(ComputeTile3)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(8):
+ elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+ of_in.release(ObjectFifoPort.Consume, 1)
+ yield_([])
+
+ print(ctx.module)
+
+
+single_buffer()
diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/CMakeLists.txt b/programming_guide/section-2/section-2e/02_external_mem_to_core/CMakeLists.txt
new file mode 100644
index 0000000000..5da03ef2b2
--- /dev/null
+++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(CMAKE_C_COMPILER gcc-13)
+ set(CMAKE_CXX_COMPILER g++-13)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName proj_${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib/test_utils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC
+ ${XRT_INC_DIR}
+ ${Boost_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ )
+endif()
diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/Makefile b/programming_guide/section-2/section-2e/02_external_mem_to_core/Makefile
new file mode 100644
index 0000000000..db95484c3d
--- /dev/null
+++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/Makefile
@@ -0,0 +1,41 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+include ../../../../programming_examples/makefile-common
+
+targetname = ext_to_core
+devicename = npu
+col = 0
+
+all: build/final.xclbin build/insts.txt
+
+build/aie.mlir: ext_to_core.py
+ mkdir -p ${@D}
+ python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir
+ mkdir -p ${@D}
+ cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+ rm -rf _build
+ mkdir -p _build
+ cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+ cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+ cp _build/${targetname}.exe $@
+else
+ cp _build/${targetname} $@
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt
+ ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+ rm -rf build _build ${targetname}.exe
diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/README.md b/programming_guide/section-2/section-2e/02_external_mem_to_core/README.md
new file mode 100644
index 0000000000..9d0baf0c23
--- /dev/null
+++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/README.md
@@ -0,0 +1,35 @@
+
+
+# External Memory to Core
+
+The design in [ext_to_core.py](./ext_to_core.py) uses an Object FIFO `of_in` to bring data from the `ShimTile` to `ComputeTile2` and another Object FIFO `of_out` to send the data from the compute tile to external memory. Each fifo uses a double buffer.
+
+
+
+```python
+ # AIE-array data movement with object fifos
+ of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_24_ty) # Input
+ of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_24_ty) # Output
+```
+
+Both a consumer and a producer process are running on `ComputeTile2`. The producer process acquires one object from `of_in` to consume and one object from `of_out` to produce into. It then reads the value of the input object and adds `1` to all its entries before releasing both objects.
+
+It is possible to build, run and test this design with the following commands:
+```
+make
+make run
+```
+The [test.cpp](./test.cpp) as well as the `# To/from AIE-array data movement` section of the design code will be described in detail in [Section 2g](../../section-2g/).
+
+Other examples containing this data movement pattern are available in the [programming_examples](../../../../programming_examples/). A few notable ones are [vector_reduce_add](../../../../programming_examples/basic/vector_reduce_add/) and [vector_scalar_add](../../../../programming_examples/basic/vector_scalar_add/).
+
+-----
+[[Prev](../01_single_double_buffer/)] [[Up](..)] [[Next](../03_external_mem_to_core_L2/)]
diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
new file mode 100644
index 0000000000..b49d1ce5fd
--- /dev/null
+++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/ext_to_core.py
@@ -0,0 +1,68 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+
+def external_mem_to_core():
+ with mlir_mod_ctx() as ctx:
+
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_24_ty = T.memref(24, T.i32())
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ ComputeTile2 = tile(0, 2)
+
+ # AIE-array data movement with object fifos
+ # Input
+ of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_24_ty)
+
+ # Output
+ of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_24_ty)
+
+ # Set up compute tiles
+
+ # Compute tile 2
+ @core(ComputeTile2)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(2):
+ elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(24):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in.release(ObjectFifoPort.Consume, 1)
+ of_out.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # To/from AIE-array data movement
+
+ memRef_48_ty = T.memref(48, T.i32())
+
+ @FuncOp.from_py_func(memRef_48_ty, memRef_48_ty, memRef_48_ty)
+ def sequence(inTensor, notUsed, outTensor):
+ npu_dma_memcpy_nd(
+ metadata="out", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 48]
+ )
+ npu_dma_memcpy_nd(
+ metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+ print(ctx.module)
+
+
+external_mem_to_core()
diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/run.lit b/programming_guide/section-2/section-2e/02_external_mem_to_core/run.lit
new file mode 100644
index 0000000000..543fdcc70b
--- /dev/null
+++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python %S/ext_to_core.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
+// RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../../runtime_lib/test_lib %S/../../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_guide/section-2/section-2e/02_external_mem_to_core/test.cpp b/programming_guide/section-2/section-2e/02_external_mem_to_core/test.cpp
new file mode 100644
index 0000000000..c14ca0f131
--- /dev/null
+++ b/programming_guide/section-2/section-2e/02_external_mem_to_core/test.cpp
@@ -0,0 +1,189 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 48;
+constexpr int OUT_SIZE = 48;
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+ if (!vm_in.count(name)) {
+ throw std::runtime_error("Error: no " + name + " file was provided\n");
+ } else {
+ std::ifstream test(vm_in[name].as());
+ if (!test) {
+ throw std::runtime_error("The " + name + " file " +
+ vm_in[name].as() +
+ " does not exist.\n");
+ }
+ }
+}
+
+std::vector load_instr_sequence(std::string instr_path) {
+ std::ifstream instr_file(instr_path);
+ std::string line;
+ std::vector instr_v;
+ while (std::getline(instr_file, line)) {
+ std::istringstream iss(line);
+ uint32_t a;
+ if (!(iss >> std::hex >> a)) {
+ throw std::runtime_error("Unable to parse instruction file\n");
+ }
+ instr_v.push_back(a);
+ }
+ return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+ // Program arguments parsing
+ po::options_description desc("Allowed options");
+ desc.add_options()("help,h", "produce help message")(
+ "xclbin,x", po::value()->required(),
+ "the input xclbin path")(
+ "kernel,k", po::value()->required(),
+ "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+ "verbosity,v", po::value()->default_value(0),
+ "the verbosity of the output")(
+ "instr,i", po::value()->required(),
+ "path of file containing userspace instructions to be sent to the LX6");
+ po::variables_map vm;
+
+ try {
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ if (vm.count("help")) {
+ std::cout << desc << "\n";
+ return 1;
+ }
+ } catch (const std::exception &ex) {
+ std::cerr << ex.what() << "\n\n";
+ std::cerr << "Usage:\n" << desc << "\n";
+ return 1;
+ }
+
+ check_arg_file_exists(vm, "xclbin");
+ check_arg_file_exists(vm, "instr");
+
+ std::vector instr_v =
+ load_instr_sequence(vm["instr"].as());
+
+ int verbosity = vm["verbosity"].as();
+ if (verbosity >= 1)
+ std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+ // Start the XRT test code
+ // Get a device handle
+ unsigned int device_index = 0;
+ auto device = xrt::device(device_index);
+
+ // Load the xclbin
+ if (verbosity >= 1)
+ std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n";
+ auto xclbin = xrt::xclbin(vm["xclbin"].as());
+
+ if (verbosity >= 1)
+ std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n";
+ std::string Node = vm["kernel"].as();
+
+ // Get the kernel from the xclbin
+ auto xkernels = xclbin.get_kernels();
+ auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+ [Node](xrt::xclbin::kernel &k) {
+ auto name = k.get_name();
+ std::cout << "Name: " << name << std::endl;
+ return name.rfind(Node, 0) == 0;
+ });
+ auto kernelName = xkernel.get_name();
+
+ if (verbosity >= 1)
+ std::cout << "Registering xclbin: " << vm["xclbin"].as()
+ << "\n";
+
+ device.register_xclbin(xclbin);
+
+ // get a hardware context
+ if (verbosity >= 1)
+ std::cout << "Getting hardware context.\n";
+ xrt::hw_context context(device, xclbin.get_uuid());
+
+ // get a kernel handle
+ if (verbosity >= 1)
+ std::cout << "Getting handle to kernel:" << kernelName << "\n";
+ auto kernel = xrt::kernel(context, kernelName);
+
+ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+ XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+ auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+ auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+ auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+ if (verbosity >= 1)
+ std::cout << "Writing data into buffer objects.\n";
+
+ uint32_t *bufInA = bo_inA.map();
+ std::vector srcVecA;
+ for (int i = 0; i < IN_SIZE; i++)
+ srcVecA.push_back(1);
+ memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+ void *bufInstr = bo_instr.map();
+ memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+ bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+ if (verbosity >= 1)
+ std::cout << "Running Kernel.\n";
+ auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+ run.wait();
+
+ bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+ uint32_t *bufOut = bo_out.map();
+
+ int errors = 0;
+
+ for (uint32_t i = 0; i < 48; i++) {
+ uint32_t ref = 2;
+ if (*(bufOut + i) != ref) {
+ std::cout << "Error in output " << *(bufOut + i) << " != " << ref
+ << std::endl;
+ errors++;
+ } else {
+ std::cout << "Correct output " << *(bufOut + i) << " == " << ref
+ << std::endl;
+ }
+ }
+
+ if (!errors) {
+ std::cout << "\nPASS!\n\n";
+ return 0;
+ } else {
+ std::cout << "\nfailed.\n\n";
+ return 1;
+ }
+}
diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/CMakeLists.txt b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/CMakeLists.txt
new file mode 100644
index 0000000000..5da03ef2b2
--- /dev/null
+++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(CMAKE_C_COMPILER gcc-13)
+ set(CMAKE_CXX_COMPILER g++-13)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName proj_${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib/test_utils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC
+ ${XRT_INC_DIR}
+ ${Boost_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ )
+endif()
diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/Makefile b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/Makefile
new file mode 100644
index 0000000000..82fdd0057d
--- /dev/null
+++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/Makefile
@@ -0,0 +1,41 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+include ../../../../programming_examples/makefile-common
+
+targetname = ext_to_core_L2
+devicename = npu
+col = 0
+
+all: build/final.xclbin build/insts.txt
+
+build/aie.mlir: ext_to_core_L2.py
+ mkdir -p ${@D}
+ python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir
+ mkdir -p ${@D}
+ cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+ rm -rf _build
+ mkdir -p _build
+ cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+ cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+ cp _build/${targetname}.exe $@
+else
+ cp _build/${targetname} $@
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt
+ ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+ rm -rf build _build ${targetname}.exe
diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/README.md b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/README.md
new file mode 100644
index 0000000000..31560688cb
--- /dev/null
+++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/README.md
@@ -0,0 +1,42 @@
+
+
+# External Memory to Core through L2
+
+The design in [ext_to_coreL2.py](./ext_to_core.py) is very similar to the one in the previous [example](../02_external_mem_to_core/) with the difference being that in this design we first bring the `24xi32` data from external memory to the `MemTile` with `of_in0`. We then use `of_in1` to bring smaller `8xi32` slices of the data from the `MemTile` to `ComputeTile2`. Two FIFOs then bring the data first to the `MemTile` via `of_out1` as `8xi32` tensors, then to the `ShimTile` via `of_out0` as `24xi32` ones. All FIFOs use double buffers.
+
+
+
+```python
+ # AIE-array data movement with object fifos
+ # Input
+ of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_24_ty)
+ of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
+ object_fifo_link(of_in0, of_in1)
+
+ # Output
+ of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_24_ty)
+ of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
+ object_fifo_link(of_out1, of_out0)
+```
+
+The processes on the compute tile work the same way as in the previous design. The producer process acquires one object from `of_in1` to consume and one object from `of_out1` to produce into. It then reads the value of the input object and adds `1` to all its entries before releasing both objects.
+
+It is possible to build, run and test this design with the following commands:
+```
+make
+make run
+```
+The [test.cpp](./test.cpp) as well as the `# To/from AIE-array data movement` section of the design code will be described in detail in [Section 2g](../../section-2g/).
+
+Other examples containing this data movement pattern are available in the [programming_examples/matrix_multiplication/](../../../../programming_examples/basic/matrix_multiplication/).
+
+-----
+[[Prev](../02_external_mem_to_core/)] [[Up](..)] [[Next](../04_distribute_L2/)]
diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
new file mode 100644
index 0000000000..f97061615b
--- /dev/null
+++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/ext_to_core_L2.py
@@ -0,0 +1,72 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+
+def external_mem_to_core_L2():
+ with mlir_mod_ctx() as ctx:
+
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_24_ty = T.memref(24, T.i32())
+ memRef_8_ty = T.memref(8, T.i32())
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ MemTile = tile(0, 1)
+ ComputeTile2 = tile(0, 2)
+
+ # AIE-array data movement with object fifos
+ # Input
+ of_in0 = object_fifo("in0", ShimTile, MemTile, 2, memRef_24_ty)
+ of_in1 = object_fifo("in1", MemTile, ComputeTile2, 2, memRef_8_ty)
+ object_fifo_link(of_in0, of_in1)
+
+ # Output
+ of_out0 = object_fifo("out0", MemTile, ShimTile, 2, memRef_24_ty)
+ of_out1 = object_fifo("out1", ComputeTile2, MemTile, 2, memRef_8_ty)
+ object_fifo_link(of_out1, of_out0)
+
+ # Set up compute tiles
+ # Compute tile 2
+ @core(ComputeTile2)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(6):
+ elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in1.release(ObjectFifoPort.Consume, 1)
+ of_out1.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ memRef_48_ty = T.memref(48, T.i32())
+
+ # To/from AIE-array data movement
+ @FuncOp.from_py_func(memRef_48_ty, memRef_48_ty, memRef_48_ty)
+ def sequence(inTensor, notUsed, outTensor):
+ npu_dma_memcpy_nd(
+ metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 48]
+ )
+ npu_dma_memcpy_nd(
+ metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+ print(ctx.module)
+
+
+external_mem_to_core_L2()
diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run.lit b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run.lit
new file mode 100644
index 0000000000..937ed0dd9f
--- /dev/null
+++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python %S/ext_to_core_L2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
+// RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../../runtime_lib/test_lib %S/../../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/test.cpp b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/test.cpp
new file mode 100644
index 0000000000..ba0e2b954d
--- /dev/null
+++ b/programming_guide/section-2/section-2e/03_external_mem_to_core_L2/test.cpp
@@ -0,0 +1,190 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 48;
+constexpr int OUT_SIZE = 48;
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+ if (!vm_in.count(name)) {
+ throw std::runtime_error("Error: no " + name + " file was provided\n");
+ } else {
+ std::ifstream test(vm_in[name].as());
+ if (!test) {
+ throw std::runtime_error("The " + name + " file " +
+ vm_in[name].as() +
+ " does not exist.\n");
+ }
+ }
+}
+
+std::vector load_instr_sequence(std::string instr_path) {
+ std::ifstream instr_file(instr_path);
+ std::string line;
+ std::vector instr_v;
+ while (std::getline(instr_file, line)) {
+ std::istringstream iss(line);
+ uint32_t a;
+ if (!(iss >> std::hex >> a)) {
+ throw std::runtime_error("Unable to parse instruction file\n");
+ }
+ instr_v.push_back(a);
+ }
+ return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+ // Program arguments parsing
+ po::options_description desc("Allowed options");
+ desc.add_options()("help,h", "produce help message")(
+ "xclbin,x", po::value()->required(),
+ "the input xclbin path")(
+ "kernel,k", po::value()->required(),
+ "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+ "verbosity,v", po::value()->default_value(0),
+ "the verbosity of the output")(
+ "instr,i", po::value()->required(),
+ "path of file containing userspace instructions to be sent to the LX6");
+ po::variables_map vm;
+
+ try {
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ if (vm.count("help")) {
+ std::cout << desc << "\n";
+ return 1;
+ }
+ } catch (const std::exception &ex) {
+ std::cerr << ex.what() << "\n\n";
+ std::cerr << "Usage:\n" << desc << "\n";
+ return 1;
+ }
+
+ check_arg_file_exists(vm, "xclbin");
+ check_arg_file_exists(vm, "instr");
+
+ std::vector instr_v =
+ load_instr_sequence(vm["instr"].as());
+
+ int verbosity = vm["verbosity"].as();
+ if (verbosity >= 1)
+ std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+ // Start the XRT test code
+ // Get a device handle
+ unsigned int device_index = 0;
+ auto device = xrt::device(device_index);
+
+ // Load the xclbin
+ if (verbosity >= 1)
+ std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n";
+ auto xclbin = xrt::xclbin(vm["xclbin"].as());
+
+ if (verbosity >= 1)
+ std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n";
+ std::string Node = vm["kernel"].as();
+
+ // Get the kernel from the xclbin
+ auto xkernels = xclbin.get_kernels();
+ auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+ [Node](xrt::xclbin::kernel &k) {
+ auto name = k.get_name();
+ std::cout << "Name: " << name << std::endl;
+ return name.rfind(Node, 0) == 0;
+ });
+ auto kernelName = xkernel.get_name();
+
+ if (verbosity >= 1)
+ std::cout << "Registering xclbin: " << vm["xclbin"].as()
+ << "\n";
+
+ device.register_xclbin(xclbin);
+
+ // get a hardware context
+ if (verbosity >= 1)
+ std::cout << "Getting hardware context.\n";
+ xrt::hw_context context(device, xclbin.get_uuid());
+
+ // get a kernel handle
+ if (verbosity >= 1)
+ std::cout << "Getting handle to kernel:" << kernelName << "\n";
+ auto kernel = xrt::kernel(context, kernelName);
+
+ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+ XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+ auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+ auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+ auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+ if (verbosity >= 1)
+ std::cout << "Writing data into buffer objects.\n";
+
+ uint32_t *bufInA = bo_inA.map();
+ std::vector srcVecA;
+ for (int i = 0; i < IN_SIZE; i++)
+ srcVecA.push_back(1);
+ memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+ void *bufInstr = bo_instr.map();
+ memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+ bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+ if (verbosity >= 1)
+ std::cout << "Running Kernel.\n";
+ auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+ run.wait();
+
+ bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+ uint32_t *bufOut = bo_out.map();
+
+ int errors = 0;
+
+ for (uint32_t i = 0; i < 48; i++) {
+ uint32_t ref = 2;
+ if (*(bufOut + i) != ref) {
+ std::cout << "Error in output " << *(bufOut + i) << " != " << ref
+ << std::endl;
+ errors++;
+ } else {
+ std::cout << "Correct output " << *(bufOut + i) << " == " << ref
+ << std::endl;
+ }
+ }
+
+ if (!errors) {
+ std::cout << "\nPASS!\n\n";
+ return 0;
+ } else {
+ std::cout << "\nfailed.\n\n";
+ return 1;
+ }
+}
diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/README.md b/programming_guide/section-2/section-2e/04_distribute_L2/README.md
new file mode 100644
index 0000000000..5c2e93c276
--- /dev/null
+++ b/programming_guide/section-2/section-2e/04_distribute_L2/README.md
@@ -0,0 +1,32 @@
+
+
+# Distribute from L2
+
+The design in [distribute_L2.py](./distribute_L2.py) uses an Object FIFO `of_in` to bring data from external memory via the `ShimTile` to the `MemTile` as `24xi32` tensors. From there, three Object FIFOs distribute smaller `8xi32` parts of the data to each of the three compute tiles. Each tile receives a different part of the larger data based on the order of the Object FIFOs in the `object_fifo_link`.
+
+
+
+```python
+ # AIE-array data movement with object fifos
+ # Input
+ of_in = object_fifo("in", ShimTile, MemTile, 2, memRef_24_ty)
+ of_in0 = object_fifo("in0", MemTile, ComputeTile0, 2, memRef_8_ty)
+ of_in1 = object_fifo("in1", MemTile, ComputeTile1, 2, memRef_8_ty)
+ of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_8_ty)
+ object_fifo_link(of_in, [of_in0, of_in1, of_in2])
+```
+
+All compute tiles are running the same process of acquring one object from their respective input Object FIFOs to consume, adding `1` to all of its entries, and releasing the object. The [join design](../05_join_L2/) shows how the data is sent back out to external memory and tested.
+
+Other examples containing this data movement pattern are available in the [programming_examples/matrix_multiplication/](../../../../programming_examples/basic/matrix_multiplication/).
+
+-----
+[[Prev](../03_external_mem_to_core_L2/)] [[Up](..)] [[Next](../05_join_L2/)]
diff --git a/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py b/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py
new file mode 100644
index 0000000000..a873be1655
--- /dev/null
+++ b/programming_guide/section-2/section-2e/04_distribute_L2/distribute_L2.py
@@ -0,0 +1,84 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+
+def distribute_L2():
+ with mlir_mod_ctx() as ctx:
+
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_24_ty = T.memref(24, T.i32())
+ memRef_8_ty = T.memref(8, T.i32())
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ MemTile = tile(0, 1)
+ ComputeTile0 = tile(0, 2)
+ ComputeTile1 = tile(0, 3)
+ ComputeTile2 = tile(0, 4)
+
+ # AIE-array data movement with object fifos
+ # Input
+ of_in = object_fifo("in", ShimTile, MemTile, 2, memRef_24_ty)
+ of_in0 = object_fifo("in0", MemTile, ComputeTile0, 2, memRef_8_ty)
+ of_in1 = object_fifo("in1", MemTile, ComputeTile1, 2, memRef_8_ty)
+ of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_8_ty)
+ object_fifo_link(of_in, [of_in0, of_in1, of_in2])
+
+ # Set up compute tiles
+ # Compute tile 2
+ @core(ComputeTile0)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(8):
+ elem = of_in0.acquire(ObjectFifoPort.Consume, 1)
+ for i in for_(8):
+ v0 = memref.load(elem, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem, [i])
+ yield_([])
+ of_in0.release(ObjectFifoPort.Consume, 1)
+ yield_([])
+
+ # Compute tile 3
+ @core(ComputeTile1)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(8):
+ elem = of_in1.acquire(ObjectFifoPort.Consume, 1)
+ for i in for_(8):
+ v0 = memref.load(elem, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem, [i])
+ yield_([])
+ of_in1.release(ObjectFifoPort.Consume, 1)
+ yield_([])
+
+ # Compute tile 4
+ @core(ComputeTile2)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(8):
+ elem = of_in2.acquire(ObjectFifoPort.Consume, 1)
+ for i in for_(8):
+ v0 = memref.load(elem, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in2.release(ObjectFifoPort.Consume, 1)
+ yield_([])
+
+ print(ctx.module)
+
+
+distribute_L2()
diff --git a/programming_guide/section-2/section-2e/05_join_L2/CMakeLists.txt b/programming_guide/section-2/section-2e/05_join_L2/CMakeLists.txt
new file mode 100644
index 0000000000..5da03ef2b2
--- /dev/null
+++ b/programming_guide/section-2/section-2e/05_join_L2/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(CMAKE_C_COMPILER gcc-13)
+ set(CMAKE_CXX_COMPILER g++-13)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName proj_${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib/test_utils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC
+ ${XRT_INC_DIR}
+ ${Boost_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ )
+endif()
diff --git a/programming_guide/section-2/section-2e/05_join_L2/Makefile b/programming_guide/section-2/section-2e/05_join_L2/Makefile
new file mode 100644
index 0000000000..9de7d68f08
--- /dev/null
+++ b/programming_guide/section-2/section-2e/05_join_L2/Makefile
@@ -0,0 +1,41 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+include ../../../../programming_examples/makefile-common
+
+targetname = distribute_and_join_L2
+devicename = npu
+col = 0
+
+all: build/final.xclbin build/insts.txt
+
+build/aie.mlir: distribute_and_join_L2.py
+ mkdir -p ${@D}
+ python3 $< ${devicename} ${col} > $@
+
+build/final.xclbin: build/aie.mlir
+ mkdir -p ${@D}
+ cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+ rm -rf _build
+ mkdir -p _build
+ cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+ cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+ cp _build/${targetname}.exe $@
+else
+ cp _build/${targetname} $@
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt
+ ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+ rm -rf build _build ${targetname}.exe
diff --git a/programming_guide/section-2/section-2e/05_join_L2/README.md b/programming_guide/section-2/section-2e/05_join_L2/README.md
new file mode 100644
index 0000000000..64f1ef902a
--- /dev/null
+++ b/programming_guide/section-2/section-2e/05_join_L2/README.md
@@ -0,0 +1,41 @@
+
+
+# Join in L2
+
+The design in [join_L2.py](./join_L2.py) uses three Object FIFOs from each of the three compute tiles to send `8xi32` pieces of data to the `MemTile`. There, the data is joined together into `24xi32` tensors based on the order of the fifos in the `object_fifo_link`. The data is then sent out to external memory using `of_out`.
+
+
+
+```python
+ # AIE-array data movement with object fifos
+ # Output
+ of_out = object_fifo("out", MemTile, ShimTile, 2, memRef_24_ty)
+ of_out0 = object_fifo("out0", ComputeTile0, MemTile, 2, memRef_8_ty)
+ of_out1 = object_fifo("out1", ComputeTile1, MemTile, 2, memRef_8_ty)
+ of_out2 = object_fifo("out2", ComputeTile2, MemTile, 2, memRef_8_ty)
+ object_fifo_link([of_out0, of_out1, of_out2], of_out)
+```
+
+All compute tiles are running the same process of acquring one object from their respective input Object FIFOs to produce, writing `1` to all of its entries, and releasing the object.
+
+This design is combined with the previous [distribute](../04_distribute_L2/distribute_L2.py) design to achieve a full data movement from external memory to the AIE array and back. The resulting code is available in [distribute_and_join_L2.py](./distribute_and_join_L2.py). It is possible to build, run and test it with the following commands:
+```
+make
+make run
+```
+The [test.cpp](./test.cpp) as well as the `# To/from AIE-array data movement` section of the design code will be described in detail in [Section 2g](../../section-2g/).
+
+> **NOTE:** The design in [distribute_and_join_L2.py](./distribute_and_join_L2.py) takes [ext_to_core](../03_external_mem_to_core_L2/) and distributes smaller pieces of the input data to three compute tiles. This pattern is typically used when the input data is too large for a single core's memory module and needs to be processed in smaller chunks, the result of which is then joined together to produce the final output.
+
+Other examples containing this data movement pattern are available in the [programming_examples](../../../../programming_examples/). A notable one is [vector_exp](../../../../programming_examples/basic/vector_exp/).
+
+-----
+[[Prev](../04_distribute_L2/)] [[Up](..)] [[Next - Section 2f](../../section-2f/)]
diff --git a/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
new file mode 100644
index 0000000000..815334e5b4
--- /dev/null
+++ b/programming_guide/section-2/section-2e/05_join_L2/distribute_and_join_L2.py
@@ -0,0 +1,109 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+
+def distribute_join_L2():
+ with mlir_mod_ctx() as ctx:
+
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_24_ty = T.memref(24, T.i32())
+ memRef_8_ty = T.memref(8, T.i32())
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ MemTile = tile(0, 1)
+ ComputeTile0 = tile(0, 2)
+ ComputeTile1 = tile(0, 3)
+ ComputeTile2 = tile(0, 4)
+
+ # AIE-array data movement with object fifos
+ # Input
+ of_in = object_fifo("in", ShimTile, MemTile, 2, memRef_24_ty)
+ of_in0 = object_fifo("in0", MemTile, ComputeTile0, 2, memRef_8_ty)
+ of_in1 = object_fifo("in1", MemTile, ComputeTile1, 2, memRef_8_ty)
+ of_in2 = object_fifo("in2", MemTile, ComputeTile2, 2, memRef_8_ty)
+ object_fifo_link(of_in, [of_in0, of_in1, of_in2])
+
+ # Output
+ of_out = object_fifo("out", MemTile, ShimTile, 2, memRef_24_ty)
+ of_out0 = object_fifo("out0", ComputeTile0, MemTile, 2, memRef_8_ty)
+ of_out1 = object_fifo("out1", ComputeTile1, MemTile, 2, memRef_8_ty)
+ of_out2 = object_fifo("out2", ComputeTile2, MemTile, 2, memRef_8_ty)
+ object_fifo_link([of_out0, of_out1, of_out2], of_out)
+
+ # Set up compute tiles
+ # Compute tile 2
+ @core(ComputeTile0)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(2):
+ elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in0.release(ObjectFifoPort.Consume, 1)
+ of_out0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # Compute tile 3
+ @core(ComputeTile1)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(2):
+ elem_in = of_in1.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out1.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in1.release(ObjectFifoPort.Consume, 1)
+ of_out1.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # Compute tile 4
+ @core(ComputeTile2)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(2):
+ elem_in = of_in2.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out2.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+ of_in2.release(ObjectFifoPort.Consume, 1)
+ of_out2.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ memRef_48_ty = T.memref(48, T.i32())
+
+ @FuncOp.from_py_func(memRef_48_ty, memRef_48_ty, memRef_48_ty)
+ def sequence(inTensor, notUsed, outTensor):
+ npu_dma_memcpy_nd(
+ metadata="out", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 48]
+ )
+ npu_dma_memcpy_nd(
+ metadata="in", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 48]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+ print(ctx.module)
+
+
+distribute_join_L2()
diff --git a/programming_guide/section-2/section-2e/05_join_L2/join_L2.py b/programming_guide/section-2/section-2e/05_join_L2/join_L2.py
new file mode 100644
index 0000000000..e79cad4bf1
--- /dev/null
+++ b/programming_guide/section-2/section-2e/05_join_L2/join_L2.py
@@ -0,0 +1,84 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 AMD Inc.
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.dialects.ext import memref, arith
+from aie.extras.context import mlir_mod_ctx
+
+
+def join_L2():
+ with mlir_mod_ctx() as ctx:
+
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_24_ty = T.memref(24, T.i32())
+ memRef_8_ty = T.memref(8, T.i32())
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ MemTile = tile(0, 1)
+ ComputeTile0 = tile(0, 2)
+ ComputeTile1 = tile(0, 3)
+ ComputeTile2 = tile(0, 4)
+
+ # AIE-array data movement with object fifos
+ # Output
+ of_out = object_fifo("out", MemTile, ShimTile, 2, memRef_24_ty)
+ of_out0 = object_fifo("out0", ComputeTile0, MemTile, 2, memRef_8_ty)
+ of_out1 = object_fifo("out1", ComputeTile1, MemTile, 2, memRef_8_ty)
+ of_out2 = object_fifo("out2", ComputeTile2, MemTile, 2, memRef_8_ty)
+ object_fifo_link([of_out0, of_out1, of_out2], of_out)
+
+ # Set up compute tiles
+ # Compute tile 2
+ @core(ComputeTile0)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(6):
+ elem = of_out0.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(8):
+ v0 = memref.load(elem, [i])
+ v1 = arith.constant(1, T.i32())
+ memref.store(v1, elem, [i])
+ yield_([])
+ of_out0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # Compute tile 3
+ @core(ComputeTile1)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(6):
+ elem = of_out1.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(8):
+ v0 = memref.load(elem, [i])
+ v1 = arith.constant(1, T.i32())
+ memref.store(v1, elem, [i])
+ yield_([])
+ of_out1.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # Compute tile 4
+ @core(ComputeTile2)
+ def core_body():
+ # Effective while(1)
+ for _ in for_(6):
+ elem = of_out2.acquire(ObjectFifoPort.Produce, 1)
+ for i in for_(8):
+ v0 = memref.load(elem, [i])
+ v1 = arith.constant(1, T.i32())
+ memref.store(v1, elem, [i])
+ yield_([])
+ of_out2.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ print(ctx.module)
+
+
+join_L2()
diff --git a/programming_guide/section-2/section-2e/05_join_L2/run.lit b/programming_guide/section-2/section-2e/05_join_L2/run.lit
new file mode 100644
index 0000000000..34c7a2f9f7
--- /dev/null
+++ b/programming_guide/section-2/section-2e/05_join_L2/run.lit
@@ -0,0 +1,10 @@
+// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai
+//
+// RUN: %python %S/distribute_and_join_L2.py npu 0 | aie-opt -cse -canonicalize -o ./aie.mlir
+// RUN: %python aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.txt ./aie.mlir
+// RUN: g++ %S/test.cpp -o test.exe -std=c++23 -Wall -I%S/../../../../runtime_lib/test_lib %S/../../../../runtime_lib/test_lib/test_utils.cpp %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem
+// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s
+// CHECK: PASS!
diff --git a/programming_guide/section-2/section-2e/05_join_L2/test.cpp b/programming_guide/section-2/section-2e/05_join_L2/test.cpp
new file mode 100644
index 0000000000..c14ca0f131
--- /dev/null
+++ b/programming_guide/section-2/section-2e/05_join_L2/test.cpp
@@ -0,0 +1,189 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 48;
+constexpr int OUT_SIZE = 48;
+
+namespace po = boost::program_options;
+
+void check_arg_file_exists(po::variables_map &vm_in, std::string name) {
+ if (!vm_in.count(name)) {
+ throw std::runtime_error("Error: no " + name + " file was provided\n");
+ } else {
+ std::ifstream test(vm_in[name].as());
+ if (!test) {
+ throw std::runtime_error("The " + name + " file " +
+ vm_in[name].as() +
+ " does not exist.\n");
+ }
+ }
+}
+
+std::vector load_instr_sequence(std::string instr_path) {
+ std::ifstream instr_file(instr_path);
+ std::string line;
+ std::vector instr_v;
+ while (std::getline(instr_file, line)) {
+ std::istringstream iss(line);
+ uint32_t a;
+ if (!(iss >> std::hex >> a)) {
+ throw std::runtime_error("Unable to parse instruction file\n");
+ }
+ instr_v.push_back(a);
+ }
+ return instr_v;
+}
+
+int main(int argc, const char *argv[]) {
+
+ // Program arguments parsing
+ po::options_description desc("Allowed options");
+ desc.add_options()("help,h", "produce help message")(
+ "xclbin,x", po::value()->required(),
+ "the input xclbin path")(
+ "kernel,k", po::value()->required(),
+ "the kernel name in the XCLBIN (for instance PP_PRE_FD)")(
+ "verbosity,v", po::value()->default_value(0),
+ "the verbosity of the output")(
+ "instr,i", po::value()->required(),
+ "path of file containing userspace instructions to be sent to the LX6");
+ po::variables_map vm;
+
+ try {
+ po::store(po::parse_command_line(argc, argv, desc), vm);
+ po::notify(vm);
+
+ if (vm.count("help")) {
+ std::cout << desc << "\n";
+ return 1;
+ }
+ } catch (const std::exception &ex) {
+ std::cerr << ex.what() << "\n\n";
+ std::cerr << "Usage:\n" << desc << "\n";
+ return 1;
+ }
+
+ check_arg_file_exists(vm, "xclbin");
+ check_arg_file_exists(vm, "instr");
+
+ std::vector instr_v =
+ load_instr_sequence(vm["instr"].as());
+
+ int verbosity = vm["verbosity"].as();
+ if (verbosity >= 1)
+ std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+ // Start the XRT test code
+ // Get a device handle
+ unsigned int device_index = 0;
+ auto device = xrt::device(device_index);
+
+ // Load the xclbin
+ if (verbosity >= 1)
+ std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n";
+ auto xclbin = xrt::xclbin(vm["xclbin"].as());
+
+ if (verbosity >= 1)
+ std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n";
+ std::string Node = vm["kernel"].as();
+
+ // Get the kernel from the xclbin
+ auto xkernels = xclbin.get_kernels();
+ auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+ [Node](xrt::xclbin::kernel &k) {
+ auto name = k.get_name();
+ std::cout << "Name: " << name << std::endl;
+ return name.rfind(Node, 0) == 0;
+ });
+ auto kernelName = xkernel.get_name();
+
+ if (verbosity >= 1)
+ std::cout << "Registering xclbin: " << vm["xclbin"].as()
+ << "\n";
+
+ device.register_xclbin(xclbin);
+
+ // get a hardware context
+ if (verbosity >= 1)
+ std::cout << "Getting hardware context.\n";
+ xrt::hw_context context(device, xclbin.get_uuid());
+
+ // get a kernel handle
+ if (verbosity >= 1)
+ std::cout << "Getting handle to kernel:" << kernelName << "\n";
+ auto kernel = xrt::kernel(context, kernelName);
+
+ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+ XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+ auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+ auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+ auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+ if (verbosity >= 1)
+ std::cout << "Writing data into buffer objects.\n";
+
+ uint32_t *bufInA = bo_inA.map();
+ std::vector srcVecA;
+ for (int i = 0; i < IN_SIZE; i++)
+ srcVecA.push_back(1);
+ memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(uint32_t)));
+
+ void *bufInstr = bo_instr.map();
+ memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+ bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+ if (verbosity >= 1)
+ std::cout << "Running Kernel.\n";
+ auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+ run.wait();
+
+ bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+ uint32_t *bufOut = bo_out.map();
+
+ int errors = 0;
+
+ for (uint32_t i = 0; i < 48; i++) {
+ uint32_t ref = 2;
+ if (*(bufOut + i) != ref) {
+ std::cout << "Error in output " << *(bufOut + i) << " != " << ref
+ << std::endl;
+ errors++;
+ } else {
+ std::cout << "Correct output " << *(bufOut + i) << " == " << ref
+ << std::endl;
+ }
+ }
+
+ if (!errors) {
+ std::cout << "\nPASS!\n\n";
+ return 0;
+ } else {
+ std::cout << "\nfailed.\n\n";
+ return 1;
+ }
+}
diff --git a/programming_guide/section-2/section-2e/README.md b/programming_guide/section-2/section-2e/README.md
new file mode 100644
index 0000000000..18dea5ecea
--- /dev/null
+++ b/programming_guide/section-2/section-2e/README.md
@@ -0,0 +1,37 @@
+
+
+# Section 2e - Practical Examples
+
+This section introduces several examples with common Object FIFO data movement patterns. These examples are intended to be simple enough so as to be easily imported and adapted into other designs.
+
+Example 01 - Single / Double Buffer
+
+* Core to core data movement using single / double buffer
+
+Example 02 - External Memory to Core
+
+* External memory to core and back using double buffers
+
+Example 03 - External Memory to Core through L2
+
+* External memory to core and back through L2 using double buffers
+
+Example 04 - Distribute from L2
+
+* Distribute data from external memory to cores through L2
+
+Example 05 - Join in L2
+
+* Join data from cores to external memory through L2
+
+
+-----
+[[Prev - Section 2d](../section-2d/)] [[Up](..)] [[Next - Section 2f](../section-2f/)]
diff --git a/programming_guide/section-2/section-2f/README.md b/programming_guide/section-2/section-2f/README.md
new file mode 100644
index 0000000000..9f06315a1a
--- /dev/null
+++ b/programming_guide/section-2/section-2f/README.md
@@ -0,0 +1,148 @@
+
+
+# Section 2f - Data Movement Without Object FIFOs
+
+Not all data movement patterns can be described with Object FIFOs. This **advanced** section goes into detail about how a user can express data movement using the Data Movement Accelerators (or `DMA`) on AIE tiles. To better understand the code and concepts introduced in this section it is recommended to first read the [Advanced Topic of Section - 2a on DMAs](../section-2a/README.md/#advanced-topic--data-movement-accelerators).
+
+The AIE architecture currently has three different types of tiles: compute tiles, referred to as "tile", memory tiles referred to as "Mem tiles", and external memory interface tiles referred to as "Shim tiles". Each of these tiles have their own attributes regarding compute capabilities and memory capacity, but the base design of their DMAs is the same. The different types of DMAs can be intialized using the constructors in [aie.py](../../../python/dialects/aie.py):
+```python
+@mem(tile) # compute tile DMA
+@shim_dma(tile) # Shim tile DMA
+@memtile_dma(tile) # Mem tile DMA
+```
+
+The DMA hardware component has a certain number of input and output `channels`, and each one has a direction and a port index. Input channels are denoted with the keyword `S2MM` and output ones with `MM2S`. Port indices vary per tile. For example, compute and Shim tiles have two input and two output ports, whereas Mem tiles have six input and six output ports.
+
+A channel in any tile's DMA can be initialized using the unified `dma` constructor:
+```python
+def dma(
+ channel_dir,
+ channel_index,
+ *,
+ num_blocks=1,
+ loop=None,
+ repeat_count=None,
+ sym_name=None,
+ loc=None,
+ ip=None,
+)
+```
+
+The data movement on each channel is described by a chain of Buffer Descriptors (or "BDs"), where each BD describes what data is being moved and configures its synchornization mechanism. The `dma` constructor already creates space for one such BD as can be seen by its `num_blocks=1` default valued input.
+
+The code snippet below shows how to configure the DMA on `tile_a` such that data coming in on input channel 0 is written into `buff_in`:
+```python
+tile_a = tile(1, 3)
+
+prod_lock = lock(tile_a, lock_id=0, init=1)
+cons_lock = lock(tile_a, lock_id=1, init=0)
+buff_in = buffer(tile=tile_a, shape=(256,), dtype=T.i32()) # 256xi32
+
+@mem(tile_a)
+def mem_body():
+ @dma(S2MM, 0) # input channel, port 0
+ def dma_in_0():
+ use_lock(prod_lock, AcquireGreaterEqual)
+ dma_bd(buff_in)
+ use_lock(cons_lock, Release)
+```
+The locks `prod_lock` and `cons_lock` follow AIE-ML architecture semantics. Their task is to mark synchronization points in the tile's and its DMA's execution: for example, if the tile is currently using `buff_in`, it will only release the `prod_lock` when it is done, and that is when the DMA will be allowed to overwrite the data in `buff_in` with new input. Similarly, the tile's core can query the `cons_lock` to know when the new data is ready to be read (i.e., when the DMA releases the lock so the core can acquire it).
+
+In the previous code, the channel only had one BD in its chain. To add additional BDs to the chain, users can use the following constructor, which takes as input what would be the previous BD in the chain it should be added to:
+```python
+@another_bd(dma_bd)
+```
+
+This next code snippet shows how to extend the previous input channel with a double, or ping-pong, buffer using the previous constructor:
+```python
+tile_a = tile(1, 3)
+
+prod_lock = lock(tile_a, lock_id=0, init=2) # note that the producer lock now has 2 tokens
+cons_lock = lock(tile_a, lock_id=1, init=0)
+buff_ping = buffer(tile=tile_a, shape=(256,), dtype=T.i32()) # 256xi32
+buff_pong = buffer(tile=tile_a, shape=(256,), dtype=T.i32()) # 256xi32
+
+@mem(tile_a)
+def mem_body():
+ @dma(S2MM, 0, num_blocks=2) # note the additional BD
+ def dma_in_0():
+ use_lock(prod_lock, AcquireGreaterEqual)
+ dma_bd(buff_ping)
+ use_lock(cons_lock, Release)
+
+ @another_bd(dma_in_0)
+ def dma_in_1():
+ use_lock(prod_lock, AcquireGreaterEqual)
+ dma_bd(buff_pong)
+ use_lock(cons_lock, Release)
+```
+> **NOTE:** This DMA configuration is equivalent to what the Object FIFO lowering looks like for double buffers.
+
+The code above can be visualized like in the following figure, where the two BDs ping-pong to each other:
+
+
+
+The last step to configure the data movement is to establish its endpoints, similar to how the Object FIFO has producer and consumer tiles. To do this, users should use the `flow` constructor:
+```python
+def flow(
+ source,
+ source_bundle=None,
+ source_channel=None,
+ dest=None,
+ dest_bundle=None,
+ dest_channel=None,
+)
+```
+The `flow` is established between channels of two DMAs (other endpoints are available, but they are beyond the scope of this section) and as such it requires:
+* its `source` and `dest` tiles,
+* its `source_bundle` and `dest_bundle`, which represent the type of endpoints (for our scope, these will be `WireBundle.DMA`),
+* and its `source_channel` and `dest_channel`, which represent the index of the channel.
+
+For example, to create a flow between tile `tile_a` and tile `tile_b`, where `tile_a` is sending data on its output channel 0 to `tile_b`'s input channel 1, the user can write:
+```python
+aie.flow(tile_a, WireBundle.DMA, 0, tile_b, WireBundle.DMA, 1)
+```
+Note how the direction of the two channels is not required by the flow, only the indices. This is because the flow lowering can infer the direction based on the `source` and `dest` inputs.
+
+The following code snippet shows a full example of two tiles where `tile_a` is sending data to `tile_b`:
+```python
+tile_a = tile(1, 2)
+tile_b = tile(1, 3)
+
+prod_lock_a = lock(tile_a, lock_id=0, init=1)
+cons_lock_a = lock(tile_a, lock_id=1, init=0)
+buff_a = buffer(tile=tile_a, shape=(256,), dtype=T.i32()) # 256xi32
+
+prod_lock_b = lock(tile_b, lock_id=0, init=1)
+cons_lock_b = lock(tile_b, lock_id=1, init=0)
+buff_b = buffer(tile=tile_b, shape=(256,), dtype=T.i32()) # 256xi32
+
+aie.flow(tile_a, WireBundle.DMA, 0, tile_b, WireBundle.DMA, 1)
+
+@mem(tile_a)
+def mem_body():
+ @dma(MM2S, 0) # output channel, port 0
+ def dma_in_0():
+ use_lock(cons_lock_a, AcquireGreaterEqual)
+ dma_bd(buff_a)
+ use_lock(prod_lock_a, Release)
+
+@mem(tile_b)
+def mem_body():
+ @dma(SS2M, 1) # input channel, port 1
+ def dma_in_0():
+ use_lock(prod_lock_b, AcquireGreaterEqual)
+ dma_bd(buff_b)
+ use_lock(cons_lock_b, Release)
+```
+
+-----
+[[Prev - Section 2e](../section-2e/)] [[Up](..)] [[Next - Section 2g](../section-2g/)]
diff --git a/programming_guide/section-2/section-2g/README.md b/programming_guide/section-2/section-2g/README.md
new file mode 100644
index 0000000000..db52d0b827
--- /dev/null
+++ b/programming_guide/section-2/section-2g/README.md
@@ -0,0 +1,130 @@
+
+
+# Section 2g - Runtime Data Movement
+
+In the preceding sections, we looked at how we can describe data movement between tiles *within* the AIE-array. However, to do anything useful, we need to get data from outside the array, i.e. from the "host", into the AIE-array and back. On NPU devices, we can achieve this with the operations described in this section.
+
+The operations that will be described in this section must be placed in a separate `sequence` function. The arguments to this function describe buffers that will be available on the host side; the body of the function describes how those buffers are moved into the AIE-array. [Section 3](../../../programming_examples/) contains an example.
+
+### Guide to Managing Runtime Data Movement to/from Host Memory
+
+In high-performance computing applications, efficiently managing data movement and synchronization is crucial. This guide provides a comprehensive overview of how to utilize the `npu_dma_memcpy_nd` and `npu_sync` functions to manage data movement at runtime from/to host memory to/from the AIE array (for example in the Ryzen™ AI NPU).
+
+#### **Efficient Data Movement with `npu_dma_memcpy_nd`**
+
+The `npu_dma_memcpy_nd` function is key for enabling non-blocking, multi-dimensional data transfers between different memory regions between the AI Engine array and external memory. This function is essential in developing real applications like signal processing, machine learning, and video processing.
+
+**Function Signature and Parameters**:
+```python
+npu_dma_memcpy_nd(metadata, bd_id, mem, offsets=None, sizes=None, strides=None)
+```
+- **`metadata`**: This string is a reference to the metadata generated by the object FIFO that records a Shim Tile and one of its DMA channels allocated for the host-side memory transfer. In order to associate the memcpy operation with an object FIFO, this metadata string needs to match the object FIFO name string.
+- **`bd_id`**: Identifier integer for the particular Buffer Descriptor control registers used for this memcpy. A buffer descriptor contains all information needed for a DMA transfer described in the parameters below.
+- **`mem`**: Reference to a host buffer, given as an argument to the sequence function, that this transfer will read from or write to.
+- **`offsets`** (optional): Start points for data transfer in each dimension. There is a maximum of four offset dimensions.
+- **`sizes`**: The extent of data to be transferred across each dimension. There is a maximum of four size dimensions.
+- **`strides`** (optional): Interval steps between data points in each dimension, useful for striding-across and reshaping data. There is a maximum of three stride dimensions that can be expressed because dimension 0 is an implicit stride of 1 4B element.
+
+It is important to note that dimension 0 of the **`sizes`** and all **`strides`** are expressed in a 4B granularity. Higher dimensions of the **`sizes`** are integers to repeat the lower dimensions. The **`offsets`** are expressed in multiples of the **`sizes`**, however the dimension 0 offset is in a 4B granularity. The strides and wraps express data transformations analogously to those described in [Section 2C](../section-2c).
+
+**Example Usage**:
+```python
+npu_dma_memcpy_nd("of_in", 0, input_buffer, sizes=[1, 1, 1, 30])
+```
+
+The example above describes a linear transfer of 30 data elements, or 120 Bytes, from the `input_buffer` in host memory into an object FIFO with matching metadata labled "of_in". The `size` dimensions are expressed right to left where the right is dimension 0 and the left dimension 3. Higher dimensions not used should be set to `1`.
+
+
+#### **Advanced Techniques for Multi-dimensional `npu_dma_memcpy_nd`**
+
+For high-performance computing applications on AMD's AI Engine, mastering the `npu_dma_memcpy_nd` function for complex data movements is crucial. Here, we focus on using the `sizes`, `strides`, and `offsets` parameters to effectively manage intricate data transfers.
+
+##### **Tiling a Large Matrix**
+
+A common tasks such as tiling a 2D matrix can be implemented using the `npu_dma_memcpy_nd` operation. Here’s a simplified example that demonstrates the description.
+
+**Scenario**: Tiling a 2D matrix from shape [100, 200] to [20, 20] and the data type `int16`. With the convention [row, col].
+
+**1. Configuration to transfer one tile**:
+```python
+metadata = "of_in"
+bd_id = 3
+mem = matrix_memory # Memory object for the matrix
+
+# Sizes define the extent of the tile to copy
+sizes = [1, 1, 20, 10]
+
+# Strides set to '0' in the higher (unused) dimensions and to '100' (length of a row in 4B or "i32s") in the minor dimension
+strides = [0, 0, 0, 100]
+
+# Offsets set to zero since we start from the beginning
+offsets = [0, 0, 0, 0]
+
+npu_dma_memcpy_nd(metadata, bd_id, mem, offsets, sizes, strides)
+```
+
+**2. Configuration to tile the whole matrix**:
+```python
+metadata = "of_in"
+bd_id = 3
+mem = matrix_memory # Memory object for the matrix
+
+# Sizes define the extent of the tile to copy.
+# Dimension 0 is 10 to transfer 20 int16s for one row of the tile,
+# Dimension 1 repeats that row transfer 20 times to complete a [20, 20] tile,
+# Dimension 2 repeats that tile transfer 10 times along a row,
+# Dimension 3 repeats the row of tiles transfer 5 times to complete.
+sizes = [5, 10, 20, 10]
+
+# Strides set to '0' in the highest (unused) dimension,
+# '2000' for the next row of tile below the last (200 x 20 x 2B / 4B),
+# '10' for the next tile to the 'right' of the last [20, 20] tile,
+# and '100' (length of a row in 4B or "i32s") in dimension 0.
+strides = [0, 2000, 10, 100]
+
+# Offsets set to zero since we start from the beginning
+offsets = [0, 0, 0, 0]
+
+npu_dma_memcpy_nd(metadata, bd_id, mem, offsets, sizes, strides)
+```
+
+#### **Host Synchronization with `npu_sync`**
+
+Synchronization between DMA channels and the host is facilitated by the `npu_sync` operation, ensuring data consistency and proper execution order. The `npu_sync` operation waits until the BD in the DMA channel is complete issuing a task complete token.
+
+**Function Signature**:
+```python
+npu_sync(column, row, direction, channel, column_num=1, row_num=1)
+```
+- **`column`** and **`row`**: Specify the tile location for initiating or targeting the synchronization. Currently we only emit task complete tokens in the Shim Tiles (row = 0).
+- **`direction`**: Indicates the DMA direction (0 for write to host, 1 for read from host).
+- **`channel`**: Identifies the DMA channel (0 or 1) for the synchronization token.
+- **`column_num`** and **`row_num`** (optional): Define the range of tiles to wait for synchronization task complete tokens, defaulting to the specific single tile only.
+
+**Example Usage**:
+```python
+# Synchronizes operations at column 0, row 0 (Shim Tile)
+# using DMA channel 1 in direction 0 (write to host).
+npu_sync(0, 0, 0, 1)
+```
+
+#### **Best Practices for Data Movement and Synchronization**
+
+- **Sync to Reuse Buffer Descriptors**: Each `npu_dma_memcpy_nd` is assigned a `bd_id`. There are a maximum of `16` BDs available to use in each Shim Tile. It is "safe" to reuse BDs once all transfers are complete, this can be managed by properly syncronizing taking into account the BDs that must have completed to transfer data into the array to complete a compute operation. And then sync on the BD that receives the data produced by the compute operation to write it back to host memory.
+- **Note Non-blocking Transfers**: Overlap data transfers with computation by leveraging the non-blocking nature of `npu_dma_memcpy_nd`.
+- **Minimize Synchronization Overhead**: Synchronize judiciously to avoid excessive overhead that might degrade performance.
+
+#### **Conclusion**
+
+The `npu_dma_memcpy_nd` and `npu_sync` functions are powerful tools for managing data transfers and synchronization with AI Engines in the Ryzen™ AI NPU. By understanding and effectively implementing applications leveraging these functions, developers can enhance the performance, efficiency, and accuracy of their high-performance computing applications.
+
+-----
+[[Prev - Section 2f](../section-2f/)] [[Up](..)] [[Next - Section 3](../../section-3/)]
diff --git a/programming_guide/section-3/CMakeLists.txt b/programming_guide/section-3/CMakeLists.txt
new file mode 100644
index 0000000000..6a91438203
--- /dev/null
+++ b/programming_guide/section-3/CMakeLists.txt
@@ -0,0 +1,75 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED YES)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(CMAKE_C_COMPILER gcc-13)
+ set(CMAKE_CXX_COMPILER g++-13)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib/test_utils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC
+ ${XRT_INC_DIR}
+ ${Boost_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ )
+endif()
diff --git a/programming_guide/section-3/Makefile b/programming_guide/section-3/Makefile
new file mode 100644
index 0000000000..eb57eeb40b
--- /dev/null
+++ b/programming_guide/section-3/Makefile
@@ -0,0 +1,43 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+include ../../programming_examples/makefile-common
+
+all: build/final.xclbin build/insts.txt
+
+targetname = vectorScalar
+
+build/scale.o: vector_scalar_mul.cc
+ mkdir -p ${@D}
+ cd ${@D} && xchesscc_wrapper ${CHESSCCWRAP2_FLAGS} -c $(<:%=../%) -o ${@F}
+
+build/final.xclbin: aie.mlir build/kernel1.o build/kernel2.o build/kernel3.o
+ mkdir -p ${@D}
+ cd ${@D} && aiecc.py --aie-generate-cdo --no-compile-host --xclbin-name=${@F} \
+ --aie-generate-npu --npu-insts-name=insts.txt $(<:%=../%)
+
+${targetname}.exe: test.cpp
+ rm -rf _build
+ mkdir -p _build
+ cd _build && ${powershell} cmake .. -DTARGET_NAME=${targetname}
+ cd _build && ${powershell} cmake --build . --config Release
+ifeq "${powershell}" "powershell.exe"
+ cp _build/${targetname}.exe $@
+else
+ cp _build/${targetname} $@
+endif
+
+run: ${targetname}.exe build/final.xclbin build/insts.txt
+ ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+run_py: build/final.xclbin build/insts.txt
+ ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE
+
+clean:
+ rm -rf build _build ${targetname}.exe
+
diff --git a/programming_guide/section-3/README.md b/programming_guide/section-3/README.md
new file mode 100644
index 0000000000..c91095d68f
--- /dev/null
+++ b/programming_guide/section-3/README.md
@@ -0,0 +1,175 @@
+
+
+# Section 3 - My First Program
+
+
+
+This section creates a first program that will run on the AIE-array. As shown in the figure on the right, we will have to create both binaries for the AIE-array (device) and CPU (host) parts. For the AIE-array, a structural description and kernel code is compiled into the AIE-array binaries: an XCLBIN file ("final.xclbin") and an instruction sequence ("inst.txt"). The host code ("test.exe") loads these AIE-array binaries and contains the test functionality.
+
+For the AIE-array structural description we will combine what you learned in [section-1](../section-1) for defining a basic structural design in Python with the data movement part from [section-2](../section-2).
+
+For the AIE kernel code, we will start with non-vectorized code that will run on the scalar processor part of an AIE. [section-4](../section-4) will introduce how to vectorize a compute kernel to harvest the compute density of the AIE.
+
+The host code can be written in either C++ (as shown in the figure) or in Python. We will also introduce some convenience utility libraries for typical test functionality and to simplify context and buffer creation when the [Xilinx RunTime (XRT)](https://github.com/Xilinx/XRT) is used, for instance in the [AMD XDNA Driver](https://github.com/amd/xdna-driver) for Ryzen™ AI devices.
+
+
+
+Throughout this section, a [vector scalar multiplication](../../programming_examples/basic/vector_scalar_mul/) (c = a * factor) will be used as an example. Vector scalar multiplication takes an input vector a and computes the output vector c by multiplying each element of a with a factor. In our example, the total vector size is set to 4096 integers (32b) that will processed in chunks of 1024.
+This design is also available in the [programming_examples](../../programming_examples) of this repository. We will first introduce the AIE-array structural description, the review the kernel code and then introduce the host code. Finally we will show ho to run the design on Ryzen™ AI enabled hardware.
+
+## AIE-array Structural Description
+
+
+
+The [aie2.py](../../programming_examples/basic/vector_scalar_mul/aie2.py) AIE-array structural description (see [section-1](../section-1)) deploys both a compute core (green) for the multiplication and a shimDMA (purple) for data movement of both input vector a and output vector c residing in external memory.
+
+```python
+# Device declaration - here using aie2 device NPU
+@device(AIEDevice.npu)
+def device_body():
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ ComputeTile2 = tile(0, 2)
+```
+
+We also need to declare that the compute core will run an external function: a kernel written in C++ that will be linked into the design as pre-compiled kernel (more details in the next subsection). To get our initial design running on the AIE-array, we will run a generic version of the vector scalar multiply run on the scalar processor of the AIE.
+
+```python
+ # Type declarations
+ memRef_ty = T.memref(1024, T.i32())
+
+ # AIE Core Function declarations
+ scale_scalar = external_func("vector_scalar_mul_aie_scalar",
+ inputs=[memRef_ty, memRef_ty, T.memref(1, T.i32()), T.i32()],
+ )
+```
+
+Since the compute core can only access L1 memory, input data needs to be explicitly moved to (yellow arrow) and from (orange arrow) the L1 memory of the AIE. We will use the objectFIFO data movement primitive (introduced in [section-2](../section-2/)).
+
+
+
+This enables looking at the data movement in the AIE-array from a logical view where we deploy 3 objectFIFOs: "of_in" to bring in the vector a, "of_factor" to bring in the scalar factor, and "of_out" to move the output vector c, all using shimDMA. Note that the objects for "of_in" and "of_out" are declared to have the `memRef_ty` type: 1024 int32 elements, while the factor is an object containing a single integer. All objectFIFO are set up using a depth size of 2 to enable the concurrent execution to the Shim Tile and Compute Tile DMAs data movement with the processing on the compute core.
+
+```python
+ # AIE-array data movement with object fifos
+ of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
+ of_factor = object_fifo("infactor", ShimTile, ComputeTile2, 2, T.memref(1, T.i32()))
+ of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+
+```
+We also need to set up the data movement to/from the AIE-array: configure n-dimensional DMA transfers in the shimDMAs to read/write to/from L3 external memory. For NPU, this is done with the `npu_dma_memcpy_nd` function (more details in [section 2-g](../section-2/section-2g)). Note that the n-dimensional transfer has a size of 4096 int32 elements and that the `metadata` argument in the `npu_dma_memcpy_nd` needs to match the `name` argument of the corresponding object FIFO.
+
+```python
+ # To/from AIE-array data movement
+ tensor_ty = T.memref(4096, T.i32())
+ scalar_ty = T.memref(1, T.i32())
+
+ @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+ def sequence(A, F, C):
+ npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
+ npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
+ npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
+ npu_sync(column=0, row=0, direction=0, channel=0)
+```
+
+Finally, we need to configure how the compute core accesses the data moved to its L1 memory, in objectFIFO terminology: we need to program the acquire and release patterns of "of_in", "of_factor" and "of_out". Only a single factor is needed for the complete 4096 vector, while for every processing iteration on a sub-vector, we need to acquire and object of 1024 integers to read from from "of_in" and and one similar sized object from "of_out". Then we call our previously declared external function with the acquired objects as operands. After the vector scalar operation, we need to release both objects to their respective "of_in" and "of_out" objectFIFO. After the 4 sub-vector iterations, we release the "of_factor" objectFIFO.
+
+This access and execute pattern runs on the AIE compute core `ComputeTile2` and needs to get linked against the precompiled external function "scale.o". We run this pattern in a very large loop to enable enqueuing multiple rounds vector scalar multiply work from the host code.
+
+```python
+ @core(ComputeTile2, "scale.o")
+ def core_body():
+ # Effective while(1)
+ for _ in for_(sys.maxsize):
+ elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1)
+ # Number of sub-vector "tile" iterations
+ for _ in for_(4):
+ elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+ elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+ call(scale_scalar, [elem_in, elem_out, elem_factor, 1024])
+ of_in.release(ObjectFifoPort.Consume, 1)
+ of_out.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+ of_factor.release(ObjectFifoPort.Consume, 1)
+ yield_([])
+```
+
+## Kernel Code
+
+We can program the AIE compute core using C++ code and compile it with `xchesscc` into an kernel object file. In this section, we will use a generic implementation of the vector scalar multiplication that can run on the scalar processor part of the AIE. The `vector_scalar_mul_aie_scalar` function processes one data element at a time, taking advantage of AIE scalar datapath to load, multiply and store data elements.
+
+```c
+void vector_scalar_mul_aie_scalar(int32_t *a_in, int32_t *c_out,
+ int32_t *factor, int32_t N) {
+ for (int i = 0; i < N; i++) {
+ c[i] = *factor * a[i];
+ }
+}
+```
+
+[Section-4](../section-4/) will introduce how to exploit the compute dense vector processor.
+
+Note that since the scalar factor is communicated through an object, it is provided as an array of size one to the C++ kernel code and hence needs to be dereferenced.
+
+## Host Code
+
+The host code is acts as environment setup and testbench for the Vector Scalar Multiplication design example. The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and kick off the execution the AIE design on the NPU. After running, it verifies the results and optionally outputs trace data. Both a C++ [test.cpp](./test.cpp) and Python [test.py](./test.py) variant of this code are available.
+
+For convenience, a set of test utilities support common elements of command line parsing, the XRT-based environment setup and with testbench functionality: [test_utils.h](../../runtime_lib/test_lib/test_utils.h) or [test.py](../../python/utils/test.py).
+
+The host code contains following elements:
+
+1. *Parse program arguments and set up constants*: the host code typically takes at least as arguments: `-x` the XCLBIN file, `-k` kernel name (with default name "MLIR_AIE"), and `-i` the instruction sequence file as arguments since it is its task to load those files and set the kernel name. Both the XCLBIN and instruction sequence are generated when compiling the AIE-array Structural Description and kernel code with `aiecc.py`.
+
+1. *Read instruction sequence*: load the instruction sequence from the specified file in memory
+
+1. *Create XRT environment*: so that we can use the XRT runtime
+
+1. *Create XRT buffer objects* for the instruction sequence, inputs (vector a and factor) and output (vector c). Note that the `kernel.group_id()` needs to match the order of `def sequence(A, F, C):` in the data movement to/from the AIE-array of python AIE-array structural description, starting with ID number 2 for the first sequence argument and then icrementing by 1.
+
+1. *Initialize and synchronize*: host to device XRT buffer objects
+
+1. *Run on AIE and synchronize*: Execute the kernel, wait to finish, and synchronize device to host XRT buffer objects
+
+1. *Run testbench checks*: Compare device results to reference and report test status
+
+
+## Running the Program
+
+To compile the design and C++ testbench:
+
+```sh
+make
+make build/vectorScalar.exe
+```
+
+To run the design:
+
+```sh
+make run
+```
+
+### Python Testbench
+
+To compile the design and run the Python testbench:
+
+```sh
+make
+```
+
+To run the design:
+
+```sh
+make run_py
+```
+-----
+[[Prev - Section 2](../section-2/)] [[Top](..)] [[Next - Section 4](../section-4/)]
diff --git a/programming_guide/section-3/aie2.py b/programming_guide/section-3/aie2.py
new file mode 100644
index 0000000000..b09f9d0637
--- /dev/null
+++ b/programming_guide/section-3/aie2.py
@@ -0,0 +1,77 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+import sys
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+import aie.utils.trace as trace_utils
+
+
+def my_vector_scalar():
+
+ @device(AIEDevice.npu)
+ def device_body():
+ memRef_ty = T.memref(1024, T.i32())
+
+ # AIE Core Function declarations
+ scale_scalar = external_func(
+ "vector_scalar_mul_aie_scalar",
+ inputs=[memRef_ty, memRef_ty, T.memref(1, T.i32()), T.i32()],
+ )
+
+ # Tile declarations
+ ShimTile = tile(0, 0)
+ ComputeTile2 = tile(0, 2)
+
+ # AIE-array data movement with object fifos
+ of_in = object_fifo("in", ShimTile, ComputeTile2, 2, memRef_ty)
+ of_factor = object_fifo(
+ "infactor", ShimTile, ComputeTile2, 2, T.memref(1, T.i32())
+ )
+ of_out = object_fifo("out", ComputeTile2, ShimTile, 2, memRef_ty)
+
+ # Set up compute tiles
+ # Compute tile 2
+ @core(ComputeTile2, "scale.o")
+ def core_body():
+ # Effective while(1)
+ for _ in for_(sys.maxsize):
+ elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1)
+ # Number of sub-vector "tile" iterations
+ for _ in for_(4):
+ elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
+ elem_in = of_in.acquire(ObjectFifoPort.Consume, 1)
+ call(scale_scalar, [elem_in, elem_out, elem_factor, 1024])
+ of_in.release(ObjectFifoPort.Consume, 1)
+ of_out.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+ of_factor.release(ObjectFifoPort.Consume, 1)
+ yield_([])
+
+ # To/from AIE-array data movement
+ tensor_ty = T.memref(4096, T.i32())
+ scalar_ty = T.memref(1, T.i32())
+
+ @FuncOp.from_py_func(tensor_ty, scalar_ty, tensor_ty)
+ def sequence(A, F, C):
+ npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, 4096])
+ npu_dma_memcpy_nd(metadata="in", bd_id=1, mem=A, sizes=[1, 1, 1, 4096])
+ npu_dma_memcpy_nd(metadata="infactor", bd_id=2, mem=F, sizes=[1, 1, 1, 1])
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+with mlir_mod_ctx() as ctx:
+ my_vector_scalar()
+ res = ctx.module.operation.verify()
+ if res == True:
+ print(ctx.module)
+ else:
+ print(res)
diff --git a/programming_guide/section-3/test.cpp b/programming_guide/section-3/test.cpp
new file mode 100644
index 0000000000..c5690e127d
--- /dev/null
+++ b/programming_guide/section-3/test.cpp
@@ -0,0 +1,139 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+
+#include "test_utils.h"
+#include "xrt/xrt_bo.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+using DATATYPE = std::uint32_t; // Configure this to match your buffer data type
+#endif
+
+const int scaleFactor = 3;
+
+namespace po = boost::program_options;
+
+int main(int argc, const char *argv[]) {
+
+ // Program arguments parsing
+ po::options_description desc("Allowed options");
+ po::variables_map vm;
+ test_utils::add_default_options(desc);
+
+ test_utils::parse_options(argc, argv, desc, vm);
+ int verbosity = vm["verbosity"].as();
+ int trace_size = vm["trace_sz"].as();
+
+ constexpr bool VERIFY = true;
+ constexpr bool ENABLE_TRACING = false;
+ // constexpr int TRACE_SIZE = 8192;
+ constexpr int IN_SIZE = 4096;
+ constexpr int OUT_SIZE = ENABLE_TRACING ? IN_SIZE + trace_size / 4 : IN_SIZE;
+
+ // Load instruction sequence
+ std::vector instr_v =
+ test_utils::load_instr_sequence(vm["instr"].as());
+
+ if (verbosity >= 1)
+ std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+ // Start the XRT context and load the kernel
+ xrt::device device;
+ xrt::kernel kernel;
+
+ test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+ vm["xclbin"].as(),
+ vm["kernel"].as());
+
+ // set up the buffer objects
+ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+ XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+ auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(DATATYPE),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+ auto bo_inFactor = xrt::bo(device, 1 * sizeof(DATATYPE),
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+ auto bo_outC = xrt::bo(device, OUT_SIZE * sizeof(DATATYPE) + trace_size,
+ XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+ if (verbosity >= 1)
+ std::cout << "Writing data into buffer objects.\n";
+
+ // Copy instruction stream to xrt buffer object
+ void *bufInstr = bo_instr.map();
+ memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+ // Initialize buffer bo_inA
+ DATATYPE *bufInA = bo_inA.map();
+ for (int i = 0; i < IN_SIZE; i++)
+ bufInA[i] = i + 1;
+
+ // Initialize buffer bo_inFactor
+ DATATYPE *bufInFactor = bo_inFactor.map();
+ *bufInFactor = scaleFactor;
+
+ // Zero out buffer bo_outC
+ DATATYPE *bufOut = bo_outC.map();
+ memset(bufOut, 0, OUT_SIZE * sizeof(DATATYPE) + trace_size);
+
+ // sync host to device memories
+ bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inFactor.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_outC.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+ // Execute the kernel and wait to finish
+ if (verbosity >= 1)
+ std::cout << "Running Kernel.\n";
+ auto run = kernel(bo_instr, instr_v.size(), bo_inA, bo_inFactor, bo_outC);
+ run.wait();
+
+ // Sync device to host memories
+ bo_outC.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+ // Compare out to golden
+ int errors = 0;
+ if (verbosity >= 1) {
+ std::cout << "Verifying results ..." << std::endl;
+ }
+ for (uint32_t i = 0; i < IN_SIZE; i++) {
+ int32_t ref = bufInA[i] * scaleFactor;
+ int32_t test = bufOut[i];
+ if (test != ref) {
+ if (verbosity >= 1)
+ std::cout << "Error in output " << test << " != " << ref << std::endl;
+ errors++;
+ } else {
+ if (verbosity >= 1)
+ std::cout << "Correct output " << test << " == " << ref << std::endl;
+ }
+ }
+
+ if (trace_size > 0) {
+ test_utils::write_out_trace(((char *)bufOut) + (IN_SIZE * sizeof(DATATYPE)),
+ trace_size, vm["trace_file"].as());
+ }
+
+ // Print Pass/Fail result of our test
+ if (!errors) {
+ std::cout << std::endl << "PASS!" << std::endl << std::endl;
+ return 0;
+ } else {
+ std::cout << std::endl
+ << errors << " mismatches." << std::endl
+ << std::endl;
+ std::cout << std::endl << "fail." << std::endl << std::endl;
+ return 1;
+ }
+}
diff --git a/programming_guide/section-3/test.py b/programming_guide/section-3/test.py
new file mode 100644
index 0000000000..bfdc33cbea
--- /dev/null
+++ b/programming_guide/section-3/test.py
@@ -0,0 +1,123 @@
+# test.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+import pyxrt as xrt
+import sys
+import time
+
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+from aie.extras.dialects.ext import memref, arith
+
+import aie.utils.test as test_utils
+import aie.utils.trace as trace_utils
+
+
+def main(opts):
+
+ # Load instruction sequence
+ with open(opts.instr, "r") as f:
+ instr_text = f.read().split("\n")
+ instr_text = [l for l in instr_text if l != ""]
+ instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
+
+ # ------------------------------------------------------------
+ # Configure this to match your design's buffer size and type
+ # ------------------------------------------------------------
+ INOUT0_VOLUME = int(4096) # Input only, 64x uint32_t in this example
+ INOUT1_VOLUME = int(1) # Input only, 1 uint32_t scale factor
+ INOUT2_VOLUME = int(4096) # Output only, 64x uint32_t in this example
+
+ INOUT0_DATATYPE = np.int32
+ INOUT1_DATATYPE = np.int32
+ INOUT2_DATATYPE = np.int32
+
+ INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
+ INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
+ INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+
+ OUT_SIZE = INOUT2_SIZE + int(opts.trace_size)
+
+ # ------------------------------------------------------
+ # Get device, load the xclbin & kernel and register them
+ # ------------------------------------------------------
+ (device, kernel) = test_utils.init_xrt_load_kernel(opts)
+
+ # ------------------------------------------------------
+ # Initialize input/ output buffer sizes and sync them
+ # ------------------------------------------------------
+ bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
+ bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
+ bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
+ bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4))
+
+ # Initialize instruction buffer
+ bo_instr.write(instr_v, 0)
+
+ # Initialize data buffers
+ inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
+ scale_factor = np.array([3], dtype=INOUT1_DATATYPE)
+ inout2 = np.zeros(OUT_SIZE, dtype=np.uint8)
+ bo_inout0.write(inout0, 0)
+ bo_inout1.write(scale_factor, 0)
+ bo_inout2.write(inout2, 0)
+
+ # Sync buffers to update input buffer values
+ bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+ bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+ bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+ bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+
+ # ------------------------------------------------------
+ # Initialize run configs
+ # ------------------------------------------------------
+ errors = 0
+
+ # ------------------------------------------------------
+ # Main run loop
+ # ------------------------------------------------------
+
+ # Run kernel
+ if opts.verbosity >= 1:
+ print("Running Kernel.")
+ h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
+ h.wait()
+ bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+
+ # Copy output results and verify they are correct
+ entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
+ output_buffer = entire_buffer[:INOUT2_VOLUME]
+ if opts.verify:
+ if opts.verbosity >= 1:
+ print("Verifying results ...")
+ ref = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) * scale_factor
+ e = np.equal(output_buffer, ref)
+ errors = errors + np.size(e) - np.count_nonzero(e)
+
+ # Write trace values if trace_size > 0
+ if opts.trace_size > 0:
+ trace_buffer = entire_buffer[INOUT2_VOLUME:]
+ trace_utils.write_out_trace(trace_buffer, str(opts.trace_file))
+
+ # ------------------------------------------------------
+ # Print verification and timing results
+ # ------------------------------------------------------
+
+ if not errors:
+ print("\nPASS!\n")
+ exit(0)
+ else:
+ print("\nError count: ", errors)
+ print("\nFailed.\n")
+ exit(-1)
+
+
+if __name__ == "__main__":
+ p = test_utils.create_default_argparser()
+ opts = p.parse_args(sys.argv[1:])
+ main(opts)
diff --git a/programming_guide/section-3/vector_scalar_mul.cc b/programming_guide/section-3/vector_scalar_mul.cc
new file mode 100755
index 0000000000..10c0aecbbc
--- /dev/null
+++ b/programming_guide/section-3/vector_scalar_mul.cc
@@ -0,0 +1,25 @@
+//===- vector_scaler_mul.cc -------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+
+extern "C" {
+
+void vector_scalar_mul_aie_scalar(int32_t *a, int32_t *c, int32_t *factor,
+ int32_t N) {
+ for (int i = 0; i < N; i++) {
+ c[i] = *factor * a[i];
+ }
+}
+
+} // extern "C"
diff --git a/programming_guide/section-4/CMakeLists.txt b/programming_guide/section-4/CMakeLists.txt
new file mode 100644
index 0000000000..6b330f21c1
--- /dev/null
+++ b/programming_guide/section-4/CMakeLists.txt
@@ -0,0 +1,70 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2024 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib/test_utils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC
+ ${XRT_INC_DIR}
+ ${Boost_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ )
+endif()
diff --git a/programming_guide/section-4/README.md b/programming_guide/section-4/README.md
new file mode 100644
index 0000000000..e4a787afae
--- /dev/null
+++ b/programming_guide/section-4/README.md
@@ -0,0 +1,22 @@
+
+
+# Section 4 - Vector Programming & Performance Measurement
+
+Now that you've had a chance to walk through the components of compiling and running a program on the Ryzen AI hardware in [section-3](../section-3), we will start looking at how we measure performance and utilize vector programming technqiues to fully leverage the power of the AI Engines for parallel compute.
+
+It's helpful to first examine perfomance measurement before we delve into vector programming in order to get a baseline for where our application performance is. There are many factors that contribute to performance including latency, throughput and power efficiency. Performance measurement is an active area of research to provide more powerful tools for users to measure the speedup of their appication on AIEs. In [section-4a](./section-4a) and [section-4b](./section-4b/), we look a performance from the perspective of timers and trace. Then in [section-4c](./section-4c), we look more closely at how to vectorize AIE kernel code.
+
+* [Section 4a - Timers](./section-4a)
+* [Section 4b - Trace](./section-4b)
+* [Section 4c - Kernel vectorization and optimization](./section-4c)
+
+-----
+[[Prev - Section 3](../section-3/)] [[Top](..)] [[Next - Section 5](../section-5/)]
\ No newline at end of file
diff --git a/programming_guide/section-4/aie2.py b/programming_guide/section-4/aie2.py
new file mode 100644
index 0000000000..4231179c36
--- /dev/null
+++ b/programming_guide/section-4/aie2.py
@@ -0,0 +1,74 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import * # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx # mlir-aie context
+
+from aie.dialects.aiex import * # extended mlir-aie dialect definitions
+from aie.dialects.scf import * # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects
+
+
+# AI Engine structural design function
+def my_first_aie_program():
+
+ # Dvice declaration - aie2 device NPU
+ @device(AIEDevice.npu)
+ def device_body():
+ # Memref types
+ memRef_8_ty = T.memref(8, T.i32())
+ memRef_16_ty = T.memref(16, T.i32())
+ memRef_32_ty = T.memref(32, T.i32())
+ memRef_64_ty = T.memref(64, T.i32())
+
+ # Tile declarations
+ ComputeTile = tile(0, 2)
+ ShimTile = tile(0, 0)
+
+ # Data movement with object FIFOs
+ # Input (from shim tile to compute tile)
+ of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
+
+ # Output (from compute tile to shim tile)
+ of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
+
+ # Compute tile body
+ @core(ComputeTile)
+ def core_body():
+ for _ in for_(8):
+ # Acquire input and output object FIFO objects
+ elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
+
+ # Core functionality - load, add 1, store
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+
+ # Release input and output object FIFO objects
+ of_in0.release(ObjectFifoPort.Consume, 1)
+ of_out0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # To/from AIE-array data movement
+ @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty)
+ def sequence(inTensor, unused, outTensor):
+ npu_dma_memcpy_nd(
+ metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
+ )
+ npu_dma_memcpy_nd(
+ metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+# Declares that subsequent code is in mlir-aie context
+with mlir_mod_ctx() as ctx:
+ my_first_aie_program() # Call design function within the mlir-aie context
+ print(ctx.module) # Print the python-to-mlir conversion
diff --git a/programming_guide/section-4/section-4a/CMakeLists.txt b/programming_guide/section-4/section-4a/CMakeLists.txt
new file mode 100644
index 0000000000..d299de85d7
--- /dev/null
+++ b/programming_guide/section-4/section-4a/CMakeLists.txt
@@ -0,0 +1,72 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC
+ ../../utils
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+ ${XRT_INC_DIR}
+ ${Boost_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ )
+endif()
\ No newline at end of file
diff --git a/programming_guide/section-4/section-4a/Makefile b/programming_guide/section-4/section-4a/Makefile
new file mode 100644
index 0000000000..ee28c567c4
--- /dev/null
+++ b/programming_guide/section-4/section-4a/Makefile
@@ -0,0 +1,48 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+include ../../../programming_examples/makefile-common
+
+all: build/final.xclbin
+
+targetname = myFirstProgram
+
+build/aie.mlir: aie2.py
+ mkdir -p ${@D}
+ python3 $< > $@
+
+build/final.xclbin: build/aie.mlir
+ mkdir -p ${@D}
+ cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+ --xclbin-name=${@F} --npu-insts-name=insts.txt ${
+
+# Section 4a - Timers
+
+* [Section 4 - Vector Programming & Peformance Measurement](../../section-4)
+ * Section 4a - Timers
+ * [Section 4b - Trace](../section-4b)
+ * [Section 4c - Kernel Vectorization and Optimization](../section-4c)
+
+-----
+
+We begin by first looking at timers for measuring application performance and what that tells us. The performance of an accelerated AI Engine application involves a number of components on the software stack, from invoking the application at the OS level, to passing control on to the kernel drivers, moving and dispatching work to the AIE array, running the accelerated application on AIE cores, and finally returning the data to the application for next-step processing. The most straightforward way to capture the performance of this entire stack of communication and processing is with an application timer, also known as the "wall clock" time. This gives us the upper bounds for how long an AIE accelerated application takes but adds to it the OS and kernel driver overhead. This is something that can be minimized when running multiple iterations of an acclerated program or running a sufficiently compute intensive application. Let's take a look at how we add the "wall clock" timer to an example program.
+
+## Application timer - Modifying [test.cpp](./test.cpp)
+Adding the application timer is as simple as noting a start and stop time surrounding the calling of the kernel function. We can use the clock timer from the chrono library which is imported via `import ` but this may already be imported by other libraries (in our `test.cpp`, this is the case). Then we record the start and stop time of our chrono timer with function calls surrounding our kernel function call like the following:
+
+```c++
+ auto start = std::chrono::high_resolution_clock::now();
+ auto run = kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
+ run.wait();
+ auto stop = std::chrono::high_resolution_clock::now();
+
+ float npu_time = std::chrono::duration_cast(stop - start).count();
+ std::cout << "NPU time: " << npu_time << "us." << std::endl;
+```
+This provides us with a good baseline for how long our accelerated kernel function takes.
+
+## Multiple iterations
+A timer for a single kernel function call is a useful starting data point for understanding performance but there can be a lot of variability and overhead for a single call that is smoothed out when run multiple times. In order to benchmark the steady-state kernel run time, we can add code around our kernel call to execute multiple times and capture the minimium, maximize and average time that our kernel takes.
+
+In our example [test.cpp](./test.cpp), we wrap our calls within a for loop (based on `num_iter`/ number of iterations).
+
+```c++
+ unsigned num_iter = n_iterations + n_warmup_iterations;
+ for (unsigned iter = 0; iter < num_iter; iter++) {
+ <... kernel run code ...>
+ }
+```
+It is also useful to run the kernel a number of times prior to recording the steady-state average. This hides initial startup timing overhead that sometimes occurs during the first few runs. We call these initial loops warmup iterations which do not include verifying results and measuring the kernel function time.
+```c++
+ for (unsigned iter = 0; iter < num_iter; iter++) {
+ <... kernel run code ...>
+ if (iter < n_warmup_iterations) {
+ /* Warmup iterations do not count towards average runtime. */
+ continue;
+ }
+ <... verify and measure timers ...>
+ }
+```
+Finally, we accumulate relevant timer data to calculate and track average, minimum, and maximum times.
+```c++
+ for (unsigned iter = 0; iter < num_iter; iter++) {
+ <... kernel run code, warmup conditional, verify, and measure timers ...>
+ npu_time_total += npu_time;
+ npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+ npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+ }
+```
+We can then compute and print the actual average, minimum and maximum run times.
+```c++
+ std::cout << "Avg NPU time: " << npu_time_total / n_iterations << "us." << std::endl;
+ std::cout << "Min NPU time: " << npu_time_min << "us." << std::endl;
+ std::cout << "Max NPU time: " << npu_time_max << "us." << std::endl;
+```
+
+## Exercises
+1. Take a look at the timer code in our example [test.cpp](./test.cpp). Then build and run the design by calling `make; make run` and note the reported average "wall clock" time. What value did you see?
+
+1. Our design was run once with a single iteration and no warmup. Let's run our design again by calling `make run` again. What reported Avg NPU time did you see this time?
+
+1. Let's set our iterations to 10 and run again with `make run` which recompiles our host code for `test.cpp`. What reported Avg NPU time do you see this time?
+
+1. Let's change our design and increase the loop size of our kernel by a factor of 10. This involves changing the outer loop from 8 to 80. What reported times do you see now?
+
+
+-----
+[[Up]](../../section-4) [[Next]](../section-4b)
+
diff --git a/programming_guide/section-4/section-4a/aie2.py b/programming_guide/section-4/section-4a/aie2.py
new file mode 100644
index 0000000000..3e1f7e59ab
--- /dev/null
+++ b/programming_guide/section-4/section-4a/aie2.py
@@ -0,0 +1,79 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import * # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx # mlir-aie context
+
+from aie.dialects.aiex import * # extended mlir-aie dialect definitions
+from aie.dialects.scf import * # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects
+
+
+# AI Engine structural design function
+def my_first_aie_program():
+
+ # Dvice declaration - aie2 device NPU
+ @device(AIEDevice.npu)
+ def device_body():
+ # Memref types
+ memRef_8_ty = T.memref(8, T.i32())
+ memRef_16_ty = T.memref(16, T.i32())
+ memRef_32_ty = T.memref(32, T.i32())
+ memRef_64_ty = T.memref(64, T.i32())
+ memRef_640_ty = T.memref(640, T.i32())
+
+ # Tile declarations
+ ComputeTile = tile(0, 2)
+ ShimTile = tile(0, 0)
+
+ # Data movement with object FIFOs
+ # Input (from shim tile to compute tile)
+ of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
+
+ # Output (from compute tile to shim tile)
+ of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
+
+ # Compute tile body
+ @core(ComputeTile)
+ def core_body():
+ for _ in for_(8):
+ # Acquire input and output object FIFO objects
+ elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
+
+ # Core functionality - load, add 1, store
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+
+ # Release input and output object FIFO objects
+ of_in0.release(ObjectFifoPort.Consume, 1)
+ of_out0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # To/from AIE-array data movement
+ @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty)
+ def sequence(inTensor, unused, outTensor):
+ npu_dma_memcpy_nd(
+ metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
+ )
+ npu_dma_memcpy_nd(
+ metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+# Declares that subsequent code is in mlir-aie context
+with mlir_mod_ctx() as ctx:
+ my_first_aie_program() # Call design function within the mlir-aie context
+ res = ctx.module.operation.verify() # Verify mlir context
+ if res == True:
+ print(ctx.module) # Print the python-to-mlir conversion
+ else:
+ print(res)
diff --git a/programming_guide/section-4/section-4a/answers/aie2.py b/programming_guide/section-4/section-4a/answers/aie2.py
new file mode 100644
index 0000000000..595e0c11d2
--- /dev/null
+++ b/programming_guide/section-4/section-4a/answers/aie2.py
@@ -0,0 +1,79 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import * # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx # mlir-aie context
+
+from aie.dialects.aiex import * # extended mlir-aie dialect definitions
+from aie.dialects.scf import * # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects
+
+
+# AI Engine structural design function
+def my_first_aie_program():
+
+ # Dvice declaration - aie2 device NPU
+ @device(AIEDevice.npu)
+ def device_body():
+ # Memref types
+ memRef_8_ty = T.memref(8, T.i32())
+ memRef_16_ty = T.memref(16, T.i32())
+ memRef_32_ty = T.memref(32, T.i32())
+ memRef_64_ty = T.memref(64, T.i32())
+ memRef_640_ty = T.memref(640, T.i32())
+
+ # Tile declarations
+ ComputeTile = tile(0, 2)
+ ShimTile = tile(0, 0)
+
+ # Data movement with object FIFOs
+ # Input (from shim tile to compute tile)
+ of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
+
+ # Output (from compute tile to shim tile)
+ of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
+
+ # Compute tile body
+ @core(ComputeTile)
+ def core_body():
+ for _ in for_(80):
+ # Acquire input and output object FIFO objects
+ elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
+
+ # Core functionality - load, add 1, store
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+
+ # Release input and output object FIFO objects
+ of_in0.release(ObjectFifoPort.Consume, 1)
+ of_out0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # To/from AIE-array data movement
+ @FuncOp.from_py_func(memRef_640_ty, memRef_64_ty, memRef_640_ty)
+ def sequence(inTensor, unused, outTensor):
+ npu_dma_memcpy_nd(
+ metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 640]
+ )
+ npu_dma_memcpy_nd(
+ metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 640]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+# Declares that subsequent code is in mlir-aie context
+with mlir_mod_ctx() as ctx:
+ my_first_aie_program() # Call design function within the mlir-aie context
+ res = ctx.module.operation.verify() # Verify mlir context
+ if res == True:
+ print(ctx.module) # Print the python-to-mlir conversion
+ else:
+ print(res)
diff --git a/programming_guide/section-4/section-4a/answers/test.cpp b/programming_guide/section-4/section-4a/answers/test.cpp
new file mode 100644
index 0000000000..d154a97425
--- /dev/null
+++ b/programming_guide/section-4/section-4a/answers/test.cpp
@@ -0,0 +1,256 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+// ------------------------------------------------------
+// Configure this to match your buffer data type
+// ------------------------------------------------------
+using INOUT0_DATATYPE = std::uint32_t;
+using INOUT1_DATATYPE = std::uint32_t;
+using INOUT2_DATATYPE = std::uint32_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Verify results (specific to our design example)
+// ----------------------------------------------------------------------------
+template
+int verify(int CSize, std::vector C, int verbosity) {
+ int errors = 0;
+ for (uint32_t i = 0; i < CSize; i++) {
+ uint32_t ref = i + 2;
+ if (C[i] != ref) {
+ std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
+ errors++;
+ } else {
+ if (verbosity > 1)
+ std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+ }
+ }
+ return errors;
+}
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+ // ------------------------------------------------------
+ // Parse program arguments
+ // ------------------------------------------------------
+ po::options_description desc("Allowed options");
+ po::variables_map vm;
+ test_utils::add_default_options(desc);
+
+ test_utils::parse_options(argc, argv, desc, vm);
+ int verbosity = vm["verbosity"].as();
+ int do_verify = vm["verify"].as();
+ int n_iterations = vm["iters"].as();
+ int n_warmup_iterations = vm["warmup"].as();
+ int trace_size = vm["trace_sz"].as();
+
+ // ------------------------------------------------------
+ // Configure this to match your design's buffer size
+ // ------------------------------------------------------
+ int INOUT0_VOLUME = 640; // Input only, 64x uint32_t in this example
+ int INOUT1_VOLUME = 640; // Not used in this example
+ int INOUT2_VOLUME = 640; // Output only, 64x uint32_t in this example
+
+ size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+ size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+ size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
+
+ // TODO Remove trace for now?
+ size_t OUT_SIZE = INOUT2_SIZE + trace_size;
+
+ srand(time(NULL));
+
+ // Load instruction sequence
+ std::vector instr_v =
+ test_utils::load_instr_sequence(vm["instr"].as());
+ if (verbosity >= 1)
+ std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+ // ------------------------------------------------------
+ // Get device, load the xclbin & kernel and register them
+ // ------------------------------------------------------
+ xrt::device device;
+ xrt::kernel kernel;
+
+ test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+ vm["xclbin"].as(),
+ vm["kernel"].as());
+
+ // ------------------------------------------------------
+ // Initialize input/ output buffer sizes and sync them
+ // ------------------------------------------------------
+ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+ XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+ auto bo_inout0 =
+ xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+ auto bo_inout1 =
+ xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+ // Assumes trace will only be added to inout2
+ auto bo_inout2 =
+ xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+ if (verbosity >= 1)
+ std::cout << "Writing data into buffer objects.\n";
+
+ // Initialize instruction buffer
+ void *bufInstr = bo_instr.map();
+ memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+ // Initialize Inout buffer 0
+ INOUT0_DATATYPE *bufInOut0 = bo_inout0.map();
+ std::vector AVec(INOUT0_VOLUME);
+ for (int i = 0; i < INOUT0_VOLUME; i++)
+ AVec[i] = i + 1;
+ // AVec.push_back(i + 1);
+ memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
+
+ // Initialize Inout buffer 1
+ // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map();
+ // std::vector BVec(INOUT1_VOLUME);
+ // for (int i = 0; i < INOUT1_VOLUME; i++)
+ // BVec[i] = i + 1
+ // //BVec.push_back(i + 1);
+ // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
+
+ // Initialize Inout buffer 2
+ char *bufInOut2 = bo_inout2.map();
+ std::vector CVec(INOUT2_VOLUME);
+ memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
+
+ // Sync buffers to update input buffer values
+ bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+ // ------------------------------------------------------
+ // Initialize run configs
+ // ------------------------------------------------------
+ unsigned num_iter = n_iterations + n_warmup_iterations;
+ float npu_time_total = 0;
+ float npu_time_min = 9999999;
+ float npu_time_max = 0;
+
+ int errors = 0;
+
+ // ------------------------------------------------------
+ // Main run loop
+ // ------------------------------------------------------
+ for (unsigned iter = 0; iter < num_iter; iter++) {
+
+ // Run kernel
+ if (verbosity >= 1)
+ std::cout << "Running Kernel.\n";
+ auto start = std::chrono::high_resolution_clock::now();
+ auto run =
+ kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
+ run.wait();
+ auto stop = std::chrono::high_resolution_clock::now();
+ bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+ if (iter < n_warmup_iterations) {
+ /* Warmup iterations do not count towards average runtime. */
+ continue;
+ }
+
+ // Copy output results and verify they are correct
+ memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
+ if (do_verify) {
+ if (verbosity >= 1) {
+ std::cout << "Verifying results ..." << std::endl;
+ }
+ auto vstart = std::chrono::system_clock::now();
+ errors = verify(INOUT2_VOLUME, CVec, verbosity);
+ auto vstop = std::chrono::system_clock::now();
+ float vtime =
+ std::chrono::duration_cast(vstop - vstart)
+ .count();
+ if (verbosity >= 1) {
+ std::cout << "Verify time: " << vtime << "secs." << std::endl;
+ }
+ } else {
+ if (verbosity >= 1)
+ std::cout << "WARNING: results not verified." << std::endl;
+ }
+
+ // Write trace values if trace_size > 0
+ if (trace_size > 0) {
+ // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE,
+ // trace_size,
+ test_utils::write_out_trace(((char *)bufInOut2), trace_size,
+ vm["trace_file"].as());
+ }
+
+ // Accumulate run times
+ float npu_time =
+ std::chrono::duration_cast(stop - start)
+ .count();
+
+ npu_time_total += npu_time;
+ npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+ npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+ }
+
+ // ------------------------------------------------------
+ // Print verification and timing results
+ // ------------------------------------------------------
+
+ // TODO - Mac count to guide gflops
+ float macs = 0;
+
+ std::cout << std::endl
+ << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+ << std::endl;
+ if (macs > 0)
+ std::cout << "Avg NPU gflops: "
+ << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+ std::cout << std::endl
+ << "Min NPU time: " << npu_time_min << "us." << std::endl;
+ if (macs > 0)
+ std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+ << std::endl;
+
+ std::cout << std::endl
+ << "Max NPU time: " << npu_time_max << "us." << std::endl;
+ if (macs > 0)
+ std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+ << std::endl;
+
+ if (!errors) {
+ std::cout << "\nPASS!\n\n";
+ return 0;
+ } else {
+ std::cout << "\nError count: " << errors << "\n\n";
+ std::cout << "\nFailed.\n\n";
+ return 1;
+ }
+}
diff --git a/programming_guide/section-4/section-4a/test.cpp b/programming_guide/section-4/section-4a/test.cpp
new file mode 100644
index 0000000000..2ec8a0d1c3
--- /dev/null
+++ b/programming_guide/section-4/section-4a/test.cpp
@@ -0,0 +1,256 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+// ------------------------------------------------------
+// Configure this to match your buffer data type
+// ------------------------------------------------------
+using INOUT0_DATATYPE = std::uint32_t;
+using INOUT1_DATATYPE = std::uint32_t;
+using INOUT2_DATATYPE = std::uint32_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Verify results (specific to our design example)
+// ----------------------------------------------------------------------------
+template
+int verify(int CSize, std::vector C, int verbosity) {
+ int errors = 0;
+ for (uint32_t i = 0; i < CSize; i++) {
+ uint32_t ref = i + 2;
+ if (C[i] != ref) {
+ std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
+ errors++;
+ } else {
+ if (verbosity > 1)
+ std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+ }
+ }
+ return errors;
+}
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+ // ------------------------------------------------------
+ // Parse program arguments
+ // ------------------------------------------------------
+ po::options_description desc("Allowed options");
+ po::variables_map vm;
+ test_utils::add_default_options(desc);
+
+ test_utils::parse_options(argc, argv, desc, vm);
+ int verbosity = vm["verbosity"].as();
+ int do_verify = vm["verify"].as();
+ int n_iterations = vm["iters"].as();
+ int n_warmup_iterations = vm["warmup"].as();
+ int trace_size = vm["trace_sz"].as();
+
+ // ------------------------------------------------------
+ // Configure this to match your design's buffer size
+ // ------------------------------------------------------
+ int INOUT0_VOLUME = 64; // Input only, 64x uint32_t in this example
+ int INOUT1_VOLUME = 64; // Not used in this example
+ int INOUT2_VOLUME = 64; // Output only, 64x uint32_t in this example
+
+ size_t INOUT0_SIZE = INOUT0_VOLUME * sizeof(INOUT0_DATATYPE);
+ size_t INOUT1_SIZE = INOUT1_VOLUME * sizeof(INOUT1_DATATYPE);
+ size_t INOUT2_SIZE = INOUT2_VOLUME * sizeof(INOUT2_DATATYPE);
+
+ // TODO Remove trace for now?
+ size_t OUT_SIZE = INOUT2_SIZE + trace_size;
+
+ srand(time(NULL));
+
+ // Load instruction sequence
+ std::vector instr_v =
+ test_utils::load_instr_sequence(vm["instr"].as());
+ if (verbosity >= 1)
+ std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+ // ------------------------------------------------------
+ // Get device, load the xclbin & kernel and register them
+ // ------------------------------------------------------
+ xrt::device device;
+ xrt::kernel kernel;
+
+ test_utils::init_xrt_load_kernel(device, kernel, verbosity,
+ vm["xclbin"].as(),
+ vm["kernel"].as());
+
+ // ------------------------------------------------------
+ // Initialize input/ output buffer sizes and sync them
+ // ------------------------------------------------------
+ auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+ XCL_BO_FLAGS_CACHEABLE, kernel.group_id(0));
+ auto bo_inout0 =
+ xrt::bo(device, INOUT0_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(2));
+ auto bo_inout1 =
+ xrt::bo(device, INOUT1_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+ // Assumes trace will only be added to inout2
+ auto bo_inout2 =
+ xrt::bo(device, OUT_SIZE, XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+
+ if (verbosity >= 1)
+ std::cout << "Writing data into buffer objects.\n";
+
+ // Initialize instruction buffer
+ void *bufInstr = bo_instr.map();
+ memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+ // Initialize Inout buffer 0
+ INOUT0_DATATYPE *bufInOut0 = bo_inout0.map();
+ std::vector AVec(INOUT0_VOLUME);
+ for (int i = 0; i < INOUT0_VOLUME; i++)
+ AVec[i] = i + 1;
+ // AVec.push_back(i + 1);
+ memcpy(bufInOut0, AVec.data(), (AVec.size() * sizeof(INOUT0_DATATYPE)));
+
+ // Initialize Inout buffer 1
+ // INOUT1_DATATYPE *bufInOut1 = bo_inout1.map();
+ // std::vector BVec(INOUT1_VOLUME);
+ // for (int i = 0; i < INOUT1_VOLUME; i++)
+ // BVec[i] = i + 1
+ // //BVec.push_back(i + 1);
+ // memcpy(bufInOut1, BVec.data(), (BVec.size() * sizeof(INOUT1_DATATYPE)));
+
+ // Initialize Inout buffer 2
+ char *bufInOut2 = bo_inout2.map();
+ std::vector CVec(INOUT2_VOLUME);
+ memset(bufInOut2, 0, OUT_SIZE); // Zeroes out INOUT2_VOLUME + trace_size
+
+ // Sync buffers to update input buffer values
+ bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inout0.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ // bo_inout1.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+ bo_inout2.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+ // ------------------------------------------------------
+ // Initialize run configs
+ // ------------------------------------------------------
+ unsigned num_iter = n_iterations + n_warmup_iterations;
+ float npu_time_total = 0;
+ float npu_time_min = 9999999;
+ float npu_time_max = 0;
+
+ int errors = 0;
+
+ // ------------------------------------------------------
+ // Main run loop
+ // ------------------------------------------------------
+ for (unsigned iter = 0; iter < num_iter; iter++) {
+
+ // Run kernel
+ if (verbosity >= 1)
+ std::cout << "Running Kernel.\n";
+ auto start = std::chrono::high_resolution_clock::now();
+ auto run =
+ kernel(bo_instr, instr_v.size(), bo_inout0, bo_inout1, bo_inout2);
+ run.wait();
+ auto stop = std::chrono::high_resolution_clock::now();
+ bo_inout2.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+ if (iter < n_warmup_iterations) {
+ /* Warmup iterations do not count towards average runtime. */
+ continue;
+ }
+
+ // Copy output results and verify they are correct
+ memcpy(CVec.data(), bufInOut2, (CVec.size() * sizeof(INOUT2_DATATYPE)));
+ if (do_verify) {
+ if (verbosity >= 1) {
+ std::cout << "Verifying results ..." << std::endl;
+ }
+ auto vstart = std::chrono::system_clock::now();
+ errors = verify(INOUT2_VOLUME, CVec, verbosity);
+ auto vstop = std::chrono::system_clock::now();
+ float vtime =
+ std::chrono::duration_cast(vstop - vstart)
+ .count();
+ if (verbosity >= 1) {
+ std::cout << "Verify time: " << vtime << "secs." << std::endl;
+ }
+ } else {
+ if (verbosity >= 1)
+ std::cout << "WARNING: results not verified." << std::endl;
+ }
+
+ // Write trace values if trace_size > 0
+ if (trace_size > 0) {
+ // test_utils::write_out_trace(((char *)bufInOut2) + INOUT2_SIZE,
+ // trace_size,
+ test_utils::write_out_trace(((char *)bufInOut2), trace_size,
+ vm["trace_file"].as());
+ }
+
+ // Accumulate run times
+ float npu_time =
+ std::chrono::duration_cast(stop - start)
+ .count();
+
+ npu_time_total += npu_time;
+ npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+ npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
+ }
+
+ // ------------------------------------------------------
+ // Print verification and timing results
+ // ------------------------------------------------------
+
+ // TODO - Mac count to guide gflops
+ float macs = 0;
+
+ std::cout << std::endl
+ << "Avg NPU time: " << npu_time_total / n_iterations << "us."
+ << std::endl;
+ if (macs > 0)
+ std::cout << "Avg NPU gflops: "
+ << macs / (1000 * npu_time_total / n_iterations) << std::endl;
+
+ std::cout << std::endl
+ << "Min NPU time: " << npu_time_min << "us." << std::endl;
+ if (macs > 0)
+ std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_min)
+ << std::endl;
+
+ std::cout << std::endl
+ << "Max NPU time: " << npu_time_max << "us." << std::endl;
+ if (macs > 0)
+ std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_max)
+ << std::endl;
+
+ if (!errors) {
+ std::cout << "\nPASS!\n\n";
+ return 0;
+ } else {
+ std::cout << "\nError count: " << errors << "\n\n";
+ std::cout << "\nFailed.\n\n";
+ return 1;
+ }
+}
diff --git a/programming_guide/section-4/section-4a/test.py b/programming_guide/section-4/section-4a/test.py
new file mode 100644
index 0000000000..0e82d741cb
--- /dev/null
+++ b/programming_guide/section-4/section-4a/test.py
@@ -0,0 +1,132 @@
+# test.py -*- Python -*-
+#
+# Copyright (C) 2024, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+import numpy as np
+import pyxrt as xrt
+import sys
+import time
+
+import aie.utils.test as test_utils
+
+# ------------------------------------------------------
+# Configure this to match your design's buffer size
+# ------------------------------------------------------
+INOUT0_VOLUME = 64 # Input only, 64x uint32_t in this example
+INOUT1_VOLUME = 64 # Not used in this example
+INOUT2_VOLUME = 64 # Output only, 64x uint32_t in this example
+
+INOUT0_DATATYPE = np.uint32
+INOUT1_DATATYPE = np.uint32
+INOUT2_DATATYPE = np.uint32
+
+INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
+INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
+INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+
+
+def main(opts):
+
+ # Load instruction sequence
+ with open(opts.instr, "r") as f:
+ instr_text = f.read().split("\n")
+ instr_text = [l for l in instr_text if l != ""]
+ instr_v = np.array([int(i, 16) for i in instr_text], dtype=np.uint32)
+
+ OUT_SIZE = INOUT2_SIZE
+
+ # ------------------------------------------------------
+ # Get device, load the xclbin & kernel and register them
+ # ------------------------------------------------------
+ (device, kernel) = test_utils.init_xrt_load_kernel(opts)
+
+ # ------------------------------------------------------
+ # Initialize input/ output buffer sizes and sync them
+ # ------------------------------------------------------
+ bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(0))
+ bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(2))
+ bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(3))
+ # bo_inout2 = xrt.bo(device, INOUT2_SIZE, xrt.bo.host_only, kernel.group_id(4))
+ bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(4))
+
+ # Initialize instruction buffer
+ bo_instr.write(instr_v, 0)
+
+ # Initialize data buffers
+ inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
+ inout1 = np.zeros(INOUT1_VOLUME, dtype=INOUT1_DATATYPE)
+ inout2 = np.zeros(INOUT2_VOLUME, dtype=INOUT2_DATATYPE)
+ bo_inout0.write(inout0, 0)
+ bo_inout1.write(inout1, 0)
+ bo_inout2.write(inout2, 0)
+
+ # Sync buffers to update input buffer values
+ bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+ bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+ bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+ bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+
+ # ------------------------------------------------------
+ # Initialize run configs
+ # ------------------------------------------------------
+ num_iter = opts.iters + opts.warmup_iters
+ npu_time_total = 0
+ npu_time_min = 9999999
+ npu_time_max = 0
+ errors = 0
+
+ # ------------------------------------------------------
+ # Main run loop
+ # ------------------------------------------------------
+ for i in range(num_iter):
+ # Run kernel
+ if opts.verbosity >= 1:
+ print("Running Kernel.")
+ start = time.time_ns()
+ h = kernel(bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
+ h.wait()
+ stop = time.time_ns()
+ bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+
+ # Warmup iterations do not count towards average runtime.
+ if i < opts.warmup_iters:
+ continue
+
+ # Copy output results and verify they are correct
+ out_size = INOUT2_SIZE
+ output_buffer = bo_inout2.read(out_size, 0).view(INOUT2_DATATYPE)
+ if opts.verify:
+ if opts.verbosity >= 1:
+ print("Verifying results ...")
+ ref = np.arange(2, INOUT0_VOLUME + 2, dtype=INOUT0_DATATYPE)
+ e = np.equal(output_buffer, ref)
+ errors = errors + np.size(e) - np.count_nonzero(e)
+
+ npu_time = stop - start
+ npu_time_total = npu_time_total + npu_time
+ npu_time_min = min(npu_time_min, npu_time)
+ npu_time_max = max(npu_time_max, npu_time)
+
+ # ------------------------------------------------------
+ # Print verification and timing results
+ # ------------------------------------------------------
+
+ # TODO - Mac count to guide gflops
+
+ print("\nAvg NPU time: {}us.".format(int((npu_time_total / opts.iters) / 1000)))
+ print("\nMin NPU time: {}us.".format(int((npu_time_min / opts.iters) / 1000)))
+ print("\nMax NPU time: {}us.".format(int((npu_time_max / opts.iters) / 1000)))
+
+ if not errors:
+ print("\nPASS!\n")
+ exit(0)
+ else:
+ print("\nError count: ", errors)
+ print("\nFailed.\n")
+ exit(-1)
+
+
+if __name__ == "__main__":
+ opts = test_utils.parse_args(sys.argv[1:])
+ main(opts)
diff --git a/programming_guide/section-4/section-4b/CMakeLists.txt b/programming_guide/section-4/section-4b/CMakeLists.txt
new file mode 100644
index 0000000000..78f4e10e9e
--- /dev/null
+++ b/programming_guide/section-4/section-4b/CMakeLists.txt
@@ -0,0 +1,70 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 Advanced Micro Devices, Inc.
+
+# parameters
+# -DBOOST_ROOT: Path to Boost install
+# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo
+# -DXRT_LIB_DIR: Path to xrt_coreutil.lib
+# -DTARGET_NAME: Target name to be built
+
+# cmake needs this line
+cmake_minimum_required(VERSION 3.1)
+
+find_program(WSL NAMES powershell.exe)
+
+if (NOT WSL)
+ set(BOOST_ROOT /usr/include/boost CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR /opt/xilinx/xrt/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR /opt/xilinx/xrt/lib CACHE STRING "Path to xrt_coreutil.lib")
+else()
+ set(BOOST_ROOT C:/Technical/thirdParty/boost_1_83_0 CACHE STRING "Path to Boost install")
+ set(XRT_INC_DIR C:/Technical/XRT/src/runtime_src/core/include CACHE STRING "Path to XRT cloned repo")
+ set(XRT_LIB_DIR C:/Technical/xrtNPUfromDLL CACHE STRING "Path to xrt_coreutil.lib")
+endif()
+
+set(TARGET_NAME test CACHE STRING "Target to be built")
+
+SET (ProjectName ${TARGET_NAME})
+SET (currentTarget ${TARGET_NAME})
+
+if ( WSL )
+ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR})
+endif ()
+
+project(${ProjectName})
+
+# Find packages
+find_package(Boost REQUIRED)
+
+add_executable(${currentTarget}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib/test_utils.cpp
+ test.cpp
+)
+
+target_compile_definitions(${currentTarget} PUBLIC DISABLE_ABI_CHECK=1)
+
+target_include_directories (${currentTarget} PUBLIC
+ ${XRT_INC_DIR}
+ ${Boost_INCLUDE_DIRS}
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../../runtime_lib/test_lib
+)
+
+target_link_directories(${currentTarget} PUBLIC
+ ${XRT_LIB_DIR}
+ ${Boost_LIBRARY_DIRS}
+)
+
+if (NOT WSL)
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ boost_program_options
+ boost_filesystem
+ )
+else()
+ target_link_libraries(${currentTarget} PUBLIC
+ xrt_coreutil
+ )
+endif()
\ No newline at end of file
diff --git a/programming_guide/section-4/section-4b/Makefile b/programming_guide/section-4/section-4b/Makefile
new file mode 100644
index 0000000000..09126e5289
--- /dev/null
+++ b/programming_guide/section-4/section-4b/Makefile
@@ -0,0 +1,56 @@
+##===- Makefile -----------------------------------------------------------===##
+#
+# This file licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+##===----------------------------------------------------------------------===##
+
+include ../../../programming_examples/makefile-common
+
+all: build/final.xclbin
+
+targetname = myFirstProgram
+
+trace_size = 8192
+
+build/aie.mlir: aie2.py
+ mkdir -p ${@D}
+ python3 $< > $@
+
+build/final.xclbin: build/aie.mlir
+ mkdir -p ${@D}
+ cd ${@D} && aiecc.py --aie-generate-cdo --aie-generate-npu --no-compile-host \
+ --xclbin-name=${@F} --npu-insts-name=insts.txt ${ trace_4b.json
+
+trace_py: build/final.xclbin build/insts.txt
+ ${powershell} python3 test.py -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -t ${trace_size}
+ ../../../programming_examples/utils/parse_trace.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > trace_4b.json
+
+clean_trace:
+ rm -rf tmpTrace trace.txt trace*json
+
+clean: clean_trace
+ rm -rf build _build ${targetname}.exe
diff --git a/programming_guide/section-4/section-4b/README.md b/programming_guide/section-4/section-4b/README.md
new file mode 100644
index 0000000000..e57029ae0c
--- /dev/null
+++ b/programming_guide/section-4/section-4b/README.md
@@ -0,0 +1,207 @@
+
+
+# Section 4b - Trace
+
+* [Section 4 - Vector Programming & Peformance Measurement](../../section-4)
+ * [Section 4a - Timers](../section-4a)
+ * Section 4b - Trace
+ * [Section 4c - Kernel Vectorization and Optimization](../section-4c)
+
+-----
+
+In the previous [section-4a](../section-4a), we looked at how timers can be used to get an overview of application performance. However, for kernel programmers that want to optimize the AIE hardware to its fullest potential, being able to see how efficiently the AIE cores and data movers are running is important. As such, the AIEs are equipped with tracing hardware that provides a cycle accurate view of hardware events. More detailed specification of the AIE2 trace unit can be found at in [AM020](https://docs.amd.com/r/en-US/am020-versal-aie-ml/Trace).
+
+Enabling trace support can be done with the following steps:
+
+## Steps to Enable Trace Support
+1. [Enable and configure AIE trace units](#1-enable-and-configure-aie-trace-units)
+1. [Configure host code to read trace data and write it to a text file](#2-configure-host-code-to-read-trace-data-and-write-it-to-a-text-file)
+1. [Parse text file to generate a waveform json file](#3-parse-text-file-to-generate-a-waveform-json-file)
+1. [Open json file in a visualization tool like Perfetto](#4-open-json-file-in-a-visualization-tool-like-perfetto)
+* [Additional Debug Hints](#additional-debug-hints)
+
+
+## 1. Enable and configure AIE trace units
+
+Enabling tracing means (1a) configuring the trace units for a given tile and then (1b) routing the generated events packets through the stream switches to the shim DMA where we can write them to a buffer in DDR for post-runtime processing.
+
+### (1a) Configure trace units for an AIE tile
+The first necessary component for trace configuraton is setting the right values for the trace control registers for each tile that we want to enable tracing for. In addition, the generated trace packets will need to be routed to shimDMA and then written to one of the 3 inout buffers. We have abstracted these two steps with the python wrapper function `configure_simple_tracing_aie2` which is in [python/utils/test.py](../../../python/utils/test.py) and is described in more detail in the [README.md under python/utils](../../../python/utils). An example of how this function is used is shown below for quick reference
+```python
+ trace_utils.configure_simple_tracing_aie2(
+ ComputeTile2,
+ ShimTile,
+ ddr_id=1,
+ size=traceSizeInBytes,
+ offset=tensorSize,
+ )
+```
+This block is defined within the sequence definition for `@FuncOp.from_py_func` where we define the shimDMA data movement to the 3 inout buffers.
+**Note** that this simplification works very well for the trace buffer from a single tile to the shimDMA. However, if we want to do something more complicated like allocating the trace buffer from multiple tiles into a single larger buffer, this function will not be able to express that. For that, please consult the [README.md under python/utils](../../../python/utils) for more guidance on how to customize the trace configuration.
+
+### (1b) Define trace event routes from tile to shimDMA
+Once the trace units and shimDMA are configured, we need to define how the trace packets are routed from compute tile to shim tile. This is done via circuit switched flows or packet switched flows as described below.
+
+#### Circuit switched flows
+An example of a simple circuit switch routing flow to route trace event packets from a compute tile to a shimDMA would be:
+
+```python
+flow(ComputeTile, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+```
+
+It is important to consider the path this routing might take and how many other streams might be using that same path. This points to whether our design may experience stream routing congestion or not. While capturing trace events are non-intrusive (does not affect the performance of the AIE cores), the routing of these trace packets are not and need to be balanced in your design to prevent congestion.
+
+#### Packet switched flows
+The alternative to circuit switched routes is packet switched routes. The benefit of this is the abilty to share a single stream switch routing channel between multiple routes. The drawback is the slight overhead of data packet headers as well as needing to gauge how much congestion might be present on a shared route given the data movement requirement of the AIE array design. This means that if multiple flows are sharing the same channel, any particular flow might experience backpressure while another flow is serviced. Depending on the performance requirement of the design, this may or may not have a performance impact.
+
+To support packet switched flows, we need to declare packet flows and attach both a `packet ID` and `packet type` to the packets. `Packet type` in particular is needed to distinguish packets coming from different tiles types (tile core, tile memory, memtiles, shimtiles). The association between tile trace unit and packet types are as follows:
+
+| Tile trace unit | packet type |
+|-----------------|-------------|
+| Tile core | 0 |
+| Tile memory | 1 |
+| ShimTile | 2 |
+| MemTile | 3 |
+
+**NOTE**: Quick reminder that most source flow channels from `WireBundle.Trace` will use channel 0, but the `Tile memory` actually uses channel 1.
+
+The `packet IDs`, on the other hand, can be anything you want as long as they are globally unique to distinguish routes from one another. An example is shown below for two tiles where both tile core and tile memory trace units are routed. Note the `packet ID` used after the `packetflow` keyword. Also note that we set `keep_pkt_hdr = true` as we would like to keep the packet headers when they are moved to DDR so we can distinguish the packets during post-run parsing.
+
+In IRON python bindings, we declare packet flows with the following syntax:
+
+`packetflow(packet ID, Source Tile, Source Port, Source Port Channel, Destination Tile, Destination Port, Destination Port Channel, Keep Packet Header boolean)`
+
+```python
+packetflow(1, ComputeTile2, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1, keep_pkt_hdr=True) # core trace
+packetflow(2, ComputeTile2, WireBundle.Trace, 1, ShimTile, WireBundle.DMA, 1, keep_pkt_hdr=True) # core mem trace
+packetflow(3, ComputeTile3, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1, keep_pkt_hdr=True) # core trace
+packetflow(4, ComputeTile3, WireBundle.Trace, 1, ShimTile, WireBundle.DMA, 1, keep_pkt_hdr=True) # core mem trace
+```
+* packet ID - The first argument that uniquely identifies each packet flow.
+
+Then we have 3 arguments for the source and 3 for the destination.
+* `Tile` - Previously defined tile
+* `Port` - Wire bundles for the port including `WireBundle.Trace`, `WireBundle.DMA`, `WireBundle.North`, etc.
+* `Channel` # - For a given port, we often use multiple channels such as DMA channel 0 and DMA channel 1. Another example in AIE2, trace ports use channel 0 for the tile core and 1 for the tile memory.
+
+MLIR examples are similar and are includeed below for quick reference but are more fully defined in the [AIE Dialect online documentation](https://xilinx.github.io/mlir-aie/AIE.html):
+```mlir
+packetflow(1) {
+ aie.packet_source<%tile02, Trace : 0> // core trace
+ aie.packet_dest<%tile00, DMA : 1>
+} {keep_pkt_header = "true"}
+```
+
+## 2. Configure host code to read trace data and write it to a text file
+
+Once the trace units are configured and enabled, we want the host code to read the trace data from DDR and write it out to a text file for post-run processing. To give a better sense of how this comes together, this section provides an example design sourc files and Makefile whose kernel is based off the [Vector Scalar Add example](../../../programming_examples/basic/vector_scalar_add/).
+
+### AIE structural design code ([aie2.py](./aie2.py))
+In order to write the DDR data to a text file, we need to decide where we want the DDR data to first be stored and then read from that location, before writing to a text file. This starts inside the [aie2.py](./aie2.py) file where we use the `configure_simple_tracing_aie2` function call to configure the trace units and program the shimDMA to write to one of the 3 inout buffers. There are many ways to configure our structural design to write this data out but one pattern is the following: `inout0` is for input data, `inout1` is for output data, and `inout2` is for output trace data as illustrated below:
+
+| inout0 | inout1 | inout2 |
+|--------|--------|--------|
+| input A | output C | trace |
+
+Another common pattern is the case where we have two input buffers and one output buffer. Since we only have 3 inout buffers, we need to append trace data to the end of the output buffer.
+
+| inout0 | inout1 | inout2 |
+|--------|--------|--------|
+| input A | input B | (output C + trace) |
+
+As described in [python/utils](../../../python/utils) for `trace.py`, we configure the `ddr` argument to match which buffer the trace is writing out to in `configure_simple_tracing_aie2`.
+| ddr ID value | buffer |
+|-----------|--------|
+| 0 | inout0 |
+| 1 | inout1 |
+| 2 | inout2 |
+
+An example of this is in the Vector Scalar Multiply example ([aie2.py](../../../programming_examples/basic/vector_scalar_mul/aie2.py)), where it uses the 2nd pattern above (input A, input B, output C + trace). In the vector scalar multiply case, A is used for the input vector and B for the scalar factor. Since we're sharing the trace data with the output buffer on `inout2`, we set `ddr_id=2`. In addition, we set the offset to be the output data buffer size since the trace data is appended after the data (`offset=N_in_bytes`). For our local design ([aie2.py](./aie.py)), we have variation of the 2nd pattern but the second inout buffer is unused (input A, unused, output C + trace). `ddr_id=2` is still used since our output buffer is mapped to `inout2` and our trace data offset is specified as `C_sz_in_bytes`.
+
+Once [aie2.py](./aie2.py) is configured to output trace data through one of the 3 inout buffers with matching `ddr_id` config and `offset`, we turn our attention to the host code to read the DDR data and write it to a file.
+
+**NOTE**: In our example design, the [aie2.py](./aie2.py) and associated [Makefile](./Makefile), we provide a Makefile target `run` for standard build and `trace` for trace-enabld build. The trace-enabled build passes the trace buffer size as an argument to [aie2.py](./aie2.py) which conditionally enables the trace `flow` and calls `configure_simple_tracing_aie2` as long as `trace_size` is > 0. This is also true for the [Vector Scalar Multiply example](../../../programming_examples/basic/vector_scalar_mul).
+
+### (2a) C/C++ Host code ([test.cpp](./test.cpp))
+The main changes needed for [test.cpp](./test.cpp) is the increase in the output buffer size to account for the trace buffer size, being careful to read only the output buffer portion when verifying correctness of the results. We also need to be sure to pass the correct buffer offset which points to the trace buffer data when calling `write_out_trace`.
+
+You can see in [test.cpp](.test.cpp) that trace_size is set based on an input argument of `-t $(trace_size)` which is defined and passed in the [Makefile](.Makefile). The `trace` target from the [Makefile](./Makefile) is shown below.
+
+```Makefile
+trace: ${targetname}.exe build/final.xclbin build/insts.txt
+ ${powershell} ./$< -x build/final.xclbin -i build/insts.txt -k MLIR_AIE -t 8192
+ ../../../programming_examples/utils/parse_trace.py --filename trace.txt --mlir build/aie.mlir --colshift 1 > trace_4b.json
+```
+Following the invocation of the executable, we call the `parse_trace.py` python script which we will cover in more detail in step 3.
+Within the [test.cpp](./test.cpp), we redefine OUT_SIZE to be the sum of output buffer size (in bytes) and the trace buffer size.
+```c++
+ int OUT_SIZE = INOUT2_SIZE + trace_size;
+```
+All subsuquent references to the output buffer size should use `OUT_SIZE`. The exception is when we want to verify the output results which should be bounded by the original output buffer size, in this case `INOUT2_VOLUME`.
+
+Finally, the function to write the trace output to a file as defined in `aie.utils.trace` is `write_out_trace` and we need to pass it the pointer in the output buffer where the trace data begins, the trace buffer size and the trace file name (default is `trace.txt`).
+```c++
+ test_utils::write_out_trace(
+ ((char *)bufInOut2) + INOUT2_SIZE,
+ trace_size, vm["trace_file"].as());
+```
+
+### (2b) Python Host code ([test.py](./test.py))
+In the [Makefile](./Makefile), we also have a `trace_py` target which calls the python host code `test.py`. Here in addition to the `-t ${trace_size}`, we also define the `-s ${data_size}` which is the data size (in uint32) for our Vector Scalar Multiply kernel.
+```Makefile
+trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.txt
+ ${powershell} python3 test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.txt -k MLIR_AIE -t ${trace_size} -s ${data_size}
+ ../../utils/parse_eventIR.py --filename trace.txt --mlir build/aie_trace_${data_size}.mlir --colshift 1 > parse_eventIR_vs.json
+```
+The python equivalent host code performs the same steps as the C/C++ host code as we redefine `OUT_SIZE` to include the `trace_size`.
+```python
+ OUT_SIZE = INOUT1_SIZE + int(opts.trace_size)
+```
+During verification, the `output_buffer` excludes the trace data and uses the `read` function as follows:
+```python
+ entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
+ output_buffer = entire_buffer[:INOUT2_VOLUME]
+```
+Finally, we read `trace buffer` from the entire_buffer starting a the offset of the `INOUT2_VOLUME` and pass the trace buffer to the python equivalent of `write_out_trace` which is defined in `aie.utils.trace`. **Note** This version doesn't need the trace_size as our python function recognizes when the array is empty.
+```python
+ if opts.trace_size > 0:
+ trace_buffer = entire_buffer[INOUT2_VOLUME:]
+ trace_utils.write_out_trace(trace_buffer, str(opts.trace_file))
+```
+
+## 3. Parse text file to generate a waveform json file
+Once the packet trace text file is generated (`trace.txt`), we use a python-based trace parser ([parse_trace.py](../../../programming_examples/utils/parse_trace.py)) to interpret the trace values and generate a waveform json file for visualization (with Perfetto).
+```Makefile
+ ../../../programming_examples/utils/parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir --colshift 1 > trace_vs.json
+```
+This leverages the python parse scripts under [programming_examples/utils](../../../programming_examples/utils/). Follow [this link](../../../programming_examples/utils/) to get more details about how to use the python parse scripts and how they are coded.
+
+## 4. Open json file in a visualization tool like Perfetto
+Open https://ui.perfetto.dev in your browser and then open up the waveform json file generated in step 3. You can navigate the waveform viewer as you would a standard waveform viewer and can even zoom/pan the waveform with the a,s,w,d keyboard keys.
+
+## Additional Debug Hints
+* If you are getting 0's in your trace outputs. Check these things:
+ * Buffer offset for the DMA is the right size (based on output buffer size)
+ * The correct tile is being routed to the the correct shim DMA. It's not uncommon in a multi core design to route the wrong tile, espeically if the tile symbols/ names are very similar or confusing.
+ * Check matching packet IDs for packet-routed flows. The packet flow ID must match the configured ID value in Trace Control 1 register or else the packets don't get routed.
+
+## Exercises
+1. Let's give tracing a try. In this directory, we're been examining a design based off the `Vector Scalar Add` example. Run `make trace` to compile the design and generate a trace file and run the `prase_trace.py` script on it to generate the `trace_4b.json` waveform file. Open this in http://ui.perfetto.dev. if you zoom into the region of interest with the W and S to zoom in and out respectively and A adn D to pan left and right. You should seem a wave like the following:
+
+
+
+ Based on this wave, You can mouse over each chunk of continguous data for `PortRunning0` (input dma port) and `PortRunning1` (output dma port). What is the chunk size? How many input and output chunks are there? This shoudl match iteration loop bounds in our exmple design.
+
+1. **TODO** Additional questions about routing congestion for circuit switch and packet switch routes for trace packets?
+
+
+-----
+[[Prev]](../section-4a) [[Up]](../../section-4) [[Next]](../section-4c)
diff --git a/programming_guide/section-4/section-4b/aie2.py b/programming_guide/section-4/section-4b/aie2.py
new file mode 100644
index 0000000000..a629daa0ce
--- /dev/null
+++ b/programming_guide/section-4/section-4b/aie2.py
@@ -0,0 +1,99 @@
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023 AMD Inc.
+
+from aie.dialects.aie import * # primary mlir-aie dialect definitions
+from aie.extras.context import mlir_mod_ctx # mlir-aie context
+
+from aie.dialects.aiex import * # extended mlir-aie dialect definitions
+from aie.dialects.scf import * # scf (strcutred control flow) dialect
+from aie.extras.dialects.ext import memref, arith # memref and arithmatic dialects
+
+import aie.utils.trace as trace_utils
+
+
+# AI Engine structural design function
+def my_first_aie_program():
+
+ enableTrace = True
+ trace_size = 8192
+ C_sz_in_bytes = 64 * 4
+
+ # Dvice declaration - aie2 device NPU
+ @device(AIEDevice.npu)
+ def device_body():
+ # Memref types
+ memRef_8_ty = T.memref(8, T.i32())
+ memRef_16_ty = T.memref(16, T.i32())
+ memRef_32_ty = T.memref(32, T.i32())
+ memRef_64_ty = T.memref(64, T.i32())
+
+ # Tile declarations
+ ComputeTile = tile(0, 2)
+ ShimTile = tile(0, 0)
+
+ # Data movement with object FIFOs
+ # Input (from shim tile to compute tile)
+ of_in0 = object_fifo("in0", ShimTile, ComputeTile, 2, memRef_8_ty)
+
+ # Output (from compute tile to shim tile)
+ of_out0 = object_fifo("out0", ComputeTile, ShimTile, 2, memRef_8_ty)
+
+ # Compute tile body
+ @core(ComputeTile)
+ def core_body():
+ for _ in for_(0xFFFFFFFF):
+ # for _ in for_(8):
+ # Acquire input and output object FIFO objects
+ elem_in = of_in0.acquire(ObjectFifoPort.Consume, 1)
+ elem_out = of_out0.acquire(ObjectFifoPort.Produce, 1)
+
+ # Core functionality - load, add 1, store
+ for i in for_(8):
+ v0 = memref.load(elem_in, [i])
+ v1 = arith.addi(v0, arith.constant(1, T.i32()))
+ memref.store(v1, elem_out, [i])
+ yield_([])
+
+ # Release input and output object FIFO objects
+ of_in0.release(ObjectFifoPort.Consume, 1)
+ of_out0.release(ObjectFifoPort.Produce, 1)
+ yield_([])
+
+ # Set up a circuit-switched flow from core to shim for tracing information
+ if enableTrace:
+ flow(ComputeTile, WireBundle.Trace, 0, ShimTile, WireBundle.DMA, 1)
+
+ # To/from AIE-array data movement
+ @FuncOp.from_py_func(memRef_64_ty, memRef_64_ty, memRef_64_ty)
+ def sequence(inTensor, notUsed, outTensor):
+
+ if enableTrace:
+ trace_utils.configure_simple_tracing_aie2(
+ ComputeTile,
+ ShimTile,
+ ddr_id=2,
+ size=trace_size,
+ offset=C_sz_in_bytes,
+ )
+
+ npu_dma_memcpy_nd(
+ metadata="out0", bd_id=0, mem=outTensor, sizes=[1, 1, 1, 64]
+ )
+ npu_dma_memcpy_nd(
+ metadata="in0", bd_id=1, mem=inTensor, sizes=[1, 1, 1, 64]
+ )
+ npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+# Declares that subsequent code is in mlir-aie context
+with mlir_mod_ctx() as ctx:
+ my_first_aie_program() # Call design function within the mlir-aie context
+ res = ctx.module.operation.verify() # Verify mlir context
+ if res == True:
+ print(ctx.module) # Print the python-to-mlir conversion
+ else:
+ print(res)
diff --git a/programming_guide/section-4/section-4b/test.cpp b/programming_guide/section-4/section-4b/test.cpp
new file mode 100644
index 0000000000..6f775e5b54
--- /dev/null
+++ b/programming_guide/section-4/section-4b/test.cpp
@@ -0,0 +1,266 @@
+//===- test.cpp -------------------------------------------000---*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+#include "test_utils.h"
+
+#ifndef DATATYPES_USING_DEFINED
+#define DATATYPES_USING_DEFINED
+// ------------------------------------------------------
+// Configure this to match your buffer data type
+// ------------------------------------------------------
+using INOUT0_DATATYPE = std::uint32_t;
+using INOUT1_DATATYPE = std::uint32_t;
+using INOUT2_DATATYPE = std::uint32_t;
+#endif
+
+namespace po = boost::program_options;
+
+// ----------------------------------------------------------------------------
+// Verify results (specific to our design example)
+// ----------------------------------------------------------------------------
+template
+int verify(int CSize, std::vector C, int verbosity) {
+ int errors = 0;
+ for (uint32_t i = 0; i < CSize; i++) {
+ uint32_t ref = i + 2;
+ if (C[i] != ref) {
+ std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
+ errors++;
+ } else {
+ if (verbosity > 1)
+ std::cout << "Correct output " << C[i] << " == " << ref << std::endl;
+ }
+ }
+ return errors;
+}
+
+// ----------------------------------------------------------------------------
+// Main
+// ----------------------------------------------------------------------------
+int main(int argc, const char *argv[]) {
+
+ // ------------------------------------------------------
+ // Parse program arguments
+ // ------------------------------------------------------
+ po::options_description desc("Allowed options");
+ po::variables_map vm;
+ test_utils::add_default_options(desc);
+
+ test_utils::parse_options(argc, argv, desc, vm);
+ int verbosity = vm["verbosity"].as();
+ int do_verify = vm["verify"].as();
+ int n_iterations = vm["iters"].as();
+ int n_warmup_iterations = vm["warmup"].as();
+ int trace_size = vm["trace_sz"].as