Skip to content
This repository has been archived by the owner on Nov 5, 2024. It is now read-only.

[wip] replace pyxrt which does unexplainable things... #15

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 30 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,42 @@ set_target_properties(

add_dependencies(xaie pyxrt)

# ##############################################################################
# own xrt bindings
# ##############################################################################

pybind11_add_module(_xrt xaiepy/xrt.cpp)
target_include_directories(_xrt PRIVATE ${XRT_INCLUDE_DIR}
${XRT_BINARY_DIR}/gen
${Boost_INCLUDE_DIRS}
)
target_link_directories(_xrt PRIVATE ${XRT_LIB_DIR})
target_link_libraries(_xrt PRIVATE xrt_coreutil uuid)
set_target_properties(
_xrt
PROPERTIES
# pyxrt and xrt in general do a ridiculous dance with drivers
# https://github.com/Xilinx/XRT/blob/edcae12640ce96ec597c4c0cc1b2a850cfcc5c8b/src/runtime_src/core/common/module_loader.cpp#L201-L205
SKIP_BUILD_RPATH ON
BUILD_WITH_INSTALL_RPATH ON)
add_dependencies(xaie _xrt)

# ##############################################################################
# finish
# ##############################################################################

set_target_properties(_bootgen _xclbinutil bootgen-lib cdo_driver xaie
xclbinutil-lib PROPERTIES POSITION_INDEPENDENT_CODE ON)
set_target_properties(
_bootgen
_xclbinutil
_xrt
bootgen-lib
cdo_driver
xaie
xclbinutil-lib
PROPERTIES POSITION_INDEPENDENT_CODE ON)

set_target_properties(
_bootgen _xclbinutil pyxrt xaie xclbinutil-lib
_bootgen _xclbinutil pyxrt xaie xclbinutil-lib _xrt
PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_DIR}
ARCHIVE_OUTPUT_DIRECTORY ${OUTPUT_DIR}
RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_DIR})
51 changes: 51 additions & 0 deletions examples/harness.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from pathlib import Path

import numpy as np
from filelock import FileLock
from xaiepy.xrt import XCLBin, list_kernels

# don't forget LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:/usr/lib/x86_64-linux-gnu


M = K = N = 64

TEST = "basic_matrix_multiplication_matrix_vector"
WORKDIR = Path(__file__).parent.absolute() / TEST / "module_dummy1_amdaie_xclbin_fb"
NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb.npu.txt"
XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb.xclbin"
KERNEL_NAME = "dummy2"
NUM_ARGS = 3

with open(NPU_INSTS_FP, "r") as f:
npu_insts = list(map(lambda n: int(n, 16), f.readlines()))

instr_v = np.array(npu_insts, dtype=np.uint32)

with open(NPU_INSTS_FP, "r") as f:
npu_insts = list(map(lambda n: int(n, 16), f.readlines()))

list_kernels(XCLBIN_PATH)

with FileLock("/tmp/npu.lock"):
xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME)
views = xclbin.mmap_buffers([(M, K), (K,), (M,)], np.float32)

xclbin.load_npu_instructions(npu_insts)

A = np.ones((M, K), dtype=np.float32)
B = 2 * np.ones((K,), dtype=np.float32)
C = np.zeros((M,), dtype=np.float32)

wraps = list(map(np.asarray, views))
np.copyto(wraps[0], A, casting="no")
np.copyto(wraps[1], B, casting="no")
np.copyto(wraps[2], C, casting="no")

xclbin.sync_buffers_to_device()
xclbin.run()
print("Running kernel")
xclbin.wait(30)
xclbin.sync_buffers_from_device()

print(wraps)
assert np.allclose(A @ B, wraps[2])
2 changes: 1 addition & 1 deletion third_party/XRT
Submodule XRT updated 87 files
+1 −1 .github/workflows/xrt_ci.yml
+7 −1 .gitignore
+1 −1 build/petalinux.build
+3 −3 src/CMake/nativeTests.cmake
+2 −1 src/runtime_src/core/common/api/bo_int.h
+33 −8 src/runtime_src/core/common/api/hw_queue.cpp
+8 −5 src/runtime_src/core/common/api/hw_queue.h
+14 −5 src/runtime_src/core/common/api/module_int.h
+5 −1 src/runtime_src/core/common/api/xrt_bo.cpp
+16 −1 src/runtime_src/core/common/api/xrt_device.cpp
+4 −0 src/runtime_src/core/common/api/xrt_hw_context.cpp
+169 −104 src/runtime_src/core/common/api/xrt_kernel.cpp
+157 −64 src/runtime_src/core/common/api/xrt_module.cpp
+29 −22 src/runtime_src/core/common/bo_cache.h
+7 −0 src/runtime_src/core/common/config_reader.h
+2 −1 src/runtime_src/core/common/drv/kds_core.c
+4 −3 src/runtime_src/core/common/shim/buffer_handle.h
+0 −9 src/runtime_src/core/common/shim/hwqueue_handle.h
+1 −0 src/runtime_src/core/edge/user/aie/common_layer/adf_api_config.h
+1 −0 src/runtime_src/core/edge/user/zynq_dev.h
+179 −36 src/runtime_src/core/include/ert.h
+2 −0 src/runtime_src/core/include/experimental/xrt_elf.h
+6 −0 src/runtime_src/core/include/xrt/xrt_bo.h
+3 −2 src/runtime_src/core/include/xrt/xrt_device.h
+28 −0 src/runtime_src/core/include/xrt/xrt_hw_context.h
+56 −0 src/runtime_src/core/include/xrt/xrt_kernel.h
+1 −1 src/runtime_src/core/pcie/driver/linux/xocl/lib/libqdma
+15 −3 src/runtime_src/core/pcie/driver/linux/xocl/mgmtpf/mgmt-core.c
+3 −0 src/runtime_src/core/pcie/driver/linux/xocl/mgmtpf/mgmt-core.h
+2 −1 src/runtime_src/core/pcie/driver/linux/xocl/mgmtpf/mgmt-sysfs.c
+144 −16 src/runtime_src/core/pcie/driver/linux/xocl/mgmtpf/mgmt-utils.c
+2 −21 src/runtime_src/core/pcie/driver/linux/xocl/subdev/qdma.c
+2 −0 src/runtime_src/core/pcie/driver/linux/xocl/userpf/xocl_kds.c
+1 −5 src/runtime_src/core/pcie/linux/shim.h
+1 −1 src/runtime_src/core/tools/common/Table2D.cpp
+1 −1 src/runtime_src/core/tools/common/XBMain.cpp
+2 −19 src/runtime_src/core/tools/common/reports/ReportAiePartitions.cpp
+0 −2 src/runtime_src/core/tools/common/reports/ReportElectrical.cpp
+14 −10 src/runtime_src/core/tools/common/reports/ReportHost.cpp
+8 −4 src/runtime_src/core/tools/common/tests/TestDF_bandwidth.cpp
+18 −23 src/runtime_src/core/tools/common/tests/TestGemm.cpp
+6 −3 src/runtime_src/core/tools/common/tests/TestNPULatency.cpp
+7 −4 src/runtime_src/core/tools/common/tests/TestNPUThroughput.cpp
+8 −4 src/runtime_src/core/tools/common/tests/TestTCTAllColumn.cpp
+8 −5 src/runtime_src/core/tools/common/tests/TestTCTOneColumn.cpp
+4 −5 src/runtime_src/core/tools/xbutil2/CMakeLists.txt
+2 −2 src/runtime_src/core/tools/xbutil2/OO_AieClockFreq.cpp
+2 −2 src/runtime_src/core/tools/xbutil2/OO_AieRegRead.cpp
+1 −2 src/runtime_src/core/tools/xbutil2/SubCmdAdvanced.cpp
+0 −1 src/runtime_src/core/tools/xbutil2/SubCmdAdvanced.h
+1 −1 src/runtime_src/core/tools/xbutil2/SubCmdConfigure.cpp
+9 −15 src/runtime_src/core/tools/xbutil2/xbutil
+2 −2 src/runtime_src/core/tools/xbutil2/xbutil-bash-completion
+2 −2 src/runtime_src/core/tools/xbutil2/xbutil-csh-completion-wrapper
+3 −3 src/runtime_src/core/tools/xbutil2/xbutil.cpp
+51 −0 src/runtime_src/core/tools/xbutil2/xrt-smi
+1 −1 src/runtime_src/core/tools/xbutil2/xrt-smi.bat
+70 −37 src/runtime_src/hip/api/hip_module.cpp
+1 −1 src/runtime_src/hip/core/event.cpp
+19 −34 src/runtime_src/hip/core/module.cpp
+66 −18 src/runtime_src/hip/core/module.h
+24 −11 src/runtime_src/tools/scripts/boost.sh
+1 −1 src/runtime_src/tools/scripts/pkgapu.sh
+11 −3 src/runtime_src/tools/xclbinutil/XclBinUtilities.cxx
+104 −95 src/runtime_src/tools/xclbinutil/aie-pdi-transform/src/pdi-parsing.c
+253 −245 src/runtime_src/tools/xclbinutil/aie-pdi-transform/src/pdi-transform.c
+1 −0 src/runtime_src/tools/xclbinutil/unittests/AIEPartition/2220.hex
+1 −0 src/runtime_src/tools/xclbinutil/unittests/AIEPartition/2220_expected.hex
+1 −0 src/runtime_src/tools/xclbinutil/unittests/AIEPartition/2221.hex
+1 −0 src/runtime_src/tools/xclbinutil/unittests/AIEPartition/2221_expected.hex
+0 −2 src/runtime_src/tools/xclbinutil/unittests/AIEPartition/AIEPartition.py
+9 −1 src/runtime_src/xdp/profile/database/static_info/aie_constructs.h
+2 −22 src/runtime_src/xdp/profile/device/common/client_transaction.cpp
+2 −3 src/runtime_src/xdp/profile/device/common/client_transaction.h
+36 −71 src/runtime_src/xdp/profile/plugin/aie_debug/aie_debug_plugin.cpp
+2 −2 src/runtime_src/xdp/profile/plugin/aie_debug/aie_debug_plugin.h
+16 −10 src/runtime_src/xdp/profile/plugin/aie_profile/client/aie_profile.cpp
+1 −1 src/runtime_src/xdp/profile/plugin/aie_profile/client/aie_profile.h
+32 −0 src/runtime_src/xdp/profile/plugin/aie_trace/client/aie_trace.cpp
+7 −0 src/runtime_src/xdp/profile/plugin/aie_trace/edge/aie_trace.cpp
+29 −0 src/runtime_src/xdp/profile/plugin/aie_trace/util/aie_trace_util.cpp
+81 −43 src/runtime_src/xdp/profile/plugin/ml_timeline/clientDev/ml_timeline.cpp
+3 −0 src/runtime_src/xdp/profile/plugin/ml_timeline/clientDev/ml_timeline.h
+7 −1 src/runtime_src/xdp/profile/plugin/ml_timeline/ml_timeline_impl.h
+37 −0 src/runtime_src/xdp/profile/plugin/ml_timeline/ml_timeline_plugin.cpp
+1 −0 src/runtime_src/xdp/profile/plugin/ml_timeline/ml_timeline_plugin.h
+1 −1 src/runtime_src/xocl/core/kernel.cpp
169 changes: 169 additions & 0 deletions xaiepy/xrt.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
//===- XRTModule.cpp --------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2023, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#include "xrt/xrt_bo.h"
#include "xrt/xrt_device.h"
#include "xrt/xrt_kernel.h"

#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/pytypes.h>
#include <pybind11/stl.h>

#include <algorithm>
#include <cstring>
#include <string>
#include <vector>

namespace py = pybind11;
using namespace py::literals;

constexpr size_t TRANSACTION_API_OP_CODE = 3;
// group_id 0 is for the op code
// group_id 1 is for npu instructions
// group_id 2 is for number of npu instructions
// host side buffers/args follow starting from position 3
// see aiecc.main.emit_design_kernel_json
constexpr size_t TRANSACTION_OP_CODE_IDX = 0;
constexpr size_t INSTRUCTION_BO_IDX = 1;
constexpr size_t INSTRUCTION_LEN_IDX = 2;
constexpr size_t HOST_BUFFERS_START_IDX = 3;

class PyXCLBin {
public:
PyXCLBin(const std::string &xclBinPath, const std::string &kernelName,
int deviceIndex)
: xclBin(std::make_unique<xrt::xclbin>(xclBinPath)),
device(std::make_unique<xrt::device>(deviceIndex)) {
device->register_xclbin(*xclBin);
context = std::make_unique<xrt::hw_context>(*device, xclBin->get_uuid());
kernel = std::make_unique<xrt::kernel>(*context, kernelName);
}

void loadNPUInstructions(const std::vector<uint32_t> &insts) {
npuInstructions = std::make_unique<xrt::bo>(
*device, insts.size() * sizeof(uint32_t), XCL_BO_FLAGS_CACHEABLE,
kernel->group_id(INSTRUCTION_BO_IDX));
npuInstructions->write(insts.data());
npuInstructions->sync(XCL_BO_SYNC_BO_TO_DEVICE);
}

template <typename ElementT>
std::vector<py::memoryview>
mmapBuffers(std::vector<std::vector<int>> shapes) {
this->buffers.reserve(shapes.size());
std::vector<py::memoryview> views;
views.reserve(shapes.size());

auto initAndViewBuffer = [this](
std::vector<int> shape, int groupId,
std::vector<std::unique_ptr<xrt::bo>> &buffers,
std::vector<py::memoryview> &views) {
int nElements =
std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>());
int nBytes = nElements * sizeof(ElementT);
xrt::bo xrtBuf(*device, nBytes, XRT_BO_FLAGS_HOST_ONLY,
kernel->group_id(groupId));
buffers.push_back(std::make_unique<xrt::bo>(xrtBuf));

ElementT *buf = xrtBuf.map<ElementT *>();
memset(buf, 0, nElements);

std::vector strides_{1};
for (int i = shape.size() - 1; i > 0; i--)
strides_.push_back(strides_.back() * shape[i]);
std::vector<int> strides;
// stride in bytes
std::transform(strides_.rbegin(), strides_.rend(),
std::back_inserter(strides),
[](int s) { return s * sizeof(ElementT); });
views.push_back(py::memoryview::from_buffer(buf, shape, strides));
};

for (size_t i = 0; i < shapes.size(); ++i)
initAndViewBuffer(shapes[i], HOST_BUFFERS_START_IDX + i, this->buffers,
views);
return views;
}

uint64_t getBufferHostAddress(size_t idx) { return buffers[idx]->address(); }

void syncBuffersToDevice() {
for (auto &buf : this->buffers)
buf->sync(XCL_BO_SYNC_BO_TO_DEVICE);
}

void syncBuffersFromDevice() {
for (auto &buf : this->buffers)
buf->sync(XCL_BO_SYNC_BO_FROM_DEVICE);
}

void run() {
run_ = std::make_unique<xrt::run>(*kernel);
run_->set_arg(TRANSACTION_OP_CODE_IDX, TRANSACTION_API_OP_CODE);
run_->set_arg(INSTRUCTION_BO_IDX, *npuInstructions);
run_->set_arg(INSTRUCTION_LEN_IDX, npuInstructions->size());
for (size_t i = 0; i < buffers.size(); ++i)
run_->set_arg(HOST_BUFFERS_START_IDX + i, *buffers[i]);
run_->start();
}

void wait(const std::optional<int> timeout) { run_->wait2(); }

std::unique_ptr<xrt::xclbin> xclBin;
std::unique_ptr<xrt::device> device;
std::unique_ptr<xrt::hw_context> context;
std::unique_ptr<xrt::kernel> kernel;
std::unique_ptr<xrt::bo> npuInstructions;

std::vector<std::unique_ptr<xrt::bo>> buffers;

std::unique_ptr<xrt::run> run_;
};

PYBIND11_MODULE(_xrt, m) {

py::class_<PyXCLBin>(m, "XCLBin", py::module_local())
.def(py::init<const std::string &, const std::string &, int>(),
"xclbin_path"_a, "kernel_name"_a, "device_index"_a = 0)
.def("load_npu_instructions", &PyXCLBin::loadNPUInstructions, "insts"_a)
.def("sync_buffers_to_device", &PyXCLBin::syncBuffersToDevice)
.def("sync_buffers_from_device", &PyXCLBin::syncBuffersFromDevice)
.def("run", &PyXCLBin::run)
.def("wait", &PyXCLBin::wait, "timeout"_a = py::none())
.def(
"mmap_buffers",
[](PyXCLBin &self, const std::vector<std::vector<int>> &shapes,
const py::object &npFormat) {
auto npy = py::module_::import("numpy");
if (npFormat.is(npy.attr("int16")))
return self.mmapBuffers<int16_t>(shapes);
if (npFormat.is(npy.attr("int32")))
return self.mmapBuffers<int32_t>(shapes);
if (npFormat.is(npy.attr("float32")))
return self.mmapBuffers<float>(shapes);
if (npFormat.is(npy.attr("int64")))
return self.mmapBuffers<int64_t>(shapes);
if (npFormat.is(npy.attr("float64")))
return self.mmapBuffers<double>(shapes);
throw std::runtime_error("unsupported np format: " +
py::repr(npFormat).cast<std::string>());
},
"shapes"_a, "np_format"_a)
.def("_get_buffer_host_address", [](PyXCLBin &self, size_t idx) {
return self.getBufferHostAddress(idx);
});
m.def("list_kernels", [](std::string fp) {
auto xclbin = xrt::xclbin(fp);
auto xkernels = xclbin.get_kernels();
for (const auto &item : xkernels)
py::print(item.get_name());
});
}
1 change: 1 addition & 0 deletions xaiepy/xrt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ._xrt import *
Loading