diff --git a/CMakeLists.txt b/CMakeLists.txt index e344140..9cd0a24 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -187,15 +187,42 @@ set_target_properties( add_dependencies(xaie pyxrt) +# ############################################################################## +# own xrt bindings +# ############################################################################## + +pybind11_add_module(_xrt xaiepy/xrt.cpp) +target_include_directories(_xrt PRIVATE ${XRT_INCLUDE_DIR} + ${XRT_BINARY_DIR}/gen + ${Boost_INCLUDE_DIRS} +) +target_link_directories(_xrt PRIVATE ${XRT_LIB_DIR}) +target_link_libraries(_xrt PRIVATE xrt_coreutil uuid) +set_target_properties( + _xrt + PROPERTIES + # pyxrt and xrt in general do a ridiculous dance with drivers + # https://github.com/Xilinx/XRT/blob/edcae12640ce96ec597c4c0cc1b2a850cfcc5c8b/src/runtime_src/core/common/module_loader.cpp#L201-L205 + SKIP_BUILD_RPATH ON + BUILD_WITH_INSTALL_RPATH ON) +add_dependencies(xaie _xrt) + # ############################################################################## # finish # ############################################################################## -set_target_properties(_bootgen _xclbinutil bootgen-lib cdo_driver xaie - xclbinutil-lib PROPERTIES POSITION_INDEPENDENT_CODE ON) +set_target_properties( + _bootgen + _xclbinutil + _xrt + bootgen-lib + cdo_driver + xaie + xclbinutil-lib + PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties( - _bootgen _xclbinutil pyxrt xaie xclbinutil-lib + _bootgen _xclbinutil pyxrt xaie xclbinutil-lib _xrt PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${OUTPUT_DIR} ARCHIVE_OUTPUT_DIRECTORY ${OUTPUT_DIR} RUNTIME_OUTPUT_DIRECTORY ${OUTPUT_DIR}) diff --git a/examples/harness.py b/examples/harness.py new file mode 100644 index 0000000..8fbcc03 --- /dev/null +++ b/examples/harness.py @@ -0,0 +1,51 @@ +from pathlib import Path + +import numpy as np +from filelock import FileLock +from xaiepy.xrt import XCLBin, list_kernels + +# don't forget LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:/usr/lib/x86_64-linux-gnu + + +M = K = N = 64 + +TEST = "basic_matrix_multiplication_matrix_vector" +WORKDIR = Path(__file__).parent.absolute() / TEST / "module_dummy1_amdaie_xclbin_fb" +NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb.npu.txt" +XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb.xclbin" +KERNEL_NAME = "dummy2" +NUM_ARGS = 3 + +with open(NPU_INSTS_FP, "r") as f: + npu_insts = list(map(lambda n: int(n, 16), f.readlines())) + +instr_v = np.array(npu_insts, dtype=np.uint32) + +with open(NPU_INSTS_FP, "r") as f: + npu_insts = list(map(lambda n: int(n, 16), f.readlines())) + +list_kernels(XCLBIN_PATH) + +with FileLock("/tmp/npu.lock"): + xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME) + views = xclbin.mmap_buffers([(M, K), (K,), (M,)], np.float32) + + xclbin.load_npu_instructions(npu_insts) + + A = np.ones((M, K), dtype=np.float32) + B = 2 * np.ones((K,), dtype=np.float32) + C = np.zeros((M,), dtype=np.float32) + + wraps = list(map(np.asarray, views)) + np.copyto(wraps[0], A, casting="no") + np.copyto(wraps[1], B, casting="no") + np.copyto(wraps[2], C, casting="no") + + xclbin.sync_buffers_to_device() + xclbin.run() + print("Running kernel") + xclbin.wait(30) + xclbin.sync_buffers_from_device() + + print(wraps) + assert np.allclose(A @ B, wraps[2]) diff --git a/third_party/XRT b/third_party/XRT index c678a94..fdba05d 160000 --- a/third_party/XRT +++ b/third_party/XRT @@ -1 +1 @@ -Subproject commit c678a9469f9b20fcb9a04bbedb5c51f8473faec0 +Subproject commit fdba05d985fc9a083af6dee1a18688e6f9796f5a diff --git a/xaiepy/xrt.cpp b/xaiepy/xrt.cpp new file mode 100644 index 0000000..636315f --- /dev/null +++ b/xaiepy/xrt.cpp @@ -0,0 +1,169 @@ +//===- XRTModule.cpp --------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace py = pybind11; +using namespace py::literals; + +constexpr size_t TRANSACTION_API_OP_CODE = 3; +// group_id 0 is for the op code +// group_id 1 is for npu instructions +// group_id 2 is for number of npu instructions +// host side buffers/args follow starting from position 3 +// see aiecc.main.emit_design_kernel_json +constexpr size_t TRANSACTION_OP_CODE_IDX = 0; +constexpr size_t INSTRUCTION_BO_IDX = 1; +constexpr size_t INSTRUCTION_LEN_IDX = 2; +constexpr size_t HOST_BUFFERS_START_IDX = 3; + +class PyXCLBin { +public: + PyXCLBin(const std::string &xclBinPath, const std::string &kernelName, + int deviceIndex) + : xclBin(std::make_unique(xclBinPath)), + device(std::make_unique(deviceIndex)) { + device->register_xclbin(*xclBin); + context = std::make_unique(*device, xclBin->get_uuid()); + kernel = std::make_unique(*context, kernelName); + } + + void loadNPUInstructions(const std::vector &insts) { + npuInstructions = std::make_unique( + *device, insts.size() * sizeof(uint32_t), XCL_BO_FLAGS_CACHEABLE, + kernel->group_id(INSTRUCTION_BO_IDX)); + npuInstructions->write(insts.data()); + npuInstructions->sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + + template + std::vector + mmapBuffers(std::vector> shapes) { + this->buffers.reserve(shapes.size()); + std::vector views; + views.reserve(shapes.size()); + + auto initAndViewBuffer = [this]( + std::vector shape, int groupId, + std::vector> &buffers, + std::vector &views) { + int nElements = + std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<>()); + int nBytes = nElements * sizeof(ElementT); + xrt::bo xrtBuf(*device, nBytes, XRT_BO_FLAGS_HOST_ONLY, + kernel->group_id(groupId)); + buffers.push_back(std::make_unique(xrtBuf)); + + ElementT *buf = xrtBuf.map(); + memset(buf, 0, nElements); + + std::vector strides_{1}; + for (int i = shape.size() - 1; i > 0; i--) + strides_.push_back(strides_.back() * shape[i]); + std::vector strides; + // stride in bytes + std::transform(strides_.rbegin(), strides_.rend(), + std::back_inserter(strides), + [](int s) { return s * sizeof(ElementT); }); + views.push_back(py::memoryview::from_buffer(buf, shape, strides)); + }; + + for (size_t i = 0; i < shapes.size(); ++i) + initAndViewBuffer(shapes[i], HOST_BUFFERS_START_IDX + i, this->buffers, + views); + return views; + } + + uint64_t getBufferHostAddress(size_t idx) { return buffers[idx]->address(); } + + void syncBuffersToDevice() { + for (auto &buf : this->buffers) + buf->sync(XCL_BO_SYNC_BO_TO_DEVICE); + } + + void syncBuffersFromDevice() { + for (auto &buf : this->buffers) + buf->sync(XCL_BO_SYNC_BO_FROM_DEVICE); + } + + void run() { + run_ = std::make_unique(*kernel); + run_->set_arg(TRANSACTION_OP_CODE_IDX, TRANSACTION_API_OP_CODE); + run_->set_arg(INSTRUCTION_BO_IDX, *npuInstructions); + run_->set_arg(INSTRUCTION_LEN_IDX, npuInstructions->size()); + for (size_t i = 0; i < buffers.size(); ++i) + run_->set_arg(HOST_BUFFERS_START_IDX + i, *buffers[i]); + run_->start(); + } + + void wait(const std::optional timeout) { run_->wait2(); } + + std::unique_ptr xclBin; + std::unique_ptr device; + std::unique_ptr context; + std::unique_ptr kernel; + std::unique_ptr npuInstructions; + + std::vector> buffers; + + std::unique_ptr run_; +}; + +PYBIND11_MODULE(_xrt, m) { + + py::class_(m, "XCLBin", py::module_local()) + .def(py::init(), + "xclbin_path"_a, "kernel_name"_a, "device_index"_a = 0) + .def("load_npu_instructions", &PyXCLBin::loadNPUInstructions, "insts"_a) + .def("sync_buffers_to_device", &PyXCLBin::syncBuffersToDevice) + .def("sync_buffers_from_device", &PyXCLBin::syncBuffersFromDevice) + .def("run", &PyXCLBin::run) + .def("wait", &PyXCLBin::wait, "timeout"_a = py::none()) + .def( + "mmap_buffers", + [](PyXCLBin &self, const std::vector> &shapes, + const py::object &npFormat) { + auto npy = py::module_::import("numpy"); + if (npFormat.is(npy.attr("int16"))) + return self.mmapBuffers(shapes); + if (npFormat.is(npy.attr("int32"))) + return self.mmapBuffers(shapes); + if (npFormat.is(npy.attr("float32"))) + return self.mmapBuffers(shapes); + if (npFormat.is(npy.attr("int64"))) + return self.mmapBuffers(shapes); + if (npFormat.is(npy.attr("float64"))) + return self.mmapBuffers(shapes); + throw std::runtime_error("unsupported np format: " + + py::repr(npFormat).cast()); + }, + "shapes"_a, "np_format"_a) + .def("_get_buffer_host_address", [](PyXCLBin &self, size_t idx) { + return self.getBufferHostAddress(idx); + }); + m.def("list_kernels", [](std::string fp) { + auto xclbin = xrt::xclbin(fp); + auto xkernels = xclbin.get_kernels(); + for (const auto &item : xkernels) + py::print(item.get_name()); + }); +} diff --git a/xaiepy/xrt.py b/xaiepy/xrt.py new file mode 100644 index 0000000..ae076db --- /dev/null +++ b/xaiepy/xrt.py @@ -0,0 +1 @@ +from ._xrt import * \ No newline at end of file