Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Kernel attributes #360

Merged
merged 34 commits into from
Feb 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
4703716
add first iter of the attributes
ksimpson-work Dec 27, 2024
da61e9c
Merge remote-tracking branch 'upstream/main' into kernel-attributes
ksimpson-work Jan 6, 2025
8c24631
update the kernel attributes branch
ksimpson-work Jan 6, 2025
71de911
complete the update
ksimpson-work Jan 6, 2025
5bbf259
remove unrelated files
ksimpson-work Jan 6, 2025
d4e966e
remove file
ksimpson-work Jan 6, 2025
41c4407
leverage fixture in fixture
ksimpson-work Jan 6, 2025
2feadfa
skip test if cuda < 12
ksimpson-work Jan 7, 2025
1913a73
Merge branch 'main' into kernel-attributes
ksimpson-work Jan 8, 2025
bcc2c4e
Merge remote-tracking branch 'upstream/main' into kernel-attributes
ksimpson-work Jan 9, 2025
46f648c
handle exceptions better
ksimpson-work Jan 10, 2025
1a2dc73
Merge remote-tracking branch 'upstream/main' into kernel-attributes
ksimpson-work Jan 10, 2025
4302df9
Merge remote-tracking branch 'origin/kernel-attributes' into kernel-a…
ksimpson-work Jan 10, 2025
c8d473e
remove the context manager and improve the docs
ksimpson-work Jan 10, 2025
887f6ea
unremove the copyright header
ksimpson-work Jan 10, 2025
0d55bc4
merge main
ksimpson-work Jan 21, 2025
2f617eb
slight modifications
ksimpson-work Jan 21, 2025
caebb92
update test
ksimpson-work Jan 21, 2025
72031e3
update test
ksimpson-work Jan 21, 2025
69c9633
add to release notes
ksimpson-work Jan 21, 2025
1ccdd81
add subclass
ksimpson-work Jan 21, 2025
3d6f30e
replace todo comment
ksimpson-work Jan 21, 2025
7f0a673
'Merge remote-tracking branch 'origin/main' into kernel-attributes
ksimpson-work Jan 27, 2025
f13fd1b
reformat the kernel attributes
ksimpson-work Jan 27, 2025
71aabcc
Merge branch 'main' into kernel-attributes
ksimpson-work Jan 31, 2025
a8f9387
Merge remote-tracking branch 'upstream/main' into kernel-attributes
ksimpson-work Jan 31, 2025
1c6fa9e
Merge remote-tracking branch 'origin/kernel-attributes' into kernel-a…
ksimpson-work Jan 31, 2025
ab2b587
Merge branch 'main' into kernel-attributes
ksimpson-work Feb 3, 2025
e5332e8
take device argument
ksimpson-work Feb 5, 2025
169df54
Merge remote-tracking branch 'origin/kernel-attributes' into kernel-a…
ksimpson-work Feb 5, 2025
6ceb5db
use cache. results: platform linux -- Python 3.12.7, pytest-8.3.3, p…
ksimpson-work Feb 5, 2025
8c5a14a
remove bench from test
ksimpson-work Feb 6, 2025
2ae7cfb
Merge branch 'main' into kernel-attributes
ksimpson-work Feb 6, 2025
86a536a
fallback to cuFuncAPI
ksimpson-work Feb 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
149 changes: 144 additions & 5 deletions cuda_core/cuda/core/experimental/_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,16 @@
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE


from warnings import warn

from cuda.core.experimental._utils import driver, get_binding_version, handle_return, precondition

_backend = {
"old": {
"file": driver.cuModuleLoad,
"data": driver.cuModuleLoadDataEx,
"kernel": driver.cuModuleGetFunction,
"attribute": driver.cuFuncGetAttribute,
},
}

Expand All @@ -34,6 +37,7 @@ def _lazy_init():
"file": driver.cuLibraryLoadFromFile,
"data": driver.cuLibraryLoadData,
"kernel": driver.cuLibraryGetKernel,
"attribute": driver.cuKernelGetAttribute,
}
_kernel_ctypes = (driver.CUfunction, driver.CUkernel)
else:
Expand All @@ -42,6 +46,136 @@ def _lazy_init():
_inited = True


class KernelAttributes:
def __init__(self):
raise RuntimeError("KernelAttributes should not be instantiated directly")

slots = ("_handle", "_cache", "_backend_version", "_loader")

def _init(handle):
self = KernelAttributes.__new__(KernelAttributes)
self._handle = handle
self._cache = {}

self._backend_version = "new" if (_py_major_ver >= 12 and _driver_ver >= 12000) else "old"
self._loader = _backend[self._backend_version]
return self

def _get_cached_attribute(self, device_id: int, attribute: driver.CUfunction_attribute) -> int:
"""Helper function to get a cached attribute or fetch and cache it if not present."""
if device_id in self._cache and attribute in self._cache[device_id]:
return self._cache[device_id][attribute]
if self._backend_version == "new":
result = handle_return(self._loader["attribute"](attribute, self._handle, device_id))
else: # "old" backend
warn(
"Device ID argument is ignored when getting attribute from kernel when cuda version < 12. ",
RuntimeWarning,
stacklevel=2,
)
result = handle_return(self._loader["attribute"](attribute, self._handle))
if device_id not in self._cache:
self._cache[device_id] = {}
self._cache[device_id][attribute] = result
return result

def max_threads_per_block(self, device_id: int = None) -> int:
"""int : The maximum number of threads per block.
This attribute is read-only."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
)

def shared_size_bytes(self, device_id: int = None) -> int:
"""int : The size in bytes of statically-allocated shared memory required by this function.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)

def const_size_bytes(self, device_id: int = None) -> int:
"""int : The size in bytes of user-allocated constant memory required by this function.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)

def local_size_bytes(self, device_id: int = None) -> int:
"""int : The size in bytes of local memory used by each thread of this function.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)

def num_regs(self, device_id: int = None) -> int:
"""int : The number of registers used by each thread of this function.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS)

def ptx_version(self, device_id: int = None) -> int:
"""int : The PTX virtual architecture version for which the function was compiled.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PTX_VERSION)

def binary_version(self, device_id: int = None) -> int:
"""int : The binary architecture version for which the function was compiled.
This attribute is read-only."""
return self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION)

def cache_mode_ca(self, device_id: int = None) -> bool:
"""bool : Whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set.
This attribute is read-only."""
return bool(self._get_cached_attribute(device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CACHE_MODE_CA))

def max_dynamic_shared_size_bytes(self, device_id: int = None) -> int:
"""int : The maximum size in bytes of dynamically-allocated shared memory that can be used
by this function."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
)

def preferred_shared_memory_carveout(self, device_id: int = None) -> int:
"""int : The shared memory carveout preference, in percent of the total shared memory."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
)

def cluster_size_must_be_set(self, device_id: int = None) -> bool:
"""bool : The kernel must launch with a valid cluster size specified.
This attribute is read-only."""
return bool(
self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SIZE_MUST_BE_SET
)
)

def required_cluster_width(self, device_id: int = None) -> int:
"""int : The required cluster width in blocks."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_WIDTH
)

def required_cluster_height(self, device_id: int = None) -> int:
"""int : The required cluster height in blocks."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_HEIGHT
)

def required_cluster_depth(self, device_id: int = None) -> int:
"""int : The required cluster depth in blocks."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_REQUIRED_CLUSTER_DEPTH
)

def non_portable_cluster_size_allowed(self, device_id: int = None) -> bool:
"""bool : Whether the function can be launched with non-portable cluster size."""
return bool(
self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NON_PORTABLE_CLUSTER_SIZE_ALLOWED
)
)

def cluster_scheduling_policy_preference(self, device_id: int = None) -> int:
"""int : The block scheduling policy of a function."""
return self._get_cached_attribute(
device_id, driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_CLUSTER_SCHEDULING_POLICY_PREFERENCE
)


class Kernel:
"""Represent a compiled kernel that had been loaded onto the device.

Expand All @@ -53,13 +187,10 @@ class Kernel:

"""

__slots__ = (
"_handle",
"_module",
)
__slots__ = ("_handle", "_module", "_attributes")

def __init__(self):
raise NotImplementedError("directly constructing a Kernel instance is not supported")
raise RuntimeError("directly constructing a Kernel instance is not supported")

@staticmethod
def _from_obj(obj, mod):
Expand All @@ -68,8 +199,16 @@ def _from_obj(obj, mod):
ker = Kernel.__new__(Kernel)
ker._handle = obj
ker._module = mod
ker._attributes = None
return ker

@property
def attributes(self):
"""Get the read-only attributes of this kernel."""
if self._attributes is None:
self._attributes = KernelAttributes._init(self._handle)
return self._attributes

# TODO: implement from_handle()


Expand Down
1 change: 1 addition & 0 deletions cuda_core/docs/source/release/0.2.0-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Highlights
----------

- Add :class:`~ProgramOptions` to facilitate the passing of runtime compile options to :obj:`~Program`.
- Add kernel attributes to :class:`~_module.Kernel`

Limitations
-----------
Expand Down
65 changes: 63 additions & 2 deletions cuda_core/tests/test_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,75 @@
import pytest
from conftest import can_load_generated_ptx

from cuda.core.experimental import Program, ProgramOptions
from cuda.core.experimental import Program, ProgramOptions, system


@pytest.fixture(scope="function")
def get_saxpy_kernel(init_cuda):
code = """
template<typename T>
__global__ void saxpy(const T a,
const T* x,
const T* y,
T* out,
size_t N) {
const unsigned int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (size_t i=tid; i<N; i+=gridDim.x*blockDim.x) {
out[tid] = a * x[tid] + y[tid];
}
}
"""

# prepare program
prog = Program(code, code_type="c++")
mod = prog.compile(
"cubin",
name_expressions=("saxpy<float>", "saxpy<double>"),
)

# run in single precision
return mod.get_kernel("saxpy<float>")


@pytest.mark.xfail(not can_load_generated_ptx(), reason="PTX version too new")
def test_get_kernel():
def test_get_kernel(init_cuda):
kernel = """extern "C" __global__ void ABC() { }"""
object_code = Program(kernel, "c++", options=ProgramOptions(relocatable_device_code=True)).compile("ptx")
assert object_code._handle is None
kernel = object_code.get_kernel("ABC")
assert object_code._handle is not None
assert kernel._handle is not None


@pytest.mark.parametrize(
"attr, expected_type",
[
("max_threads_per_block", int),
("shared_size_bytes", int),
("const_size_bytes", int),
("local_size_bytes", int),
("num_regs", int),
("ptx_version", int),
("binary_version", int),
("cache_mode_ca", bool),
("cluster_size_must_be_set", bool),
("max_dynamic_shared_size_bytes", int),
("preferred_shared_memory_carveout", int),
("required_cluster_width", int),
("required_cluster_height", int),
("required_cluster_depth", int),
("non_portable_cluster_size_allowed", bool),
("cluster_scheduling_policy_preference", int),
],
)
def test_read_only_kernel_attributes(get_saxpy_kernel, attr, expected_type):
kernel = get_saxpy_kernel
method = getattr(kernel.attributes, attr)
# get the value without providing a device ordinal
value = method()
assert value is not None

# get the value for each device on the system
for device in system.devices:
value = method(device.device_id)
assert isinstance(value, expected_type), f"Expected {attr} to be of type {expected_type}, but got {type(value)}"
Loading