diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml index 5aca7fccc..6f268dff4 100644 --- a/.test-conda-env-py3.yml +++ b/.test-conda-env-py3.yml @@ -1,5 +1,8 @@ name: test-conda-env channels: +# For https://github.com/pocl/pocl/pull/1069 +# See https://github.com/conda-forge/pocl-feedstock/pull/80 +- conda-forge/label/pocl_dev - conda-forge - nodefaults diff --git a/doc/index.rst b/doc/index.rst index bc4af44cc..54c38c21e 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -110,11 +110,11 @@ Contents runtime_memory runtime_program runtime_gl + tools array types algorithm howto - tools misc 🚀 Github 💾 Download Releases diff --git a/doc/misc.rst b/doc/misc.rst index ff71ae7a4..0ebd67346 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -29,7 +29,6 @@ Then run:: You can install these pieces of software in your user account and do not need root/administrator privileges. - Enabling access to CPUs and GPUs via (Py)OpenCL ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -283,6 +282,13 @@ other software to be turned into the corresponding :mod:`pyopencl` objects. User-visible Changes ==================== +Version 2022.2 +-------------- + +- Added :ref:`opaque-style SVM ` and :class:`pyopencl.SVMPointer`. +- Added :class:`pyopencl.tools.SVMPool`. +- Added automatic queue-synchronized deallocation of SVM. + Version 2020.3 -------------- .. note:: @@ -728,7 +734,9 @@ Funding Work on pytential was supported in part by * the US National Science Foundation under grant numbers DMS-1418961, - DMS-1654756, SHF-1911019, and OAC-1931577. + DMS-1654756, SHF-1911019, and OAC-1931577, and +* the Department of Energy, National Nuclear Security Administration, + under Award Number DE-NA0003963. AK also gratefully acknowledges a hardware gift from Nvidia Corporation. diff --git a/doc/runtime_const.rst b/doc/runtime_const.rst index 864a641c7..b6f34ea60 100644 --- a/doc/runtime_const.rst +++ b/doc/runtime_const.rst @@ -6,6 +6,7 @@ OpenCL Runtime: Constants .. include:: constants.inc .. class:: NameVersion + Describes the version of a specific feature. .. note:: @@ -19,6 +20,7 @@ OpenCL Runtime: Constants .. attribute:: name .. class:: DeviceTopologyAmd + .. method:: __init__(bus, device, function) .. attribute:: type .. attribute:: bus diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst index f4e01f266..f9c31a8bf 100644 --- a/doc/runtime_memory.rst +++ b/doc/runtime_memory.rst @@ -116,14 +116,109 @@ by both the host and the device. *Coarse-grain* SVM requires that buffers be mapped before being accessed on the host, *fine-grain* SVM does away with that requirement. +.. warning:: + + Compared to :class:`Buffer`\ s, SVM brings with it a new concern: the + synchronization of memory deallocation. Unlike other objects in OpenCL, + SVM is represented by a plain (C-language) pointer and thus has no ability for + reference counting. + + As a result, it is perfectly legal to allocate a :class:`Buffer`, enqueue an + operation on it, and release the buffer, without worrying about whether the + operation has completed. The OpenCL implementation will keep the buffer alive + until the operation has completed. This is *not* the case with SVM: Unless + otherwise specified, memory deallocation is performed immediately when + requested, and so SVM will be deallocated whenever the Python + garbage collector sees fit, even if the operation has not completed, + immediately leading to undefined behavior (i.e., typically, memory corruption and, + before too long, a crash). + + Version 2022.2 of PyOpenCL offers substantially improved tools + for dealing with this. In particular, all means for allocating SVM + allow specifying a :class:`CommandQueue`, so that deallocation + is enqueued and performed after previously-enqueued operations + have completed. + SVM requires OpenCL 2.0. +.. _opaque-svm: + +Opaque and "Wrapped-:mod:`numpy`" Styles of Referencing SVM +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +When trying to pass SVM pointers to functionality in :mod:`pyopencl`, +two styles are supported: + +- First, the opaque style. This style most closely resembles + :class:`Buffer`-based allocation available in OpenCL 1.x. + SVM pointers are held in opaque "handle" objects such as :class:`SVMAllocation`. + +- Second, the wrapped-:mod:`numpy` style. In this case, a :class:`numpy.ndarray` + (or another object implementing the :c:func:`Python buffer protocol + `) serves as the reference to an area of SVM. + This style permits using memory areas with :mod:`pyopencl`'s SVM + interfaces even if they were allocated outside of :mod:`pyopencl`. + + Since passing a :class:`numpy.ndarray` (or another type of object obeying the + buffer interface) already has existing semantics in most settings in + :mod:`pyopencl` (such as when passing arguments to a kernel or calling + :func:`enqueue_copy`), there exists a wrapper object, :class:`SVM`, that may + be "wrapped around" these objects to mark them as SVM. + +The commonality between the two styles is that both ultimately implement +the :class:`SVMPointer` interface, which :mod:`pyopencl` uses to obtain +the actual SVM pointer. + +Note that it is easily possible to obtain a :class:`numpy.ndarray` view of SVM +areas held in the opaque style, see :attr:`SVMPointer.buf`, permitting +transitions from opaque to wrapped-:mod:`numpy` style. The opposite transition +(from wrapped-:mod:`numpy` to opaque) is not necessarily straightforward, +as it would require "fishing" the opaque SVM handle out of a chain of +:attr:`numpy.ndarray.base` attributes (or similar, depending on +the actual object serving as the main SVM reference). + +See :ref:`numpy-svm-helpers` for helper functions that ease setting up the +wrapped-:mod:`numpy` structure. + +Wrapped-:mod:`numpy` SVM tends to be a good fit for fine-grain SVM because of +the ease of direct host-side access, but the creation of the nested structure +that makes this possible is associated with a certain amount of cost. + +By comparison, opaque SVM access tends to be a good fit for coarse-grain +SVM, because direct host access is not possible without mapping the array +anyway, and it has lower setup cost. It is of course entirely possible to use +opaque SVM access with fine-grain SVM. + +.. versionchanged:: 2022.2 + + This version adds the opaque style of SVM access. + +Using SVM with Arrays +^^^^^^^^^^^^^^^^^^^^^ + +While all types of SVM can be used as the memory backing +:class:`pyopencl.array.Array` objects, ensuring that new arrays returned +by array operations (e.g. arithmetic) also use SVM is easiest to accomplish +by passing an :class:`~pyopencl.tools.SVMAllocator` (or +:class:`~pyopencl.tools.SVMPool`) as the *allocator* parameter in functions +returning new arrays. + +SVM Pointers, Allocations, and Maps +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: SVMPointer + +.. autoclass:: SVMAllocation + .. autoclass:: SVM .. autoclass:: SVMMap -Allocating SVM -^^^^^^^^^^^^^^ + +.. _numpy-svm-helpers: + +Helper functions for :mod:`numpy`-based SVM allocation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. autofunction:: svm_empty .. autofunction:: svm_empty_like @@ -140,11 +235,6 @@ Operations on SVM .. autofunction:: enqueue_svm_memfill .. autofunction:: enqueue_svm_migratemem -SVM Allocation Holder -^^^^^^^^^^^^^^^^^^^^^ - -.. autoclass:: SVMAllocation - Image ----- @@ -281,6 +371,8 @@ Transfers .. autofunction:: enqueue_copy(queue, dest, src, **kwargs) +.. autofunction:: enqueue_fill(queue, dest, src, **kwargs) + Mapping Memory into Host Address Space -------------------------------------- @@ -406,3 +498,11 @@ Pipes See :class:`pipe_info` for values of *param*. +Type aliases +------------ + +.. currentmodule:: pyopencl._cl + +.. class:: Buffer + + See :class:`pyopencl.Buffer`. diff --git a/doc/tools.rst b/doc/tools.rst index 7fdde084e..080d1c89a 100644 --- a/doc/tools.rst +++ b/doc/tools.rst @@ -1,203 +1,4 @@ Built-in Utilities ================== -.. module:: pyopencl.tools - -.. _memory-pools: - -Memory Pools ------------- - -The constructor :func:`pyopencl.Buffer` can consume a fairly large amount of -processing time if it is invoked very frequently. For example, code based on -:class:`pyopencl.array.Array` can easily run into this issue because a -fresh memory area is allocated for each intermediate result. Memory pools are a -remedy for this problem based on the observation that often many of the block -allocations are of the same sizes as previously used ones. - -Then, instead of fully returning the memory to the system and incurring the -associated reallocation overhead, the pool holds on to the memory and uses it -to satisfy future allocations of similarly-sized blocks. The pool reacts -appropriately to out-of-memory conditions as long as all memory allocations -are made through it. Allocations performed from outside of the pool may run -into spurious out-of-memory conditions due to the pool owning much or all of -the available memory. - -Using :class:`pyopencl.array.Array` instances with a :class:`MemoryPool` is -not complicated:: - - mem_pool = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) - a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) - -.. class:: PooledBuffer - - An object representing a :class:`MemoryPool`-based allocation of - device memory. Once this object is deleted, its associated device - memory is returned to the pool. This supports the same interface - as :class:`pyopencl.Buffer`. - -.. class:: AllocatorInterface - - An interface implemented by various memory allocation functions - in :mod:`pyopencl`. - - .. method:: __call__(size) - - Allocate and return a :class:`pyopencl.Buffer` of the given *size*. - -.. class:: DeferredAllocator(context, mem_flags=pyopencl.mem_flags.READ_WRITE) - - *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds - to the *flags* argument of :class:`pyopencl.Buffer`. DeferredAllocator - has the same semantics as regular OpenCL buffer allocation, i.e. it may - promise memory to be available that may (in any call to a buffer-using - CL function) turn out to not exist later on. (Allocations in CL are - bound to contexts, not devices, and memory availability depends on which - device the buffer is used with.) - - Implements :class:`AllocatorInterface`. - - .. versionchanged :: 2013.1 - - ``CLAllocator`` was deprecated and replaced - by :class:`DeferredAllocator`. - - .. method:: __call__(size) - - Allocate a :class:`pyopencl.Buffer` of the given *size*. - - .. versionchanged :: 2020.2 - - The allocator will succeed even for allocations of size zero, - returning *None*. - -.. class:: ImmediateAllocator(queue, mem_flags=pyopencl.mem_flags.READ_WRITE) - - *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds - to the *flags* argument of :class:`pyopencl.Buffer`. - :class:`ImmediateAllocator` will attempt to ensure at allocation time that - allocated memory is actually available. If no memory is available, an out-of-memory - error is reported at allocation time. - - Implements :class:`AllocatorInterface`. - - .. versionadded:: 2013.1 - - .. method:: __call__(size) - - Allocate a :class:`pyopencl.Buffer` of the given *size*. - - .. versionchanged :: 2020.2 - - The allocator will succeed even for allocations of size zero, - returning *None*. - -.. class:: MemoryPool(allocator[, leading_bits_in_bin_id]) - - A memory pool for OpenCL device memory. *allocator* must be an instance of - one of the above classes, and should be an :class:`ImmediateAllocator`. - The memory pool assumes that allocation failures are reported - by the allocator immediately, and not in the OpenCL-typical - deferred manner. - - Implements :class:`AllocatorInterface`. - - .. note:: - - The current implementation of the memory pool will retain allocated - memory after it is returned by the application and keep it in a bin - identified by the leading *leading_bits_in_bin_id* bits of the - allocation size. To ensure that allocations within each bin are - interchangeable, allocation sizes are rounded up to the largest size - that shares the leading bits of the requested allocation size. - - The current default value of *leading_bits_in_bin_id* is - four, but this may change in future versions and is not - guaranteed. - - *leading_bits_in_bin_id* must be passed by keyword, - and its role is purely advisory. It is not guaranteed - that future versions of the pool will use the - same allocation scheme and/or honor *leading_bits_in_bin_id*. - - .. versionchanged:: 2019.1 - - Current bin allocation behavior documented, *leading_bits_in_bin_id* - added. - - .. attribute:: held_blocks - - The number of unused blocks being held by this pool. - - .. attribute:: active_blocks - - The number of blocks in active use that have been allocated - through this pool. - - .. attribute:: managed_bytes - - "Managed" memory is "active" and "held" memory. - - .. versionadded: 2021.1.2 - - .. attribute:: active_bytes - - "Active" bytes are bytes under the control of the application. - This may be smaller than the actual allocated size reflected - in :attr:`managed_bytes`. - - .. versionadded: 2021.1.2 - - .. method:: allocate(size) - - Return a :class:`PooledBuffer` of the given *size*. - - .. method:: __call__(size) - - Synonym for :meth:`allocate` to match the :class:`AllocatorInterface`. - - .. versionadded: 2011.2 - - .. method:: free_held - - Free all unused memory that the pool is currently holding. - - .. method:: stop_holding - - Instruct the memory to start immediately freeing memory returned - to it, instead of holding it for future allocations. - Implicitly calls :meth:`free_held`. - This is useful as a cleanup action when a memory pool falls out - of use. - -CL-Object-dependent Caching ---------------------------- - -.. autofunction:: first_arg_dependent_memoize -.. autofunction:: clear_first_arg_caches - -Testing -------- - -.. function:: pytest_generate_tests_for_pyopencl(metafunc) - - Using the line:: - - from pyopencl.tools import pytest_generate_tests_for_pyopencl \ - as pytest_generate_tests - - in your `pytest `_ test scripts allows you to use the - arguments *ctx_factory*, *device*, or *platform* in your test functions, - and they will automatically be run for each OpenCL device/platform in the - system, as appropriate. - - The following two environment variables are also supported to control - device/platform choice:: - - PYOPENCL_TEST=0:0,1;intel=i5,i7 - -Device Characterization ------------------------ - -.. automodule:: pyopencl.characterize - :members: +.. automodule:: pyopencl.tools diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py new file mode 100644 index 000000000..724d32b63 --- /dev/null +++ b/examples/demo_array_svm.py @@ -0,0 +1,33 @@ +import pyopencl as cl +import pyopencl.array as cl_array +from pyopencl.tools import SVMAllocator, SVMPool +import numpy as np + +n = 50000 +a = np.random.rand(n).astype(np.float32) +b = np.random.rand(n).astype(np.float32) + + +ctx = cl.create_some_context() +queue = cl.CommandQueue(ctx) + +alloc = SVMAllocator(ctx, alignment=0, queue=queue) +alloc = SVMPool(alloc) + +a_dev = cl_array.to_device(queue, a, allocator=alloc) +b_dev = cl_array.to_device(queue, b, allocator=alloc) +dest_dev = cl_array.empty_like(a_dev) + +prg = cl.Program(ctx, """ + __kernel void sum(__global const float *a, + __global const float *b, __global float *c) + { + int gid = get_global_id(0); + c[gid] = a[gid] + b[gid]; + } + """).build() + +knl = prg.sum +knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data) + +np.testing.assert_allclose(dest_dev.get(), (a_dev+b_dev).get()) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index eaf909633..ef96cb9bf 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -21,6 +21,8 @@ """ from sys import intern +from warnings import warn +from typing import Union, Any, Optional, Sequence from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT # noqa @@ -43,7 +45,6 @@ import os from os.path import dirname, join, realpath if realpath(join(os.getcwd(), "pyopencl")) == realpath(dirname(__file__)): - from warnings import warn warn("It looks like you are importing PyOpenCL from " "its source directory. This likely won't work.") raise @@ -199,11 +200,9 @@ if get_cl_header_version() >= (2, 0): from pyopencl._cl import ( # noqa - SVMAllocation, + SVMPointer, SVM, - - # FIXME - #enqueue_svm_migratemem, + SVMAllocation, ) if _cl.have_gl(): @@ -267,7 +266,6 @@ class CommandQueueUsedAfterExit(UserWarning): def compiler_output(text): import os - from warnings import warn if int(os.environ.get("PYOPENCL_COMPILER_OUTPUT", "0")): warn(text, CompilerWarning) else: @@ -389,7 +387,6 @@ def enable_debugging(platform_or_context): import os os.environ["CPU_MAX_COMPUTE_UNITS"] = "1" else: - from warnings import warn warn("do not know how to enable debugging on '%s'" % platform.name) @@ -428,7 +425,6 @@ def _get_prg(self): return self._prg else: # "no program" can only happen in from-source case. - from warnings import warn warn("Pre-build attribute access defeats compiler caching.", stacklevel=3) @@ -662,7 +658,6 @@ def device_hashable_model_and_version_identifier(self): return ("v1", self.vendor, self.vendor_id, self.name, self.version) def device_persistent_unique_id(self): - from warnings import warn warn("Device.persistent_unique_id is deprecated. " "Use Device.hashable_model_and_version_identifier instead.", DeprecationWarning, stacklevel=2) @@ -684,7 +679,6 @@ def device_persistent_unique_id(self): def context_init(self, devices, properties, dev_type, cache_dir=None): if cache_dir is not None: - from warnings import warn warn("The 'cache_dir' argument to the Context constructor " "is deprecated and no longer has an effect. " "It was removed because it only applied to the wrapper " @@ -970,7 +964,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None, if hostbuf is not None and not \ (flags & (mem_flags.USE_HOST_PTR | mem_flags.COPY_HOST_PTR)): - from warnings import warn warn("'hostbuf' was passed, but no memory flags to make use of it.") if hostbuf is None and pitches is not None: @@ -1043,7 +1036,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None, class _ImageInfoGetter: def __init__(self, event): - from warnings import warn warn("Image.image.attr is deprecated and will go away in 2021. " "Use Image.attr directly, instead.") @@ -1131,44 +1123,166 @@ def memory_map_exit(self, exc_type, exc_val, exc_tb): # }}} - # {{{ SVMAllocation + # {{{ SVMPointer if get_cl_header_version() >= (2, 0): - SVMAllocation.__doc__ = """An object whose lifetime is tied to an - allocation of shared virtual memory. + SVMPointer.__doc__ = """A base class for things that can be passed to + functions that allow an SVM pointer, e.g. kernel enqueues and memory + copies. - .. note:: + Objects of this type cannot currently be directly created or + implemented in Python. To obtain objects implementing this type, + consider its subtypes :class:`SVMAllocation` and :class:`SVM`. - Most likely, you will not want to use this directly, but rather - :func:`svm_empty` and related functions which allow access to this - functionality using a friendlier, more Pythonic interface. - .. versionadded:: 2016.2 + .. property:: svm_ptr - .. automethod:: __init__(self, ctx, size, alignment, flags=None) - .. automethod:: release - .. automethod:: enqueue_release + Gives the SVM pointer as an :class:`int`. + + .. property:: size + + An :class:`int` denoting the size in bytes, or *None*, if the size + of the SVM pointed to is not known. + + *Most* objects of this type (e.g. instances of + :class:`SVMAllocation` and :class:`SVM` know their size, so that, + for example :class:`enqueue_copy` will automatically copy an entire + :class:`SVMAllocation` when a size is not explicitly specified. + + .. automethod:: map + .. automethod:: map_ro + .. automethod:: map_rw + .. automethod:: as_buffer + .. property:: buf + + An opaque object implementing the :c:func:`Python buffer protocol + `. It exposes the pointed-to memory as + a one-dimensional buffer of bytes, with the size matching + :attr:`size`. + + No guarantee is provided that two references to this attribute + result in the same object. """ - if get_cl_header_version() >= (2, 0): - svmallocation_old_init = SVMAllocation.__init__ + def svmptr_map(self, queue: CommandQueue, *, flags: int, is_blocking: bool = + True, wait_for: Optional[Sequence[Event]] = None, + size: Optional[Event] = None) -> "SVMMap": + """ + :arg is_blocking: If *False*, subsequent code must wait on + :attr:`SVMMap.event` in the returned object before accessing the + mapped memory. + :arg flags: a combination of :class:`pyopencl.map_flags`. + :arg size: The size of the map in bytes. If not provided, defaults to + :attr:`size`. - def svmallocation_init(self, ctx, size, alignment, flags, _interface=None): + |std-enqueue-blurb| + """ + return SVMMap(self, + np.asarray(self.buf), + queue, + _cl._enqueue_svm_map(queue, is_blocking, flags, self, wait_for, + size=size)) + + def svmptr_map_ro(self, queue: CommandQueue, *, is_blocking: bool = True, + wait_for: Optional[Sequence[Event]] = None, + size: Optional[int] = None) -> "SVMMap": + """Like :meth:`map`, but with *flags* set for a read-only map. + """ + + return self.map(queue, flags=map_flags.READ, + is_blocking=is_blocking, wait_for=wait_for, size=size) + + def svmptr_map_rw(self, queue: CommandQueue, *, is_blocking: bool = True, + wait_for: Optional[Sequence[Event]] = None, + size: Optional[int] = None) -> "SVMMap": + """Like :meth:`map`, but with *flags* set for a read-only map. + """ + + return self.map(queue, flags=map_flags.READ | map_flags.WRITE, + is_blocking=is_blocking, wait_for=wait_for, size=size) + + def svmptr__enqueue_unmap(self, queue, wait_for=None): + return _cl._enqueue_svm_unmap(queue, self, wait_for) + + def svmptr_as_buffer(self, ctx: Context, *, flags: Optional[int] = None, + size: Optional[int] = None) -> Buffer: """ :arg ctx: a :class:`Context` - :arg flags: some of :class:`svm_mem_flags`. + :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to + read-write. + :arg size: The size of the map in bytes. If not provided, defaults to + :attr:`size`. + :returns: a :class:`Buffer` corresponding to *self*. + + The memory referred to by this object must not be freed before + the returned :class:`Buffer` is released. """ - svmallocation_old_init(self, ctx, size, alignment, flags) - # mem_flags.READ_ONLY applies to kernels, not the host - read_write = True - _interface["data"] = ( - int(self._ptr_as_int()), not read_write) + if flags is None: + flags = mem_flags.READ_WRITE | mem_flags.USE_HOST_PTR + + if size is None: + size = self.size + + return Buffer(ctx, flags, size=size, hostbuf=self.buf) + + if get_cl_header_version() >= (2, 0): + SVMPointer.map = svmptr_map + SVMPointer.map_ro = svmptr_map_ro + SVMPointer.map_rw = svmptr_map_rw + SVMPointer._enqueue_unmap = svmptr__enqueue_unmap + SVMPointer.as_buffer = svmptr_as_buffer + + # }}} - self.__array_interface__ = _interface + # {{{ SVMAllocation if get_cl_header_version() >= (2, 0): - SVMAllocation.__init__ = svmallocation_init + SVMAllocation.__doc__ = """ + Is a :class:`SVMPointer`. + + .. versionadded:: 2016.2 + + .. automethod:: __init__ + + :arg flags: See :class:`svm_mem_flags`. + :arg queue: If not specified, the allocation will be freed + eagerly, irrespective of whether pending/enqueued operations + are still using this memory. + + If specified, deallocation of the memory will be enqueued + with the given queue, and will only be performed + after previously-enqueue operations in the queue have + completed. + + It is an error to specify an out-of-order queue. + + .. warning:: + + Not specifying a queue will typically lead to undesired + behavior, including crashes and memory corruption. + See the warning in :ref:`svm`. + + .. automethod:: enqueue_release + + Enqueue the release of this allocation into *queue*. + If *queue* is not specified, enqueue the deallocation + into the queue provided at allocation time or via + :class:`bind_to_queue`. + + .. automethod:: bind_to_queue + + Change the queue used for implicit enqueue of deallocation + to *queue*. Sufficient synchronization is ensured by + enqueuing a marker into the old queue and waiting on this + marker in the new queue. + + .. automethod:: unbind_from_queue + + Configure the allocation to no longer implicitly enqueue + memory allocation. If such a queue was previously provided, + :meth:`~CommandQueue.finish` is automatically called on it. + """ # }}} @@ -1179,23 +1293,14 @@ def svmallocation_init(self, ctx, size, alignment, flags, _interface=None): (such as a :class:`numpy.ndarray`) as referring to shared virtual memory. + Is a :class:`SVMPointer`, hence objects of this type may be passed + to kernel calls and :func:`enqueue_copy`, and all methods declared + there are also available there. Note that :meth:`map` differs + slightly from :meth:`SVMPointer.map`. + Depending on the features of the OpenCL implementation, the following types of objects may be passed to/wrapped in this type: - * coarse-grain shared memory as returned by (e.g.) :func:`csvm_empty` - for any implementation of OpenCL 2.0. - - This is how coarse-grain SVM may be used from both host and device:: - - svm_ary = cl.SVM( - cl.csvm_empty(ctx, 1000, np.float32, alignment=64)) - assert isinstance(svm_ary.mem, np.ndarray) - - with svm_ary.map_rw(queue) as ary: - ary.fill(17) # use from host - - prg.twice(queue, svm_ary.mem.shape, None, svm_ary) - * fine-grain shared memory as returned by (e.g.) :func:`fsvm_empty`, if the implementation supports fine-grained shared virtual memory. This memory may directly be passed to a kernel:: @@ -1222,10 +1327,28 @@ def svmallocation_init(self, ctx, size, alignment, flags, _interface=None): queue.finish() # synchronize print(ary) # access from host - Objects of this type may be passed to kernel calls and - :func:`enqueue_copy`. Coarse-grain shared-memory *must* be mapped - into host address space using :meth:`map` before being accessed - through the :mod:`numpy` interface. + * coarse-grain shared memory as returned by (e.g.) :func:`csvm_empty` + for any implementation of OpenCL 2.0. + + .. note:: + + Applications making use of coarse-grain SVM may be better + served by opaque-style SVM. See :ref:`opaque-svm`. + + This is how coarse-grain SVM may be used from both host and device:: + + svm_ary = cl.SVM( + cl.csvm_empty(ctx, 1000, np.float32, alignment=64)) + assert isinstance(svm_ary.mem, np.ndarray) + + with svm_ary.map_rw(queue) as ary: + ary.fill(17) # use from host + + prg.twice(queue, svm_ary.mem.shape, None, svm_ary) + + Coarse-grain shared-memory *must* be mapped into host address space + using :meth:`~SVMPointer.map` before being accessed through the + :mod:`numpy` interface. .. note:: @@ -1246,9 +1369,10 @@ def svmallocation_init(self, ctx, size, alignment, flags, _interface=None): .. automethod:: map .. automethod:: map_ro .. automethod:: map_rw - .. automethod:: as_buffer """ + # }}} + if get_cl_header_version() >= (2, 0): svm_old_init = SVM.__init__ @@ -1262,14 +1386,18 @@ def svm_map(self, queue, flags, is_blocking=True, wait_for=None): :arg is_blocking: If *False*, subsequent code must wait on :attr:`SVMMap.event` in the returned object before accessing the mapped memory. - :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to - read-write. + :arg flags: a combination of :class:`pyopencl.map_flags`. :returns: an :class:`SVMMap` instance + This differs from the inherited :class:`SVMPointer.map` in that no size + can be specified, and that :attr:`mem` is the exact array produced + when the :class:`SVMMap` is used as a context manager. + |std-enqueue-blurb| """ return SVMMap( self, + self.mem, queue, _cl._enqueue_svm_map(queue, is_blocking, flags, self, wait_for)) @@ -1288,29 +1416,12 @@ def svm_map_rw(self, queue, is_blocking=True, wait_for=None): def svm__enqueue_unmap(self, queue, wait_for=None): return _cl._enqueue_svm_unmap(queue, self, wait_for) - def svm_as_buffer(self, ctx, flags=None): - """ - :arg ctx: a :class:`Context` - :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to - read-write. - :returns: a :class:`Buffer` corresponding to *self*. - - The memory referred to by this object must not be freed before - the returned :class:`Buffer` is released. - """ - - if flags is None: - flags = mem_flags.READ_WRITE - - return Buffer(ctx, flags, size=self.mem.nbytes, hostbuf=self.mem) - if get_cl_header_version() >= (2, 0): SVM.__init__ = svm_init SVM.map = svm_map SVM.map_ro = svm_map_ro SVM.map_rw = svm_map_rw SVM._enqueue_unmap = svm__enqueue_unmap - SVM.as_buffer = svm_as_buffer # }}} @@ -1413,6 +1524,27 @@ def gl_object_get_gl_object(self): # }}} +# {{{ _OverriddenArrayInterfaceSVMAllocation + +if get_cl_header_version() >= (2, 0): + class _OverriddenArrayInterfaceSVMAllocation(SVMAllocation): + def __init__(self, ctx, size, alignment, flags, *, _interface, + queue=None): + """ + :arg ctx: a :class:`Context` + :arg flags: some of :class:`svm_mem_flags`. + """ + super().__init__(ctx, size, alignment, flags, queue) + + # mem_flags.READ_ONLY applies to kernels, not the host + read_write = True + _interface["data"] = (int(self.svm_ptr), not read_write) + + self.__array_interface__ = _interface + +# }}} + + # {{{ create_some_context def create_some_context(interactive=None, answers=None): @@ -1553,19 +1685,24 @@ def parse_device(choice): class SVMMap: """ - .. attribute:: event + Returned by :func:`SVMPointer.map` and :func:`SVM.map`. + This class may also be used as a context manager in a ``with`` statement. + :meth:`release` will be called upon exit from the ``with`` region. + The value returned to the ``as`` part of the context manager is the + mapped Python object (e.g. a :mod:`numpy` array). .. versionadded:: 2016.2 + .. property:: event + + The :class:`Event` returned when mapping the memory. + .. automethod:: release - This class may also be used as a context manager in a ``with`` statement. - :meth:`release` will be called upon exit from the ``with`` region. - The value returned to the ``as`` part of the context manager is the - mapped Python object (e.g. a :mod:`numpy` array). """ - def __init__(self, svm, queue, event): + def __init__(self, svm, array, queue, event): self.svm = svm + self.array = array self.queue = queue self.event = event @@ -1574,7 +1711,7 @@ def __del__(self): self.release() def __enter__(self): - return self.svm.mem + return self.array def __exit__(self, exc_type, exc_val, exc_tb): self.release() @@ -1633,7 +1770,13 @@ def enqueue_copy(queue, dest, src, **kwargs): .. rubric :: Transfer :class:`Buffer` ↔ host .. ------------------------------------------------------------------------ - :arg device_offset: offset in bytes (optional) + :arg src_offset: offset in bytes (optional) + + May only be nonzero if applied on the device side. + + :arg dst_offset: offset in bytes (optional) + + May only be nonzero if applied on the device side. .. note:: @@ -1653,7 +1796,7 @@ def enqueue_copy(queue, dest, src, **kwargs): and to the minimum of the size of the source and target from 2013.1 on. :arg src_offset: (optional) - :arg dest_offset: (optional) + :arg dst_offset: (optional) .. ------------------------------------------------------------------------ .. rubric :: Rectangular :class:`Buffer` ↔ host transfers (CL 1.1 and newer) @@ -1719,7 +1862,7 @@ def enqueue_copy(queue, dest, src, **kwargs): three or shorter. (mandatory) .. ------------------------------------------------------------------------ - .. rubric :: Transfer :class:`SVM`/host ↔ :class:`SVM`/host + .. rubric :: Transfer :class:`SVMPointer`/host ↔ :class:`SVMPointer`/host .. ------------------------------------------------------------------------ :arg byte_count: (optional) If not specified, defaults to the @@ -1736,25 +1879,61 @@ def enqueue_copy(queue, dest, src, **kwargs): if dest.type == mem_object_type.BUFFER: if isinstance(src, MemoryObjectHolder): if src.type == mem_object_type.BUFFER: + # {{{ buffer -> buffer + if "src_origin" in kwargs: + # rectangular return _cl._enqueue_copy_buffer_rect( queue, src, dest, **kwargs) else: - kwargs["dst_offset"] = kwargs.pop("dest_offset", 0) + # linear + dest_offset = kwargs.pop("dest_offset", None) + if dest_offset is not None: + if "dst_offset" in kwargs: + raise TypeError("may not specify both 'dst_offset' " + "and 'dest_offset'") + + warn("The 'dest_offset' argument of enqueue_copy " + "is deprecated. Use 'dst_offset' instead. " + "'dest_offset' will stop working in 2023.x.", + DeprecationWarning, stacklevel=2) + + kwargs["dst_offset"] = dest_offset + return _cl._enqueue_copy_buffer(queue, src, dest, **kwargs) + + # }}} elif src.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]: return _cl._enqueue_copy_image_to_buffer( queue, src, dest, **kwargs) else: raise ValueError("invalid src mem object type") else: - # assume from-host + # {{{ host -> buffer + if "buffer_origin" in kwargs: return _cl._enqueue_write_buffer_rect(queue, dest, src, **kwargs) else: + device_offset = kwargs.pop("device_offset", None) + if device_offset is not None: + if "dst_offset" in kwargs: + raise TypeError("may not specify both 'device_offset' " + "and 'dst_offset'") + + warn("The 'device_offset' argument of enqueue_copy " + "is deprecated. Use 'dst_offset' instead. " + "'dst_offset' will stop working in 2023.x.", + DeprecationWarning, stacklevel=2) + + kwargs["dst_offset"] = device_offset + return _cl._enqueue_write_buffer(queue, dest, src, **kwargs) + # }}} + elif dest.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]: + # {{{ ... -> image + if isinstance(src, MemoryObjectHolder): if src.type == mem_object_type.BUFFER: return _cl._enqueue_copy_buffer_to_image( @@ -1776,17 +1955,28 @@ def enqueue_copy(queue, dest, src, **kwargs): return _cl._enqueue_write_image( queue, dest, origin, region, src, **kwargs) + + # }}} else: raise ValueError("invalid dest mem object type") - elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVM): - # to SVM - if not isinstance(src, SVM): + elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVMPointer): + # {{{ ... -> SVM + + if not isinstance(src, SVMPointer): src = SVM(src) is_blocking = kwargs.pop("is_blocking", True) + + # These are NOT documented. They only support consistency with the + # Buffer-based API for the sake of the Array. + assert kwargs.pop("src_offset", 0) == 0 + assert kwargs.pop("dst_offset", 0) == 0 + return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs) + # }}} + else: # assume to-host @@ -1795,7 +1985,21 @@ def enqueue_copy(queue, dest, src, **kwargs): if "buffer_origin" in kwargs: return _cl._enqueue_read_buffer_rect(queue, src, dest, **kwargs) else: + device_offset = kwargs.pop("device_offset", None) + if device_offset is not None: + if "src_offset" in kwargs: + raise TypeError("may not specify both 'device_offset' " + "and 'src_offset'") + + warn("The 'device_offset' argument of enqueue_copy " + "is deprecated. Use 'src_offset' instead. " + "'dst_offset' will stop working in 2023.x.", + DeprecationWarning, stacklevel=2) + + kwargs["src_offset"] = device_offset + return _cl._enqueue_read_buffer(queue, src, dest, **kwargs) + elif src.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]: origin = kwargs.pop("origin") region = kwargs.pop("region") @@ -1810,12 +2014,20 @@ def enqueue_copy(queue, dest, src, **kwargs): queue, src, origin, region, dest, **kwargs) else: raise ValueError("invalid src mem object type") - elif isinstance(src, SVM): - # from svm + elif isinstance(src, SVMPointer): + # {{{ svm -> host + # dest is not a SVM instance, otherwise we'd be in the branch above + + # This is NOT documented. They only support consistency with the + # Buffer-based API for the sake of the Array. + assert kwargs.pop("src_offset", 0) == 0 + is_blocking = kwargs.pop("is_blocking", True) return _cl._enqueue_svm_memcpy( queue, is_blocking, SVM(dest), src, **kwargs) + + # }}} else: # assume from-host raise TypeError("enqueue_copy cannot perform host-to-host transfers") @@ -1823,6 +2035,28 @@ def enqueue_copy(queue, dest, src, **kwargs): # }}} +# {{{ enqueue_fill + +def enqueue_fill(queue: CommandQueue, + dest: "Union[MemoryObjectHolder, SVMPointer]", + pattern: Any, size: int, *, offset: int = 0, + wait_for: Optional[Sequence[Event]] = None) -> Event: + """ + .. versionadded:: 2022.2 + """ + if isinstance(dest, MemoryObjectHolder): + return enqueue_fill_buffer(queue, dest, pattern, offset, size, wait_for) + elif isinstance(dest, SVMPointer): + if offset: + raise NotImplementedError("enqueue_fill with SVM does not yet support " + "offsets") + return enqueue_svm_memfill(queue, dest, pattern, size, wait_for) + else: + raise TypeError(f"enqueue_fill does not know how to fill '{type(dest)}'") + +# }}} + + # {{{ image creation DTYPE_TO_CHANNEL_TYPE = { @@ -1928,7 +2162,6 @@ def enqueue_barrier(queue, wait_for=None): def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None): if not (queue._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2)): - from warnings import warn warn("The context for this queue does not declare OpenCL 1.2 support, so " "the next thing you might see is a crash") @@ -1945,7 +2178,7 @@ def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None): def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None): """Fill shared virtual memory with a pattern. - :arg dest: a Python buffer object, optionally wrapped in an :class:`SVM` object + :arg dest: a Python buffer object, or any implementation of :class:`SVMPointer`. :arg pattern: a Python buffer object (e.g. a :class:`numpy.ndarray` with the fill pattern to be used. :arg byte_count: The size of the memory to be fill. Defaults to the @@ -1956,17 +2189,17 @@ def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None): .. versionadded:: 2016.2 """ - if not isinstance(dest, SVM): + if not isinstance(dest, SVMPointer): dest = SVM(dest) return _cl._enqueue_svm_memfill( - queue, dest, pattern, byte_count=None, wait_for=None) + queue, dest, pattern, byte_count=byte_count, wait_for=wait_for) def enqueue_svm_migratemem(queue, svms, flags, wait_for=None): """ :arg svms: a collection of Python buffer objects (e.g. :mod:`numpy` - arrays), optionally wrapped in :class:`SVM` objects. + arrays), or any implementation of :class:`SVMPointer`. :arg flags: a combination of :class:`mem_migration_flags` |std-enqueue-blurb| @@ -1976,15 +2209,10 @@ def enqueue_svm_migratemem(queue, svms, flags, wait_for=None): This function requires OpenCL 2.1. """ - return _cl._enqueue_svm_migratemem( - queue, - [svm.mem if isinstance(svm, SVM) else svm - for svm in svms], - flags, - wait_for) + return _cl._enqueue_svm_migratemem(queue, svms, flags, wait_for) -def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None): +def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None, queue=None): """Allocate an empty :class:`numpy.ndarray` of the given *shape*, *dtype* and *order*. (See :func:`numpy.empty` for the meaning of these arguments.) The array will be allocated in shared virtual memory belonging @@ -2002,6 +2230,10 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None): will likely want to wrap the returned array in an :class:`SVM` tag. .. versionadded:: 2016.2 + + .. versionchanged:: 2022.2 + + *queue* argument added. """ dtype = np.dtype(dtype) @@ -2048,7 +2280,9 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None): if alignment is None: alignment = itemsize - svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface) + svm_alloc = _OverriddenArrayInterfaceSVMAllocation( + ctx, nbytes, alignment, flags, _interface=interface, + queue=queue) return np.asarray(svm_alloc) diff --git a/pyopencl/array.py b/pyopencl/array.py index 15ed2bbbf..699466aa4 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -31,6 +31,7 @@ from dataclasses import dataclass, field from typing import Any, List, Optional from functools import reduce +from warnings import warn import numpy as np import pyopencl.elementwise as elementwise @@ -49,6 +50,11 @@ SCALAR_CLASSES = (Number, np.bool_, bool) +if cl.get_cl_header_version() >= (2, 0): + _SVMPointer_or_nothing = cl.SVMPointer +else: + _SVMPointer_or_nothing = () + _COMMON_DTYPE_CACHE = {} @@ -130,7 +136,6 @@ class InconsistentOpenCLQueueWarning(UserWarning): class VecLookupWarner: def __getattr__(self, name): - from warnings import warn warn("pyopencl.array.vec is deprecated. " "Please use pyopencl.cltypes for OpenCL vector and scalar types", DeprecationWarning, 2) @@ -226,7 +231,6 @@ def kernel_runner(*args, **kwargs): class DefaultAllocator(cl.tools.DeferredAllocator): def __init__(self, *args, **kwargs): - from warnings import warn warn("pyopencl.array.DefaultAllocator is deprecated. " "It will be continue to exist throughout the 2013.x " "versions of PyOpenCL.", @@ -255,6 +259,7 @@ class _copy_queue: # noqa _ARRAY_GET_SIZES_CACHE = {} _BOOL_DTYPE = np.dtype(np.int8) +_NOT_PRESENT = object() class Array: @@ -591,6 +596,16 @@ def __init__(self, cq, shape, dtype, order="C", allocator=None, self.context = context self._flags = _flags + if __debug__: + if queue is not None and isinstance( + self.base_data, _SVMPointer_or_nothing): + mem_queue = getattr(self.base_data, "_queue", _NOT_PRESENT) + if mem_queue is not _NOT_PRESENT and mem_queue != queue: + warn("Array has different queue from backing SVM memory. " + "This may lead to the array getting deallocated sooner " + "than expected, potentially leading to crashes.", + InconsistentOpenCLQueueWarning, stacklevel=2) + @property def ndim(self): return len(self.shape) @@ -714,7 +729,6 @@ def set(self, ary, queue=None, async_=None, **kwargs): raise RuntimeError("cannot set from non-contiguous array") if not _equal_strides(ary.strides, self.strides, self.shape): - from warnings import warn warn("Setting array from one with different " "strides/storage order. This will cease to work " "in 2013.x.", @@ -722,8 +736,9 @@ def set(self, ary, queue=None, async_=None, **kwargs): if self.size: event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary, - device_offset=self.offset, + dst_offset=self.offset, is_blocking=not async_) + self.add_event(event1) def _get(self, queue=None, ary=None, async_=None, **kwargs): @@ -756,7 +771,6 @@ def _get(self, queue=None, ary=None, async_=None, **kwargs): raise TypeError("'ary' has non-matching type") if self.shape != ary.shape: - from warnings import warn warn("get() between arrays of different shape is deprecated " "and will be removed in PyCUDA 2017.x", DeprecationWarning, stacklevel=2) @@ -772,8 +786,9 @@ def _get(self, queue=None, ary=None, async_=None, **kwargs): if self.size: event1 = cl.enqueue_copy(queue, ary, self.base_data, - device_offset=self.offset, + src_offset=self.offset, wait_for=self.events, is_blocking=not async_) + self.add_event(event1) else: event1 = None @@ -806,7 +821,6 @@ def get(self, queue=None, ary=None, async_=None, **kwargs): """ if async_: - from warnings import warn warn("calling pyopencl.Array.get with `async_=True` is deprecated. " "Please use pyopencl.Array.get_async for asynchronous " "device-to-host transfers", @@ -877,7 +891,6 @@ def __repr__(self): if result[:5] == "array": result = f"cl.{type(self).__name__}" + result[5:] else: - from warnings import warn warn(f"{type(result).__name__}.__repr__ was expected to return a " f"string starting with 'array', got '{result[:10]!r}'") @@ -1459,8 +1472,8 @@ def _zero_fill(self, queue=None, wait_for=None): # https://github.com/inducer/pyopencl/issues/395 if cl_version_gtr_1_2 and not (on_nvidia and self.nbytes >= 2**31): self.add_event( - cl.enqueue_fill_buffer(queue, self.base_data, np.int8(0), - self.offset, self.nbytes, wait_for=wait_for)) + cl.enqueue_fill(queue, self.base_data, np.int8(0), + self.nbytes, offset=self.offset, wait_for=wait_for)) else: zero = np.zeros((), self.dtype) self.fill(zero, queue=queue) @@ -2121,7 +2134,7 @@ def setitem(self, subscript, value, queue=None, wait_for=None): if subarray.shape == value.shape and subarray.strides == value.strides: self.add_event( cl.enqueue_copy(queue, subarray.base_data, - value, device_offset=subarray.offset, wait_for=wait_for)) + value, dst_offset=subarray.offset, wait_for=wait_for)) return else: value = to_device(queue, value, self.allocator) @@ -2675,7 +2688,6 @@ def concatenate(arrays, axis=0, queue=None, allocator=None): import builtins if builtins.any(type(ary) != type(arrays[0]) # noqa: E721 for ary in arrays[1:]): - from warnings import warn warn("Elements of 'arrays' not of the same type, returning " "an instance of the type of arrays[0]", stacklevel=2) @@ -2744,7 +2756,6 @@ def hstack(arrays, queue=None): import builtins if builtins.any(type(ary) != type(arrays[0]) # noqa: E721 for ary in arrays[1:]): - from warnings import warn warn("Elements of 'arrays' not of the same type, returning " "an instance of the type of arrays[0]", stacklevel=2) @@ -2804,7 +2815,6 @@ def stack(arrays, axis=0, queue=None): import builtins if builtins.any(type(ary) != type(arrays[0]) # noqa: E721 for ary in arrays[1:]): - from warnings import warn warn("Elements of 'arrays' not of the same type, returning " "an instance of the type of arrays[0]", stacklevel=2) diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 27adac75b..fb4a91e14 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -1,4 +1,92 @@ -"""Various helpful bits and pieces without much of a common theme.""" +r""" +.. _memory-pools: + +Memory Pools +------------ + +Memory allocation (e.g. in the form of the :func:`pyopencl.Buffer` constructor) +can be expensive if used frequently. For example, code based on +:class:`pyopencl.array.Array` can easily run into this issue because a fresh +memory area is allocated for each intermediate result. Memory pools are a +remedy for this problem based on the observation that often many of the block +allocations are of the same sizes as previously used ones. + +Then, instead of fully returning the memory to the system and incurring the +associated reallocation overhead, the pool holds on to the memory and uses it +to satisfy future allocations of similarly-sized blocks. The pool reacts +appropriately to out-of-memory conditions as long as all memory allocations +are made through it. Allocations performed from outside of the pool may run +into spurious out-of-memory conditions due to the pool owning much or all of +the available memory. + +There are two flavors of allocators and memory pools: + +- :ref:`buf-mempool` +- :ref:`svm-mempool` + +Using :class:`pyopencl.array.Array`\ s can be used with memory pools in a +straightforward manner:: + + mem_pool = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) + a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) + +Likewise, SVM-based allocators are directly usable with +:class:`pyopencl.array.Array`. + +.. _buf-mempool: + +:class:`~pyopencl.Buffer`-based Allocators and Memory Pools +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. autoclass:: PooledBuffer + +.. autoclass:: AllocatorBase + +.. autoclass:: DeferredAllocator + +.. autoclass:: ImmediateAllocator + +.. autoclass:: MemoryPool + +.. _svm-mempool: + +:ref:`SVM `-Based Allocators and Memory Pools +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +SVM functionality requires OpenCL 2.0. + +.. autoclass:: PooledSVM + +.. autoclass:: SVMAllocator + +.. autoclass:: SVMPool + +CL-Object-dependent Caching +--------------------------- + +.. autofunction:: first_arg_dependent_memoize +.. autofunction:: clear_first_arg_caches + +Testing +------- + +.. autofunction:: pytest_generate_tests_for_pyopencl + +Device Characterization +----------------------- + +.. automodule:: pyopencl.characterize + :members: + +Type aliases +------------ + +.. currentmodule:: pyopencl._cl + +.. class:: AllocatorBase + + See :class:`pyopencl.tools.AllocatorBase`. +""" __copyright__ = "Copyright (C) 2010 Andreas Kloeckner" @@ -33,7 +121,7 @@ import numpy as np from pytools import memoize, memoize_method -from pyopencl._cl import bitlog2 # noqa: F401 +from pyopencl._cl import bitlog2, get_cl_header_version # noqa: F401 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase import re @@ -59,10 +147,293 @@ def _register_types(): # {{{ imported names from pyopencl._cl import ( # noqa - PooledBuffer as PooledBuffer, - _tools_DeferredAllocator as DeferredAllocator, - _tools_ImmediateAllocator as ImmediateAllocator, - MemoryPool as MemoryPool) + PooledBuffer, AllocatorBase, DeferredAllocator, + ImmediateAllocator, MemoryPool, + ) + + +if get_cl_header_version() >= (2, 0): + from pyopencl._cl import ( # noqa + SVMPool, + PooledSVM, + SVMAllocator, + ) + +# }}} + + +# {{{ monkeypatch docstrings into imported interfaces + +_MEMPOOL_IFACE_DOCS = """ +.. note:: + + The current implementation of the memory pool will retain allocated + memory after it is returned by the application and keep it in a bin + identified by the leading *leading_bits_in_bin_id* bits of the + allocation size. To ensure that allocations within each bin are + interchangeable, allocation sizes are rounded up to the largest size + that shares the leading bits of the requested allocation size. + + The current default value of *leading_bits_in_bin_id* is + four, but this may change in future versions and is not + guaranteed. + + *leading_bits_in_bin_id* must be passed by keyword, + and its role is purely advisory. It is not guaranteed + that future versions of the pool will use the + same allocation scheme and/or honor *leading_bits_in_bin_id*. + +.. attribute:: held_blocks + + The number of unused blocks being held by this pool. + +.. attribute:: active_blocks + + The number of blocks in active use that have been allocated + through this pool. + +.. attribute:: managed_bytes + + "Managed" memory is "active" and "held" memory. + + .. versionadded:: 2021.1.2 + +.. attribute:: active_bytes + + "Active" bytes are bytes under the control of the application. + This may be smaller than the actual allocated size reflected + in :attr:`managed_bytes`. + + .. versionadded:: 2021.1.2 + + +.. method:: free_held + + Free all unused memory that the pool is currently holding. + +.. method:: stop_holding + + Instruct the memory to start immediately freeing memory returned + to it, instead of holding it for future allocations. + Implicitly calls :meth:`free_held`. + This is useful as a cleanup action when a memory pool falls out + of use. +""" + + +def _monkeypatch_docstrings(): + + PooledBuffer.__doc__ = """ + An object representing a :class:`MemoryPool`-based allocation of + :class:`~pyopencl.Buffer`-style device memory. Analogous to + :class:`~pyopencl.Buffer`, however once this object is deleted, its + associated device memory is returned to the pool. + + Is a :class:`pyopencl.MemoryObject`. + """ + + AllocatorBase.__doc__ = """ + An interface implemented by various memory allocation functions + in :mod:`pyopencl`. + + .. automethod:: __call__ + + Allocate and return a :class:`pyopencl.Buffer` of the given *size*. + """ + + # {{{ DeferredAllocator + + DeferredAllocator.__doc__ = """ + *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds + to the *flags* argument of :class:`pyopencl.Buffer`. DeferredAllocator + has the same semantics as regular OpenCL buffer allocation, i.e. it may + promise memory to be available that may (in any call to a buffer-using + CL function) turn out to not exist later on. (Allocations in CL are + bound to contexts, not devices, and memory availability depends on which + device the buffer is used with.) + + Implements :class:`AllocatorBase`. + + .. versionchanged :: 2013.1 + + ``CLAllocator`` was deprecated and replaced + by :class:`DeferredAllocator`. + + .. method:: __init__(context, mem_flags=pyopencl.mem_flags.READ_WRITE) + + .. automethod:: __call__ + + Allocate a :class:`pyopencl.Buffer` of the given *size*. + + .. versionchanged :: 2020.2 + + The allocator will succeed even for allocations of size zero, + returning *None*. + """ + + # }}} + + # {{{ ImmediateAllocator + + ImmediateAllocator.__doc__ = """ + *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds + to the *flags* argument of :class:`pyopencl.Buffer`. + :class:`ImmediateAllocator` will attempt to ensure at allocation time that + allocated memory is actually available. If no memory is available, an + out-of-memory error is reported at allocation time. + + Implements :class:`AllocatorBase`. + + .. versionadded:: 2013.1 + + .. method:: __init__(queue, mem_flags=pyopencl.mem_flags.READ_WRITE) + + .. automethod:: __call__ + + Allocate a :class:`pyopencl.Buffer` of the given *size*. + + .. versionchanged :: 2020.2 + + The allocator will succeed even for allocations of size zero, + returning *None*. + """ + + # }}} + + # {{{ MemoryPool + + MemoryPool.__doc__ = """ + A memory pool for OpenCL device memory in :class:`pyopencl.Buffer` form. + *allocator* must be an instance of one of the above classes, and should be + an :class:`ImmediateAllocator`. The memory pool assumes that allocation + failures are reported by the allocator immediately, and not in the + OpenCL-typical deferred manner. + + Implements :class:`AllocatorBase`. + + .. versionchanged:: 2019.1 + + Current bin allocation behavior documented, *leading_bits_in_bin_id* + added. + + .. automethod:: __init__ + + .. automethod:: allocate + + Return a :class:`PooledBuffer` of the given *size*. + + .. automethod:: __call__ + + Synonym for :meth:`allocate` to match :class:`AllocatorBase`. + + .. versionadded:: 2011.2 + """ + _MEMPOOL_IFACE_DOCS + + # }}} + + +_monkeypatch_docstrings() + + +def _monkeypatch_svm_docstrings(): + # {{{ PooledSVM + + PooledSVM.__doc__ = """ + An object representing a :class:`SVMPool`-based allocation of + :ref:`svm`. Analogous to :class:`~pyopencl.SVMAllocation`, however once + this object is deleted, its associated device memory is returned to the + pool from which it came. + + .. versionadded:: 2022.2 + + .. note:: + + If the :class:`SVMAllocator` for the :class:`SVMPool` that allocated an + object of this type is associated with an (in-order) + :class:`~pyopencl.CommandQueue`, sufficient synchronization is provided + to ensure operations enqueued before deallocation complete before + operations from a different use (possibly in a different queue) are + permitted to start. This applies when :class:`release` is called and + also when the object is freed automatically by the garbage collector. + + Is a :class:`pyopencl.SVMPointer`. + + Supports structural equality and hashing. + + .. automethod:: release + + Return the held memory to the pool. See the note about synchronization + behavior during deallocation above. + + .. automethod:: enqueue_release + + Synonymous to :meth;`release`, for consistency with + :class:`~pyopencl.SVMAllocation`. Note that, unlike + :meth:`pyopencl.SVMAllocation.enqueue_release`, specifying a queue + or events to be waited for is not supported. + + .. automethod:: bind_to_queue + + Analogous to :meth:`pyopencl.SVMAllocation.bind_to_queue`. + + .. automethod:: unbind_from_queue + + Analogous to :meth:`pyopencl.SVMAllocation.unbind_from_queue`. + """ + + # }}} + + # {{{ SVMAllocator + + SVMAllocator.__doc__ = """ + .. versionadded:: 2022.2 + + .. automethod:: __init__ + + :arg flags: See :class:`~pyopencl.svm_mem_flags`. + :arg queue: If not specified, allocations will be freed + eagerly, irrespective of whether pending/enqueued operations + are still using the memory. + + If specified, deallocation of memory will be enqueued + with the given queue, and will only be performed + after previously-enqueue operations in the queue have + completed. + + It is an error to specify an out-of-order queue. + + .. warning:: + + Not specifying a queue will typically lead to undesired + behavior, including crashes and memory corruption. + See the warning in :ref:`svm`. + + .. automethod:: __call__ + + Return a :class:`~pyopencl.SVMAllocation` of the given *size*. + """ + + # }}} + + # {{{ SVMPool + + SVMPool.__doc__ = """ + A memory pool for OpenCL device memory in :ref:`SVM ` form. + *allocator* must be an instance of :class:`SVMAllocator`. + + .. versionadded:: 2022.2 + + .. automethod:: __init__ + .. automethod:: __call__ + + Return a :class:`PooledSVM` of the given *size*. + """ + _MEMPOOL_IFACE_DOCS + + # }}} + + +if get_cl_header_version() >= (2, 0): + _monkeypatch_svm_docstrings() # }}} @@ -310,6 +681,22 @@ def idfn(val): def pytest_generate_tests_for_pyopencl(metafunc): + """Using the line:: + + from pyopencl.tools import pytest_generate_tests_for_pyopencl + as pytest_generate_tests + + in your `pytest `_ test scripts allows you to use the + arguments *ctx_factory*, *device*, or *platform* in your test functions, + and they will automatically be run for each OpenCL device/platform in the + system, as appropriate. + + The following two environment variabls is also supported to control + device/platform choice:: + + PYOPENCL_TEST=0:0,1;intel=i5,i7 + """ + arg_names = get_pyopencl_fixture_arg_names(metafunc) if not arg_names: return @@ -605,7 +992,7 @@ def match_dtype_to_c_struct(device, name, dtype, context=None): the given *device* to ensure that :mod:`numpy` and C offsets and sizes match.) - .. versionadded: 2013.1 + .. versionadded:: 2013.1 This example explains the use of this function:: diff --git a/pyopencl/version.py b/pyopencl/version.py index d4fcac4f0..41bb1ecc8 100644 --- a/pyopencl/version.py +++ b/pyopencl/version.py @@ -1,3 +1,3 @@ -VERSION = (2022, 1, 6) +VERSION = (2022, 2) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/src/mempool.hpp b/src/mempool.hpp index 44f0fd643..a0eca827e 100644 --- a/src/mempool.hpp +++ b/src/mempool.hpp @@ -102,7 +102,7 @@ namespace PYGPU_PACKAGE container_t m_container; typedef typename container_t::value_type bin_pair_t; - std::unique_ptr m_allocator; + std::shared_ptr m_allocator; // A held block is one that's been released by the application, but that // we are keeping around to dish out again. @@ -125,8 +125,8 @@ namespace PYGPU_PACKAGE unsigned m_leading_bits_in_bin_id; public: - memory_pool(Allocator const &alloc=Allocator(), unsigned leading_bits_in_bin_id=4) - : m_allocator(alloc.copy()), + memory_pool(std::shared_ptr alloc, unsigned leading_bits_in_bin_id=4) + : m_allocator(alloc), m_held_blocks(0), m_active_blocks(0), m_managed_bytes(0), m_active_bytes(0), m_stop_holding(false), @@ -233,7 +233,8 @@ namespace PYGPU_PACKAGE std::cout << "[pool] allocation of size " << size << " served from bin " << bin_nr << " which contained " << bin.size() << " entries" << std::endl; - return pop_block_from_bin(bin, size); + return m_allocator->hand_out_existing_block( + pop_block_from_bin(bin, size)); } size_type alloc_sz = alloc_size(bin_nr); @@ -256,7 +257,8 @@ namespace PYGPU_PACKAGE m_allocator->try_release_blocks(); if (bin.size()) - return pop_block_from_bin(bin, size); + return m_allocator->hand_out_existing_block( + pop_block_from_bin(bin, size)); if (m_trace) std::cout << "[pool] allocation still OOM after GC" << std::endl; @@ -282,7 +284,7 @@ namespace PYGPU_PACKAGE "failed to free memory for allocation"); } - void free(pointer_type p, size_type size) + void free(pointer_type &&p, size_type size) { --m_active_blocks; m_active_bytes -= size; @@ -291,7 +293,7 @@ namespace PYGPU_PACKAGE if (!m_stop_holding) { inc_held_blocks(); - get_bin(bin_nr).push_back(p); + get_bin(bin_nr).push_back(std::move(p)); if (m_trace) std::cout << "[pool] block of size " << size << " returned to bin " @@ -300,7 +302,7 @@ namespace PYGPU_PACKAGE } else { - m_allocator->free(p); + m_allocator->free(std::move(p)); m_managed_bytes -= alloc_size(bin_nr); } } @@ -313,7 +315,7 @@ namespace PYGPU_PACKAGE while (bin.size()) { - m_allocator->free(bin.back()); + m_allocator->free(std::move(bin.back())); m_managed_bytes -= alloc_size(bin_pair.first); bin.pop_back(); @@ -353,7 +355,7 @@ namespace PYGPU_PACKAGE if (bin.size()) { - m_allocator->free(bin.back()); + m_allocator->free(std::move(bin.back())); m_managed_bytes -= alloc_size(bin_pair.first); bin.pop_back(); @@ -379,7 +381,7 @@ namespace PYGPU_PACKAGE pointer_type pop_block_from_bin(bin_t &bin, size_type size) { - pointer_type result = bin.back(); + pointer_type result(std::move(bin.back())); bin.pop_back(); dec_held_blocks(); @@ -399,7 +401,7 @@ namespace PYGPU_PACKAGE typedef typename Pool::pointer_type pointer_type; typedef typename Pool::size_type size_type; - private: + protected: PYGPU_SHARED_PTR m_pool; pointer_type m_ptr; @@ -421,7 +423,7 @@ namespace PYGPU_PACKAGE { if (m_valid) { - m_pool->free(m_ptr, m_size); + m_pool->free(std::move(m_ptr), m_size); m_valid = false; } else @@ -435,16 +437,8 @@ namespace PYGPU_PACKAGE #endif ); } - - pointer_type ptr() const - { return m_ptr; } - - size_type size() const - { return m_size; } }; } - - #endif diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp index 98964056f..413b8452b 100644 --- a/src/wrap_cl.hpp +++ b/src/wrap_cl.hpp @@ -137,10 +137,8 @@ // }}} +// {{{ macros and typedefs for wrappers - - -// {{{ tools #if PY_VERSION_HEX >= 0x02050000 typedef Py_ssize_t PYOPENCL_BUFFER_SIZE_T; #else @@ -229,8 +227,6 @@ } - - #define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \ { \ bool failed_with_mem_error = false; \ @@ -260,8 +256,20 @@ } \ } + +#define PYOPENCL_GET_SVM_SIZE(NAME) \ + size_t NAME##_size; \ + bool NAME##_has_size = false; \ + try \ + { \ + NAME##_size = NAME.size(); \ + NAME##_has_size = true; \ + } \ + catch (size_not_available) { } + // }}} + // {{{ tracing and error reporting #ifdef PYOPENCL_TRACE #define PYOPENCL_PRINT_CALL_TRACE(NAME) \ @@ -329,6 +337,7 @@ // }}} + // {{{ get_info helpers #define PYOPENCL_GET_OPAQUE_INFO(WHAT, FIRST_ARG, SECOND_ARG, CL_TYPE, TYPE) \ { \ @@ -383,6 +392,7 @@ // }}} + // {{{ event helpers -------------------------------------------------------------- #define PYOPENCL_PARSE_WAIT_FOR \ cl_uint num_events_in_wait_list = 0; \ @@ -424,7 +434,9 @@ // }}} + // {{{ equality testing + #define PYOPENCL_EQUALITY_TESTS(cls) \ bool operator==(cls const &other) const \ { return data() == other.data(); } \ @@ -432,8 +444,8 @@ { return data() != other.data(); } \ long hash() const \ { return (long) (intptr_t) data(); } -// }}} +// }}} namespace pyopencl @@ -496,6 +508,19 @@ namespace pyopencl // }}} + // {{{ utility functions + + inline bool is_queue_out_of_order(cl_command_queue queue) + { + cl_command_queue_properties param_value; + PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo, + (queue, CL_QUEUE_PROPERTIES, sizeof(param_value), ¶m_value, 0)); + return param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + } + + // }}} + + // {{{ buffer interface helper @@ -1655,6 +1680,101 @@ namespace pyopencl // }}} + // {{{ command_queue_ref + + // In contrast to command_queue, command_queue_ref is "nullable", i.e. + // it is a RAII *optional* reference to a command queue. + + class command_queue_ref + { + private: + bool m_valid; + cl_command_queue m_queue; + + public: + command_queue_ref() + : m_valid(false) + {} + + command_queue_ref(cl_command_queue queue) + : m_valid(queue != nullptr), m_queue(queue) + { + // E.g. SVM allocations of size zero use a NULL queue. Tolerate that. + if (m_valid) + PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue)); + } + + command_queue_ref(command_queue_ref &&src) noexcept + : m_valid(src.m_valid), m_queue(src.m_queue) + { + src.m_valid = false; + } + + command_queue_ref(const command_queue_ref &src) + : m_valid(src.m_valid), m_queue(src.m_queue) + { + // Note that there isn't anything per se wrong with this + // copy constructor, the refcounting is just potentially + // expensive. + // + // All code in current use moves these, it does not copy them, + // so this should never get called. + // + // Unfortunately, we can't delete this copy constructor, + // because we would like to return these from functions. + // This makes at least gcc require copy constructors, even + // if those are never called due to NRVO. + std::cerr << "COPYING A COMMAND_QUEUE_REF." << std::endl; + + if (m_valid) + PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue)); + } + + command_queue_ref &operator=(const command_queue_ref &) = delete; + + ~command_queue_ref() + { + reset(); + } + + bool is_valid() const + { + return m_valid; + } + + cl_command_queue data() const + { + if (m_valid) + return m_queue; + else + throw error("command_queue_ref.data", CL_INVALID_VALUE, + "command_queue_ref is not valid"); + } + + void reset() + { + if (m_valid) + PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue, (m_queue)); + m_valid = false; + } + + void set(cl_command_queue queue) + { + if (!queue) + throw error("command_queue_ref.set", CL_INVALID_VALUE, + "cannot set to NULL command queue"); + + if (m_valid) + PYOPENCL_CALL_GUARDED(clReleaseCommandQueue, (m_queue)); + m_queue = queue; + PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue)); + m_valid = true; + } + }; + + // }}} + + // {{{ event/synchronization class event : noncopyable @@ -2046,6 +2166,9 @@ namespace pyopencl } py::object get_info(cl_mem_info param_name) const; + + virtual ~memory_object_holder() + { } }; @@ -2093,7 +2216,7 @@ namespace pyopencl m_valid = false; } - virtual ~memory_object() + ~memory_object() { if (m_valid) release(); @@ -2337,7 +2460,7 @@ namespace pyopencl command_queue &cq, memory_object_holder &mem, py::object buffer, - size_t device_offset, + size_t src_offset, py::object py_wait_for, bool is_blocking) { @@ -2361,7 +2484,7 @@ namespace pyopencl queue, mem.data(), PYOPENCL_CAST_BOOL(is_blocking), - device_offset, len, buf, + src_offset, len, buf, PYOPENCL_WAITLIST_ARGS, &evt )) ); @@ -2376,7 +2499,7 @@ namespace pyopencl command_queue &cq, memory_object_holder &mem, py::object buffer, - size_t device_offset, + size_t dst_offset, py::object py_wait_for, bool is_blocking) { @@ -2400,7 +2523,7 @@ namespace pyopencl queue, mem.data(), PYOPENCL_CAST_BOOL(is_blocking), - device_offset, len, buf, + dst_offset, len, buf, PYOPENCL_WAITLIST_ARGS, &evt )) ); @@ -3441,11 +3564,28 @@ namespace pyopencl // }}} - // {{{ svm - #if PYOPENCL_CL_VERSION >= 0x2000 - class svm_arg_wrapper + // {{{ svm pointer + + class size_not_available { }; + + class svm_pointer + { + public: + virtual void *svm_ptr() const = 0; + // may throw size_not_available + virtual size_t size() const = 0; + virtual ~svm_pointer() + { } + }; + + // }}} + + + // {{{ svm_arg_wrapper + + class svm_arg_wrapper : public svm_pointer { private: void *m_ptr; @@ -3468,7 +3608,7 @@ namespace pyopencl m_size = ward->m_buf.len; } - void *ptr() const + void *svm_ptr() const { return m_ptr; } @@ -3478,26 +3618,71 @@ namespace pyopencl } }; + // }}} + + + // {{{ svm_allocation - class svm_allocation : noncopyable + class svm_allocation : public svm_pointer { private: std::shared_ptr m_context; void *m_allocation; + size_t m_size; + command_queue_ref m_queue; + // FIXME Should maybe also allow keeping a list of events so that we can + // wait for users to finish in the case of out-of-order queues. public: - svm_allocation(std::shared_ptr const &ctx, size_t size, cl_uint alignment, cl_svm_mem_flags flags) - : m_context(ctx) + svm_allocation(std::shared_ptr const &ctx, size_t size, cl_uint alignment, + cl_svm_mem_flags flags, const command_queue *queue = nullptr) + : m_context(ctx), m_size(size) { - PYOPENCL_PRINT_CALL_TRACE("clSVMalloc"); - m_allocation = clSVMAlloc( - ctx->data(), - flags, size, alignment); + if (queue) + { + m_queue.set(queue->data()); + if (is_queue_out_of_order(m_queue.data())) + throw error("SVMAllocation.__init__", CL_INVALID_VALUE, + "supplying an out-of-order queue to SVMAllocation is invalid"); + } + + int try_count = 0; + while (try_count < 2) + { + PYOPENCL_PRINT_CALL_TRACE("clSVMalloc"); + m_allocation = clSVMAlloc( + ctx->data(), + flags, size, alignment); + if (m_allocation) + return; + + ++try_count; + run_python_gc(); + } if (!m_allocation) throw pyopencl::error("clSVMAlloc", CL_OUT_OF_RESOURCES); } + svm_allocation(std::shared_ptr const &ctx, void *allocation, size_t size, + const cl_command_queue queue) + : m_context(ctx), m_allocation(allocation), m_size(size) + { + if (queue) + { + if (is_queue_out_of_order(queue)) + { + release(); + throw error("SVMAllocation.__init__", CL_INVALID_VALUE, + "supplying an out-of-order queue to SVMAllocation is invalid"); + } + m_queue.set(queue); + } + } + + svm_allocation(const svm_allocation &) = delete; + svm_allocation &operator=(const svm_allocation &) = delete; + ~svm_allocation() { if (m_allocation) @@ -3510,36 +3695,62 @@ namespace pyopencl throw error("SVMAllocation.release", CL_INVALID_VALUE, "trying to double-unref svm allocation"); - clSVMFree(m_context->data(), m_allocation); + if (m_queue.is_valid()) + { + PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, ( + m_queue.data(), 1, &m_allocation, + nullptr, nullptr, + 0, nullptr, nullptr)); + m_queue.reset(); + } + else + { + PYOPENCL_PRINT_CALL_TRACE("clSVMFree"); + clSVMFree(m_context->data(), m_allocation); + } m_allocation = nullptr; } - void enqueue_release(command_queue &queue, py::object py_wait_for) + event *enqueue_release(command_queue *queue, py::object py_wait_for) { PYOPENCL_PARSE_WAIT_FOR; if (!m_allocation) - throw error("SVMAllocation.release", CL_INVALID_VALUE, - "trying to double-unref svm allocation"); + throw error("SVMAllocation.enqueue_release", CL_INVALID_VALUE, + "trying to enqueue_release on an already-freed allocation"); + + cl_command_queue use_queue; + if (queue) + use_queue = queue->data(); + else + { + if (m_queue.is_valid()) + use_queue = m_queue.data(); + else + throw error("SVMAllocation.enqueue_release", CL_INVALID_VALUE, + "no implicit queue available, must be provided explicitly"); + } cl_event evt; PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, ( - queue.data(), 1, &m_allocation, + use_queue, 1, &m_allocation, nullptr, nullptr, PYOPENCL_WAITLIST_ARGS, &evt)); m_allocation = nullptr; + + PYOPENCL_RETURN_NEW_EVENT(evt); } - void *ptr() const + void *svm_ptr() const { return m_allocation; } - intptr_t ptr_as_int() const + size_t size() const { - return (intptr_t) m_allocation; + return m_size; } bool operator==(svm_allocation const &other) const @@ -3551,22 +3762,108 @@ namespace pyopencl { return m_allocation != other.m_allocation; } + + void bind_to_queue(command_queue const &queue) + { + if (is_queue_out_of_order(queue.data())) + throw error("SVMAllocation.bind_to_queue", CL_INVALID_VALUE, + "supplying an out-of-order queue to SVMAllocation is invalid"); + + if (m_queue.is_valid()) + { + if (m_queue.data() != queue.data()) + { + // make sure synchronization promises stay valid in new queue + cl_event evt; + + PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_queue.data(), &evt)); + PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, + (queue.data(), 1, &evt, nullptr)); + } + } + + m_queue.set(queue.data()); + } + + void unbind_from_queue() + { + if (m_queue.is_valid()) + PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_queue.data())); + + m_queue.reset(); + } + + // only use for testing/diagnostic/debugging purposes! + cl_command_queue queue() const + { + if (m_queue.is_valid()) + return m_queue.data(); + else + return nullptr; + } }; + // }}} + + + // {{{ svm operations inline event *enqueue_svm_memcpy( command_queue &cq, cl_bool is_blocking, - svm_arg_wrapper &dst, svm_arg_wrapper &src, - py::object py_wait_for + svm_pointer &dst, svm_pointer &src, + py::object py_wait_for, + py::object byte_count_py ) { PYOPENCL_PARSE_WAIT_FOR; - if (src.size() != dst.size()) + // {{{ process size + + PYOPENCL_GET_SVM_SIZE(src); + PYOPENCL_GET_SVM_SIZE(dst); + + size_t size; + bool have_size = false; + + if (src_has_size) + { + size = src_size; + have_size = true; + } + if (dst_has_size) + { + if (have_size) + { + if (!byte_count_py.is_none()) + size = std::min(size, dst_size); + else if (size != dst_size) + throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE, + "sizes of source and destination buffer do not match"); + } + else + { + size = dst_size; + have_size = true; + } + } + + if (!byte_count_py.is_none()) + { + size_t byte_count = byte_count_py.cast(); + if (have_size && byte_count > size) + throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE, + "specified byte_count larger than size of source or destination buffers"); + size = byte_count; + have_size = true; + } + + if (!have_size) throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE, - "sizes of source and destination buffer do not match"); + "size not passed and could not be determined"); + + // }}} cl_event evt; PYOPENCL_CALL_GUARDED( @@ -3574,8 +3871,8 @@ namespace pyopencl ( cq.data(), is_blocking, - dst.ptr(), src.ptr(), - dst.size(), + dst.svm_ptr(), src.svm_ptr(), + size, PYOPENCL_WAITLIST_ARGS, &evt )); @@ -3587,7 +3884,7 @@ namespace pyopencl inline event *enqueue_svm_memfill( command_queue &cq, - svm_arg_wrapper &dst, py::object py_pattern, + svm_pointer &dst, py::object py_pattern, py::object byte_count, py::object py_wait_for ) @@ -3604,18 +3901,41 @@ namespace pyopencl pattern_ptr = pattern_ward->m_buf.buf; pattern_len = pattern_ward->m_buf.len; - size_t fill_size = dst.size(); + // {{{ process size + + PYOPENCL_GET_SVM_SIZE(dst); + + size_t size; + bool have_size = false; + if (dst_has_size) + { + size = dst_size; + have_size = true; + } if (!byte_count.is_none()) - fill_size = py::cast(byte_count); + { + size_t user_size = py::cast(byte_count); + if (have_size && user_size > size) + throw error("enqueue_svm_memfill", CL_INVALID_VALUE, + "byte_count too large for specified SVM buffer"); + } + + if (!have_size) + { + throw error("enqueue_svm_memfill", CL_INVALID_VALUE, + "byte_count not passed and could not be determined"); + } + + // }}} cl_event evt; PYOPENCL_CALL_GUARDED( clEnqueueSVMMemFill, ( cq.data(), - dst.ptr(), pattern_ptr, + dst.svm_ptr(), pattern_ptr, pattern_len, - fill_size, + size, PYOPENCL_WAITLIST_ARGS, &evt )); @@ -3629,12 +3949,40 @@ namespace pyopencl command_queue &cq, cl_bool is_blocking, cl_map_flags flags, - svm_arg_wrapper &svm, - py::object py_wait_for + svm_pointer &svm, + py::object py_wait_for, + py::object user_size_py ) { PYOPENCL_PARSE_WAIT_FOR; + // {{{ process size + + PYOPENCL_GET_SVM_SIZE(svm); + + size_t size; + bool have_size = false; + if (svm_has_size) + { + size = svm_size; + have_size = true; + } + if (!user_size_py.is_none()) + { + size_t user_size = py::cast(user_size_py); + if (have_size && user_size > size) + throw error("enqueue_svm_memfill", CL_INVALID_VALUE, + "user-provided size too large for specified SVM buffer"); + } + + if (!have_size) + { + throw error("enqueue_svm_mem_map", CL_INVALID_VALUE, + "size not passed and could not be determined"); + } + + // }}} + cl_event evt; PYOPENCL_CALL_GUARDED( clEnqueueSVMMap, @@ -3642,7 +3990,7 @@ namespace pyopencl cq.data(), is_blocking, flags, - svm.ptr(), svm.size(), + svm.svm_ptr(), size, PYOPENCL_WAITLIST_ARGS, &evt )); @@ -3654,7 +4002,7 @@ namespace pyopencl inline event *enqueue_svm_unmap( command_queue &cq, - svm_arg_wrapper &svm, + svm_pointer &svm, py::object py_wait_for ) { @@ -3665,7 +4013,7 @@ namespace pyopencl clEnqueueSVMUnmap, ( cq.data(), - svm.ptr(), + svm.svm_ptr(), PYOPENCL_WAITLIST_ARGS, &evt )); @@ -3691,9 +4039,9 @@ namespace pyopencl for (py::handle py_svm: svms) { - svm_arg_wrapper &svm(py::cast(py_svm)); + svm_pointer &svm(py::cast(py_svm)); - svm_pointers.push_back(svm.ptr()); + svm_pointers.push_back(svm.svm_ptr()); sizes.push_back(svm.size()); } @@ -4336,16 +4684,18 @@ namespace pyopencl { private: cl_kernel m_kernel; + bool m_set_arg_prefer_svm; public: kernel(cl_kernel knl, bool retain) - : m_kernel(knl) + : m_kernel(knl), m_set_arg_prefer_svm(false) { if (retain) PYOPENCL_CALL_GUARDED(clRetainKernel, (knl)); } kernel(program const &prg, std::string const &kernel_name) + : m_set_arg_prefer_svm(false) { cl_int status_code; @@ -4486,10 +4836,10 @@ namespace pyopencl } #if PYOPENCL_CL_VERSION >= 0x2000 - void set_arg_svm(cl_uint arg_index, svm_arg_wrapper const &wrp) + void set_arg_svm(cl_uint arg_index, svm_pointer const &wrp) { PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer, - (m_kernel, arg_index, wrp.ptr())); + (m_kernel, arg_index, wrp.svm_ptr())); } #endif @@ -4501,21 +4851,47 @@ namespace pyopencl return; } - try + // It turns out that a taken 'catch' has a relatively high cost, so + // in deciding which of "mem object" and "svm" to try first, we use + // whatever we were given last time around. + if (m_set_arg_prefer_svm) { - set_arg_mem(arg_index, arg.cast()); - return; +#if PYOPENCL_CL_VERSION >= 0x2000 + try + { + set_arg_svm(arg_index, arg.cast()); + return; + } + catch (py::cast_error &) { } +#endif + + try + { + set_arg_mem(arg_index, arg.cast()); + m_set_arg_prefer_svm = false; + return; + } + catch (py::cast_error &) { } } - catch (py::cast_error &) { } + else + { + try + { + set_arg_mem(arg_index, arg.cast()); + return; + } + catch (py::cast_error &) { } #if PYOPENCL_CL_VERSION >= 0x2000 - try - { - set_arg_svm(arg_index, arg.cast()); - return; - } - catch (py::cast_error &) { } + try + { + set_arg_svm(arg_index, arg.cast()); + m_set_arg_prefer_svm = true; + return; + } + catch (py::cast_error &) { } #endif + } try { diff --git a/src/wrap_cl_part_1.cpp b/src/wrap_cl_part_1.cpp index 8d62ef5d1..3b9f79eda 100644 --- a/src/wrap_cl_part_1.cpp +++ b/src/wrap_cl_part_1.cpp @@ -282,11 +282,12 @@ void pyopencl_expose_part_1(py::module &m) // {{{ transfers // {{{ byte-for-byte + m.def("_enqueue_read_buffer", enqueue_read_buffer, py::arg("queue"), py::arg("mem"), py::arg("hostbuf"), - py::arg("device_offset")=0, + py::arg("src_offset")=0, py::arg("wait_for")=py::none(), py::arg("is_blocking")=true ); @@ -294,7 +295,7 @@ void pyopencl_expose_part_1(py::module &m) py::arg("queue"), py::arg("mem"), py::arg("hostbuf"), - py::arg("device_offset")=0, + py::arg("dst_offset")=0, py::arg("wait_for")=py::none(), py::arg("is_blocking")=true ); diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp index 0c9a0d1b1..80560bd76 100644 --- a/src/wrap_cl_part_2.cpp +++ b/src/wrap_cl_part_2.cpp @@ -24,6 +24,7 @@ // OTHER DEALINGS IN THE SOFTWARE. +#include #define NO_IMPORT_ARRAY #define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API @@ -64,6 +65,22 @@ namespace pyopencl { } #endif + +#if PYOPENCL_CL_VERSION >= 0x2000 + class svm_pointer_as_buffer + { + private: + svm_pointer &m_ptr; + + public: + svm_pointer_as_buffer(svm_pointer &ptr) + : m_ptr(ptr) + { } + + svm_pointer &ptr() const + { return m_ptr; } + }; +#endif } @@ -292,37 +309,130 @@ void pyopencl_expose_part_2(py::module &m) // }}} - // {{{ svm + // {{{ svm_pointer #if PYOPENCL_CL_VERSION >= 0x2000 + { + typedef svm_pointer cls; + py::class_(m, "SVMPointer", py::dynamic_attr()) + // For consistency, it may seem appropriate to use int_ptr here, but + // that would work on both buffers and SVM, and passing a buffer pointer to + // a kernel is going to lead to a bad time. + .def_property_readonly("svm_ptr", + [](cls &self) { return (intptr_t) self.svm_ptr(); }) + .def_property_readonly("size", [](cls &self) -> py::object + { + try + { + return py::cast(self.size()); + } + catch (size_not_available) + { + return py::none(); + } + }) + .def_property_readonly("buf", [](cls &self) -> svm_pointer_as_buffer * { + return new svm_pointer_as_buffer(self); + }, py::return_value_policy::reference_internal) + ; + } + + { + typedef svm_pointer_as_buffer cls; + py::class_(m, "_SVMPointerAsBuffer", pybind11::buffer_protocol()) + .def_buffer([](cls &self) -> pybind11::buffer_info + { + size_t size; + try + { + size = self.ptr().size(); + } + catch (size_not_available) + { + throw pyopencl::error("SVMPointer buffer protocol", CL_INVALID_VALUE, + "size of SVM is not known"); + } + return pybind11::buffer_info( + // Pointer to buffer + self.ptr().svm_ptr(), + // Size of one scalar + sizeof(unsigned char), + // Python struct-style format descriptor + pybind11::format_descriptor::format(), + // Number of dimensions + 1, + // Buffer dimensions + { size }, + // Strides (in bytes) for each index + { sizeof(unsigned char) } + ); + }) + ; + } + + // }}} + + // {{{ svm_arg_wrapper + { typedef svm_arg_wrapper cls; - py::class_(m, "SVM", py::dynamic_attr()) + py::class_(m, "SVM", py::dynamic_attr()) .def(py::init()) ; } + // }}} + + // {{{ svm_allocation + { typedef svm_allocation cls; - py::class_(m, "SVMAllocation", py::dynamic_attr()) - .def(py::init, size_t, cl_uint, cl_svm_mem_flags>()) + py::class_(m, "SVMAllocation", py::dynamic_attr()) + .def(py::init, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(), + py::arg("context"), + py::arg("size"), + py::arg("alignment"), + py::arg("flags"), + py::arg("queue").none(true)=py::none() + ) .DEF_SIMPLE_METHOD(release) .def("enqueue_release", &cls::enqueue_release, ":returns: a :class:`pyopencl.Event`\n\n" - "|std-enqueue-blurb|") - .def("_ptr_as_int", &cls::ptr_as_int) + "|std-enqueue-blurb|", + py::arg("queue").none(true)=py::none(), + py::arg("wait_for").none(true)=py::none() + ) .def(py::self == py::self) .def(py::self != py::self) - .def("__hash__", &cls::ptr_as_int) + .def("__hash__", [](cls &self) { return (intptr_t) self.svm_ptr(); }) + .def("bind_to_queue", &cls::bind_to_queue, + py::arg("queue")) + .DEF_SIMPLE_METHOD(unbind_from_queue) + + // only for diagnostic/debugging/testing purposes! + .def_property_readonly("_queue", + [](cls const &self) -> py::object + { + cl_command_queue queue = self.queue(); + if (queue) + return py::cast(new command_queue(queue, true)); + else + return py::none(); + }) ; } + // }}} + + // {{{ svm operations + m.def("_enqueue_svm_memcpy", enqueue_svm_memcpy, py::arg("queue"), py::arg("is_blocking"), py::arg("dst"), py::arg("src"), - py::arg("wait_for")=py::none() + py::arg("wait_for")=py::none(), + py::arg("byte_count")=py::none() ); m.def("_enqueue_svm_memfill", enqueue_svm_memfill, @@ -338,7 +448,8 @@ void pyopencl_expose_part_2(py::module &m) py::arg("is_blocking"), py::arg("flags"), py::arg("svm"), - py::arg("wait_for")=py::none() + py::arg("wait_for")=py::none(), + py::arg("size")=py::none() ); m.def("_enqueue_svm_unmap", enqueue_svm_unmap, diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp index f40832bbf..36f6b4106 100644 --- a/src/wrap_mempool.cpp +++ b/src/wrap_mempool.cpp @@ -40,44 +40,53 @@ -namespace -{ +namespace pyopencl { + // {{{ test_allocator + class test_allocator { public: typedef void *pointer_type; typedef size_t size_type; - virtual test_allocator *copy() const + bool is_deferred() const { - return new test_allocator(); + return false; } - virtual bool is_deferred() const + pointer_type allocate(size_type s) { - return false; + return nullptr; } - virtual pointer_type allocate(size_type s) + + pointer_type hand_out_existing_block(pointer_type &&p) { - return nullptr; + return p; } - void free(pointer_type p) + ~test_allocator() + { } + + void free(pointer_type &&p) { } void try_release_blocks() { } }; + // }}} + + + // {{{ buffer allocators - class cl_allocator_base + class buffer_allocator_base { protected: std::shared_ptr m_context; cl_mem_flags m_flags; public: - cl_allocator_base(std::shared_ptr const &ctx, + buffer_allocator_base(std::shared_ptr const &ctx, cl_mem_flags flags=CL_MEM_READ_WRITE) : m_context(ctx), m_flags(flags) { @@ -86,21 +95,25 @@ namespace "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags"); } - cl_allocator_base(cl_allocator_base const &src) + buffer_allocator_base(buffer_allocator_base const &src) : m_context(src.m_context), m_flags(src.m_flags) { } - virtual ~cl_allocator_base() + virtual ~buffer_allocator_base() { } typedef cl_mem pointer_type; typedef size_t size_type; - virtual cl_allocator_base *copy() const = 0; virtual bool is_deferred() const = 0; virtual pointer_type allocate(size_type s) = 0; - void free(pointer_type p) + pointer_type hand_out_existing_block(pointer_type &&p) + { + return p; + } + + void free(pointer_type &&p) { PYOPENCL_CALL_GUARDED(clReleaseMemObject, (p)); } @@ -111,22 +124,18 @@ namespace } }; - class cl_deferred_allocator : public cl_allocator_base + + class deferred_buffer_allocator : public buffer_allocator_base { private: - typedef cl_allocator_base super; + typedef buffer_allocator_base super; public: - cl_deferred_allocator(std::shared_ptr const &ctx, + deferred_buffer_allocator(std::shared_ptr const &ctx, cl_mem_flags flags=CL_MEM_READ_WRITE) : super(ctx, flags) { } - cl_allocator_base *copy() const - { - return new cl_deferred_allocator(*this); - } - bool is_deferred() const { return true; } @@ -139,30 +148,24 @@ namespace } }; - const unsigned zero = 0; - class cl_immediate_allocator : public cl_allocator_base + class immediate_buffer_allocator : public buffer_allocator_base { private: - typedef cl_allocator_base super; + typedef buffer_allocator_base super; pyopencl::command_queue m_queue; public: - cl_immediate_allocator(pyopencl::command_queue &queue, + immediate_buffer_allocator(pyopencl::command_queue &queue, cl_mem_flags flags=CL_MEM_READ_WRITE) : super(std::shared_ptr(queue.get_context()), flags), m_queue(queue.data(), /*retain*/ true) { } - cl_immediate_allocator(cl_immediate_allocator const &src) + immediate_buffer_allocator(immediate_buffer_allocator const &src) : super(src), m_queue(src.m_queue) { } - cl_allocator_base *copy() const - { - return new cl_immediate_allocator(*this); - } - bool is_deferred() const { return false; } @@ -210,13 +213,47 @@ namespace } }; + // }}} + // {{{ pooled_buffer + + class pooled_buffer + : public pyopencl::pooled_allocation >, + public pyopencl::memory_object_holder + { + private: + typedef + pyopencl::pooled_allocation > + super; + + public: + pooled_buffer( + std::shared_ptr p, super::size_type s) + : super(p, s) + { } + + virtual ~pooled_buffer() + { } + + const super::pointer_type data() const + { return m_ptr; } + + size_t size() const + { + return m_size; + } + }; + + // }}} + + + // {{{ allocate_from_buffer_allocator inline - pyopencl::buffer *allocator_call(cl_allocator_base &alloc, size_t size) + buffer *allocate_from_buffer_allocator(buffer_allocator_base &alloc, size_t size) { - cl_mem mem; + cl_mem mem = nullptr; int try_count = 0; while (try_count < 2) { @@ -256,41 +293,264 @@ namespace } } + // }}} + // {{{ allocate_from_buffer_pool - class pooled_buffer - : public pyopencl::pooled_allocation >, - public pyopencl::memory_object_holder + pooled_buffer *allocate_from_buffer_pool( + std::shared_ptr > pool, + memory_pool::size_type sz) + { + return new pooled_buffer(pool, sz); + } + + // }}} + + +#if PYOPENCL_CL_VERSION >= 0x2000 + + struct svm_held_pointer + { + void *ptr; + pyopencl::command_queue_ref queue; + }; + + + // {{{ svm allocator + + class svm_allocator + { + public: + typedef svm_held_pointer pointer_type; + typedef size_t size_type; + + protected: + std::shared_ptr m_context; + cl_uint m_alignment; + cl_svm_mem_flags m_flags; + pyopencl::command_queue_ref m_queue; + + public: + svm_allocator(std::shared_ptr const &ctx, + cl_uint alignment=0, cl_svm_mem_flags flags=CL_MEM_READ_WRITE, + pyopencl::command_queue *queue=nullptr) + : m_context(ctx), m_alignment(alignment), m_flags(flags) + { + if (queue) + m_queue.set(queue->data()); + } + + svm_allocator(svm_allocator const &src) + : m_context(src.m_context), m_alignment(src.m_alignment), + m_flags(src.m_flags) + { } + + ~svm_allocator() + { } + + bool is_deferred() const + { + // According to experiments with the Nvidia implementation (and based + // on my reading of the CL spec), clSVMalloc will return an error + // immedaitely upon being out of memory. Therefore the + // immediate/deferred split on the buffer side is not needed here. + // -AK, 2022-09-07 + + return false; + } + + std::shared_ptr context() const + { + return m_context; + } + + pointer_type allocate(size_type size) + { + if (size == 0) + return { nullptr, nullptr }; + + PYOPENCL_PRINT_CALL_TRACE("clSVMalloc"); + return { + clSVMAlloc(m_context->data(), m_flags, size, m_alignment), + pyopencl::command_queue_ref(m_queue.is_valid() ? m_queue.data() : nullptr) + }; + } + + pointer_type hand_out_existing_block(pointer_type &&p) + { + if (m_queue.is_valid()) + { + if (p.queue.is_valid()) + { + if (p.queue.data() != m_queue.data()) + { + // make sure synchronization promises stay valid in new queue + cl_event evt; + + PYOPENCL_CALL_GUARDED(clEnqueueMarker, (p.queue.data(), &evt)); + PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, + (m_queue.data(), 1, &evt, nullptr)); + } + } + p.queue.set(m_queue.data()); + } + else + { + if (p.queue.is_valid()) + { + PYOPENCL_CALL_GUARDED_THREADED(clFinish, (p.queue.data())); + p.queue.reset(); + } + } + + return std::move(p); + } + + void free(pointer_type &&p) + { + if (p.queue.is_valid()) + { + PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, ( + p.queue.data(), 1, &p.ptr, + nullptr, nullptr, + 0, nullptr, nullptr)); + p.queue.reset(); + } + else + { + PYOPENCL_PRINT_CALL_TRACE("clSVMFree"); + clSVMFree(m_context->data(), p.ptr); + } + } + + void try_release_blocks() + { + pyopencl::run_python_gc(); + } + }; + + // }}} + + + // {{{ pooled_svm + + class pooled_svm + : public pyopencl::pooled_allocation>, + public pyopencl::svm_pointer { private: typedef - pyopencl::pooled_allocation > + pyopencl::pooled_allocation> super; public: - pooled_buffer( + pooled_svm( std::shared_ptr p, super::size_type s) : super(p, s) { } - const super::pointer_type data() const - { return ptr(); } + virtual ~pooled_svm() + { } + + void *svm_ptr() const + { return m_ptr.ptr; } + + size_t size() const + { return m_size; } + + void bind_to_queue(pyopencl::command_queue const &queue) + { + if (pyopencl::is_queue_out_of_order(queue.data())) + throw pyopencl::error("PooledSVM.bind_to_queue", CL_INVALID_VALUE, + "supplying an out-of-order queue to SVMAllocation is invalid"); + + if (m_ptr.queue.is_valid()) + { + if (m_ptr.queue.data() != queue.data()) + { + // make sure synchronization promises stay valid in new queue + cl_event evt; + + PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_ptr.queue.data(), &evt)); + PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList, + (queue.data(), 1, &evt, nullptr)); + } + } + + m_ptr.queue.set(queue.data()); + } + + void unbind_from_queue() + { + if (m_ptr.queue.is_valid()) + PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_ptr.queue.data())); + + m_ptr.queue.reset(); + } + + // only use for testing/diagnostic/debugging purposes! + cl_command_queue queue() const + { + if (m_ptr.queue.is_valid()) + return m_ptr.queue.data(); + else + return nullptr; + } }; + // }}} + // {{{ svm_allocator_call - pooled_buffer *device_pool_allocate( - std::shared_ptr > pool, - pyopencl::memory_pool::size_type sz) + inline + pyopencl::svm_allocation *svm_allocator_call(svm_allocator &alloc, size_t size) { - return new pooled_buffer(pool, sz); + int try_count = 0; + while (true) + { + try + { + svm_held_pointer mem(alloc.allocate(size)); + if (mem.queue.is_valid()) + return new pyopencl::svm_allocation( + alloc.context(), mem.ptr, size, mem.queue.data()); + else + return new pyopencl::svm_allocation( + alloc.context(), mem.ptr, size, nullptr); + } + catch (pyopencl::error &e) + { + if (!e.is_out_of_memory()) + throw; + if (++try_count == 2) + throw; + } + + alloc.try_release_blocks(); + } + } + + // }}} + + + // {{{ allocate_from_svm_ppol + + pooled_svm *allocate_from_svm_ppol( + std::shared_ptr > pool, + pyopencl::memory_pool::size_type sz) + { + return new pooled_svm(pool, sz); } + // }}} +#endif +} +namespace { template void expose_memory_pool(Wrapper &wrapper) { @@ -304,6 +564,9 @@ namespace .DEF_SIMPLE_METHOD(alloc_size) .DEF_SIMPLE_METHOD(free_held) .DEF_SIMPLE_METHOD(stop_holding) + + // undoc for now + .def("_set_trace", &cls::set_trace) ; } } @@ -316,22 +579,24 @@ void pyopencl_expose_mempool(py::module &m) m.def("bitlog2", pyopencl::bitlog2); { - typedef cl_allocator_base cls; - py::class_ wrapper( - m, "_tools_AllocatorBase"/*, py::no_init */); + typedef pyopencl::buffer_allocator_base cls; + py::class_> wrapper(m, "AllocatorBase"); wrapper - .def("__call__", allocator_call) + .def("__call__", pyopencl::allocate_from_buffer_allocator, py::arg("size")) ; } { - typedef pyopencl::memory_pool cls; + typedef pyopencl::memory_pool cls; py::class_> wrapper( m, "_TestMemoryPool"); wrapper .def(py::init([](unsigned leading_bits_in_bin_id) - { return new cls(test_allocator(), leading_bits_in_bin_id); }), + { return new cls( + std::shared_ptr( + new pyopencl::test_allocator()), + leading_bits_in_bin_id); }), py::arg("leading_bits_in_bin_id")=4 ) .def("allocate", [](std::shared_ptr pool, cls::size_type sz) @@ -345,9 +610,9 @@ void pyopencl_expose_mempool(py::module &m) } { - typedef cl_deferred_allocator cls; - py::class_ wrapper( - m, "_tools_DeferredAllocator"); + typedef pyopencl::deferred_buffer_allocator cls; + py::class_> wrapper( + m, "DeferredAllocator"); wrapper .def(py::init< std::shared_ptr const &>()) @@ -359,9 +624,9 @@ void pyopencl_expose_mempool(py::module &m) } { - typedef cl_immediate_allocator cls; - py::class_ wrapper( - m, "_tools_ImmediateAllocator"); + typedef pyopencl::immediate_buffer_allocator cls; + py::class_> wrapper( + m, "ImmediateAllocator"); wrapper .def(py::init()) .def(py::init(), @@ -370,31 +635,88 @@ void pyopencl_expose_mempool(py::module &m) } { - typedef pyopencl::memory_pool cls; + typedef pyopencl::pooled_buffer cls; + py::class_(m, "PooledBuffer") + .def("release", &cls::free) + + .def("bind_to_queue", [](cls &self, pyopencl::command_queue &queue) { /* no-op */ }) + .def("unbind_from_queue", [](cls &self) { /* no-op */ }) + ; + } + + { + typedef pyopencl::memory_pool cls; - py::class_< - cls, /* boost::noncopyable, */ - std::shared_ptr> wrapper( m, "MemoryPool"); + py::class_> wrapper( m, "MemoryPool"); wrapper - .def(py::init(), + .def(py::init, unsigned>(), py::arg("allocator"), py::arg("leading_bits_in_bin_id")=4 ) - .def("allocate", device_pool_allocate) - .def("__call__", device_pool_allocate) - // undoc for now - .DEF_SIMPLE_METHOD(set_trace) + .def("allocate", pyopencl::allocate_from_buffer_pool, py::arg("size")) + .def("__call__", pyopencl::allocate_from_buffer_pool, py::arg("size")) ; expose_memory_pool(wrapper); } +#if PYOPENCL_CL_VERSION >= 0x2000 + { + typedef pyopencl::svm_allocator cls; + py::class_> wrapper(m, "SVMAllocator"); + wrapper + .def(py::init const &, cl_uint, cl_uint, pyopencl::command_queue *>(), + py::arg("context"), + py::kw_only(), + py::arg("alignment")=0, + py::arg("flags")=CL_MEM_READ_WRITE, + py::arg("queue").none(true)=nullptr + ) + .def("__call__", pyopencl::svm_allocator_call, py::arg("size")) + ; + } + { - typedef pooled_buffer cls; - py::class_( - m, "PooledBuffer"/* , py::no_init */) + typedef pyopencl::pooled_svm cls; + py::class_(m, "PooledSVM") .def("release", &cls::free) + .def("enqueue_release", &cls::free) + .def("__eq__", [](const cls &self, const cls &other) + { return self.svm_ptr() == other.svm_ptr(); }) + .def("__hash__", [](cls &self) { return (intptr_t) self.svm_ptr(); }) + .DEF_SIMPLE_METHOD(bind_to_queue) + .DEF_SIMPLE_METHOD(unbind_from_queue) + + // only for diagnostic/debugging/testing purposes! + .def_property_readonly("_queue", + [](cls const &self) -> py::object + { + cl_command_queue queue = self.queue(); + if (queue) + return py::cast(new pyopencl::command_queue(queue, true)); + else + return py::none(); + }) + ; + } + + { + typedef pyopencl::memory_pool cls; + + py::class_> wrapper( m, "SVMPool"); + wrapper + .def(py::init, unsigned>(), + py::arg("allocator"), + py::kw_only(), + py::arg("leading_bits_in_bin_id")=4 + ) + .def("__call__", pyopencl::allocate_from_svm_ppol, py::arg("size")) ; + + expose_memory_pool(wrapper); } + +#endif } + +// vim: foldmethod=marker diff --git a/test/test_array.py b/test/test_array.py index ffb0714c8..ffec2f4d2 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -2180,6 +2180,64 @@ def test_dtype_conversions(ctx_factory): # }}} +# {{{ test_svm_mem_pool_with_arrays + +@pytest.mark.parametrize("use_mempool", [False, True]) +def test_arrays_with_svm_allocators(ctx_factory, use_mempool): + context = ctx_factory() + queue = cl.CommandQueue(context) + queue2 = cl.CommandQueue(context) + + from pyopencl.characterize import has_coarse_grain_buffer_svm + has_cg_svm = has_coarse_grain_buffer_svm(queue.device) + + if not has_cg_svm: + pytest.skip("Need coarse-grained SVM support for this test.") + + alloc = cl_tools.SVMAllocator(context, queue=queue) + if use_mempool: + alloc = cl_tools.SVMPool(alloc) + + def alloc2(size): + allocation = alloc(size) + allocation.bind_to_queue(queue2) + return allocation + + a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=alloc) + b_dev = cl_array.to_device(queue, np.arange(2000), allocator=alloc) + 4000 + + assert a_dev.allocator is alloc + assert b_dev.allocator is alloc + + assert a_dev.data._queue == queue + assert b_dev.data._queue == queue + + a_dev2 = cl_array.arange(queue2, 2000, dtype=np.float32, allocator=alloc2) + b_dev2 = cl_array.to_device(queue2, np.arange(2000), allocator=alloc2) + 4000 + + assert a_dev2.allocator is alloc2 + assert b_dev2.allocator is alloc2 + + assert a_dev2.data._queue == queue2 + assert b_dev2.data._queue == queue2 + + np.testing.assert_allclose((a_dev+b_dev).get(), (a_dev2+b_dev2).get()) + + with pytest.warns(cl_array.InconsistentOpenCLQueueWarning): + a_dev2.with_queue(queue) + + # safe to let this proceed to deallocation, since we're not + # operating on the memory + + with pytest.warns(cl_array.InconsistentOpenCLQueueWarning): + cl_array.empty(queue2, 2000, np.float32, allocator=alloc) + + # safe to let this proceed to deallocation, since we're not + # operating on the memory + +# }}} + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) @@ -2187,4 +2245,4 @@ def test_dtype_conversions(ctx_factory): from pytest import main main([__file__]) -# vim: filetype=pyopencl:fdm=marker +# vim: fdm=marker diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 0ec3e1343..758b05c0a 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -59,6 +59,8 @@ def _xfail_if_pocl_gpu(device, what): "at least the Titan V, as of pocl 1.6, 2021-01-20") +# {{{ test_get_info + def test_get_info(ctx_factory): ctx = ctx_factory() device, = ctx.devices @@ -236,6 +238,10 @@ def do_test(cl_obj, info_cls, func=None, try_attr_form=True): do_test(img, cl.image_info, lambda info: img.get_image_info(info)) +# }}} + + +# {{{ test_int_ptr def test_int_ptr(ctx_factory): def do_test(obj): @@ -285,6 +291,10 @@ def do_test(obj): img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256)) do_test(img) +# }}} + + +# {{{ test_invalid_kernel_names_cause_failures def test_invalid_kernel_names_cause_failures(ctx_factory): ctx = ctx_factory() @@ -308,6 +318,10 @@ def test_invalid_kernel_names_cause_failures(ctx_factory): else: raise +# }}} + + +# {{{ test_image_format_constructor def test_image_format_constructor(): # doesn't need image support to succeed @@ -319,6 +333,10 @@ def test_image_format_constructor(): if not cl._PYPY: assert not hasattr(iform, "__dict__") +# }}} + + +# {{{ test_device_topology_amd_constructor def test_device_topology_amd_constructor(): # doesn't need cl_amd_device_attribute_query support to succeed @@ -331,6 +349,10 @@ def test_device_topology_amd_constructor(): if not cl._PYPY: assert not hasattr(topol, "__dict__") +# }}} + + +# {{{ test_nonempty_supported_image_formats def test_nonempty_supported_image_formats(ctx_factory): context = ctx_factory() @@ -344,6 +366,10 @@ def test_nonempty_supported_image_formats(ctx_factory): from pytest import skip skip("images not supported on %s" % device.name) +# }}} + + +# {{{ test_that_python_args_fail def test_that_python_args_fail(ctx_factory): context = ctx_factory() @@ -379,6 +405,10 @@ def test_that_python_args_fail(ctx_factory): a_result = np.empty_like(a) cl.enqueue_copy(queue, a_buf, a_result).wait() +# }}} + + +# {{{ test_image_2d def test_image_2d(ctx_factory): context = ctx_factory() @@ -452,6 +482,10 @@ def test_image_2d(ctx_factory): else: assert good +# }}} + + +# {{{ test_image_3d def test_image_3d(ctx_factory): #test for image_from_array for 3d image of float2 @@ -530,6 +564,10 @@ def test_image_3d(ctx_factory): else: assert good +# }}} + + +# {{{ test_copy_buffer def test_copy_buffer(ctx_factory): context = ctx_factory() @@ -549,6 +587,10 @@ def test_copy_buffer(ctx_factory): assert la.norm(a - b) == 0 +# }}} + + +# {{{ test_mempool_* def test_mempool(ctx_factory): from pyopencl.tools import MemoryPool, ImmediateAllocator @@ -601,6 +643,10 @@ def test_mempool_32bit_issues(): for offs in range(-5, 5): pool.allocate(2**i + offs) +# }}} + + +# {{{ test_allocator @pytest.mark.parametrize("allocator_cls", [ImmediateAllocator, DeferredAllocator]) def test_allocator(ctx_factory, allocator_cls): @@ -618,6 +664,10 @@ def test_allocator(ctx_factory, allocator_cls): assert mem is not None assert mem2 is None +# }}} + + +# {{{ test_vector_args def test_vector_args(ctx_factory): context = ctx_factory() @@ -639,7 +689,10 @@ def test_vector_args(ctx_factory): assert (dest == x).all() +# }}} + +# {{{ test_header_dep_handling def test_header_dep_handling(ctx_factory): context = ctx_factory() @@ -657,6 +710,10 @@ def test_header_dep_handling(ctx_factory): cl.Program(context, kernel_src).build(["-I", dirname(__file__)]) cl.Program(context, kernel_src).build(["-I", dirname(__file__)]) +# }}} + + +# {{{ test_context_dep_memoize def test_context_dep_memoize(ctx_factory): context = ctx_factory() @@ -674,6 +731,10 @@ def do_something(ctx): assert counter[0] == 1 +# }}} + + +# {{{ test_can_build_and_run_binary def test_can_build_and_run_binary(ctx_factory): ctx = ctx_factory() @@ -698,6 +759,10 @@ def test_can_build_and_run_binary(ctx_factory): foo.simple(queue, (n,), (16,), a_dev.data, dest_dev.data) +# }}} + + +# {{{ test_enqueue_barrier_marker def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() @@ -716,6 +781,10 @@ def test_enqueue_barrier_marker(ctx_factory): evt2 = cl.enqueue_marker(queue, wait_for=[evt1]) cl.enqueue_barrier(queue, wait_for=[evt1, evt2]) +# }}} + + +# {{{ test_wait_for_events def test_wait_for_events(ctx_factory): ctx = ctx_factory() @@ -724,6 +793,10 @@ def test_wait_for_events(ctx_factory): evt2 = cl.enqueue_marker(queue) cl.wait_for_events([evt1, evt2]) +# }}} + + +# {{{ test_unload_compiler def test_unload_compiler(platform): if (platform._get_cl_version() < (1, 2) @@ -736,6 +809,10 @@ def test_unload_compiler(platform): skip("Intel proprietary driver does not support unloading compiler") cl.unload_platform_compiler(platform) +# }}} + + +# {{{ test_platform_get_devices def test_platform_get_devices(ctx_factory): ctx = ctx_factory() @@ -762,6 +839,10 @@ def test_platform_get_devices(ctx_factory): for dev in devs: assert dev.type & dev_type == dev_type +# }}} + + +# {{{ test_user_event def test_user_event(ctx_factory): ctx = ctx_factory() @@ -808,6 +889,10 @@ def event_waiter2(e, key): raise RuntimeError("cl.wait_for_events timeout on UserEvent") assert evt.command_execution_status == cl.command_execution_status.COMPLETE +# }}} + + +# {{{ test_buffer_get_host_array def test_buffer_get_host_array(ctx_factory): if cl._PYPY: @@ -843,6 +928,10 @@ def test_buffer_get_host_array(ctx_factory): except cl.LogicError: pass +# }}} + + +# {{{ test_program_valued_get_info def test_program_valued_get_info(ctx_factory): ctx = ctx_factory() @@ -860,6 +949,10 @@ def test_program_valued_get_info(ctx_factory): assert knl.program == prg knl.program.binaries[0] +# }}} + + +# {{{ test_event_set_callback def test_event_set_callback(ctx_factory): import sys @@ -921,6 +1014,10 @@ def cb(status): assert got_called +# }}} + + +# {{{ test_global_offset def test_global_offset(ctx_factory): context = ctx_factory() @@ -951,6 +1048,10 @@ def test_global_offset(ctx_factory): assert (a_2 == 2*a).all() +# }}} + + +# {{{ test_sub_buffers def test_sub_buffers(ctx_factory): ctx = ctx_factory() @@ -981,6 +1082,10 @@ def test_sub_buffers(ctx_factory): assert np.array_equal(a_sub, a_sub_ref) +# }}} + + +# {{{ test_spirv def test_spirv(ctx_factory): ctx = ctx_factory() @@ -1012,16 +1117,19 @@ def test_spirv(ctx_factory): assert la.norm((dest_dev - (a_dev+b_dev)).get()) < 1e-7 +# }}} + + +# {{{ test_coarse_grain_svm -def test_coarse_grain_svm(ctx_factory): +@pytest.mark.parametrize("use_opaque_style", [False, True]) +def test_coarse_grain_svm(ctx_factory, use_opaque_style): import sys is_pypy = "__pypy__" in sys.builtin_module_names ctx = ctx_factory() queue = cl.CommandQueue(ctx) - _xfail_if_pocl_gpu(queue.device, "SVM") - dev = ctx.devices[0] from pyopencl.characterize import has_coarse_grain_buffer_svm @@ -1035,16 +1143,32 @@ def test_coarse_grain_svm(ctx_factory): if ("AMD" in dev.platform.name and dev.type & cl.device_type.GPU): pytest.xfail("AMD GPU crashes on SVM unmap") + if (dev.platform.vendor == "The pocl project" + and dev.type & cl.device_type.GPU + and "k40" in dev.name.lower()): + pytest.xfail("Crashes on K40s via POCL-CUDA") + dtype = np.dtype(np.float32) n = 3000 - svm_ary = cl.SVM(cl.csvm_empty(ctx, (n,), np.float32, alignment=64)) - if not is_pypy: - # https://bitbucket.org/pypy/numpy/issues/52 - assert isinstance(svm_ary.mem.base, cl.SVMAllocation) + if use_opaque_style: + svm_ary = cl.SVMAllocation(ctx, n*dtype.itemsize, alignment=64, + flags=cl.svm_mem_flags.READ_WRITE) + else: + svm_ary = cl.SVM(cl.csvm_empty(ctx, (n,), dtype, alignment=64)) + if not is_pypy: + # https://bitbucket.org/pypy/numpy/issues/52 + assert isinstance(svm_ary.mem.base, cl.SVMAllocation) - cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype)) + cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), dtype)) with svm_ary.map_rw(queue) as ary: + if use_opaque_style: + ary = ary.view(dtype) + else: + assert ary is svm_ary.mem + + assert ary.nbytes == n * dtype.itemsize + ary.fill(17) orig_ary = ary.copy() @@ -1055,21 +1179,28 @@ def test_coarse_grain_svm(ctx_factory): } """).build() - prg.twice(queue, svm_ary.mem.shape, None, svm_ary) + prg.twice(queue, (n,), None, svm_ary) + + if dev.platform.vendor == "The pocl project" \ + and dev.type & cl.device_type.GPU: + # clCreateBuffer from SVM doesn't work yet on GPU pocl + prg.twice(queue, (n,), None, svm_ary) + else: + prg.twice(queue, (n,), None, svm_ary.as_buffer(ctx)) with svm_ary.map_ro(queue) as ary: - print(ary) - assert np.array_equal(orig_ary*2, ary) + if use_opaque_style: + ary = ary.view(dtype) + else: + assert ary is svm_ary.mem + + assert np.array_equal(orig_ary*4, ary) new_ary = np.empty_like(orig_ary) new_ary.fill(-1) - if ctx.devices[0].platform.name != "Portable Computing Language": - # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)" - # in pocl up to and including 1.0rc1. - - cl.enqueue_copy(queue, new_ary, svm_ary) - assert np.array_equal(orig_ary*2, new_ary) + cl.enqueue_copy(queue, new_ary, svm_ary) + assert np.array_equal(orig_ary*4, new_ary) # {{{ https://github.com/inducer/pyopencl/issues/372 @@ -1095,6 +1226,10 @@ def test_coarse_grain_svm(ctx_factory): # }}} +# }}} + + +# {{{ test_fine_grain_svm def test_fine_grain_svm(ctx_factory): import sys @@ -1133,6 +1268,10 @@ def test_fine_grain_svm(ctx_factory): print(ary) assert np.array_equal(orig_ary*2, ary) +# }}} + + +# {{{ test_map_dtype @pytest.mark.parametrize("dtype", [ np.uint, @@ -1157,6 +1296,10 @@ def test_map_dtype(ctx_factory, dtype): print(array.dtype) assert array.dtype == dt +# }}} + + +# {{{ test_compile_link def test_compile_link(ctx_factory): ctx = ctx_factory() @@ -1194,6 +1337,10 @@ def test_compile_link(ctx_factory): z.experiment(queue, (128**2,), (128,)) queue.finish() +# }}} + + +# {{{ test_copy_buffer_rect def test_copy_buffer_rect(ctx_factory): ctx = ctx_factory() @@ -1209,6 +1356,10 @@ def test_copy_buffer_rect(ctx_factory): src_origin=(0, 0), dst_origin=(1, 1), region=arr1.shape[::-1]) +# }}} + + +# {{{ test_threaded_nanny_events def test_threaded_nanny_events(ctx_factory): # https://github.com/inducer/pyopencl/issues/296 @@ -1236,6 +1387,10 @@ def create_arrays_thread(n1=10, n2=20): t1.join() t2.join() +# }}} + + +# {{{ test_empty_ndrange @pytest.mark.parametrize("empty_shape", [(0,), (3, 0, 2)]) def test_empty_ndrange(ctx_factory, empty_shape): @@ -1256,6 +1411,10 @@ def test_empty_ndrange(ctx_factory, empty_shape): prg.add_two(queue, a.shape, None, a.data, allow_empty_ndrange=True) +# }}} + + +# {{{ test_command_queue_context_manager def test_command_queue_context_manager(ctx_factory): ctx = ctx_factory() @@ -1265,6 +1424,10 @@ def test_command_queue_context_manager(ctx_factory): with pytest.warns(cl.CommandQueueUsedAfterExit): q.flush() +# }}} + + +# {{{ test_capture_call def test_capture_call(ctx_factory): ctx = ctx_factory() @@ -1298,6 +1461,8 @@ def test_capture_call(ctx_factory): exec(compile(sio.getvalue(), "captured.py", "exec"), compile_dict) compile_dict["main"]() +# }}} + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests. @@ -1309,3 +1474,5 @@ def test_capture_call(ctx_factory): else: from pytest import main main([__file__]) + +# vim: foldmethod=marker