diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index 5aca7fccc..6f268dff4 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -1,5 +1,8 @@
 name: test-conda-env
 channels:
+# For https://github.com/pocl/pocl/pull/1069
+# See https://github.com/conda-forge/pocl-feedstock/pull/80
+- conda-forge/label/pocl_dev
 - conda-forge
 - nodefaults
 
diff --git a/doc/index.rst b/doc/index.rst
index bc4af44cc..54c38c21e 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -110,11 +110,11 @@ Contents
     runtime_memory
     runtime_program
     runtime_gl
+    tools
     array
     types
     algorithm
     howto
-    tools
     misc
     🚀 Github <https://github.com/inducer/pyopencl>
     💾 Download Releases <https://pypi.org/project/pyopencl>
diff --git a/doc/misc.rst b/doc/misc.rst
index ff71ae7a4..0ebd67346 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -29,7 +29,6 @@ Then run::
 You can install these pieces of software in your user account and
 do not need root/administrator privileges.
 
-
 Enabling access to CPUs and GPUs via (Py)OpenCL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -283,6 +282,13 @@ other software to be turned into the corresponding :mod:`pyopencl` objects.
 User-visible Changes
 ====================
 
+Version 2022.2
+--------------
+
+- Added :ref:`opaque-style SVM <opaque-svm>` and :class:`pyopencl.SVMPointer`.
+- Added :class:`pyopencl.tools.SVMPool`.
+- Added automatic queue-synchronized deallocation of SVM.
+
 Version 2020.3
 --------------
 .. note::
@@ -728,7 +734,9 @@ Funding
 Work on pytential was supported in part by
 
 * the US National Science Foundation under grant numbers DMS-1418961,
-  DMS-1654756, SHF-1911019, and OAC-1931577.
+  DMS-1654756, SHF-1911019, and OAC-1931577, and
+* the Department of Energy, National Nuclear Security Administration,
+  under Award Number DE-NA0003963.
 
 AK also gratefully acknowledges a hardware gift from Nvidia Corporation.
 
diff --git a/doc/runtime_const.rst b/doc/runtime_const.rst
index 864a641c7..b6f34ea60 100644
--- a/doc/runtime_const.rst
+++ b/doc/runtime_const.rst
@@ -6,6 +6,7 @@ OpenCL Runtime: Constants
 .. include:: constants.inc
 
 .. class:: NameVersion
+
     Describes the version of a specific feature.
 
     .. note::
@@ -19,6 +20,7 @@ OpenCL Runtime: Constants
     .. attribute:: name
 
 .. class:: DeviceTopologyAmd
+
     .. method:: __init__(bus, device, function)
     .. attribute:: type
     .. attribute:: bus
diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst
index f4e01f266..f9c31a8bf 100644
--- a/doc/runtime_memory.rst
+++ b/doc/runtime_memory.rst
@@ -116,14 +116,109 @@ by both the host and the device. *Coarse-grain* SVM requires that
 buffers be mapped before being accessed on the host, *fine-grain* SVM
 does away with that requirement.
 
+.. warning::
+
+    Compared to :class:`Buffer`\ s, SVM brings with it a new concern: the
+    synchronization of memory deallocation. Unlike other objects in OpenCL,
+    SVM is represented by a plain (C-language) pointer and thus has no ability for
+    reference counting.
+
+    As a result, it is perfectly legal to allocate a :class:`Buffer`, enqueue an
+    operation on it, and release the buffer, without worrying about whether the
+    operation has completed. The OpenCL implementation will keep the buffer alive
+    until the operation has completed. This is *not* the case with SVM: Unless
+    otherwise specified, memory deallocation is performed immediately when
+    requested, and so SVM will be deallocated whenever the Python
+    garbage collector sees fit, even if the operation has not completed,
+    immediately leading to undefined behavior (i.e., typically, memory corruption and,
+    before too long, a crash).
+
+    Version 2022.2 of PyOpenCL offers substantially improved tools
+    for dealing with this. In particular, all means for allocating SVM
+    allow specifying a :class:`CommandQueue`, so that deallocation
+    is enqueued and performed after previously-enqueued operations
+    have completed.
+
 SVM requires OpenCL 2.0.
 
+.. _opaque-svm:
+
+Opaque and "Wrapped-:mod:`numpy`" Styles of Referencing SVM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+When trying to pass SVM pointers to functionality in :mod:`pyopencl`,
+two styles are supported:
+
+- First, the opaque style. This style most closely resembles
+  :class:`Buffer`-based allocation available in OpenCL 1.x.
+  SVM pointers are held in opaque "handle" objects such as :class:`SVMAllocation`.
+
+- Second, the wrapped-:mod:`numpy` style. In this case, a :class:`numpy.ndarray`
+  (or another object implementing the  :c:func:`Python buffer protocol
+  <PyObject_GetBuffer>`) serves as the reference to an area of SVM.
+  This style permits using memory areas with :mod:`pyopencl`'s SVM
+  interfaces even if they were allocated outside of :mod:`pyopencl`.
+
+  Since passing a :class:`numpy.ndarray` (or another type of object obeying the
+  buffer interface) already has existing semantics in most settings in
+  :mod:`pyopencl` (such as when passing arguments to a kernel or calling
+  :func:`enqueue_copy`), there exists a wrapper object, :class:`SVM`, that may
+  be "wrapped around" these objects to mark them as SVM.
+
+The commonality between the two styles is that both ultimately implement
+the :class:`SVMPointer` interface, which :mod:`pyopencl` uses to obtain
+the actual SVM pointer.
+
+Note that it is easily possible to obtain a :class:`numpy.ndarray` view of SVM
+areas held in the opaque style, see :attr:`SVMPointer.buf`, permitting
+transitions from opaque to wrapped-:mod:`numpy` style. The opposite transition
+(from wrapped-:mod:`numpy` to opaque) is not necessarily straightforward,
+as it would require "fishing" the opaque SVM handle out of a chain of
+:attr:`numpy.ndarray.base` attributes (or similar, depending on
+the actual object serving as the main SVM reference).
+
+See :ref:`numpy-svm-helpers` for helper functions that ease setting up the
+wrapped-:mod:`numpy` structure.
+
+Wrapped-:mod:`numpy` SVM tends to be a good fit for fine-grain SVM because of
+the ease of direct host-side access, but the creation of the nested structure
+that makes this possible is associated with a certain amount of cost.
+
+By comparison, opaque SVM access tends to be a good fit for coarse-grain
+SVM, because direct host access is not possible without mapping the array
+anyway, and it has lower setup cost. It is of course entirely possible to use
+opaque SVM access with fine-grain SVM.
+
+.. versionchanged:: 2022.2
+
+   This version adds the opaque style of SVM access.
+
+Using SVM with Arrays
+^^^^^^^^^^^^^^^^^^^^^
+
+While all types of SVM can be used as the memory backing
+:class:`pyopencl.array.Array` objects, ensuring that new arrays returned
+by array operations (e.g. arithmetic) also use SVM is easiest to accomplish
+by passing an :class:`~pyopencl.tools.SVMAllocator` (or
+:class:`~pyopencl.tools.SVMPool`) as the *allocator* parameter in functions
+returning new arrays.
+
+SVM Pointers, Allocations, and Maps
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: SVMPointer
+
+.. autoclass:: SVMAllocation
+
 .. autoclass:: SVM
 
 .. autoclass:: SVMMap
 
-Allocating SVM
-^^^^^^^^^^^^^^
+
+.. _numpy-svm-helpers:
+
+Helper functions for :mod:`numpy`-based SVM allocation
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 .. autofunction:: svm_empty
 .. autofunction:: svm_empty_like
@@ -140,11 +235,6 @@ Operations on SVM
 .. autofunction:: enqueue_svm_memfill
 .. autofunction:: enqueue_svm_migratemem
 
-SVM Allocation Holder
-^^^^^^^^^^^^^^^^^^^^^
-
-.. autoclass:: SVMAllocation
-
 Image
 -----
 
@@ -281,6 +371,8 @@ Transfers
 
 .. autofunction:: enqueue_copy(queue, dest, src, **kwargs)
 
+.. autofunction:: enqueue_fill(queue, dest, src, **kwargs)
+
 Mapping Memory into Host Address Space
 --------------------------------------
 
@@ -406,3 +498,11 @@ Pipes
 
         See :class:`pipe_info` for values of *param*.
 
+Type aliases
+------------
+
+.. currentmodule:: pyopencl._cl
+
+.. class:: Buffer
+
+   See :class:`pyopencl.Buffer`.
diff --git a/doc/tools.rst b/doc/tools.rst
index 7fdde084e..080d1c89a 100644
--- a/doc/tools.rst
+++ b/doc/tools.rst
@@ -1,203 +1,4 @@
 Built-in Utilities
 ==================
 
-.. module:: pyopencl.tools
-
-.. _memory-pools:
-
-Memory Pools
-------------
-
-The constructor :func:`pyopencl.Buffer` can consume a fairly large amount of
-processing time if it is invoked very frequently. For example, code based on
-:class:`pyopencl.array.Array` can easily run into this issue because a
-fresh memory area is allocated for each intermediate result. Memory pools are a
-remedy for this problem based on the observation that often many of the block
-allocations are of the same sizes as previously used ones.
-
-Then, instead of fully returning the memory to the system and incurring the
-associated reallocation overhead, the pool holds on to the memory and uses it
-to satisfy future allocations of similarly-sized blocks. The pool reacts
-appropriately to out-of-memory conditions as long as all memory allocations
-are made through it. Allocations performed from outside of the pool may run
-into spurious out-of-memory conditions due to the pool owning much or all of
-the available memory.
-
-Using :class:`pyopencl.array.Array` instances with a :class:`MemoryPool` is
-not complicated::
-
-    mem_pool = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))
-    a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool)
-
-.. class:: PooledBuffer
-
-    An object representing a :class:`MemoryPool`-based allocation of
-    device memory.  Once this object is deleted, its associated device
-    memory is returned to the pool. This supports the same interface
-    as :class:`pyopencl.Buffer`.
-
-.. class:: AllocatorInterface
-
-   An interface implemented by various memory allocation functions
-   in :mod:`pyopencl`.
-
-    .. method:: __call__(size)
-
-        Allocate and return a :class:`pyopencl.Buffer` of the given *size*.
-
-.. class:: DeferredAllocator(context, mem_flags=pyopencl.mem_flags.READ_WRITE)
-
-    *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
-    to the *flags* argument of :class:`pyopencl.Buffer`. DeferredAllocator
-    has the same semantics as regular OpenCL buffer allocation, i.e. it may
-    promise memory to be available that may (in any call to a buffer-using
-    CL function) turn out to not exist later on. (Allocations in CL are
-    bound to contexts, not devices, and memory availability depends on which
-    device the buffer is used with.)
-
-    Implements :class:`AllocatorInterface`.
-
-    .. versionchanged :: 2013.1
-
-        ``CLAllocator`` was deprecated and replaced
-        by :class:`DeferredAllocator`.
-
-    .. method:: __call__(size)
-
-        Allocate a :class:`pyopencl.Buffer` of the given *size*.
-
-        .. versionchanged :: 2020.2
-
-            The allocator will succeed even for allocations of size zero,
-            returning *None*.
-
-.. class:: ImmediateAllocator(queue, mem_flags=pyopencl.mem_flags.READ_WRITE)
-
-    *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
-    to the *flags* argument of :class:`pyopencl.Buffer`.
-    :class:`ImmediateAllocator` will attempt to ensure at allocation time that
-    allocated memory is actually available. If no memory is available, an out-of-memory
-    error is reported at allocation time.
-
-    Implements :class:`AllocatorInterface`.
-
-    .. versionadded:: 2013.1
-
-    .. method:: __call__(size)
-
-        Allocate a :class:`pyopencl.Buffer` of the given *size*.
-
-        .. versionchanged :: 2020.2
-
-            The allocator will succeed even for allocations of size zero,
-            returning *None*.
-
-.. class:: MemoryPool(allocator[, leading_bits_in_bin_id])
-
-    A memory pool for OpenCL device memory. *allocator* must be an instance of
-    one of the above classes, and should be an :class:`ImmediateAllocator`.
-    The memory pool assumes that allocation failures are reported
-    by the allocator immediately, and not in the OpenCL-typical
-    deferred manner.
-
-    Implements :class:`AllocatorInterface`.
-
-    .. note::
-
-        The current implementation of the memory pool will retain allocated
-        memory after it is returned by the application and keep it in a bin
-        identified by the leading *leading_bits_in_bin_id* bits of the
-        allocation size. To ensure that allocations within each bin are
-        interchangeable, allocation sizes are rounded up to the largest size
-        that shares the leading bits of the requested allocation size.
-
-        The current default value of *leading_bits_in_bin_id* is
-        four, but this may change in future versions and is not
-        guaranteed.
-
-        *leading_bits_in_bin_id* must be passed by keyword,
-        and its role is purely advisory. It is not guaranteed
-        that future versions of the pool will use the
-        same allocation scheme and/or honor *leading_bits_in_bin_id*.
-
-    .. versionchanged:: 2019.1
-
-        Current bin allocation behavior documented, *leading_bits_in_bin_id*
-        added.
-
-    .. attribute:: held_blocks
-
-        The number of unused blocks being held by this pool.
-
-    .. attribute:: active_blocks
-
-        The number of blocks in active use that have been allocated
-        through this pool.
-
-    .. attribute:: managed_bytes
-
-        "Managed" memory is "active" and "held" memory.
-
-        .. versionadded: 2021.1.2
-
-    .. attribute:: active_bytes
-
-        "Active" bytes are bytes under the control of the application.
-        This may be smaller than the actual allocated size reflected
-        in :attr:`managed_bytes`.
-
-        .. versionadded: 2021.1.2
-
-    .. method:: allocate(size)
-
-        Return a :class:`PooledBuffer` of the given *size*.
-
-    .. method:: __call__(size)
-
-        Synonym for :meth:`allocate` to match the :class:`AllocatorInterface`.
-
-        .. versionadded: 2011.2
-
-    .. method:: free_held
-
-        Free all unused memory that the pool is currently holding.
-
-    .. method:: stop_holding
-
-        Instruct the memory to start immediately freeing memory returned
-        to it, instead of holding it for future allocations.
-        Implicitly calls :meth:`free_held`.
-        This is useful as a cleanup action when a memory pool falls out
-        of use.
-
-CL-Object-dependent Caching
----------------------------
-
-.. autofunction:: first_arg_dependent_memoize
-.. autofunction:: clear_first_arg_caches
-
-Testing
--------
-
-.. function:: pytest_generate_tests_for_pyopencl(metafunc)
-
-    Using the line::
-
-        from pyopencl.tools import pytest_generate_tests_for_pyopencl \
-                as pytest_generate_tests
-
-    in your `pytest <http://pytest.org>`_ test scripts allows you to use the
-    arguments *ctx_factory*, *device*, or *platform* in your test functions,
-    and they will automatically be run for each OpenCL device/platform in the
-    system, as appropriate.
-
-    The following two environment variables are also supported to control
-    device/platform choice::
-
-        PYOPENCL_TEST=0:0,1;intel=i5,i7
-
-Device Characterization
------------------------
-
-.. automodule:: pyopencl.characterize
-    :members:
+.. automodule:: pyopencl.tools
diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py
new file mode 100644
index 000000000..724d32b63
--- /dev/null
+++ b/examples/demo_array_svm.py
@@ -0,0 +1,33 @@
+import pyopencl as cl
+import pyopencl.array as cl_array
+from pyopencl.tools import SVMAllocator, SVMPool
+import numpy as np
+
+n = 50000
+a = np.random.rand(n).astype(np.float32)
+b = np.random.rand(n).astype(np.float32)
+
+
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+alloc = SVMAllocator(ctx, alignment=0, queue=queue)
+alloc = SVMPool(alloc)
+
+a_dev = cl_array.to_device(queue, a, allocator=alloc)
+b_dev = cl_array.to_device(queue, b, allocator=alloc)
+dest_dev = cl_array.empty_like(a_dev)
+
+prg = cl.Program(ctx, """
+    __kernel void sum(__global const float *a,
+    __global const float *b, __global float *c)
+    {
+      int gid = get_global_id(0);
+      c[gid] = a[gid] + b[gid];
+    }
+    """).build()
+
+knl = prg.sum
+knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
+
+np.testing.assert_allclose(dest_dev.get(), (a_dev+b_dev).get())
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index eaf909633..ef96cb9bf 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -21,6 +21,8 @@
 """
 
 from sys import intern
+from warnings import warn
+from typing import Union, Any, Optional, Sequence
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
 
@@ -43,7 +45,6 @@
     import os
     from os.path import dirname, join, realpath
     if realpath(join(os.getcwd(), "pyopencl")) == realpath(dirname(__file__)):
-        from warnings import warn
         warn("It looks like you are importing PyOpenCL from "
                 "its source directory. This likely won't work.")
     raise
@@ -199,11 +200,9 @@
 
 if get_cl_header_version() >= (2, 0):
     from pyopencl._cl import (  # noqa
-        SVMAllocation,
+        SVMPointer,
         SVM,
-
-        # FIXME
-        #enqueue_svm_migratemem,
+        SVMAllocation,
         )
 
 if _cl.have_gl():
@@ -267,7 +266,6 @@ class CommandQueueUsedAfterExit(UserWarning):
 
 def compiler_output(text):
     import os
-    from warnings import warn
     if int(os.environ.get("PYOPENCL_COMPILER_OUTPUT", "0")):
         warn(text, CompilerWarning)
     else:
@@ -389,7 +387,6 @@ def enable_debugging(platform_or_context):
         import os
         os.environ["CPU_MAX_COMPUTE_UNITS"] = "1"
     else:
-        from warnings import warn
         warn("do not know how to enable debugging on '%s'"
                 % platform.name)
 
@@ -428,7 +425,6 @@ def _get_prg(self):
             return self._prg
         else:
             # "no program" can only happen in from-source case.
-            from warnings import warn
             warn("Pre-build attribute access defeats compiler caching.",
                     stacklevel=3)
 
@@ -662,7 +658,6 @@ def device_hashable_model_and_version_identifier(self):
         return ("v1", self.vendor, self.vendor_id, self.name, self.version)
 
     def device_persistent_unique_id(self):
-        from warnings import warn
         warn("Device.persistent_unique_id is deprecated. "
                 "Use Device.hashable_model_and_version_identifier instead.",
                 DeprecationWarning, stacklevel=2)
@@ -684,7 +679,6 @@ def device_persistent_unique_id(self):
 
     def context_init(self, devices, properties, dev_type, cache_dir=None):
         if cache_dir is not None:
-            from warnings import warn
             warn("The 'cache_dir' argument to the Context constructor "
                 "is deprecated and no longer has an effect. "
                 "It was removed because it only applied to the wrapper "
@@ -970,7 +964,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None,
 
         if hostbuf is not None and not \
                 (flags & (mem_flags.USE_HOST_PTR | mem_flags.COPY_HOST_PTR)):
-            from warnings import warn
             warn("'hostbuf' was passed, but no memory flags to make use of it.")
 
         if hostbuf is None and pitches is not None:
@@ -1043,7 +1036,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None,
 
     class _ImageInfoGetter:
         def __init__(self, event):
-            from warnings import warn
             warn("Image.image.attr is deprecated and will go away in 2021. "
                     "Use Image.attr directly, instead.")
 
@@ -1131,44 +1123,166 @@ def memory_map_exit(self, exc_type, exc_val, exc_tb):
 
     # }}}
 
-    # {{{ SVMAllocation
+    # {{{ SVMPointer
 
     if get_cl_header_version() >= (2, 0):
-        SVMAllocation.__doc__ = """An object whose lifetime is tied to an
-            allocation of shared virtual memory.
+        SVMPointer.__doc__ = """A base class for things that can be passed to
+            functions that allow an SVM pointer, e.g. kernel enqueues and memory
+            copies.
 
-            .. note::
+            Objects of this type cannot currently be directly created or
+            implemented in Python.  To obtain objects implementing this type,
+            consider its subtypes :class:`SVMAllocation` and :class:`SVM`.
 
-                Most likely, you will not want to use this directly, but rather
-                :func:`svm_empty` and related functions which allow access to this
-                functionality using a friendlier, more Pythonic interface.
 
-            .. versionadded:: 2016.2
+            .. property:: svm_ptr
 
-            .. automethod:: __init__(self, ctx, size, alignment, flags=None)
-            .. automethod:: release
-            .. automethod:: enqueue_release
+                Gives the SVM pointer as an :class:`int`.
+
+            .. property:: size
+
+                An :class:`int` denoting the size in bytes, or *None*, if the size
+                of the SVM pointed to is not known.
+
+                *Most* objects of this type (e.g. instances of
+                :class:`SVMAllocation` and :class:`SVM` know their size, so that,
+                for example :class:`enqueue_copy` will automatically copy an entire
+                :class:`SVMAllocation` when a size is not explicitly specified.
+
+            .. automethod:: map
+            .. automethod:: map_ro
+            .. automethod:: map_rw
+            .. automethod:: as_buffer
+            .. property:: buf
+
+                An opaque object implementing the :c:func:`Python buffer protocol
+                <PyObject_GetBuffer>`. It exposes the pointed-to memory as
+                a one-dimensional buffer of bytes, with the size matching
+                :attr:`size`.
+
+                No guarantee is provided that two references to this attribute
+                result in the same object.
             """
 
-    if get_cl_header_version() >= (2, 0):
-        svmallocation_old_init = SVMAllocation.__init__
+    def svmptr_map(self, queue: CommandQueue, *, flags: int, is_blocking: bool =
+                   True, wait_for: Optional[Sequence[Event]] = None,
+                   size: Optional[Event] = None) -> "SVMMap":
+        """
+        :arg is_blocking: If *False*, subsequent code must wait on
+            :attr:`SVMMap.event` in the returned object before accessing the
+            mapped memory.
+        :arg flags: a combination of :class:`pyopencl.map_flags`.
+        :arg size: The size of the map in bytes. If not provided, defaults to
+            :attr:`size`.
 
-    def svmallocation_init(self, ctx, size, alignment, flags, _interface=None):
+        |std-enqueue-blurb|
+        """
+        return SVMMap(self,
+                np.asarray(self.buf),
+                queue,
+                _cl._enqueue_svm_map(queue, is_blocking, flags, self, wait_for,
+                                    size=size))
+
+    def svmptr_map_ro(self, queue: CommandQueue, *, is_blocking: bool = True,
+                      wait_for: Optional[Sequence[Event]] = None,
+                      size: Optional[int] = None) -> "SVMMap":
+        """Like :meth:`map`, but with *flags* set for a read-only map.
+        """
+
+        return self.map(queue, flags=map_flags.READ,
+                is_blocking=is_blocking, wait_for=wait_for, size=size)
+
+    def svmptr_map_rw(self, queue: CommandQueue, *, is_blocking: bool = True,
+                      wait_for: Optional[Sequence[Event]] = None,
+                      size: Optional[int] = None) -> "SVMMap":
+        """Like :meth:`map`, but with *flags* set for a read-only map.
+        """
+
+        return self.map(queue, flags=map_flags.READ | map_flags.WRITE,
+                is_blocking=is_blocking, wait_for=wait_for, size=size)
+
+    def svmptr__enqueue_unmap(self, queue, wait_for=None):
+        return _cl._enqueue_svm_unmap(queue, self, wait_for)
+
+    def svmptr_as_buffer(self, ctx: Context, *, flags: Optional[int] = None,
+                         size: Optional[int] = None) -> Buffer:
         """
         :arg ctx: a :class:`Context`
-        :arg flags: some of :class:`svm_mem_flags`.
+        :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to
+            read-write.
+        :arg size: The size of the map in bytes. If not provided, defaults to
+            :attr:`size`.
+        :returns: a :class:`Buffer` corresponding to *self*.
+
+        The memory referred to by this object must not be freed before
+        the returned :class:`Buffer` is released.
         """
-        svmallocation_old_init(self, ctx, size, alignment, flags)
 
-        # mem_flags.READ_ONLY applies to kernels, not the host
-        read_write = True
-        _interface["data"] = (
-                int(self._ptr_as_int()), not read_write)
+        if flags is None:
+            flags = mem_flags.READ_WRITE | mem_flags.USE_HOST_PTR
+
+        if size is None:
+            size = self.size
+
+        return Buffer(ctx, flags, size=size, hostbuf=self.buf)
+
+    if get_cl_header_version() >= (2, 0):
+        SVMPointer.map = svmptr_map
+        SVMPointer.map_ro = svmptr_map_ro
+        SVMPointer.map_rw = svmptr_map_rw
+        SVMPointer._enqueue_unmap = svmptr__enqueue_unmap
+        SVMPointer.as_buffer = svmptr_as_buffer
+
+    # }}}
 
-        self.__array_interface__ = _interface
+    # {{{ SVMAllocation
 
     if get_cl_header_version() >= (2, 0):
-        SVMAllocation.__init__ = svmallocation_init
+        SVMAllocation.__doc__ = """
+            Is a :class:`SVMPointer`.
+
+            .. versionadded:: 2016.2
+
+            .. automethod:: __init__
+
+                :arg flags: See :class:`svm_mem_flags`.
+                :arg queue: If not specified, the allocation will be freed
+                    eagerly, irrespective of whether pending/enqueued operations
+                    are still using this memory.
+
+                    If specified, deallocation of the memory will be enqueued
+                    with the given queue, and will only be performed
+                    after previously-enqueue operations in the queue have
+                    completed.
+
+                    It is an error to specify an out-of-order queue.
+
+                    .. warning::
+
+                        Not specifying a queue will typically lead to undesired
+                        behavior, including crashes and memory corruption.
+                        See the warning in :ref:`svm`.
+
+            .. automethod:: enqueue_release
+
+                Enqueue the release of this allocation into *queue*.
+                If *queue* is not specified, enqueue the deallocation
+                into the queue provided at allocation time or via
+                :class:`bind_to_queue`.
+
+            .. automethod:: bind_to_queue
+
+                Change the queue used for implicit enqueue of deallocation
+                to *queue*. Sufficient synchronization is ensured by
+                enqueuing a marker into the old queue and waiting on this
+                marker in the new queue.
+
+            .. automethod:: unbind_from_queue
+
+                Configure the allocation to no longer implicitly enqueue
+                memory allocation. If such a queue was previously provided,
+                :meth:`~CommandQueue.finish` is automatically called on it.
+            """
 
     # }}}
 
@@ -1179,23 +1293,14 @@ def svmallocation_init(self, ctx, size, alignment, flags, _interface=None):
             (such as a :class:`numpy.ndarray`) as referring to shared virtual
             memory.
 
+            Is a :class:`SVMPointer`, hence objects of this type may be passed
+            to kernel calls and :func:`enqueue_copy`, and all methods declared
+            there are also available there. Note that :meth:`map` differs
+            slightly from :meth:`SVMPointer.map`.
+
             Depending on the features of the OpenCL implementation, the following
             types of objects may be passed to/wrapped in this type:
 
-            *   coarse-grain shared memory as returned by (e.g.) :func:`csvm_empty`
-                for any implementation of OpenCL 2.0.
-
-                This is how coarse-grain SVM may be used from both host and device::
-
-                    svm_ary = cl.SVM(
-                        cl.csvm_empty(ctx, 1000, np.float32, alignment=64))
-                    assert isinstance(svm_ary.mem, np.ndarray)
-
-                    with svm_ary.map_rw(queue) as ary:
-                        ary.fill(17)  # use from host
-
-                    prg.twice(queue, svm_ary.mem.shape, None, svm_ary)
-
             *   fine-grain shared memory as returned by (e.g.) :func:`fsvm_empty`,
                 if the implementation supports fine-grained shared virtual memory.
                 This memory may directly be passed to a kernel::
@@ -1222,10 +1327,28 @@ def svmallocation_init(self, ctx, size, alignment, flags, _interface=None):
                     queue.finish() # synchronize
                     print(ary) # access from host
 
-            Objects of this type may be passed to kernel calls and
-            :func:`enqueue_copy`.  Coarse-grain shared-memory *must* be mapped
-            into host address space using :meth:`map` before being accessed
-            through the :mod:`numpy` interface.
+            *   coarse-grain shared memory as returned by (e.g.) :func:`csvm_empty`
+                for any implementation of OpenCL 2.0.
+
+                .. note::
+
+                    Applications making use of coarse-grain SVM may be better
+                    served by opaque-style SVM. See :ref:`opaque-svm`.
+
+                This is how coarse-grain SVM may be used from both host and device::
+
+                    svm_ary = cl.SVM(
+                        cl.csvm_empty(ctx, 1000, np.float32, alignment=64))
+                    assert isinstance(svm_ary.mem, np.ndarray)
+
+                    with svm_ary.map_rw(queue) as ary:
+                        ary.fill(17)  # use from host
+
+                    prg.twice(queue, svm_ary.mem.shape, None, svm_ary)
+
+            Coarse-grain shared-memory *must* be mapped into host address space
+            using :meth:`~SVMPointer.map` before being accessed through the
+            :mod:`numpy` interface.
 
             .. note::
 
@@ -1246,9 +1369,10 @@ def svmallocation_init(self, ctx, size, alignment, flags, _interface=None):
             .. automethod:: map
             .. automethod:: map_ro
             .. automethod:: map_rw
-            .. automethod:: as_buffer
             """
 
+    # }}}
+
     if get_cl_header_version() >= (2, 0):
         svm_old_init = SVM.__init__
 
@@ -1262,14 +1386,18 @@ def svm_map(self, queue, flags, is_blocking=True, wait_for=None):
         :arg is_blocking: If *False*, subsequent code must wait on
             :attr:`SVMMap.event` in the returned object before accessing the
             mapped memory.
-        :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to
-            read-write.
+        :arg flags: a combination of :class:`pyopencl.map_flags`.
         :returns: an :class:`SVMMap` instance
 
+        This differs from the inherited :class:`SVMPointer.map` in that no size
+        can be specified, and that :attr:`mem` is the exact array produced
+        when the :class:`SVMMap` is used as a context manager.
+
         |std-enqueue-blurb|
         """
         return SVMMap(
                 self,
+                self.mem,
                 queue,
                 _cl._enqueue_svm_map(queue, is_blocking, flags, self, wait_for))
 
@@ -1288,29 +1416,12 @@ def svm_map_rw(self, queue, is_blocking=True, wait_for=None):
     def svm__enqueue_unmap(self, queue, wait_for=None):
         return _cl._enqueue_svm_unmap(queue, self, wait_for)
 
-    def svm_as_buffer(self, ctx, flags=None):
-        """
-        :arg ctx: a :class:`Context`
-        :arg flags: a combination of :class:`pyopencl.map_flags`, defaults to
-            read-write.
-        :returns: a :class:`Buffer` corresponding to *self*.
-
-        The memory referred to by this object must not be freed before
-        the returned :class:`Buffer` is released.
-        """
-
-        if flags is None:
-            flags = mem_flags.READ_WRITE
-
-        return Buffer(ctx, flags, size=self.mem.nbytes, hostbuf=self.mem)
-
     if get_cl_header_version() >= (2, 0):
         SVM.__init__ = svm_init
         SVM.map = svm_map
         SVM.map_ro = svm_map_ro
         SVM.map_rw = svm_map_rw
         SVM._enqueue_unmap = svm__enqueue_unmap
-        SVM.as_buffer = svm_as_buffer
 
     # }}}
 
@@ -1413,6 +1524,27 @@ def gl_object_get_gl_object(self):
 # }}}
 
 
+# {{{ _OverriddenArrayInterfaceSVMAllocation
+
+if get_cl_header_version() >= (2, 0):
+    class _OverriddenArrayInterfaceSVMAllocation(SVMAllocation):
+        def __init__(self, ctx, size, alignment, flags, *, _interface,
+                queue=None):
+            """
+            :arg ctx: a :class:`Context`
+            :arg flags: some of :class:`svm_mem_flags`.
+            """
+            super().__init__(ctx, size, alignment, flags, queue)
+
+            # mem_flags.READ_ONLY applies to kernels, not the host
+            read_write = True
+            _interface["data"] = (int(self.svm_ptr), not read_write)
+
+            self.__array_interface__ = _interface
+
+# }}}
+
+
 # {{{ create_some_context
 
 def create_some_context(interactive=None, answers=None):
@@ -1553,19 +1685,24 @@ def parse_device(choice):
 
 class SVMMap:
     """
-    .. attribute:: event
+    Returned by :func:`SVMPointer.map` and :func:`SVM.map`.
+    This class may also be used as a context manager in a ``with`` statement.
+    :meth:`release` will be called upon exit from the ``with`` region.
+    The value returned to the ``as`` part of the context manager is the
+    mapped Python object (e.g. a :mod:`numpy` array).
 
     .. versionadded:: 2016.2
 
+    .. property:: event
+
+        The :class:`Event` returned when mapping the memory.
+
     .. automethod:: release
 
-    This class may also be used as a context manager in a ``with`` statement.
-    :meth:`release` will be called upon exit from the ``with`` region.
-    The value returned to the ``as`` part of the context manager is the
-    mapped Python object (e.g. a :mod:`numpy` array).
     """
-    def __init__(self, svm, queue, event):
+    def __init__(self, svm, array, queue, event):
         self.svm = svm
+        self.array = array
         self.queue = queue
         self.event = event
 
@@ -1574,7 +1711,7 @@ def __del__(self):
             self.release()
 
     def __enter__(self):
-        return self.svm.mem
+        return self.array
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.release()
@@ -1633,7 +1770,13 @@ def enqueue_copy(queue, dest, src, **kwargs):
     .. rubric :: Transfer :class:`Buffer` ↔ host
     .. ------------------------------------------------------------------------
 
-    :arg device_offset: offset in bytes (optional)
+    :arg src_offset: offset in bytes (optional)
+
+        May only be nonzero if applied on the device side.
+
+    :arg dst_offset: offset in bytes (optional)
+
+        May only be nonzero if applied on the device side.
 
     .. note::
 
@@ -1653,7 +1796,7 @@ def enqueue_copy(queue, dest, src, **kwargs):
         and to the minimum of the size of the source and target
         from 2013.1 on.
     :arg src_offset: (optional)
-    :arg dest_offset: (optional)
+    :arg dst_offset: (optional)
 
     .. ------------------------------------------------------------------------
     .. rubric :: Rectangular :class:`Buffer` ↔  host transfers (CL 1.1 and newer)
@@ -1719,7 +1862,7 @@ def enqueue_copy(queue, dest, src, **kwargs):
         three or shorter. (mandatory)
 
     .. ------------------------------------------------------------------------
-    .. rubric :: Transfer :class:`SVM`/host ↔ :class:`SVM`/host
+    .. rubric :: Transfer :class:`SVMPointer`/host ↔ :class:`SVMPointer`/host
     .. ------------------------------------------------------------------------
 
     :arg byte_count: (optional) If not specified, defaults to the
@@ -1736,25 +1879,61 @@ def enqueue_copy(queue, dest, src, **kwargs):
         if dest.type == mem_object_type.BUFFER:
             if isinstance(src, MemoryObjectHolder):
                 if src.type == mem_object_type.BUFFER:
+                    # {{{ buffer -> buffer
+
                     if "src_origin" in kwargs:
+                        # rectangular
                         return _cl._enqueue_copy_buffer_rect(
                                 queue, src, dest, **kwargs)
                     else:
-                        kwargs["dst_offset"] = kwargs.pop("dest_offset", 0)
+                        # linear
+                        dest_offset = kwargs.pop("dest_offset", None)
+                        if dest_offset is not None:
+                            if "dst_offset" in kwargs:
+                                raise TypeError("may not specify both 'dst_offset' "
+                                                "and 'dest_offset'")
+
+                            warn("The 'dest_offset' argument of enqueue_copy "
+                                 "is deprecated. Use 'dst_offset' instead. "
+                                 "'dest_offset' will stop working in 2023.x.",
+                                 DeprecationWarning, stacklevel=2)
+
+                            kwargs["dst_offset"] = dest_offset
+
                         return _cl._enqueue_copy_buffer(queue, src, dest, **kwargs)
+
+                    # }}}
                 elif src.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]:
                     return _cl._enqueue_copy_image_to_buffer(
                             queue, src, dest, **kwargs)
                 else:
                     raise ValueError("invalid src mem object type")
             else:
-                # assume from-host
+                # {{{ host -> buffer
+
                 if "buffer_origin" in kwargs:
                     return _cl._enqueue_write_buffer_rect(queue, dest, src, **kwargs)
                 else:
+                    device_offset = kwargs.pop("device_offset", None)
+                    if device_offset is not None:
+                        if "dst_offset" in kwargs:
+                            raise TypeError("may not specify both 'device_offset' "
+                                            "and 'dst_offset'")
+
+                        warn("The 'device_offset' argument of enqueue_copy "
+                                "is deprecated. Use 'dst_offset' instead. "
+                                "'dst_offset' will stop working in 2023.x.",
+                                DeprecationWarning, stacklevel=2)
+
+                        kwargs["dst_offset"] = device_offset
+
                     return _cl._enqueue_write_buffer(queue, dest, src, **kwargs)
 
+                # }}}
+
         elif dest.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]:
+            # {{{ ... -> image
+
             if isinstance(src, MemoryObjectHolder):
                 if src.type == mem_object_type.BUFFER:
                     return _cl._enqueue_copy_buffer_to_image(
@@ -1776,17 +1955,28 @@ def enqueue_copy(queue, dest, src, **kwargs):
 
                 return _cl._enqueue_write_image(
                         queue, dest, origin, region, src, **kwargs)
+
+            # }}}
         else:
             raise ValueError("invalid dest mem object type")
 
-    elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVM):
-        # to SVM
-        if not isinstance(src, SVM):
+    elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVMPointer):
+        # {{{ ... ->  SVM
+
+        if not isinstance(src, SVMPointer):
             src = SVM(src)
 
         is_blocking = kwargs.pop("is_blocking", True)
+
+        # These are NOT documented. They only support consistency with the
+        # Buffer-based API for the sake of the Array.
+        assert kwargs.pop("src_offset", 0) == 0
+        assert kwargs.pop("dst_offset", 0) == 0
+
         return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs)
 
+        # }}}
+
     else:
         # assume to-host
 
@@ -1795,7 +1985,21 @@ def enqueue_copy(queue, dest, src, **kwargs):
                 if "buffer_origin" in kwargs:
                     return _cl._enqueue_read_buffer_rect(queue, src, dest, **kwargs)
                 else:
+                    device_offset = kwargs.pop("device_offset", None)
+                    if device_offset is not None:
+                        if "src_offset" in kwargs:
+                            raise TypeError("may not specify both 'device_offset' "
+                                            "and 'src_offset'")
+
+                        warn("The 'device_offset' argument of enqueue_copy "
+                                "is deprecated. Use 'src_offset' instead. "
+                                "'dst_offset' will stop working in 2023.x.",
+                                DeprecationWarning, stacklevel=2)
+
+                        kwargs["src_offset"] = device_offset
+
                     return _cl._enqueue_read_buffer(queue, src, dest, **kwargs)
+
             elif src.type in [mem_object_type.IMAGE2D, mem_object_type.IMAGE3D]:
                 origin = kwargs.pop("origin")
                 region = kwargs.pop("region")
@@ -1810,12 +2014,20 @@ def enqueue_copy(queue, dest, src, **kwargs):
                         queue, src, origin, region, dest, **kwargs)
             else:
                 raise ValueError("invalid src mem object type")
-        elif isinstance(src, SVM):
-            # from svm
+        elif isinstance(src, SVMPointer):
+            # {{{ svm -> host
+
             # dest is not a SVM instance, otherwise we'd be in the branch above
+
+            # This is NOT documented. They only support consistency with the
+            # Buffer-based API for the sake of the Array.
+            assert kwargs.pop("src_offset", 0) == 0
+
             is_blocking = kwargs.pop("is_blocking", True)
             return _cl._enqueue_svm_memcpy(
                     queue, is_blocking, SVM(dest), src, **kwargs)
+
+            # }}}
         else:
             # assume from-host
             raise TypeError("enqueue_copy cannot perform host-to-host transfers")
@@ -1823,6 +2035,28 @@ def enqueue_copy(queue, dest, src, **kwargs):
 # }}}
 
 
+# {{{ enqueue_fill
+
+def enqueue_fill(queue: CommandQueue,
+        dest: "Union[MemoryObjectHolder, SVMPointer]",
+        pattern: Any, size: int, *, offset: int = 0,
+        wait_for: Optional[Sequence[Event]] = None) -> Event:
+    """
+    .. versionadded:: 2022.2
+    """
+    if isinstance(dest, MemoryObjectHolder):
+        return enqueue_fill_buffer(queue, dest, pattern, offset, size, wait_for)
+    elif isinstance(dest, SVMPointer):
+        if offset:
+            raise NotImplementedError("enqueue_fill with SVM does not yet support "
+                    "offsets")
+        return enqueue_svm_memfill(queue, dest, pattern, size, wait_for)
+    else:
+        raise TypeError(f"enqueue_fill does not know how to fill '{type(dest)}'")
+
+# }}}
+
+
 # {{{ image creation
 
 DTYPE_TO_CHANNEL_TYPE = {
@@ -1928,7 +2162,6 @@ def enqueue_barrier(queue, wait_for=None):
 
 def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
     if not (queue._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2)):
-        from warnings import warn
         warn("The context for this queue does not declare OpenCL 1.2 support, so "
                 "the next thing you might see is a crash")
 
@@ -1945,7 +2178,7 @@ def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
 def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
     """Fill shared virtual memory with a pattern.
 
-    :arg dest: a Python buffer object, optionally wrapped in an :class:`SVM` object
+    :arg dest: a Python buffer object, or any implementation of :class:`SVMPointer`.
     :arg pattern: a Python buffer object (e.g. a :class:`numpy.ndarray` with the
         fill pattern to be used.
     :arg byte_count: The size of the memory to be fill. Defaults to the
@@ -1956,17 +2189,17 @@ def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
     .. versionadded:: 2016.2
     """
 
-    if not isinstance(dest, SVM):
+    if not isinstance(dest, SVMPointer):
         dest = SVM(dest)
 
     return _cl._enqueue_svm_memfill(
-            queue, dest, pattern, byte_count=None, wait_for=None)
+            queue, dest, pattern, byte_count=byte_count, wait_for=wait_for)
 
 
 def enqueue_svm_migratemem(queue, svms, flags, wait_for=None):
     """
     :arg svms: a collection of Python buffer objects (e.g. :mod:`numpy`
-        arrays), optionally wrapped in :class:`SVM` objects.
+        arrays), or any implementation of :class:`SVMPointer`.
     :arg flags: a combination of :class:`mem_migration_flags`
 
     |std-enqueue-blurb|
@@ -1976,15 +2209,10 @@ def enqueue_svm_migratemem(queue, svms, flags, wait_for=None):
     This function requires OpenCL 2.1.
     """
 
-    return _cl._enqueue_svm_migratemem(
-            queue,
-            [svm.mem if isinstance(svm, SVM) else svm
-                for svm in svms],
-            flags,
-            wait_for)
+    return _cl._enqueue_svm_migratemem(queue, svms, flags, wait_for)
 
 
-def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
+def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None, queue=None):
     """Allocate an empty :class:`numpy.ndarray` of the given *shape*, *dtype*
     and *order*. (See :func:`numpy.empty` for the meaning of these arguments.)
     The array will be allocated in shared virtual memory belonging
@@ -2002,6 +2230,10 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
     will likely want to wrap the returned array in an :class:`SVM` tag.
 
     .. versionadded:: 2016.2
+
+    .. versionchanged:: 2022.2
+
+        *queue* argument added.
     """
 
     dtype = np.dtype(dtype)
@@ -2048,7 +2280,9 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
     if alignment is None:
         alignment = itemsize
 
-    svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface)
+    svm_alloc = _OverriddenArrayInterfaceSVMAllocation(
+            ctx, nbytes, alignment, flags, _interface=interface,
+            queue=queue)
     return np.asarray(svm_alloc)
 
 
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 15ed2bbbf..699466aa4 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -31,6 +31,7 @@
 from dataclasses import dataclass, field
 from typing import Any, List, Optional
 from functools import reduce
+from warnings import warn
 
 import numpy as np
 import pyopencl.elementwise as elementwise
@@ -49,6 +50,11 @@
 
 SCALAR_CLASSES = (Number, np.bool_, bool)
 
+if cl.get_cl_header_version() >= (2, 0):
+    _SVMPointer_or_nothing = cl.SVMPointer
+else:
+    _SVMPointer_or_nothing = ()
+
 
 _COMMON_DTYPE_CACHE = {}
 
@@ -130,7 +136,6 @@ class InconsistentOpenCLQueueWarning(UserWarning):
 
 class VecLookupWarner:
     def __getattr__(self, name):
-        from warnings import warn
         warn("pyopencl.array.vec is deprecated. "
              "Please use pyopencl.cltypes for OpenCL vector and scalar types",
              DeprecationWarning, 2)
@@ -226,7 +231,6 @@ def kernel_runner(*args, **kwargs):
 
 class DefaultAllocator(cl.tools.DeferredAllocator):
     def __init__(self, *args, **kwargs):
-        from warnings import warn
         warn("pyopencl.array.DefaultAllocator is deprecated. "
                 "It will be continue to exist throughout the 2013.x "
                 "versions of PyOpenCL.",
@@ -255,6 +259,7 @@ class _copy_queue:  # noqa
 
 _ARRAY_GET_SIZES_CACHE = {}
 _BOOL_DTYPE = np.dtype(np.int8)
+_NOT_PRESENT = object()
 
 
 class Array:
@@ -591,6 +596,16 @@ def __init__(self, cq, shape, dtype, order="C", allocator=None,
         self.context = context
         self._flags = _flags
 
+        if __debug__:
+            if queue is not None and isinstance(
+                    self.base_data, _SVMPointer_or_nothing):
+                mem_queue = getattr(self.base_data, "_queue", _NOT_PRESENT)
+                if mem_queue is not _NOT_PRESENT and mem_queue != queue:
+                    warn("Array has different queue from backing SVM memory. "
+                         "This may lead to the array getting deallocated sooner "
+                         "than expected, potentially leading to crashes.",
+                         InconsistentOpenCLQueueWarning, stacklevel=2)
+
     @property
     def ndim(self):
         return len(self.shape)
@@ -714,7 +729,6 @@ def set(self, ary, queue=None, async_=None, **kwargs):
             raise RuntimeError("cannot set from non-contiguous array")
 
         if not _equal_strides(ary.strides, self.strides, self.shape):
-            from warnings import warn
             warn("Setting array from one with different "
                     "strides/storage order. This will cease to work "
                     "in 2013.x.",
@@ -722,8 +736,9 @@ def set(self, ary, queue=None, async_=None, **kwargs):
 
         if self.size:
             event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
-                    device_offset=self.offset,
+                    dst_offset=self.offset,
                     is_blocking=not async_)
+
             self.add_event(event1)
 
     def _get(self, queue=None, ary=None, async_=None, **kwargs):
@@ -756,7 +771,6 @@ def _get(self, queue=None, ary=None, async_=None, **kwargs):
                 raise TypeError("'ary' has non-matching type")
 
             if self.shape != ary.shape:
-                from warnings import warn
                 warn("get() between arrays of different shape is deprecated "
                         "and will be removed in PyCUDA 2017.x",
                         DeprecationWarning, stacklevel=2)
@@ -772,8 +786,9 @@ def _get(self, queue=None, ary=None, async_=None, **kwargs):
 
         if self.size:
             event1 = cl.enqueue_copy(queue, ary, self.base_data,
-                    device_offset=self.offset,
+                    src_offset=self.offset,
                     wait_for=self.events, is_blocking=not async_)
+
             self.add_event(event1)
         else:
             event1 = None
@@ -806,7 +821,6 @@ def get(self, queue=None, ary=None, async_=None, **kwargs):
         """
 
         if async_:
-            from warnings import warn
             warn("calling pyopencl.Array.get with `async_=True` is deprecated. "
                     "Please use pyopencl.Array.get_async for asynchronous "
                     "device-to-host transfers",
@@ -877,7 +891,6 @@ def __repr__(self):
         if result[:5] == "array":
             result = f"cl.{type(self).__name__}" + result[5:]
         else:
-            from warnings import warn
             warn(f"{type(result).__name__}.__repr__ was expected to return a "
                      f"string starting with 'array', got '{result[:10]!r}'")
 
@@ -1459,8 +1472,8 @@ def _zero_fill(self, queue=None, wait_for=None):
         # https://github.com/inducer/pyopencl/issues/395
         if cl_version_gtr_1_2 and not (on_nvidia and self.nbytes >= 2**31):
             self.add_event(
-                    cl.enqueue_fill_buffer(queue, self.base_data, np.int8(0),
-                        self.offset, self.nbytes, wait_for=wait_for))
+                    cl.enqueue_fill(queue, self.base_data, np.int8(0),
+                        self.nbytes, offset=self.offset, wait_for=wait_for))
         else:
             zero = np.zeros((), self.dtype)
             self.fill(zero, queue=queue)
@@ -2121,7 +2134,7 @@ def setitem(self, subscript, value, queue=None, wait_for=None):
             if subarray.shape == value.shape and subarray.strides == value.strides:
                 self.add_event(
                         cl.enqueue_copy(queue, subarray.base_data,
-                            value, device_offset=subarray.offset, wait_for=wait_for))
+                            value, dst_offset=subarray.offset, wait_for=wait_for))
                 return
             else:
                 value = to_device(queue, value, self.allocator)
@@ -2675,7 +2688,6 @@ def concatenate(arrays, axis=0, queue=None, allocator=None):
         import builtins
         if builtins.any(type(ary) != type(arrays[0])  # noqa: E721
                         for ary in arrays[1:]):
-            from warnings import warn
             warn("Elements of 'arrays' not of the same type, returning "
                  "an instance of the type of arrays[0]",
                  stacklevel=2)
@@ -2744,7 +2756,6 @@ def hstack(arrays, queue=None):
         import builtins
         if builtins.any(type(ary) != type(arrays[0])  # noqa: E721
                         for ary in arrays[1:]):
-            from warnings import warn
             warn("Elements of 'arrays' not of the same type, returning "
                  "an instance of the type of arrays[0]",
                  stacklevel=2)
@@ -2804,7 +2815,6 @@ def stack(arrays, axis=0, queue=None):
         import builtins
         if builtins.any(type(ary) != type(arrays[0])  # noqa: E721
                         for ary in arrays[1:]):
-            from warnings import warn
             warn("Elements of 'arrays' not of the same type, returning "
                  "an instance of the type of arrays[0]",
                  stacklevel=2)
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 27adac75b..fb4a91e14 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -1,4 +1,92 @@
-"""Various helpful bits and pieces without much of a common theme."""
+r"""
+.. _memory-pools:
+
+Memory Pools
+------------
+
+Memory allocation (e.g. in the form of the :func:`pyopencl.Buffer` constructor)
+can be expensive if used frequently. For example, code based on
+:class:`pyopencl.array.Array` can easily run into this issue because a fresh
+memory area is allocated for each intermediate result.  Memory pools are a
+remedy for this problem based on the observation that often many of the block
+allocations are of the same sizes as previously used ones.
+
+Then, instead of fully returning the memory to the system and incurring the
+associated reallocation overhead, the pool holds on to the memory and uses it
+to satisfy future allocations of similarly-sized blocks. The pool reacts
+appropriately to out-of-memory conditions as long as all memory allocations
+are made through it. Allocations performed from outside of the pool may run
+into spurious out-of-memory conditions due to the pool owning much or all of
+the available memory.
+
+There are two flavors of allocators and memory pools:
+
+- :ref:`buf-mempool`
+- :ref:`svm-mempool`
+
+Using :class:`pyopencl.array.Array`\ s can be used with memory pools in a
+straightforward manner::
+
+    mem_pool = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue))
+    a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool)
+
+Likewise, SVM-based allocators are directly usable with
+:class:`pyopencl.array.Array`.
+
+.. _buf-mempool:
+
+:class:`~pyopencl.Buffer`-based Allocators and Memory Pools
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: PooledBuffer
+
+.. autoclass:: AllocatorBase
+
+.. autoclass:: DeferredAllocator
+
+.. autoclass:: ImmediateAllocator
+
+.. autoclass:: MemoryPool
+
+.. _svm-mempool:
+
+:ref:`SVM <svm>`-Based Allocators and Memory Pools
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+SVM functionality requires OpenCL 2.0.
+
+.. autoclass:: PooledSVM
+
+.. autoclass:: SVMAllocator
+
+.. autoclass:: SVMPool
+
+CL-Object-dependent Caching
+---------------------------
+
+.. autofunction:: first_arg_dependent_memoize
+.. autofunction:: clear_first_arg_caches
+
+Testing
+-------
+
+.. autofunction:: pytest_generate_tests_for_pyopencl
+
+Device Characterization
+-----------------------
+
+.. automodule:: pyopencl.characterize
+    :members:
+
+Type aliases
+------------
+
+.. currentmodule:: pyopencl._cl
+
+.. class:: AllocatorBase
+
+   See :class:`pyopencl.tools.AllocatorBase`.
+"""
 
 
 __copyright__ = "Copyright (C) 2010 Andreas Kloeckner"
@@ -33,7 +121,7 @@
 
 import numpy as np
 from pytools import memoize, memoize_method
-from pyopencl._cl import bitlog2  # noqa: F401
+from pyopencl._cl import bitlog2, get_cl_header_version  # noqa: F401
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
 
 import re
@@ -59,10 +147,293 @@ def _register_types():
 # {{{ imported names
 
 from pyopencl._cl import (  # noqa
-        PooledBuffer as PooledBuffer,
-        _tools_DeferredAllocator as DeferredAllocator,
-        _tools_ImmediateAllocator as ImmediateAllocator,
-        MemoryPool as MemoryPool)
+        PooledBuffer, AllocatorBase, DeferredAllocator,
+        ImmediateAllocator, MemoryPool,
+        )
+
+
+if get_cl_header_version() >= (2, 0):
+    from pyopencl._cl import (  # noqa
+            SVMPool,
+            PooledSVM,
+            SVMAllocator,
+            )
+
+# }}}
+
+
+# {{{ monkeypatch docstrings into imported interfaces
+
+_MEMPOOL_IFACE_DOCS = """
+.. note::
+
+    The current implementation of the memory pool will retain allocated
+    memory after it is returned by the application and keep it in a bin
+    identified by the leading *leading_bits_in_bin_id* bits of the
+    allocation size. To ensure that allocations within each bin are
+    interchangeable, allocation sizes are rounded up to the largest size
+    that shares the leading bits of the requested allocation size.
+
+    The current default value of *leading_bits_in_bin_id* is
+    four, but this may change in future versions and is not
+    guaranteed.
+
+    *leading_bits_in_bin_id* must be passed by keyword,
+    and its role is purely advisory. It is not guaranteed
+    that future versions of the pool will use the
+    same allocation scheme and/or honor *leading_bits_in_bin_id*.
+
+.. attribute:: held_blocks
+
+    The number of unused blocks being held by this pool.
+
+.. attribute:: active_blocks
+
+    The number of blocks in active use that have been allocated
+    through this pool.
+
+.. attribute:: managed_bytes
+
+    "Managed" memory is "active" and "held" memory.
+
+    .. versionadded:: 2021.1.2
+
+.. attribute:: active_bytes
+
+    "Active" bytes are bytes under the control of the application.
+    This may be smaller than the actual allocated size reflected
+    in :attr:`managed_bytes`.
+
+    .. versionadded:: 2021.1.2
+
+
+.. method:: free_held
+
+    Free all unused memory that the pool is currently holding.
+
+.. method:: stop_holding
+
+    Instruct the memory to start immediately freeing memory returned
+    to it, instead of holding it for future allocations.
+    Implicitly calls :meth:`free_held`.
+    This is useful as a cleanup action when a memory pool falls out
+    of use.
+"""
+
+
+def _monkeypatch_docstrings():
+
+    PooledBuffer.__doc__ = """
+    An object representing a :class:`MemoryPool`-based allocation of
+    :class:`~pyopencl.Buffer`-style device memory.  Analogous to
+    :class:`~pyopencl.Buffer`, however once this object is deleted, its
+    associated device memory is returned to the pool.
+
+    Is a :class:`pyopencl.MemoryObject`.
+    """
+
+    AllocatorBase.__doc__ = """
+    An interface implemented by various memory allocation functions
+    in :mod:`pyopencl`.
+
+    .. automethod:: __call__
+
+        Allocate and return a :class:`pyopencl.Buffer` of the given *size*.
+    """
+
+    # {{{ DeferredAllocator
+
+    DeferredAllocator.__doc__ = """
+    *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
+    to the *flags* argument of :class:`pyopencl.Buffer`. DeferredAllocator
+    has the same semantics as regular OpenCL buffer allocation, i.e. it may
+    promise memory to be available that may (in any call to a buffer-using
+    CL function) turn out to not exist later on. (Allocations in CL are
+    bound to contexts, not devices, and memory availability depends on which
+    device the buffer is used with.)
+
+    Implements :class:`AllocatorBase`.
+
+    .. versionchanged :: 2013.1
+
+        ``CLAllocator`` was deprecated and replaced
+        by :class:`DeferredAllocator`.
+
+    .. method::  __init__(context, mem_flags=pyopencl.mem_flags.READ_WRITE)
+
+    .. automethod:: __call__
+
+        Allocate a :class:`pyopencl.Buffer` of the given *size*.
+
+        .. versionchanged :: 2020.2
+
+            The allocator will succeed even for allocations of size zero,
+            returning *None*.
+    """
+
+    # }}}
+
+    # {{{ ImmediateAllocator
+
+    ImmediateAllocator.__doc__ = """
+    *mem_flags* takes its values from :class:`pyopencl.mem_flags` and corresponds
+    to the *flags* argument of :class:`pyopencl.Buffer`.
+    :class:`ImmediateAllocator` will attempt to ensure at allocation time that
+    allocated memory is actually available. If no memory is available, an
+    out-of-memory error is reported at allocation time.
+
+    Implements :class:`AllocatorBase`.
+
+    .. versionadded:: 2013.1
+
+    .. method:: __init__(queue, mem_flags=pyopencl.mem_flags.READ_WRITE)
+
+    .. automethod:: __call__
+
+        Allocate a :class:`pyopencl.Buffer` of the given *size*.
+
+        .. versionchanged :: 2020.2
+
+            The allocator will succeed even for allocations of size zero,
+            returning *None*.
+    """
+
+    # }}}
+
+    # {{{ MemoryPool
+
+    MemoryPool.__doc__ = """
+    A memory pool for OpenCL device memory in :class:`pyopencl.Buffer` form.
+    *allocator* must be an instance of one of the above classes, and should be
+    an :class:`ImmediateAllocator`.  The memory pool assumes that allocation
+    failures are reported by the allocator immediately, and not in the
+    OpenCL-typical deferred manner.
+
+    Implements :class:`AllocatorBase`.
+
+    .. versionchanged:: 2019.1
+
+        Current bin allocation behavior documented, *leading_bits_in_bin_id*
+        added.
+
+    .. automethod:: __init__
+
+    .. automethod:: allocate
+
+        Return a :class:`PooledBuffer` of the given *size*.
+
+    .. automethod:: __call__
+
+        Synonym for :meth:`allocate` to match :class:`AllocatorBase`.
+
+        .. versionadded:: 2011.2
+    """ + _MEMPOOL_IFACE_DOCS
+
+    # }}}
+
+
+_monkeypatch_docstrings()
+
+
+def _monkeypatch_svm_docstrings():
+    # {{{ PooledSVM
+
+    PooledSVM.__doc__ = """
+    An object representing a :class:`SVMPool`-based allocation of
+    :ref:`svm`.  Analogous to :class:`~pyopencl.SVMAllocation`, however once
+    this object is deleted, its associated device memory is returned to the
+    pool from which it came.
+
+    .. versionadded:: 2022.2
+
+    .. note::
+
+        If the :class:`SVMAllocator` for the :class:`SVMPool` that allocated an
+        object of this type is associated with an (in-order)
+        :class:`~pyopencl.CommandQueue`, sufficient synchronization is provided
+        to ensure operations enqueued before deallocation complete before
+        operations from a different use (possibly in a different queue) are
+        permitted to start. This applies when :class:`release` is called and
+        also when the object is freed automatically by the garbage collector.
+
+    Is a :class:`pyopencl.SVMPointer`.
+
+    Supports structural equality and hashing.
+
+    .. automethod:: release
+
+        Return the held memory to the pool. See the note about synchronization
+        behavior during deallocation above.
+
+    .. automethod:: enqueue_release
+
+        Synonymous to :meth;`release`, for consistency with
+        :class:`~pyopencl.SVMAllocation`. Note that, unlike
+        :meth:`pyopencl.SVMAllocation.enqueue_release`, specifying a queue
+        or events to be waited for is not supported.
+
+    .. automethod:: bind_to_queue
+
+        Analogous to :meth:`pyopencl.SVMAllocation.bind_to_queue`.
+
+    .. automethod:: unbind_from_queue
+
+        Analogous to :meth:`pyopencl.SVMAllocation.unbind_from_queue`.
+    """
+
+    # }}}
+
+    # {{{ SVMAllocator
+
+    SVMAllocator.__doc__ = """
+    .. versionadded:: 2022.2
+
+    .. automethod:: __init__
+
+        :arg flags: See :class:`~pyopencl.svm_mem_flags`.
+        :arg queue: If not specified, allocations will be freed
+            eagerly, irrespective of whether pending/enqueued operations
+            are still using the memory.
+
+            If specified, deallocation of memory will be enqueued
+            with the given queue, and will only be performed
+            after previously-enqueue operations in the queue have
+            completed.
+
+            It is an error to specify an out-of-order queue.
+
+            .. warning::
+
+                Not specifying a queue will typically lead to undesired
+                behavior, including crashes and memory corruption.
+                See the warning in :ref:`svm`.
+
+    .. automethod:: __call__
+
+        Return a :class:`~pyopencl.SVMAllocation` of the given *size*.
+    """
+
+    # }}}
+
+    # {{{ SVMPool
+
+    SVMPool.__doc__ = """
+    A memory pool for OpenCL device memory in :ref:`SVM <svm>` form.
+    *allocator* must be an instance of :class:`SVMAllocator`.
+
+    .. versionadded:: 2022.2
+
+    .. automethod:: __init__
+    .. automethod:: __call__
+
+        Return a :class:`PooledSVM` of the given *size*.
+    """ + _MEMPOOL_IFACE_DOCS
+
+    # }}}
+
+
+if get_cl_header_version() >= (2, 0):
+    _monkeypatch_svm_docstrings()
 
 # }}}
 
@@ -310,6 +681,22 @@ def idfn(val):
 
 
 def pytest_generate_tests_for_pyopencl(metafunc):
+    """Using the line::
+
+        from pyopencl.tools import pytest_generate_tests_for_pyopencl
+                as pytest_generate_tests
+
+    in your `pytest <http://pytest.org>`_ test scripts allows you to use the
+    arguments *ctx_factory*, *device*, or *platform* in your test functions,
+    and they will automatically be run for each OpenCL device/platform in the
+    system, as appropriate.
+
+    The following two environment variabls is also supported to control
+    device/platform choice::
+
+        PYOPENCL_TEST=0:0,1;intel=i5,i7
+    """
+
     arg_names = get_pyopencl_fixture_arg_names(metafunc)
     if not arg_names:
         return
@@ -605,7 +992,7 @@ def match_dtype_to_c_struct(device, name, dtype, context=None):
     the given *device* to ensure that :mod:`numpy` and C offsets and
     sizes match.)
 
-    .. versionadded: 2013.1
+    .. versionadded:: 2013.1
 
     This example explains the use of this function::
 
diff --git a/pyopencl/version.py b/pyopencl/version.py
index d4fcac4f0..41bb1ecc8 100644
--- a/pyopencl/version.py
+++ b/pyopencl/version.py
@@ -1,3 +1,3 @@
-VERSION = (2022, 1, 6)
+VERSION = (2022, 2)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
diff --git a/src/mempool.hpp b/src/mempool.hpp
index 44f0fd643..a0eca827e 100644
--- a/src/mempool.hpp
+++ b/src/mempool.hpp
@@ -102,7 +102,7 @@ namespace PYGPU_PACKAGE
       container_t m_container;
       typedef typename container_t::value_type bin_pair_t;
 
-      std::unique_ptr<Allocator> m_allocator;
+      std::shared_ptr<Allocator> m_allocator;
 
       // A held block is one that's been released by the application, but that
       // we are keeping around to dish out again.
@@ -125,8 +125,8 @@ namespace PYGPU_PACKAGE
       unsigned m_leading_bits_in_bin_id;
 
     public:
-      memory_pool(Allocator const &alloc=Allocator(), unsigned leading_bits_in_bin_id=4)
-        : m_allocator(alloc.copy()),
+      memory_pool(std::shared_ptr<Allocator> alloc, unsigned leading_bits_in_bin_id=4)
+        : m_allocator(alloc),
         m_held_blocks(0), m_active_blocks(0),
         m_managed_bytes(0), m_active_bytes(0),
         m_stop_holding(false),
@@ -233,7 +233,8 @@ namespace PYGPU_PACKAGE
             std::cout
               << "[pool] allocation of size " << size << " served from bin " << bin_nr
               << " which contained " << bin.size() << " entries" << std::endl;
-          return pop_block_from_bin(bin, size);
+          return m_allocator->hand_out_existing_block(
+              pop_block_from_bin(bin, size));
         }
 
          size_type alloc_sz = alloc_size(bin_nr);
@@ -256,7 +257,8 @@ namespace PYGPU_PACKAGE
 
         m_allocator->try_release_blocks();
         if (bin.size())
-          return pop_block_from_bin(bin, size);
+          return m_allocator->hand_out_existing_block(
+              pop_block_from_bin(bin, size));
 
         if (m_trace)
           std::cout << "[pool] allocation still OOM after GC" << std::endl;
@@ -282,7 +284,7 @@ namespace PYGPU_PACKAGE
             "failed to free memory for allocation");
       }
 
-      void free(pointer_type p, size_type size)
+      void free(pointer_type &&p, size_type size)
       {
         --m_active_blocks;
         m_active_bytes -= size;
@@ -291,7 +293,7 @@ namespace PYGPU_PACKAGE
         if (!m_stop_holding)
         {
           inc_held_blocks();
-          get_bin(bin_nr).push_back(p);
+          get_bin(bin_nr).push_back(std::move(p));
 
           if (m_trace)
             std::cout << "[pool] block of size " << size << " returned to bin "
@@ -300,7 +302,7 @@ namespace PYGPU_PACKAGE
         }
         else
         {
-          m_allocator->free(p);
+          m_allocator->free(std::move(p));
           m_managed_bytes -= alloc_size(bin_nr);
         }
       }
@@ -313,7 +315,7 @@ namespace PYGPU_PACKAGE
 
           while (bin.size())
           {
-            m_allocator->free(bin.back());
+            m_allocator->free(std::move(bin.back()));
             m_managed_bytes -= alloc_size(bin_pair.first);
             bin.pop_back();
 
@@ -353,7 +355,7 @@ namespace PYGPU_PACKAGE
 
           if (bin.size())
           {
-            m_allocator->free(bin.back());
+            m_allocator->free(std::move(bin.back()));
             m_managed_bytes -= alloc_size(bin_pair.first);
             bin.pop_back();
 
@@ -379,7 +381,7 @@ namespace PYGPU_PACKAGE
 
       pointer_type pop_block_from_bin(bin_t &bin, size_type size)
       {
-        pointer_type result = bin.back();
+        pointer_type result(std::move(bin.back()));
         bin.pop_back();
 
         dec_held_blocks();
@@ -399,7 +401,7 @@ namespace PYGPU_PACKAGE
       typedef typename Pool::pointer_type pointer_type;
       typedef typename Pool::size_type size_type;
 
-    private:
+    protected:
       PYGPU_SHARED_PTR<pool_type> m_pool;
 
       pointer_type m_ptr;
@@ -421,7 +423,7 @@ namespace PYGPU_PACKAGE
       {
         if (m_valid)
         {
-          m_pool->free(m_ptr, m_size);
+          m_pool->free(std::move(m_ptr), m_size);
           m_valid = false;
         }
         else
@@ -435,16 +437,8 @@ namespace PYGPU_PACKAGE
 #endif
               );
       }
-
-      pointer_type ptr() const
-      { return m_ptr; }
-
-      size_type size() const
-      { return m_size; }
   };
 }
 
 
-
-
 #endif
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 98964056f..413b8452b 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -137,10 +137,8 @@
 // }}}
 
 
+// {{{ macros and typedefs for wrappers
 
-
-
-// {{{ tools
 #if PY_VERSION_HEX >= 0x02050000
   typedef Py_ssize_t PYOPENCL_BUFFER_SIZE_T;
 #else
@@ -229,8 +227,6 @@
     }
 
 
-
-
 #define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \
   { \
     bool failed_with_mem_error = false; \
@@ -260,8 +256,20 @@
     } \
   }
 
+
+#define PYOPENCL_GET_SVM_SIZE(NAME) \
+  size_t NAME##_size; \
+  bool NAME##_has_size = false; \
+  try \
+  { \
+    NAME##_size = NAME.size(); \
+    NAME##_has_size = true; \
+  } \
+  catch (size_not_available)  { }
+
 // }}}
 
+
 // {{{ tracing and error reporting
 #ifdef PYOPENCL_TRACE
   #define PYOPENCL_PRINT_CALL_TRACE(NAME) \
@@ -329,6 +337,7 @@
 
 // }}}
 
+
 // {{{ get_info helpers
 #define PYOPENCL_GET_OPAQUE_INFO(WHAT, FIRST_ARG, SECOND_ARG, CL_TYPE, TYPE) \
   { \
@@ -383,6 +392,7 @@
 
 // }}}
 
+
 // {{{ event helpers --------------------------------------------------------------
 #define PYOPENCL_PARSE_WAIT_FOR \
     cl_uint num_events_in_wait_list = 0; \
@@ -424,7 +434,9 @@
 
 // }}}
 
+
 // {{{ equality testing
+
 #define PYOPENCL_EQUALITY_TESTS(cls) \
     bool operator==(cls const &other) const \
     { return data() == other.data(); } \
@@ -432,8 +444,8 @@
     { return data() != other.data(); } \
     long hash() const \
     { return (long) (intptr_t) data(); }
-// }}}
 
+// }}}
 
 
 namespace pyopencl
@@ -496,6 +508,19 @@ namespace pyopencl
   // }}}
 
 
+  // {{{ utility functions
+
+  inline bool is_queue_out_of_order(cl_command_queue queue)
+  {
+      cl_command_queue_properties param_value;
+      PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
+          (queue, CL_QUEUE_PROPERTIES, sizeof(param_value), &param_value, 0));
+      return param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
+  }
+
+  // }}}
+
+
   // {{{ buffer interface helper
 
 
@@ -1655,6 +1680,101 @@ namespace pyopencl
   // }}}
 
 
+  // {{{ command_queue_ref
+
+  // In contrast to command_queue, command_queue_ref is "nullable", i.e.
+  // it is a RAII *optional* reference to a command queue.
+
+  class command_queue_ref
+  {
+    private:
+      bool m_valid;
+      cl_command_queue m_queue;
+
+    public:
+      command_queue_ref()
+        : m_valid(false)
+      {}
+
+      command_queue_ref(cl_command_queue queue)
+        : m_valid(queue != nullptr), m_queue(queue)
+      {
+        // E.g. SVM allocations of size zero use a NULL queue. Tolerate that.
+        if (m_valid)
+          PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+      }
+
+      command_queue_ref(command_queue_ref &&src) noexcept
+        : m_valid(src.m_valid), m_queue(src.m_queue)
+      {
+        src.m_valid = false;
+      }
+
+      command_queue_ref(const command_queue_ref &src)
+      : m_valid(src.m_valid), m_queue(src.m_queue)
+      {
+        // Note that there isn't anything per se wrong with this
+        // copy constructor, the refcounting is just potentially
+        // expensive.
+        //
+        // All code in current use moves these, it does not copy them,
+        // so this should never get called.
+        //
+        // Unfortunately, we can't delete this copy constructor,
+        // because we would like to return these from functions.
+        // This makes at least gcc require copy constructors, even
+        // if those are never called due to NRVO.
+        std::cerr << "COPYING A COMMAND_QUEUE_REF." << std::endl;
+
+        if (m_valid)
+          PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+      }
+
+      command_queue_ref &operator=(const command_queue_ref &) = delete;
+
+      ~command_queue_ref()
+      {
+        reset();
+      }
+
+      bool is_valid() const
+      {
+        return m_valid;
+      }
+
+      cl_command_queue data() const
+      {
+        if (m_valid)
+          return m_queue;
+        else
+          throw error("command_queue_ref.data", CL_INVALID_VALUE,
+              "command_queue_ref is not valid");
+      }
+
+      void reset()
+      {
+        if (m_valid)
+          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue, (m_queue));
+        m_valid = false;
+      }
+
+      void set(cl_command_queue queue)
+      {
+        if (!queue)
+          throw error("command_queue_ref.set", CL_INVALID_VALUE,
+              "cannot set to NULL command queue");
+
+        if (m_valid)
+          PYOPENCL_CALL_GUARDED(clReleaseCommandQueue, (m_queue));
+        m_queue = queue;
+        PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+        m_valid = true;
+      }
+  };
+
+  // }}}
+
+
   // {{{ event/synchronization
 
   class event : noncopyable
@@ -2046,6 +2166,9 @@ namespace pyopencl
       }
 
       py::object get_info(cl_mem_info param_name) const;
+
+      virtual ~memory_object_holder()
+      { }
   };
 
 
@@ -2093,7 +2216,7 @@ namespace pyopencl
         m_valid = false;
       }
 
-      virtual ~memory_object()
+      ~memory_object()
       {
         if (m_valid)
           release();
@@ -2337,7 +2460,7 @@ namespace pyopencl
       command_queue &cq,
       memory_object_holder &mem,
       py::object buffer,
-      size_t device_offset,
+      size_t src_offset,
       py::object py_wait_for,
       bool is_blocking)
   {
@@ -2361,7 +2484,7 @@ namespace pyopencl
             queue,
             mem.data(),
             PYOPENCL_CAST_BOOL(is_blocking),
-            device_offset, len, buf,
+            src_offset, len, buf,
             PYOPENCL_WAITLIST_ARGS, &evt
             ))
       );
@@ -2376,7 +2499,7 @@ namespace pyopencl
       command_queue &cq,
       memory_object_holder &mem,
       py::object buffer,
-      size_t device_offset,
+      size_t dst_offset,
       py::object py_wait_for,
       bool is_blocking)
   {
@@ -2400,7 +2523,7 @@ namespace pyopencl
             queue,
             mem.data(),
             PYOPENCL_CAST_BOOL(is_blocking),
-            device_offset, len, buf,
+            dst_offset, len, buf,
             PYOPENCL_WAITLIST_ARGS, &evt
             ))
       );
@@ -3441,11 +3564,28 @@ namespace pyopencl
   // }}}
 
 
-  // {{{ svm
-
 #if PYOPENCL_CL_VERSION >= 0x2000
 
-  class svm_arg_wrapper
+  // {{{ svm pointer
+
+  class size_not_available { };
+
+  class svm_pointer
+  {
+    public:
+      virtual void *svm_ptr() const = 0;
+      // may throw size_not_available
+      virtual size_t size() const = 0;
+      virtual ~svm_pointer()
+      { }
+  };
+
+  // }}}
+
+
+  // {{{ svm_arg_wrapper
+
+  class svm_arg_wrapper : public svm_pointer
   {
     private:
       void *m_ptr;
@@ -3468,7 +3608,7 @@ namespace pyopencl
         m_size = ward->m_buf.len;
       }
 
-      void *ptr() const
+      void *svm_ptr() const
       {
         return m_ptr;
       }
@@ -3478,26 +3618,71 @@ namespace pyopencl
       }
   };
 
+  // }}}
+
+
+  // {{{ svm_allocation
 
-  class svm_allocation : noncopyable
+  class svm_allocation : public svm_pointer
   {
     private:
       std::shared_ptr<context> m_context;
       void *m_allocation;
+      size_t m_size;
+      command_queue_ref m_queue;
+      // FIXME Should maybe also allow keeping a list of events so that we can
+      // wait for users to finish in the case of out-of-order queues.
 
     public:
-      svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment, cl_svm_mem_flags flags)
-        : m_context(ctx)
+      svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment,
+          cl_svm_mem_flags flags, const command_queue *queue = nullptr)
+        : m_context(ctx), m_size(size)
       {
-        PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
-        m_allocation = clSVMAlloc(
-            ctx->data(),
-            flags, size, alignment);
+        if (queue)
+        {
+          m_queue.set(queue->data());
+          if (is_queue_out_of_order(m_queue.data()))
+            throw error("SVMAllocation.__init__", CL_INVALID_VALUE,
+                "supplying an out-of-order queue to SVMAllocation is invalid");
+        }
+
+        int try_count = 0;
+        while (try_count < 2)
+        {
+          PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
+          m_allocation = clSVMAlloc(
+              ctx->data(),
+              flags, size, alignment);
+          if (m_allocation)
+            return;
+
+          ++try_count;
+          run_python_gc();
+        }
 
         if (!m_allocation)
           throw pyopencl::error("clSVMAlloc", CL_OUT_OF_RESOURCES);
       }
 
+      svm_allocation(std::shared_ptr<context> const &ctx, void *allocation, size_t size,
+           const cl_command_queue queue)
+        : m_context(ctx), m_allocation(allocation), m_size(size)
+      {
+        if (queue)
+        {
+          if (is_queue_out_of_order(queue))
+          {
+            release();
+            throw error("SVMAllocation.__init__", CL_INVALID_VALUE,
+                "supplying an out-of-order queue to SVMAllocation is invalid");
+          }
+          m_queue.set(queue);
+        }
+      }
+
+      svm_allocation(const svm_allocation &) = delete;
+      svm_allocation &operator=(const svm_allocation &) = delete;
+
       ~svm_allocation()
       {
         if (m_allocation)
@@ -3510,36 +3695,62 @@ namespace pyopencl
           throw error("SVMAllocation.release", CL_INVALID_VALUE,
               "trying to double-unref svm allocation");
 
-        clSVMFree(m_context->data(), m_allocation);
+        if (m_queue.is_valid())
+        {
+          PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
+                m_queue.data(), 1, &m_allocation,
+                nullptr, nullptr,
+                0, nullptr, nullptr));
+          m_queue.reset();
+        }
+        else
+        {
+          PYOPENCL_PRINT_CALL_TRACE("clSVMFree");
+          clSVMFree(m_context->data(), m_allocation);
+        }
         m_allocation = nullptr;
       }
 
-      void enqueue_release(command_queue &queue, py::object py_wait_for)
+      event *enqueue_release(command_queue *queue, py::object py_wait_for)
       {
         PYOPENCL_PARSE_WAIT_FOR;
 
         if (!m_allocation)
-          throw error("SVMAllocation.release", CL_INVALID_VALUE,
-              "trying to double-unref svm allocation");
+          throw error("SVMAllocation.enqueue_release", CL_INVALID_VALUE,
+              "trying to enqueue_release on an already-freed allocation");
+
+        cl_command_queue use_queue;
+        if (queue)
+          use_queue = queue->data();
+        else
+        {
+          if (m_queue.is_valid())
+            use_queue = m_queue.data();
+          else
+            throw error("SVMAllocation.enqueue_release", CL_INVALID_VALUE,
+                "no implicit queue available, must be provided explicitly");
+        }
 
         cl_event evt;
 
         PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
-              queue.data(), 1, &m_allocation,
+              use_queue, 1, &m_allocation,
               nullptr, nullptr,
               PYOPENCL_WAITLIST_ARGS, &evt));
 
         m_allocation = nullptr;
+
+        PYOPENCL_RETURN_NEW_EVENT(evt);
       }
 
-      void *ptr() const
+      void *svm_ptr() const
       {
         return m_allocation;
       }
 
-      intptr_t ptr_as_int() const
+      size_t size() const
       {
-        return (intptr_t) m_allocation;
+        return m_size;
       }
 
       bool operator==(svm_allocation const &other) const
@@ -3551,22 +3762,108 @@ namespace pyopencl
       {
         return m_allocation != other.m_allocation;
       }
+
+      void bind_to_queue(command_queue const &queue)
+      {
+        if (is_queue_out_of_order(queue.data()))
+          throw error("SVMAllocation.bind_to_queue", CL_INVALID_VALUE,
+              "supplying an out-of-order queue to SVMAllocation is invalid");
+
+        if (m_queue.is_valid())
+        {
+          if (m_queue.data() != queue.data())
+          {
+            // make sure synchronization promises stay valid in new queue
+            cl_event evt;
+
+            PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_queue.data(), &evt));
+            PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList,
+                (queue.data(), 1, &evt, nullptr));
+          }
+        }
+
+        m_queue.set(queue.data());
+      }
+
+      void unbind_from_queue()
+      {
+        if (m_queue.is_valid())
+          PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_queue.data()));
+
+        m_queue.reset();
+      }
+
+      // only use for testing/diagnostic/debugging purposes!
+      cl_command_queue queue() const
+      {
+        if (m_queue.is_valid())
+          return m_queue.data();
+        else
+          return nullptr;
+      }
   };
 
+  // }}}
+
+
+  // {{{ svm operations
 
   inline
   event *enqueue_svm_memcpy(
       command_queue &cq,
       cl_bool is_blocking,
-      svm_arg_wrapper &dst, svm_arg_wrapper &src,
-      py::object py_wait_for
+      svm_pointer &dst, svm_pointer &src,
+      py::object py_wait_for,
+      py::object byte_count_py
       )
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
-    if (src.size() != dst.size())
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(src);
+    PYOPENCL_GET_SVM_SIZE(dst);
+
+    size_t size;
+    bool have_size = false;
+
+    if (src_has_size)
+    {
+      size = src_size;
+      have_size = true;
+    }
+    if (dst_has_size)
+    {
+      if (have_size)
+      {
+        if (!byte_count_py.is_none())
+          size = std::min(size, dst_size);
+        else if (size != dst_size)
+          throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
+              "sizes of source and destination buffer do not match");
+      }
+      else
+      {
+        size = dst_size;
+        have_size = true;
+      }
+    }
+
+    if (!byte_count_py.is_none())
+    {
+      size_t byte_count = byte_count_py.cast<size_t>();
+      if (have_size && byte_count > size)
+        throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
+            "specified byte_count larger than size of source or destination buffers");
+      size = byte_count;
+      have_size = true;
+    }
+
+    if (!have_size)
       throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
-          "sizes of source and destination buffer do not match");
+          "size not passed and could not be determined");
+
+    // }}}
 
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
@@ -3574,8 +3871,8 @@ namespace pyopencl
         (
           cq.data(),
           is_blocking,
-          dst.ptr(), src.ptr(),
-          dst.size(),
+          dst.svm_ptr(), src.svm_ptr(),
+          size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3587,7 +3884,7 @@ namespace pyopencl
   inline
   event *enqueue_svm_memfill(
       command_queue &cq,
-      svm_arg_wrapper &dst, py::object py_pattern,
+      svm_pointer &dst, py::object py_pattern,
       py::object byte_count,
       py::object py_wait_for
       )
@@ -3604,18 +3901,41 @@ namespace pyopencl
     pattern_ptr = pattern_ward->m_buf.buf;
     pattern_len = pattern_ward->m_buf.len;
 
-    size_t fill_size = dst.size();
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(dst);
+
+    size_t size;
+    bool have_size = false;
+    if (dst_has_size)
+    {
+      size = dst_size;
+      have_size = true;
+    }
     if (!byte_count.is_none())
-      fill_size = py::cast<size_t>(byte_count);
+    {
+      size_t user_size = py::cast<size_t>(byte_count);
+      if (have_size && user_size > size)
+        throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+            "byte_count too large for specified SVM buffer");
+    }
+
+    if (!have_size)
+    {
+      throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+          "byte_count not passed and could not be determined");
+    }
+
+    // }}}
 
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
         clEnqueueSVMMemFill,
         (
           cq.data(),
-          dst.ptr(), pattern_ptr,
+          dst.svm_ptr(), pattern_ptr,
           pattern_len,
-          fill_size,
+          size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3629,12 +3949,40 @@ namespace pyopencl
       command_queue &cq,
       cl_bool is_blocking,
       cl_map_flags flags,
-      svm_arg_wrapper &svm,
-      py::object py_wait_for
+      svm_pointer &svm,
+      py::object py_wait_for,
+      py::object user_size_py
       )
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(svm);
+
+    size_t size;
+    bool have_size = false;
+    if (svm_has_size)
+    {
+      size = svm_size;
+      have_size = true;
+    }
+    if (!user_size_py.is_none())
+    {
+      size_t user_size = py::cast<size_t>(user_size_py);
+      if (have_size && user_size > size)
+        throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+            "user-provided size too large for specified SVM buffer");
+    }
+
+    if (!have_size)
+    {
+      throw error("enqueue_svm_mem_map", CL_INVALID_VALUE,
+          "size not passed and could not be determined");
+    }
+
+    // }}}
+
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
         clEnqueueSVMMap,
@@ -3642,7 +3990,7 @@ namespace pyopencl
           cq.data(),
           is_blocking,
           flags,
-          svm.ptr(), svm.size(),
+          svm.svm_ptr(), size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3654,7 +4002,7 @@ namespace pyopencl
   inline
   event *enqueue_svm_unmap(
       command_queue &cq,
-      svm_arg_wrapper &svm,
+      svm_pointer &svm,
       py::object py_wait_for
       )
   {
@@ -3665,7 +4013,7 @@ namespace pyopencl
         clEnqueueSVMUnmap,
         (
           cq.data(),
-          svm.ptr(),
+          svm.svm_ptr(),
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3691,9 +4039,9 @@ namespace pyopencl
 
     for (py::handle py_svm: svms)
     {
-      svm_arg_wrapper &svm(py::cast<svm_arg_wrapper &>(py_svm));
+      svm_pointer &svm(py::cast<svm_pointer &>(py_svm));
 
-      svm_pointers.push_back(svm.ptr());
+      svm_pointers.push_back(svm.svm_ptr());
       sizes.push_back(svm.size());
     }
 
@@ -4336,16 +4684,18 @@ namespace pyopencl
   {
     private:
       cl_kernel m_kernel;
+      bool m_set_arg_prefer_svm;
 
     public:
       kernel(cl_kernel knl, bool retain)
-        : m_kernel(knl)
+        : m_kernel(knl), m_set_arg_prefer_svm(false)
       {
         if (retain)
           PYOPENCL_CALL_GUARDED(clRetainKernel, (knl));
       }
 
       kernel(program const &prg, std::string const &kernel_name)
+        : m_set_arg_prefer_svm(false)
       {
         cl_int status_code;
 
@@ -4486,10 +4836,10 @@ namespace pyopencl
       }
 
 #if PYOPENCL_CL_VERSION >= 0x2000
-      void set_arg_svm(cl_uint arg_index, svm_arg_wrapper const &wrp)
+      void set_arg_svm(cl_uint arg_index, svm_pointer const &wrp)
       {
         PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer,
-            (m_kernel, arg_index, wrp.ptr()));
+            (m_kernel, arg_index, wrp.svm_ptr()));
       }
 #endif
 
@@ -4501,21 +4851,47 @@ namespace pyopencl
           return;
         }
 
-        try
+        // It turns out that a taken 'catch' has a relatively high cost, so
+        // in deciding which of "mem object" and "svm" to try first, we use
+        // whatever we were given last time around.
+        if (m_set_arg_prefer_svm)
         {
-          set_arg_mem(arg_index, arg.cast<memory_object_holder &>());
-          return;
+#if PYOPENCL_CL_VERSION >= 0x2000
+          try
+          {
+            set_arg_svm(arg_index, arg.cast<svm_pointer const &>());
+            return;
+          }
+          catch (py::cast_error &) { }
+#endif
+
+          try
+          {
+            set_arg_mem(arg_index, arg.cast<memory_object_holder &>());
+            m_set_arg_prefer_svm = false;
+            return;
+          }
+          catch (py::cast_error &) { }
         }
-        catch (py::cast_error &) { }
+        else
+        {
+          try
+          {
+            set_arg_mem(arg_index, arg.cast<memory_object_holder &>());
+            return;
+          }
+          catch (py::cast_error &) { }
 
 #if PYOPENCL_CL_VERSION >= 0x2000
-        try
-        {
-          set_arg_svm(arg_index, arg.cast<svm_arg_wrapper const &>());
-          return;
-        }
-        catch (py::cast_error &) { }
+          try
+          {
+            set_arg_svm(arg_index, arg.cast<svm_pointer const &>());
+            m_set_arg_prefer_svm = true;
+            return;
+          }
+          catch (py::cast_error &) { }
 #endif
+        }
 
         try
         {
diff --git a/src/wrap_cl_part_1.cpp b/src/wrap_cl_part_1.cpp
index 8d62ef5d1..3b9f79eda 100644
--- a/src/wrap_cl_part_1.cpp
+++ b/src/wrap_cl_part_1.cpp
@@ -282,11 +282,12 @@ void pyopencl_expose_part_1(py::module &m)
   // {{{ transfers
 
   // {{{ byte-for-byte
+
   m.def("_enqueue_read_buffer", enqueue_read_buffer,
       py::arg("queue"),
       py::arg("mem"),
       py::arg("hostbuf"),
-      py::arg("device_offset")=0,
+      py::arg("src_offset")=0,
       py::arg("wait_for")=py::none(),
       py::arg("is_blocking")=true
       );
@@ -294,7 +295,7 @@ void pyopencl_expose_part_1(py::module &m)
       py::arg("queue"),
       py::arg("mem"),
       py::arg("hostbuf"),
-      py::arg("device_offset")=0,
+      py::arg("dst_offset")=0,
       py::arg("wait_for")=py::none(),
       py::arg("is_blocking")=true
       );
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 0c9a0d1b1..80560bd76 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -24,6 +24,7 @@
 // OTHER DEALINGS IN THE SOFTWARE.
 
 
+#include <memory>
 #define NO_IMPORT_ARRAY
 #define PY_ARRAY_UNIQUE_SYMBOL pyopencl_ARRAY_API
 
@@ -64,6 +65,22 @@ namespace pyopencl {
   }
 
 #endif
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+  class svm_pointer_as_buffer
+  {
+    private:
+      svm_pointer &m_ptr;
+
+    public:
+      svm_pointer_as_buffer(svm_pointer &ptr)
+        : m_ptr(ptr)
+      { }
+
+      svm_pointer &ptr() const
+      { return m_ptr; }
+  };
+#endif
 }
 
 
@@ -292,37 +309,130 @@ void pyopencl_expose_part_2(py::module &m)
 
   // }}}
 
-  // {{{ svm
+  // {{{ svm_pointer
 
 #if PYOPENCL_CL_VERSION >= 0x2000
+  {
+    typedef svm_pointer cls;
+    py::class_<cls>(m, "SVMPointer", py::dynamic_attr())
+      // For consistency, it may seem appropriate to use int_ptr here, but
+      // that would work on both buffers and SVM, and passing a buffer pointer to
+      // a kernel is going to lead to a bad time.
+      .def_property_readonly("svm_ptr",
+          [](cls &self) { return (intptr_t) self.svm_ptr(); })
+      .def_property_readonly("size", [](cls &self) -> py::object
+          {
+            try
+            {
+              return py::cast(self.size());
+            }
+            catch (size_not_available)
+            {
+              return py::none();
+            }
+          })
+      .def_property_readonly("buf", [](cls &self) -> svm_pointer_as_buffer * {
+            return new svm_pointer_as_buffer(self);
+          }, py::return_value_policy::reference_internal)
+      ;
+  }
+
+  {
+    typedef svm_pointer_as_buffer cls;
+    py::class_<cls>(m, "_SVMPointerAsBuffer", pybind11::buffer_protocol())
+      .def_buffer([](cls &self) -> pybind11::buffer_info
+          {
+            size_t size;
+            try
+            {
+              size = self.ptr().size();
+            }
+            catch (size_not_available)
+            {
+              throw pyopencl::error("SVMPointer buffer protocol", CL_INVALID_VALUE,
+                  "size of SVM is not known");
+            }
+            return pybind11::buffer_info(
+                // Pointer to buffer
+                self.ptr().svm_ptr(),
+                // Size of one scalar
+                sizeof(unsigned char),
+                // Python struct-style format descriptor
+                pybind11::format_descriptor<unsigned char>::format(),
+                // Number of dimensions
+                1,
+                // Buffer dimensions
+                { size },
+                // Strides (in bytes) for each index
+                { sizeof(unsigned char) }
+                );
+          })
+    ;
+  }
+
+  // }}}
+
+  // {{{ svm_arg_wrapper
+
   {
     typedef svm_arg_wrapper cls;
-    py::class_<cls>(m, "SVM", py::dynamic_attr())
+    py::class_<cls, svm_pointer>(m, "SVM", py::dynamic_attr())
       .def(py::init<py::object>())
       ;
   }
 
+  // }}}
+
+  // {{{ svm_allocation
+
   {
     typedef svm_allocation cls;
-    py::class_<cls>(m, "SVMAllocation", py::dynamic_attr())
-      .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags>())
+    py::class_<cls, svm_pointer>(m, "SVMAllocation", py::dynamic_attr())
+      .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(),
+          py::arg("context"),
+          py::arg("size"),
+          py::arg("alignment"),
+          py::arg("flags"),
+          py::arg("queue").none(true)=py::none()
+          )
       .DEF_SIMPLE_METHOD(release)
       .def("enqueue_release", &cls::enqueue_release,
           ":returns: a :class:`pyopencl.Event`\n\n"
-          "|std-enqueue-blurb|")
-      .def("_ptr_as_int", &cls::ptr_as_int)
+          "|std-enqueue-blurb|",
+          py::arg("queue").none(true)=py::none(),
+          py::arg("wait_for").none(true)=py::none()
+          )
       .def(py::self == py::self)
       .def(py::self != py::self)
-      .def("__hash__", &cls::ptr_as_int)
+      .def("__hash__", [](cls &self) { return (intptr_t) self.svm_ptr(); })
+      .def("bind_to_queue", &cls::bind_to_queue,
+          py::arg("queue"))
+      .DEF_SIMPLE_METHOD(unbind_from_queue)
+
+      // only for diagnostic/debugging/testing purposes!
+      .def_property_readonly("_queue",
+          [](cls const &self) -> py::object
+          {
+            cl_command_queue queue = self.queue();
+            if (queue)
+              return py::cast(new command_queue(queue, true));
+            else
+              return py::none();
+          })
       ;
   }
 
+  // }}}
+
+  // {{{ svm operations
+
   m.def("_enqueue_svm_memcpy", enqueue_svm_memcpy,
       py::arg("queue"),
       py::arg("is_blocking"),
       py::arg("dst"),
       py::arg("src"),
-      py::arg("wait_for")=py::none()
+      py::arg("wait_for")=py::none(),
+      py::arg("byte_count")=py::none()
       );
 
   m.def("_enqueue_svm_memfill", enqueue_svm_memfill,
@@ -338,7 +448,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("is_blocking"),
       py::arg("flags"),
       py::arg("svm"),
-      py::arg("wait_for")=py::none()
+      py::arg("wait_for")=py::none(),
+      py::arg("size")=py::none()
       );
 
   m.def("_enqueue_svm_unmap", enqueue_svm_unmap,
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index f40832bbf..36f6b4106 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -40,44 +40,53 @@
 
 
 
-namespace
-{
+namespace pyopencl {
+  // {{{ test_allocator
+
   class test_allocator
   {
     public:
       typedef void *pointer_type;
       typedef size_t size_type;
 
-      virtual test_allocator *copy() const
+      bool is_deferred() const
       {
-        return new test_allocator();
+        return false;
       }
 
-      virtual bool is_deferred() const
+      pointer_type allocate(size_type s)
       {
-        return false;
+        return nullptr;
       }
-      virtual pointer_type allocate(size_type s)
+
+      pointer_type hand_out_existing_block(pointer_type &&p)
       {
-        return nullptr;
+        return p;
       }
 
-      void free(pointer_type p)
+      ~test_allocator()
+      { }
+
+      void free(pointer_type &&p)
       { }
 
       void try_release_blocks()
       { }
   };
 
+  // }}}
+
+
+  // {{{ buffer allocators
 
-  class cl_allocator_base
+  class buffer_allocator_base
   {
     protected:
       std::shared_ptr<pyopencl::context> m_context;
       cl_mem_flags m_flags;
 
     public:
-      cl_allocator_base(std::shared_ptr<pyopencl::context> const &ctx,
+      buffer_allocator_base(std::shared_ptr<pyopencl::context> const &ctx,
           cl_mem_flags flags=CL_MEM_READ_WRITE)
         : m_context(ctx), m_flags(flags)
       {
@@ -86,21 +95,25 @@ namespace
               "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags");
       }
 
-      cl_allocator_base(cl_allocator_base const &src)
+      buffer_allocator_base(buffer_allocator_base const &src)
       : m_context(src.m_context), m_flags(src.m_flags)
       { }
 
-      virtual ~cl_allocator_base()
+      virtual ~buffer_allocator_base()
       { }
 
       typedef cl_mem pointer_type;
       typedef size_t size_type;
 
-      virtual cl_allocator_base *copy() const = 0;
       virtual bool is_deferred() const = 0;
       virtual pointer_type allocate(size_type s) = 0;
 
-      void free(pointer_type p)
+      pointer_type hand_out_existing_block(pointer_type &&p)
+      {
+        return p;
+      }
+
+      void free(pointer_type &&p)
       {
         PYOPENCL_CALL_GUARDED(clReleaseMemObject, (p));
       }
@@ -111,22 +124,18 @@ namespace
       }
   };
 
-  class cl_deferred_allocator : public cl_allocator_base
+
+  class deferred_buffer_allocator : public buffer_allocator_base
   {
     private:
-      typedef cl_allocator_base super;
+      typedef buffer_allocator_base super;
 
     public:
-      cl_deferred_allocator(std::shared_ptr<pyopencl::context> const &ctx,
+      deferred_buffer_allocator(std::shared_ptr<pyopencl::context> const &ctx,
           cl_mem_flags flags=CL_MEM_READ_WRITE)
         : super(ctx, flags)
       { }
 
-      cl_allocator_base *copy() const
-      {
-        return new cl_deferred_allocator(*this);
-      }
-
       bool is_deferred() const
       { return true; }
 
@@ -139,30 +148,24 @@ namespace
       }
   };
 
-  const unsigned zero = 0;
 
-  class cl_immediate_allocator : public cl_allocator_base
+  class immediate_buffer_allocator : public buffer_allocator_base
   {
     private:
-      typedef cl_allocator_base super;
+      typedef buffer_allocator_base super;
       pyopencl::command_queue m_queue;
 
     public:
-      cl_immediate_allocator(pyopencl::command_queue &queue,
+      immediate_buffer_allocator(pyopencl::command_queue &queue,
           cl_mem_flags flags=CL_MEM_READ_WRITE)
         : super(std::shared_ptr<pyopencl::context>(queue.get_context()), flags),
         m_queue(queue.data(), /*retain*/ true)
       { }
 
-      cl_immediate_allocator(cl_immediate_allocator const &src)
+      immediate_buffer_allocator(immediate_buffer_allocator const &src)
         : super(src), m_queue(src.m_queue)
       { }
 
-      cl_allocator_base *copy() const
-      {
-        return new cl_immediate_allocator(*this);
-      }
-
       bool is_deferred() const
       { return false; }
 
@@ -210,13 +213,47 @@ namespace
       }
   };
 
+  // }}}
 
 
+  // {{{ pooled_buffer
+
+  class pooled_buffer
+    : public pyopencl::pooled_allocation<pyopencl::memory_pool<buffer_allocator_base> >,
+    public pyopencl::memory_object_holder
+  {
+    private:
+      typedef
+        pyopencl::pooled_allocation<pyopencl::memory_pool<buffer_allocator_base> >
+        super;
+
+    public:
+      pooled_buffer(
+          std::shared_ptr<super::pool_type> p, super::size_type s)
+        : super(p, s)
+      { }
+
+      virtual ~pooled_buffer()
+      { }
+
+      const super::pointer_type data() const
+      { return m_ptr; }
+
+      size_t size() const
+      {
+        return m_size;
+      }
+  };
+
+  // }}}
+
+
+  // {{{ allocate_from_buffer_allocator
 
   inline
-  pyopencl::buffer *allocator_call(cl_allocator_base &alloc, size_t size)
+  buffer *allocate_from_buffer_allocator(buffer_allocator_base &alloc, size_t size)
   {
-    cl_mem mem;
+    cl_mem mem = nullptr;
     int try_count = 0;
     while (try_count < 2)
     {
@@ -256,41 +293,264 @@ namespace
     }
   }
 
+  // }}}
 
 
+  // {{{ allocate_from_buffer_pool
 
-  class pooled_buffer
-    : public pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator_base> >,
-    public pyopencl::memory_object_holder
+  pooled_buffer *allocate_from_buffer_pool(
+      std::shared_ptr<memory_pool<buffer_allocator_base> > pool,
+      memory_pool<buffer_allocator_base>::size_type sz)
+  {
+    return new pooled_buffer(pool, sz);
+  }
+
+  // }}}
+
+
+#if PYOPENCL_CL_VERSION >= 0x2000
+
+  struct svm_held_pointer
+  {
+    void *ptr;
+    pyopencl::command_queue_ref queue;
+  };
+
+
+  // {{{ svm allocator
+
+  class svm_allocator
+  {
+    public:
+      typedef svm_held_pointer pointer_type;
+      typedef size_t size_type;
+
+    protected:
+      std::shared_ptr<pyopencl::context> m_context;
+      cl_uint m_alignment;
+      cl_svm_mem_flags m_flags;
+      pyopencl::command_queue_ref m_queue;
+
+    public:
+      svm_allocator(std::shared_ptr<pyopencl::context> const &ctx,
+          cl_uint alignment=0, cl_svm_mem_flags flags=CL_MEM_READ_WRITE,
+          pyopencl::command_queue *queue=nullptr)
+        : m_context(ctx), m_alignment(alignment), m_flags(flags)
+      {
+        if (queue)
+          m_queue.set(queue->data());
+      }
+
+      svm_allocator(svm_allocator const &src)
+      : m_context(src.m_context), m_alignment(src.m_alignment),
+      m_flags(src.m_flags)
+      { }
+
+      ~svm_allocator()
+      { }
+
+      bool is_deferred() const
+      {
+        // According to experiments with the Nvidia implementation (and based
+        // on my reading of the CL spec), clSVMalloc will return an error
+        // immedaitely upon being out of memory.  Therefore the
+        // immediate/deferred split on the buffer side is not needed here.
+        // -AK, 2022-09-07
+
+        return false;
+      }
+
+      std::shared_ptr<pyopencl::context> context() const
+      {
+        return m_context;
+      }
+
+      pointer_type allocate(size_type size)
+      {
+        if (size == 0)
+          return { nullptr, nullptr };
+
+        PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
+        return {
+          clSVMAlloc(m_context->data(), m_flags, size, m_alignment),
+          pyopencl::command_queue_ref(m_queue.is_valid() ? m_queue.data() : nullptr)
+        };
+      }
+
+      pointer_type hand_out_existing_block(pointer_type &&p)
+      {
+        if (m_queue.is_valid())
+        {
+          if (p.queue.is_valid())
+          {
+            if (p.queue.data() != m_queue.data())
+            {
+              // make sure synchronization promises stay valid in new queue
+              cl_event evt;
+
+              PYOPENCL_CALL_GUARDED(clEnqueueMarker, (p.queue.data(), &evt));
+              PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList,
+                  (m_queue.data(), 1, &evt, nullptr));
+            }
+          }
+          p.queue.set(m_queue.data());
+        }
+        else
+        {
+          if (p.queue.is_valid())
+          {
+            PYOPENCL_CALL_GUARDED_THREADED(clFinish, (p.queue.data()));
+            p.queue.reset();
+          }
+        }
+
+        return std::move(p);
+      }
+
+      void free(pointer_type &&p)
+      {
+        if (p.queue.is_valid())
+        {
+          PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
+                p.queue.data(), 1, &p.ptr,
+                nullptr, nullptr,
+                0, nullptr, nullptr));
+          p.queue.reset();
+        }
+        else
+        {
+          PYOPENCL_PRINT_CALL_TRACE("clSVMFree");
+          clSVMFree(m_context->data(), p.ptr);
+        }
+      }
+
+      void try_release_blocks()
+      {
+        pyopencl::run_python_gc();
+      }
+  };
+
+  // }}}
+
+
+  // {{{ pooled_svm
+
+  class pooled_svm
+    : public pyopencl::pooled_allocation<pyopencl::memory_pool<svm_allocator>>,
+    public pyopencl::svm_pointer
   {
     private:
       typedef
-        pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator_base> >
+        pyopencl::pooled_allocation<pyopencl::memory_pool<svm_allocator>>
         super;
 
     public:
-      pooled_buffer(
+      pooled_svm(
           std::shared_ptr<super::pool_type> p, super::size_type s)
         : super(p, s)
       { }
 
-      const super::pointer_type data() const
-      { return ptr(); }
+      virtual ~pooled_svm()
+      { }
+
+      void *svm_ptr() const
+      { return m_ptr.ptr; }
+
+      size_t size() const
+      { return m_size; }
+
+      void bind_to_queue(pyopencl::command_queue const &queue)
+      {
+        if (pyopencl::is_queue_out_of_order(queue.data()))
+          throw pyopencl::error("PooledSVM.bind_to_queue", CL_INVALID_VALUE,
+              "supplying an out-of-order queue to SVMAllocation is invalid");
+
+        if (m_ptr.queue.is_valid())
+        {
+          if (m_ptr.queue.data() != queue.data())
+          {
+            // make sure synchronization promises stay valid in new queue
+            cl_event evt;
+
+            PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_ptr.queue.data(), &evt));
+            PYOPENCL_CALL_GUARDED(clEnqueueMarkerWithWaitList,
+                (queue.data(), 1, &evt, nullptr));
+          }
+        }
+
+        m_ptr.queue.set(queue.data());
+      }
+
+      void unbind_from_queue()
+      {
+        if (m_ptr.queue.is_valid())
+          PYOPENCL_CALL_GUARDED_THREADED(clFinish, (m_ptr.queue.data()));
+
+        m_ptr.queue.reset();
+      }
+
+      // only use for testing/diagnostic/debugging purposes!
+      cl_command_queue queue() const
+      {
+        if (m_ptr.queue.is_valid())
+          return m_ptr.queue.data();
+        else
+          return nullptr;
+      }
   };
 
+  // }}}
 
 
+  // {{{ svm_allocator_call
 
-  pooled_buffer *device_pool_allocate(
-      std::shared_ptr<pyopencl::memory_pool<cl_allocator_base> > pool,
-      pyopencl::memory_pool<cl_allocator_base>::size_type sz)
+  inline
+  pyopencl::svm_allocation *svm_allocator_call(svm_allocator &alloc, size_t size)
   {
-    return new pooled_buffer(pool, sz);
+    int try_count = 0;
+    while (true)
+    {
+      try
+      {
+        svm_held_pointer mem(alloc.allocate(size));
+        if (mem.queue.is_valid())
+          return new pyopencl::svm_allocation(
+              alloc.context(), mem.ptr, size, mem.queue.data());
+        else
+          return new pyopencl::svm_allocation(
+              alloc.context(), mem.ptr, size, nullptr);
+      }
+      catch (pyopencl::error &e)
+      {
+        if (!e.is_out_of_memory())
+          throw;
+        if (++try_count == 2)
+          throw;
+      }
+
+      alloc.try_release_blocks();
+    }
+  }
+
+  // }}}
+
+
+  // {{{ allocate_from_svm_ppol
+
+  pooled_svm *allocate_from_svm_ppol(
+      std::shared_ptr<pyopencl::memory_pool<svm_allocator> > pool,
+      pyopencl::memory_pool<svm_allocator>::size_type sz)
+  {
+    return new pooled_svm(pool, sz);
   }
 
+  // }}}
 
+#endif
+}
 
 
+namespace {
   template<class Wrapper>
   void expose_memory_pool(Wrapper &wrapper)
   {
@@ -304,6 +564,9 @@ namespace
       .DEF_SIMPLE_METHOD(alloc_size)
       .DEF_SIMPLE_METHOD(free_held)
       .DEF_SIMPLE_METHOD(stop_holding)
+
+      // undoc for now
+      .def("_set_trace", &cls::set_trace)
       ;
   }
 }
@@ -316,22 +579,24 @@ void pyopencl_expose_mempool(py::module &m)
   m.def("bitlog2", pyopencl::bitlog2);
 
   {
-    typedef cl_allocator_base cls;
-    py::class_<cls /*, boost::noncopyable */> wrapper(
-        m, "_tools_AllocatorBase"/*, py::no_init */);
+    typedef pyopencl::buffer_allocator_base cls;
+    py::class_<cls, std::shared_ptr<cls>> wrapper(m, "AllocatorBase");
     wrapper
-      .def("__call__", allocator_call)
+      .def("__call__", pyopencl::allocate_from_buffer_allocator, py::arg("size"))
       ;
 
   }
 
   {
-    typedef pyopencl::memory_pool<test_allocator> cls;
+    typedef pyopencl::memory_pool<pyopencl::test_allocator> cls;
 
     py::class_<cls, std::shared_ptr<cls>> wrapper( m, "_TestMemoryPool");
     wrapper
       .def(py::init([](unsigned leading_bits_in_bin_id)
-            { return new cls(test_allocator(), leading_bits_in_bin_id); }),
+            { return new cls(
+                std::shared_ptr<pyopencl::test_allocator>(
+                  new pyopencl::test_allocator()),
+                leading_bits_in_bin_id); }),
           py::arg("leading_bits_in_bin_id")=4
           )
       .def("allocate", [](std::shared_ptr<cls> pool, cls::size_type sz)
@@ -345,9 +610,9 @@ void pyopencl_expose_mempool(py::module &m)
   }
 
   {
-    typedef cl_deferred_allocator cls;
-    py::class_<cls, cl_allocator_base> wrapper(
-        m, "_tools_DeferredAllocator");
+    typedef pyopencl::deferred_buffer_allocator cls;
+    py::class_<cls, pyopencl::buffer_allocator_base, std::shared_ptr<cls>> wrapper(
+        m, "DeferredAllocator");
     wrapper
       .def(py::init<
           std::shared_ptr<pyopencl::context> const &>())
@@ -359,9 +624,9 @@ void pyopencl_expose_mempool(py::module &m)
   }
 
   {
-    typedef cl_immediate_allocator cls;
-    py::class_<cls, cl_allocator_base> wrapper(
-        m, "_tools_ImmediateAllocator");
+    typedef pyopencl::immediate_buffer_allocator cls;
+    py::class_<cls, pyopencl::buffer_allocator_base, std::shared_ptr<cls>> wrapper(
+        m, "ImmediateAllocator");
     wrapper
       .def(py::init<pyopencl::command_queue &>())
       .def(py::init<pyopencl::command_queue &, cl_mem_flags>(),
@@ -370,31 +635,88 @@ void pyopencl_expose_mempool(py::module &m)
   }
 
   {
-    typedef pyopencl::memory_pool<cl_allocator_base> cls;
+    typedef pyopencl::pooled_buffer cls;
+    py::class_<cls, pyopencl::memory_object_holder>(m, "PooledBuffer")
+      .def("release", &cls::free)
+
+      .def("bind_to_queue", [](cls &self, pyopencl::command_queue &queue) { /* no-op */ })
+      .def("unbind_from_queue", [](cls &self) { /* no-op */ })
+      ;
+  }
+
+  {
+    typedef pyopencl::memory_pool<pyopencl::buffer_allocator_base> cls;
 
-    py::class_<
-      cls, /* boost::noncopyable, */
-      std::shared_ptr<cls>> wrapper( m, "MemoryPool");
+    py::class_<cls, std::shared_ptr<cls>> wrapper( m, "MemoryPool");
     wrapper
-      .def(py::init<cl_allocator_base const &, unsigned>(),
+      .def(py::init<std::shared_ptr<pyopencl::buffer_allocator_base>, unsigned>(),
           py::arg("allocator"),
           py::arg("leading_bits_in_bin_id")=4
           )
-      .def("allocate", device_pool_allocate)
-      .def("__call__", device_pool_allocate)
-      // undoc for now
-      .DEF_SIMPLE_METHOD(set_trace)
+      .def("allocate", pyopencl::allocate_from_buffer_pool, py::arg("size"))
+      .def("__call__", pyopencl::allocate_from_buffer_pool, py::arg("size"))
       ;
 
     expose_memory_pool(wrapper);
   }
 
+#if PYOPENCL_CL_VERSION >= 0x2000
+  {
+    typedef pyopencl::svm_allocator cls;
+    py::class_<cls, std::shared_ptr<cls>> wrapper(m, "SVMAllocator");
+    wrapper
+      .def(py::init<std::shared_ptr<pyopencl::context>  const &, cl_uint, cl_uint, pyopencl::command_queue *>(),
+          py::arg("context"),
+          py::kw_only(),
+          py::arg("alignment")=0,
+          py::arg("flags")=CL_MEM_READ_WRITE,
+          py::arg("queue").none(true)=nullptr
+          )
+      .def("__call__", pyopencl::svm_allocator_call, py::arg("size"))
+      ;
+  }
+
   {
-    typedef pooled_buffer cls;
-    py::class_<cls, /* boost::noncopyable, */
-      pyopencl::memory_object_holder>(
-          m, "PooledBuffer"/* , py::no_init */)
+    typedef pyopencl::pooled_svm cls;
+    py::class_<cls, pyopencl::svm_pointer>(m, "PooledSVM")
       .def("release", &cls::free)
+      .def("enqueue_release", &cls::free)
+      .def("__eq__", [](const cls &self, const cls &other)
+          { return self.svm_ptr() == other.svm_ptr(); })
+      .def("__hash__", [](cls &self) { return (intptr_t) self.svm_ptr(); })
+      .DEF_SIMPLE_METHOD(bind_to_queue)
+      .DEF_SIMPLE_METHOD(unbind_from_queue)
+
+      // only for diagnostic/debugging/testing purposes!
+      .def_property_readonly("_queue",
+          [](cls const &self) -> py::object
+          {
+            cl_command_queue queue = self.queue();
+            if (queue)
+              return py::cast(new pyopencl::command_queue(queue, true));
+            else
+              return py::none();
+          })
+      ;
+  }
+
+  {
+    typedef pyopencl::memory_pool<pyopencl::svm_allocator> cls;
+
+    py::class_<cls, std::shared_ptr<cls>> wrapper( m, "SVMPool");
+    wrapper
+      .def(py::init<std::shared_ptr<pyopencl::svm_allocator>, unsigned>(),
+          py::arg("allocator"),
+          py::kw_only(),
+          py::arg("leading_bits_in_bin_id")=4
+          )
+      .def("__call__", pyopencl::allocate_from_svm_ppol, py::arg("size"))
       ;
+
+    expose_memory_pool(wrapper);
   }
+
+#endif
 }
+
+// vim: foldmethod=marker
diff --git a/test/test_array.py b/test/test_array.py
index ffb0714c8..ffec2f4d2 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -2180,6 +2180,64 @@ def test_dtype_conversions(ctx_factory):
 # }}}
 
 
+# {{{ test_svm_mem_pool_with_arrays
+
+@pytest.mark.parametrize("use_mempool", [False, True])
+def test_arrays_with_svm_allocators(ctx_factory, use_mempool):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+    queue2 = cl.CommandQueue(context)
+
+    from pyopencl.characterize import has_coarse_grain_buffer_svm
+    has_cg_svm = has_coarse_grain_buffer_svm(queue.device)
+
+    if not has_cg_svm:
+        pytest.skip("Need coarse-grained SVM support for this test.")
+
+    alloc = cl_tools.SVMAllocator(context, queue=queue)
+    if use_mempool:
+        alloc = cl_tools.SVMPool(alloc)
+
+    def alloc2(size):
+        allocation = alloc(size)
+        allocation.bind_to_queue(queue2)
+        return allocation
+
+    a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=alloc)
+    b_dev = cl_array.to_device(queue, np.arange(2000), allocator=alloc) + 4000
+
+    assert a_dev.allocator is alloc
+    assert b_dev.allocator is alloc
+
+    assert a_dev.data._queue == queue
+    assert b_dev.data._queue == queue
+
+    a_dev2 = cl_array.arange(queue2, 2000, dtype=np.float32, allocator=alloc2)
+    b_dev2 = cl_array.to_device(queue2, np.arange(2000), allocator=alloc2) + 4000
+
+    assert a_dev2.allocator is alloc2
+    assert b_dev2.allocator is alloc2
+
+    assert a_dev2.data._queue == queue2
+    assert b_dev2.data._queue == queue2
+
+    np.testing.assert_allclose((a_dev+b_dev).get(), (a_dev2+b_dev2).get())
+
+    with pytest.warns(cl_array.InconsistentOpenCLQueueWarning):
+        a_dev2.with_queue(queue)
+
+        # safe to let this proceed to deallocation, since we're not
+        # operating on the memory
+
+    with pytest.warns(cl_array.InconsistentOpenCLQueueWarning):
+        cl_array.empty(queue2, 2000, np.float32, allocator=alloc)
+
+        # safe to let this proceed to deallocation, since we're not
+        # operating on the memory
+
+# }}}
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
@@ -2187,4 +2245,4 @@ def test_dtype_conversions(ctx_factory):
         from pytest import main
         main([__file__])
 
-# vim: filetype=pyopencl:fdm=marker
+# vim: fdm=marker
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index 0ec3e1343..758b05c0a 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -59,6 +59,8 @@ def _xfail_if_pocl_gpu(device, what):
                 "at least the Titan V, as of pocl 1.6, 2021-01-20")
 
 
+# {{{ test_get_info
+
 def test_get_info(ctx_factory):
     ctx = ctx_factory()
     device, = ctx.devices
@@ -236,6 +238,10 @@ def do_test(cl_obj, info_cls, func=None, try_attr_form=True):
         do_test(img, cl.image_info,
                 lambda info: img.get_image_info(info))
 
+# }}}
+
+
+# {{{ test_int_ptr
 
 def test_int_ptr(ctx_factory):
     def do_test(obj):
@@ -285,6 +291,10 @@ def do_test(obj):
         img = cl.Image(ctx, cl.mem_flags.READ_ONLY, img_format, (128, 256))
         do_test(img)
 
+# }}}
+
+
+# {{{ test_invalid_kernel_names_cause_failures
 
 def test_invalid_kernel_names_cause_failures(ctx_factory):
     ctx = ctx_factory()
@@ -308,6 +318,10 @@ def test_invalid_kernel_names_cause_failures(ctx_factory):
         else:
             raise
 
+# }}}
+
+
+# {{{ test_image_format_constructor
 
 def test_image_format_constructor():
     # doesn't need image support to succeed
@@ -319,6 +333,10 @@ def test_image_format_constructor():
     if not cl._PYPY:
         assert not hasattr(iform, "__dict__")
 
+# }}}
+
+
+# {{{ test_device_topology_amd_constructor
 
 def test_device_topology_amd_constructor():
     # doesn't need cl_amd_device_attribute_query support to succeed
@@ -331,6 +349,10 @@ def test_device_topology_amd_constructor():
     if not cl._PYPY:
         assert not hasattr(topol, "__dict__")
 
+# }}}
+
+
+# {{{ test_nonempty_supported_image_formats
 
 def test_nonempty_supported_image_formats(ctx_factory):
     context = ctx_factory()
@@ -344,6 +366,10 @@ def test_nonempty_supported_image_formats(ctx_factory):
         from pytest import skip
         skip("images not supported on %s" % device.name)
 
+# }}}
+
+
+# {{{ test_that_python_args_fail
 
 def test_that_python_args_fail(ctx_factory):
     context = ctx_factory()
@@ -379,6 +405,10 @@ def test_that_python_args_fail(ctx_factory):
     a_result = np.empty_like(a)
     cl.enqueue_copy(queue, a_buf, a_result).wait()
 
+# }}}
+
+
+# {{{ test_image_2d
 
 def test_image_2d(ctx_factory):
     context = ctx_factory()
@@ -452,6 +482,10 @@ def test_image_2d(ctx_factory):
         else:
             assert good
 
+# }}}
+
+
+# {{{ test_image_3d
 
 def test_image_3d(ctx_factory):
     #test for image_from_array for 3d image of float2
@@ -530,6 +564,10 @@ def test_image_3d(ctx_factory):
         else:
             assert good
 
+# }}}
+
+
+# {{{ test_copy_buffer
 
 def test_copy_buffer(ctx_factory):
     context = ctx_factory()
@@ -549,6 +587,10 @@ def test_copy_buffer(ctx_factory):
 
     assert la.norm(a - b) == 0
 
+# }}}
+
+
+# {{{ test_mempool_*
 
 def test_mempool(ctx_factory):
     from pyopencl.tools import MemoryPool, ImmediateAllocator
@@ -601,6 +643,10 @@ def test_mempool_32bit_issues():
         for offs in range(-5, 5):
             pool.allocate(2**i + offs)
 
+# }}}
+
+
+# {{{ test_allocator
 
 @pytest.mark.parametrize("allocator_cls", [ImmediateAllocator, DeferredAllocator])
 def test_allocator(ctx_factory, allocator_cls):
@@ -618,6 +664,10 @@ def test_allocator(ctx_factory, allocator_cls):
     assert mem is not None
     assert mem2 is None
 
+# }}}
+
+
+# {{{ test_vector_args
 
 def test_vector_args(ctx_factory):
     context = ctx_factory()
@@ -639,7 +689,10 @@ def test_vector_args(ctx_factory):
 
     assert (dest == x).all()
 
+# }}}
 
+
+# {{{ test_header_dep_handling
 def test_header_dep_handling(ctx_factory):
     context = ctx_factory()
 
@@ -657,6 +710,10 @@ def test_header_dep_handling(ctx_factory):
     cl.Program(context, kernel_src).build(["-I", dirname(__file__)])
     cl.Program(context, kernel_src).build(["-I", dirname(__file__)])
 
+# }}}
+
+
+# {{{ test_context_dep_memoize
 
 def test_context_dep_memoize(ctx_factory):
     context = ctx_factory()
@@ -674,6 +731,10 @@ def do_something(ctx):
 
     assert counter[0] == 1
 
+# }}}
+
+
+# {{{ test_can_build_and_run_binary
 
 def test_can_build_and_run_binary(ctx_factory):
     ctx = ctx_factory()
@@ -698,6 +759,10 @@ def test_can_build_and_run_binary(ctx_factory):
 
     foo.simple(queue, (n,), (16,), a_dev.data, dest_dev.data)
 
+# }}}
+
+
+# {{{ test_enqueue_barrier_marker
 
 def test_enqueue_barrier_marker(ctx_factory):
     ctx = ctx_factory()
@@ -716,6 +781,10 @@ def test_enqueue_barrier_marker(ctx_factory):
     evt2 = cl.enqueue_marker(queue, wait_for=[evt1])
     cl.enqueue_barrier(queue, wait_for=[evt1, evt2])
 
+# }}}
+
+
+# {{{ test_wait_for_events
 
 def test_wait_for_events(ctx_factory):
     ctx = ctx_factory()
@@ -724,6 +793,10 @@ def test_wait_for_events(ctx_factory):
     evt2 = cl.enqueue_marker(queue)
     cl.wait_for_events([evt1, evt2])
 
+# }}}
+
+
+# {{{ test_unload_compiler
 
 def test_unload_compiler(platform):
     if (platform._get_cl_version() < (1, 2)
@@ -736,6 +809,10 @@ def test_unload_compiler(platform):
         skip("Intel proprietary driver does not support unloading compiler")
     cl.unload_platform_compiler(platform)
 
+# }}}
+
+
+# {{{ test_platform_get_devices
 
 def test_platform_get_devices(ctx_factory):
     ctx = ctx_factory()
@@ -762,6 +839,10 @@ def test_platform_get_devices(ctx_factory):
         for dev in devs:
             assert dev.type & dev_type == dev_type
 
+# }}}
+
+
+# {{{ test_user_event
 
 def test_user_event(ctx_factory):
     ctx = ctx_factory()
@@ -808,6 +889,10 @@ def event_waiter2(e, key):
         raise RuntimeError("cl.wait_for_events timeout on UserEvent")
     assert evt.command_execution_status == cl.command_execution_status.COMPLETE
 
+# }}}
+
+
+# {{{ test_buffer_get_host_array
 
 def test_buffer_get_host_array(ctx_factory):
     if cl._PYPY:
@@ -843,6 +928,10 @@ def test_buffer_get_host_array(ctx_factory):
     except cl.LogicError:
         pass
 
+# }}}
+
+
+# {{{ test_program_valued_get_info
 
 def test_program_valued_get_info(ctx_factory):
     ctx = ctx_factory()
@@ -860,6 +949,10 @@ def test_program_valued_get_info(ctx_factory):
     assert knl.program == prg
     knl.program.binaries[0]
 
+# }}}
+
+
+# {{{ test_event_set_callback
 
 def test_event_set_callback(ctx_factory):
     import sys
@@ -921,6 +1014,10 @@ def cb(status):
 
     assert got_called
 
+# }}}
+
+
+# {{{ test_global_offset
 
 def test_global_offset(ctx_factory):
     context = ctx_factory()
@@ -951,6 +1048,10 @@ def test_global_offset(ctx_factory):
 
     assert (a_2 == 2*a).all()
 
+# }}}
+
+
+# {{{ test_sub_buffers
 
 def test_sub_buffers(ctx_factory):
     ctx = ctx_factory()
@@ -981,6 +1082,10 @@ def test_sub_buffers(ctx_factory):
 
     assert np.array_equal(a_sub, a_sub_ref)
 
+# }}}
+
+
+# {{{ test_spirv
 
 def test_spirv(ctx_factory):
     ctx = ctx_factory()
@@ -1012,16 +1117,19 @@ def test_spirv(ctx_factory):
 
     assert la.norm((dest_dev - (a_dev+b_dev)).get()) < 1e-7
 
+# }}}
+
+
+# {{{ test_coarse_grain_svm
 
-def test_coarse_grain_svm(ctx_factory):
+@pytest.mark.parametrize("use_opaque_style", [False, True])
+def test_coarse_grain_svm(ctx_factory, use_opaque_style):
     import sys
     is_pypy = "__pypy__" in sys.builtin_module_names
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    _xfail_if_pocl_gpu(queue.device, "SVM")
-
     dev = ctx.devices[0]
 
     from pyopencl.characterize import has_coarse_grain_buffer_svm
@@ -1035,16 +1143,32 @@ def test_coarse_grain_svm(ctx_factory):
     if ("AMD" in dev.platform.name
             and dev.type & cl.device_type.GPU):
         pytest.xfail("AMD GPU crashes on SVM unmap")
+    if (dev.platform.vendor == "The pocl project"
+            and dev.type & cl.device_type.GPU
+            and "k40" in dev.name.lower()):
+        pytest.xfail("Crashes on K40s via POCL-CUDA")
 
+    dtype = np.dtype(np.float32)
     n = 3000
-    svm_ary = cl.SVM(cl.csvm_empty(ctx, (n,), np.float32, alignment=64))
-    if not is_pypy:
-        # https://bitbucket.org/pypy/numpy/issues/52
-        assert isinstance(svm_ary.mem.base, cl.SVMAllocation)
+    if use_opaque_style:
+        svm_ary = cl.SVMAllocation(ctx, n*dtype.itemsize, alignment=64,
+                                   flags=cl.svm_mem_flags.READ_WRITE)
+    else:
+        svm_ary = cl.SVM(cl.csvm_empty(ctx, (n,), dtype, alignment=64))
+        if not is_pypy:
+            # https://bitbucket.org/pypy/numpy/issues/52
+            assert isinstance(svm_ary.mem.base, cl.SVMAllocation)
 
-    cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype))
+    cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), dtype))
 
     with svm_ary.map_rw(queue) as ary:
+        if use_opaque_style:
+            ary = ary.view(dtype)
+        else:
+            assert ary is svm_ary.mem
+
+        assert ary.nbytes == n * dtype.itemsize
+
         ary.fill(17)
         orig_ary = ary.copy()
 
@@ -1055,21 +1179,28 @@ def test_coarse_grain_svm(ctx_factory):
         }
         """).build()
 
-    prg.twice(queue, svm_ary.mem.shape, None, svm_ary)
+    prg.twice(queue, (n,), None, svm_ary)
+
+    if dev.platform.vendor == "The pocl project" \
+            and dev.type & cl.device_type.GPU:
+        # clCreateBuffer from SVM doesn't work yet on GPU pocl
+        prg.twice(queue, (n,), None, svm_ary)
+    else:
+        prg.twice(queue, (n,), None, svm_ary.as_buffer(ctx))
 
     with svm_ary.map_ro(queue) as ary:
-        print(ary)
-        assert np.array_equal(orig_ary*2, ary)
+        if use_opaque_style:
+            ary = ary.view(dtype)
+        else:
+            assert ary is svm_ary.mem
+
+        assert np.array_equal(orig_ary*4, ary)
 
     new_ary = np.empty_like(orig_ary)
     new_ary.fill(-1)
 
-    if ctx.devices[0].platform.name != "Portable Computing Language":
-        # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)"
-        # in pocl up to and including 1.0rc1.
-
-        cl.enqueue_copy(queue, new_ary, svm_ary)
-        assert np.array_equal(orig_ary*2, new_ary)
+    cl.enqueue_copy(queue, new_ary, svm_ary)
+    assert np.array_equal(orig_ary*4, new_ary)
 
     # {{{ https://github.com/inducer/pyopencl/issues/372
 
@@ -1095,6 +1226,10 @@ def test_coarse_grain_svm(ctx_factory):
 
     # }}}
 
+# }}}
+
+
+# {{{ test_fine_grain_svm
 
 def test_fine_grain_svm(ctx_factory):
     import sys
@@ -1133,6 +1268,10 @@ def test_fine_grain_svm(ctx_factory):
     print(ary)
     assert np.array_equal(orig_ary*2, ary)
 
+# }}}
+
+
+# {{{ test_map_dtype
 
 @pytest.mark.parametrize("dtype", [
     np.uint,
@@ -1157,6 +1296,10 @@ def test_map_dtype(ctx_factory, dtype):
         print(array.dtype)
         assert array.dtype == dt
 
+# }}}
+
+
+# {{{ test_compile_link
 
 def test_compile_link(ctx_factory):
     ctx = ctx_factory()
@@ -1194,6 +1337,10 @@ def test_compile_link(ctx_factory):
     z.experiment(queue, (128**2,), (128,))
     queue.finish()
 
+# }}}
+
+
+# {{{ test_copy_buffer_rect
 
 def test_copy_buffer_rect(ctx_factory):
     ctx = ctx_factory()
@@ -1209,6 +1356,10 @@ def test_copy_buffer_rect(ctx_factory):
             src_origin=(0, 0), dst_origin=(1, 1),
             region=arr1.shape[::-1])
 
+# }}}
+
+
+# {{{ test_threaded_nanny_events
 
 def test_threaded_nanny_events(ctx_factory):
     # https://github.com/inducer/pyopencl/issues/296
@@ -1236,6 +1387,10 @@ def create_arrays_thread(n1=10, n2=20):
     t1.join()
     t2.join()
 
+# }}}
+
+
+# {{{ test_empty_ndrange
 
 @pytest.mark.parametrize("empty_shape", [(0,), (3, 0, 2)])
 def test_empty_ndrange(ctx_factory, empty_shape):
@@ -1256,6 +1411,10 @@ def test_empty_ndrange(ctx_factory, empty_shape):
 
     prg.add_two(queue, a.shape, None, a.data, allow_empty_ndrange=True)
 
+# }}}
+
+
+# {{{ test_command_queue_context_manager
 
 def test_command_queue_context_manager(ctx_factory):
     ctx = ctx_factory()
@@ -1265,6 +1424,10 @@ def test_command_queue_context_manager(ctx_factory):
     with pytest.warns(cl.CommandQueueUsedAfterExit):
         q.flush()
 
+# }}}
+
+
+# {{{ test_capture_call
 
 def test_capture_call(ctx_factory):
     ctx = ctx_factory()
@@ -1298,6 +1461,8 @@ def test_capture_call(ctx_factory):
     exec(compile(sio.getvalue(), "captured.py", "exec"), compile_dict)
     compile_dict["main"]()
 
+# }}}
+
 
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.
@@ -1309,3 +1474,5 @@ def test_capture_call(ctx_factory):
     else:
         from pytest import main
         main([__file__])
+
+# vim: foldmethod=marker