From ab78e468fb688e42fe41afc71ae7a18f3e68c101 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:09:44 -0500
Subject: [PATCH 01/16] __init__: import warn once at the top

---
 pyopencl/__init__.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 6420e5afb..016ad4d82 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -21,6 +21,7 @@
 """
 
 from sys import intern
+from warnings import warn
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
 
@@ -43,7 +44,6 @@
     import os
     from os.path import dirname, join, realpath
     if realpath(join(os.getcwd(), "pyopencl")) == realpath(dirname(__file__)):
-        from warnings import warn
         warn("It looks like you are importing PyOpenCL from "
                 "its source directory. This likely won't work.")
     raise
@@ -267,7 +267,6 @@ class CommandQueueUsedAfterExit(UserWarning):
 
 def compiler_output(text):
     import os
-    from warnings import warn
     if int(os.environ.get("PYOPENCL_COMPILER_OUTPUT", "0")):
         warn(text, CompilerWarning)
     else:
@@ -389,7 +388,6 @@ def enable_debugging(platform_or_context):
         import os
         os.environ["CPU_MAX_COMPUTE_UNITS"] = "1"
     else:
-        from warnings import warn
         warn("do not know how to enable debugging on '%s'"
                 % platform.name)
 
@@ -428,7 +426,6 @@ def _get_prg(self):
             return self._prg
         else:
             # "no program" can only happen in from-source case.
-            from warnings import warn
             warn("Pre-build attribute access defeats compiler caching.",
                     stacklevel=3)
 
@@ -662,7 +659,6 @@ def device_hashable_model_and_version_identifier(self):
         return ("v1", self.vendor, self.vendor_id, self.name, self.version)
 
     def device_persistent_unique_id(self):
-        from warnings import warn
         warn("Device.persistent_unique_id is deprecated. "
                 "Use Device.hashable_model_and_version_identifier instead.",
                 DeprecationWarning, stacklevel=2)
@@ -684,7 +680,6 @@ def device_persistent_unique_id(self):
 
     def context_init(self, devices, properties, dev_type, cache_dir=None):
         if cache_dir is not None:
-            from warnings import warn
             warn("The 'cache_dir' argument to the Context constructor "
                 "is deprecated and no longer has an effect. "
                 "It was removed because it only applied to the wrapper "
@@ -970,7 +965,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None,
 
         if hostbuf is not None and not \
                 (flags & (mem_flags.USE_HOST_PTR | mem_flags.COPY_HOST_PTR)):
-            from warnings import warn
             warn("'hostbuf' was passed, but no memory flags to make use of it.")
 
         if hostbuf is None and pitches is not None:
@@ -1043,7 +1037,6 @@ def image_init(self, context, flags, format, shape=None, pitches=None,
 
     class _ImageInfoGetter:
         def __init__(self, event):
-            from warnings import warn
             warn("Image.image.attr is deprecated and will go away in 2021. "
                     "Use Image.attr directly, instead.")
 
@@ -1927,7 +1920,6 @@ def enqueue_barrier(queue, wait_for=None):
 
 def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
     if not (queue._get_cl_version() >= (1, 2) and get_cl_header_version() >= (1, 2)):
-        from warnings import warn
         warn("The context for this queue does not declare OpenCL 1.2 support, so "
                 "the next thing you might see is a crash")
 

From 30e0085adaf1246b1a40f3a1707b80a1d1216b81 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:14:58 -0500
Subject: [PATCH 02/16] Add is_queue_in_order

---
 src/wrap_cl.hpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 98964056f..8de8cd0df 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -496,6 +496,19 @@ namespace pyopencl
   // }}}
 
 
+  // {{{ utility functions
+
+  inline bool is_queue_in_order(cl_command_queue queue)
+  {
+      cl_command_queue_properties param_value;
+      PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
+          (queue, CL_QUEUE_PROPERTIES, sizeof(param_value), &param_value, 0));
+      return !(param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+  }
+
+  // }}}
+
+
   // {{{ buffer interface helper
 
 

From bfcfcc9fae09529bdc3ff2b119572c8d778b4cf1 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:15:32 -0500
Subject: [PATCH 03/16] Add command_queue_ref

---
 src/wrap_cl.hpp | 70 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 70 insertions(+)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 8de8cd0df..8147cb183 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -1668,6 +1668,76 @@ namespace pyopencl
   // }}}
 
 
+  // {{{ command_queue_ref
+
+  // In contrast to command_queue, command_queue_ref is "nullable", i.e.
+  // it is a RAII *optional* reference to a command queue.
+
+  class command_queue_ref
+  {
+    private:
+      bool m_valid;
+      cl_command_queue m_queue;
+
+    public:
+      command_queue_ref()
+        : m_valid(false)
+      {}
+
+      command_queue_ref(cl_command_queue queue)
+        : m_valid(true), m_queue(queue)
+      {
+        PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+      }
+
+      command_queue_ref(command_queue_ref &&src)
+        : m_valid(src.m_valid), m_queue(src.m_queue)
+      {
+        src.m_valid = false;
+      }
+
+      command_queue_ref(const command_queue_ref &) = delete;
+      command_queue_ref &operator=(const command_queue_ref &) = delete;
+
+      ~command_queue_ref()
+      {
+        reset();
+      }
+
+      bool is_valid() const
+      {
+        return m_valid;
+      }
+
+      cl_command_queue data() const
+      {
+        if (m_valid)
+          return m_queue;
+        else
+          throw error("command_queue_ref.data", CL_INVALID_VALUE,
+              "command_queue_ref is not valid");
+      }
+
+      void reset()
+      {
+        if (m_valid)
+          PYOPENCL_CALL_GUARDED_CLEANUP(clReleaseCommandQueue, (m_queue));
+        m_valid = false;
+      }
+
+      void set(cl_command_queue queue)
+      {
+        if (m_valid)
+          PYOPENCL_CALL_GUARDED(clReleaseCommandQueue, (m_queue));
+        m_queue = queue;
+        PYOPENCL_CALL_GUARDED(clRetainCommandQueue, (m_queue));
+        m_valid = true;
+      }
+  };
+
+  // }}}
+
+
   // {{{ event/synchronization
 
   class event : noncopyable

From defe1850dd4a1358484998201cc47da666c23e61 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 30 Jun 2022 23:18:16 -0500
Subject: [PATCH 04/16] Add, use enqueue_fill

---
 doc/runtime_memory.rst |  2 ++
 pyopencl/__init__.py   | 21 +++++++++++++++++++++
 pyopencl/array.py      |  4 ++--
 3 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst
index f4e01f266..ffe6661e8 100644
--- a/doc/runtime_memory.rst
+++ b/doc/runtime_memory.rst
@@ -281,6 +281,8 @@ Transfers
 
 .. autofunction:: enqueue_copy(queue, dest, src, **kwargs)
 
+.. autofunction:: enqueue_fill(queue, dest, src, **kwargs)
+
 Mapping Memory into Host Address Space
 --------------------------------------
 
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 016ad4d82..86c77f05b 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -22,6 +22,7 @@
 
 from sys import intern
 from warnings import warn
+from typing import Union, Any
 
 from pyopencl.version import VERSION, VERSION_STATUS, VERSION_TEXT  # noqa
 
@@ -1815,6 +1816,26 @@ def enqueue_copy(queue, dest, src, **kwargs):
 # }}}
 
 
+# {{{ enqueue_fill
+
+def enqueue_fill(queue: CommandQueue, dest: Union[MemoryObjectHolder, SVM],
+        pattern: Any, size: int, *, offset: int = 0, wait_for=None) -> Event:
+    """
+    .. versionadded:: 2022.2
+    """
+    if isinstance(dest, MemoryObjectHolder):
+        return enqueue_fill_buffer(queue, dest, pattern, offset, size, wait_for)
+    elif isinstance(dest, SVM):
+        if offset:
+            raise NotImplementedError("enqueue_fill with SVM does not yet support "
+                    "offsets")
+        return enqueue_svm_memfill(queue, dest, pattern, size, wait_for)
+    else:
+        raise TypeError(f"enqueue_fill does not know how to fill '{type(dest)}'")
+
+# }}}
+
+
 # {{{ image creation
 
 DTYPE_TO_CHANNEL_TYPE = {
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 4e8b52a13..fe6162645 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -1458,8 +1458,8 @@ def _zero_fill(self, queue=None, wait_for=None):
         # https://github.com/inducer/pyopencl/issues/395
         if cl_version_gtr_1_2 and not (on_nvidia and self.nbytes >= 2**31):
             self.add_event(
-                    cl.enqueue_fill_buffer(queue, self.base_data, np.int8(0),
-                        self.offset, self.nbytes, wait_for=wait_for))
+                    cl.enqueue_fill(queue, self.base_data, np.int8(0),
+                        self.nbytes, offset=self.offset, wait_for=wait_for))
         else:
             zero = np.zeros((), self.dtype)
             self.fill(zero, queue=queue)

From 8c15afc5d431a6c50a6043fe180ea1fd307955dd Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 29 Mar 2021 00:21:04 -0500
Subject: [PATCH 05/16] Experiment with SVM backing arrays

---
 examples/demo_array_svm.py | 57 ++++++++++++++++++++++++++++++++++++++
 pyopencl/array.py          | 22 +++++++++++----
 2 files changed, 73 insertions(+), 6 deletions(-)
 create mode 100644 examples/demo_array_svm.py

diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py
new file mode 100644
index 000000000..07454f1fe
--- /dev/null
+++ b/examples/demo_array_svm.py
@@ -0,0 +1,57 @@
+import pyopencl as cl
+import pyopencl.array as cl_array
+import numpy as np
+import numpy.linalg as la
+
+n = 5000000
+a = np.random.rand(n).astype(np.float32)
+b = np.random.rand(n).astype(np.float32)
+
+
+class SVMAllocator:
+    def __init__(self, ctx, flags, alignment):
+        self._context = ctx
+        self._flags = flags
+        self._alignment = alignment
+
+    def __call__(self, nbytes):
+        return cl.SVM(cl.svm_empty(
+                ctx, self._flags, (nbytes,), np.int8, "C", self._alignment))
+
+
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+alloc = SVMAllocator(ctx,
+        cl.svm_mem_flags.READ_WRITE | cl.svm_mem_flags.SVM_FINE_GRAIN_BUFFER,
+        0)
+
+a_dev = cl_array.to_device(queue, a, allocator=alloc)
+print("A_DEV", a_dev.data.mem.nbytes, a_dev.data.mem.__array_interface__)
+b_dev = cl_array.to_device(queue, b, allocator=alloc)
+dest_dev = cl_array.empty_like(a_dev)
+print("DEST", dest_dev.data.mem.__array_interface__)
+
+prg = cl.Program(ctx, """
+    __kernel void sum(__global const float *a,
+    __global const float *b, __global float *c)
+    {
+      int gid = get_global_id(0);
+      c[gid] = a[gid] + b[gid];
+    }
+    """).build()
+
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
+
+# PROBLEM: numpy frees the temporary out of (a_dev+b_dev) before
+# we're done with it
+diff = dest_dev - (a_dev+b_dev)
+if 0:
+    diff = diff.get()
+    np.set_printoptions(linewidth=400)
+    print(dest_dev)
+    print((a_dev+b_dev).get())
+    print(diff)
+    print(la.norm(diff))
+    print("A_DEV", a_dev.data.mem.__array_interface__)
diff --git a/pyopencl/array.py b/pyopencl/array.py
index fe6162645..c191d0ed4 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -720,9 +720,14 @@ def set(self, ary, queue=None, async_=None, **kwargs):
                     stacklevel=2)
 
         if self.size:
-            event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
-                    device_offset=self.offset,
-                    is_blocking=not async_)
+            if self.offset:
+                event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+                        device_offset=self.offset,
+                        is_blocking=not async_)
+            else:
+                event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+                        is_blocking=not async_)
+
             self.add_event(event1)
 
     def _get(self, queue=None, ary=None, async_=None, **kwargs):
@@ -770,9 +775,14 @@ def _get(self, queue=None, ary=None, async_=None, **kwargs):
                     "to associate one.")
 
         if self.size:
-            event1 = cl.enqueue_copy(queue, ary, self.base_data,
-                    device_offset=self.offset,
-                    wait_for=self.events, is_blocking=not async_)
+            if self.offset:
+                event1 = cl.enqueue_copy(queue, ary, self.base_data,
+                        device_offset=self.offset,
+                        wait_for=self.events, is_blocking=not async_)
+            else:
+                event1 = cl.enqueue_copy(queue, ary, self.base_data,
+                        wait_for=self.events, is_blocking=not async_)
+
             self.add_event(event1)
         else:
             event1 = None

From a286cc37d642f20e294a17a2fbdfdd26465ee5b0 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 27 Jun 2022 01:12:54 -0500
Subject: [PATCH 06/16] Allow tying an svm_allocation to a queue

---
 pyopencl/__init__.py   | 14 +++++++++----
 src/wrap_cl.hpp        | 45 ++++++++++++++++++++++++++++++++++++++----
 src/wrap_cl_part_2.cpp | 10 +++++++++-
 3 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 86c77f05b..5b8d8b72e 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -1146,12 +1146,13 @@ def memory_map_exit(self, exc_type, exc_val, exc_tb):
     if get_cl_header_version() >= (2, 0):
         svmallocation_old_init = SVMAllocation.__init__
 
-    def svmallocation_init(self, ctx, size, alignment, flags, _interface=None):
+    def svmallocation_init(self, ctx, size, alignment, flags, _interface=None,
+            queue=None):
         """
         :arg ctx: a :class:`Context`
         :arg flags: some of :class:`svm_mem_flags`.
         """
-        svmallocation_old_init(self, ctx, size, alignment, flags)
+        svmallocation_old_init(self, ctx, size, alignment, flags, queue)
 
         # mem_flags.READ_ONLY applies to kernels, not the host
         read_write = True
@@ -1996,7 +1997,7 @@ def enqueue_svm_migratemem(queue, svms, flags, wait_for=None):
             wait_for)
 
 
-def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
+def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None, queue=None):
     """Allocate an empty :class:`numpy.ndarray` of the given *shape*, *dtype*
     and *order*. (See :func:`numpy.empty` for the meaning of these arguments.)
     The array will be allocated in shared virtual memory belonging
@@ -2014,6 +2015,10 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
     will likely want to wrap the returned array in an :class:`SVM` tag.
 
     .. versionadded:: 2016.2
+
+    .. versionchanged:: 2022.2
+
+        *queue* argument added.
     """
 
     dtype = np.dtype(dtype)
@@ -2060,7 +2065,8 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None):
     if alignment is None:
         alignment = itemsize
 
-    svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface)
+    svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface,
+            queue=queue)
     return np.asarray(svm_alloc)
 
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 8147cb183..2582a5103 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -3562,16 +3562,28 @@ namespace pyopencl
   };
 
 
-  class svm_allocation : noncopyable
+  class svm_allocation
   {
     private:
       std::shared_ptr<context> m_context;
       void *m_allocation;
+      command_queue_ref m_queue;
+      // FIXME Should keep a list of events so that we can wait for users
+      // to finish in the case of out-of-order queues.
 
     public:
-      svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment, cl_svm_mem_flags flags)
+      svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment,
+          cl_svm_mem_flags flags, const command_queue *queue = nullptr)
         : m_context(ctx)
       {
+        if (queue)
+        {
+          m_queue.set(queue->data());
+          if (!is_queue_in_order(m_queue.data()))
+            throw error("SVMAllocation.__init__", CL_INVALID_VALUE,
+                "supplying an out-of-order queue to SVMAllocation is invalid");
+        }
+
         PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
         m_allocation = clSVMAlloc(
             ctx->data(),
@@ -3581,6 +3593,9 @@ namespace pyopencl
           throw pyopencl::error("clSVMAlloc", CL_OUT_OF_RESOURCES);
       }
 
+      svm_allocation(const svm_allocation &) = delete;
+      svm_allocation &operator=(const svm_allocation &) = delete;
+
       ~svm_allocation()
       {
         if (m_allocation)
@@ -3593,8 +3608,20 @@ namespace pyopencl
           throw error("SVMAllocation.release", CL_INVALID_VALUE,
               "trying to double-unref svm allocation");
 
-        clSVMFree(m_context->data(), m_allocation);
-        m_allocation = nullptr;
+        if (m_queue.is_valid())
+        {
+          PYOPENCL_CALL_GUARDED_CLEANUP(clEnqueueSVMFree, (
+                m_queue.data(), 1, &m_allocation,
+                nullptr, nullptr,
+                0, nullptr, nullptr));
+          m_queue.reset();
+        }
+        else
+        {
+          PYOPENCL_PRINT_CALL_TRACE("clSVMFree");
+          clSVMFree(m_context->data(), m_allocation);
+          m_allocation = nullptr;
+        }
       }
 
       void enqueue_release(command_queue &queue, py::object py_wait_for)
@@ -3634,6 +3661,16 @@ namespace pyopencl
       {
         return m_allocation != other.m_allocation;
       }
+
+      void bind_to_queue(command_queue const &queue)
+      {
+        m_queue.set(queue.data());
+      }
+
+      void unbind_from_queue()
+      {
+        m_queue.reset();
+      }
   };
 
 
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index 0c9a0d1b1..ace5e9777 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -305,7 +305,13 @@ void pyopencl_expose_part_2(py::module &m)
   {
     typedef svm_allocation cls;
     py::class_<cls>(m, "SVMAllocation", py::dynamic_attr())
-      .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags>())
+      .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(),
+          py::arg("context"),
+          py::arg("size"),
+          py::arg("alignment"),
+          py::arg("flags"),
+          py::arg("queue").none(true)=py::none()
+          )
       .DEF_SIMPLE_METHOD(release)
       .def("enqueue_release", &cls::enqueue_release,
           ":returns: a :class:`pyopencl.Event`\n\n"
@@ -314,6 +320,8 @@ void pyopencl_expose_part_2(py::module &m)
       .def(py::self == py::self)
       .def(py::self != py::self)
       .def("__hash__", &cls::ptr_as_int)
+      .DEF_SIMPLE_METHOD(bind_to_queue)
+      .DEF_SIMPLE_METHOD(unbind_from_queue)
       ;
   }
 

From d893cd737ece8af456f25a573ed062f77876d5f5 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 27 Jun 2022 01:13:52 -0500
Subject: [PATCH 07/16] Add fold markers in wrap_mempool

---
 src/wrap_mempool.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 6b014ba5e..2738ccca2 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -70,6 +70,8 @@ namespace
   };
 
 
+  // {{{ cl allocators
+
   class cl_allocator_base
   {
     protected:
@@ -210,8 +212,10 @@ namespace
       }
   };
 
+  // }}}
 
 
+  // {{{ allocator_call
 
   inline
   pyopencl::buffer *allocator_call(cl_allocator_base &alloc, size_t size)
@@ -256,8 +260,10 @@ namespace
     }
   }
 
+  // }}}
 
 
+  // {{{ pooled_buffer
 
   class pooled_buffer
     : public pyopencl::pooled_allocation<pyopencl::memory_pool<cl_allocator_base> >,
@@ -278,8 +284,10 @@ namespace
       { return ptr(); }
   };
 
+  // }}}
 
 
+  // {{{{ device_pool_allocate
 
   pooled_buffer *device_pool_allocate(
       std::shared_ptr<pyopencl::memory_pool<cl_allocator_base> > pool,
@@ -288,6 +296,9 @@ namespace
     return new pooled_buffer(pool, sz);
   }
 
+  // }}}
+
+
 
 
 
@@ -398,3 +409,5 @@ void pyopencl_expose_mempool(py::module &m)
       ;
   }
 }
+
+// vim: foldmethod=marker

From 7f9e0822ea8e092fd831909ec36ea15e15ce38bc Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 27 Jun 2022 01:14:15 -0500
Subject: [PATCH 08/16] Draft svm_allocator for mempool

---
 src/wrap_mempool.cpp | 54 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 54 insertions(+)

diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 2738ccca2..35630b036 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -299,6 +299,60 @@ namespace
   // }}}
 
 
+  // {{{ svm allocator
+
+  // FIXME: Does this need deferred and immediate just like the buffer-level
+  // allocators? (I.e. can I tell whether I am out of memory just from allocations?)
+  class svm_allocator
+  {
+    protected:
+      std::shared_ptr<pyopencl::context> m_context;
+      cl_uint m_alignment;
+      cl_mem_flags m_flags;
+
+    public:
+      svm_allocator(std::shared_ptr<pyopencl::context> const &ctx,
+          cl_uint alignment, cl_mem_flags flags=CL_MEM_READ_WRITE)
+        : m_context(ctx), m_alignment(alignment), m_flags(flags)
+      {
+        if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))
+          throw pyopencl::error("Allocator", CL_INVALID_VALUE,
+              "cannot specify USE_HOST_PTR or COPY_HOST_PTR flags");
+      }
+
+      svm_allocator(svm_allocator const &src)
+      : m_context(src.m_context), m_alignment(src.m_alignment),
+      m_flags(src.m_flags)
+      { }
+
+      virtual ~svm_allocator()
+      { }
+
+      typedef void *pointer_type;
+      typedef size_t size_type;
+
+      pointer_type allocate(size_type size)
+      {
+        if (size == 0)
+          return nullptr;
+
+        PYOPENCL_PRINT_CALL_TRACE("clSVMalloc");
+        return clSVMAlloc(m_context->data(), m_flags, size, m_alignment);
+      }
+
+      void free(pointer_type p)
+      {
+        clSVMFree(m_context->data(), p);
+      }
+
+      void try_release_blocks()
+      {
+        pyopencl::run_python_gc();
+      }
+  };
+
+  // }}}
+
 
 
 

From 823e893faad822dc3eb842f31ac2b4d0597dae17 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:11:50 -0500
Subject: [PATCH 09/16] enqueue_copy for SVM: Accept no-op {src,dest}_offset,
 byte_count parameters

---
 pyopencl/__init__.py   | 3 +++
 src/wrap_cl_part_2.cpp | 1 +
 2 files changed, 4 insertions(+)

diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 5b8d8b72e..12ae3be04 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -1779,6 +1779,9 @@ def enqueue_copy(queue, dest, src, **kwargs):
             src = SVM(src)
 
         is_blocking = kwargs.pop("is_blocking", True)
+        assert kwargs.pop("src_offset", 0) == 0
+        assert kwargs.pop("dest_offset", 0) == 0
+        assert "byte_count" not in kwargs or kwargs.pop("byte_count") == src._size()
         return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs)
 
     else:
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index ace5e9777..a6f604ee7 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -299,6 +299,7 @@ void pyopencl_expose_part_2(py::module &m)
     typedef svm_arg_wrapper cls;
     py::class_<cls>(m, "SVM", py::dynamic_attr())
       .def(py::init<py::object>())
+      .def("_size", &cls::size)
       ;
   }
 

From b3b6345f99e5ab043fa59437c022cdbcfa1dfe53 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:17:38 -0500
Subject: [PATCH 10/16] Add SVMAllocator (in Python)

---
 pyopencl/tools.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 27adac75b..1b027f0ff 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -67,6 +67,25 @@ def _register_types():
 # }}}
 
 
+# {{{ svm allocator
+
+# FIXME: Replace me with C++
+class SVMAllocator:
+    def __init__(self, ctx, flags, *, alignment=0, queue=None):
+        self._context = ctx
+        self._flags = flags
+        self._alignment = alignment
+        self._queue = queue
+
+    def __call__(self, nbytes):
+        import pyopencl as cl
+        return cl.SVM(cl.svm_empty(
+                self._context, self._flags, (nbytes,), np.int8, "C", self._alignment,
+                self._queue))
+
+# }}}
+
+
 # {{{ first-arg caches
 
 _first_arg_dependent_caches = []

From 81f06d65e9aa40db827bc595f6547aa4c96d684d Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 1 Jul 2022 01:18:13 -0500
Subject: [PATCH 11/16] Jostle some section headers in wrap_cl.hpp

---
 src/wrap_cl.hpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index 2582a5103..ec6089ccc 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -137,10 +137,8 @@
 // }}}
 
 
+// {{{ macros and typedefs for wrappers
 
-
-
-// {{{ tools
 #if PY_VERSION_HEX >= 0x02050000
   typedef Py_ssize_t PYOPENCL_BUFFER_SIZE_T;
 #else
@@ -262,6 +260,7 @@
 
 // }}}
 
+
 // {{{ tracing and error reporting
 #ifdef PYOPENCL_TRACE
   #define PYOPENCL_PRINT_CALL_TRACE(NAME) \
@@ -329,6 +328,7 @@
 
 // }}}
 
+
 // {{{ get_info helpers
 #define PYOPENCL_GET_OPAQUE_INFO(WHAT, FIRST_ARG, SECOND_ARG, CL_TYPE, TYPE) \
   { \
@@ -383,6 +383,7 @@
 
 // }}}
 
+
 // {{{ event helpers --------------------------------------------------------------
 #define PYOPENCL_PARSE_WAIT_FOR \
     cl_uint num_events_in_wait_list = 0; \
@@ -424,7 +425,9 @@
 
 // }}}
 
+
 // {{{ equality testing
+
 #define PYOPENCL_EQUALITY_TESTS(cls) \
     bool operator==(cls const &other) const \
     { return data() == other.data(); } \
@@ -432,8 +435,8 @@
     { return data() != other.data(); } \
     long hash() const \
     { return (long) (intptr_t) data(); }
-// }}}
 
+// }}}
 
 
 namespace pyopencl

From df4940c652e3cd43a886b8400bd3e0ff2bcbbae7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 2 Jul 2022 00:07:27 -0500
Subject: [PATCH 12/16] Add SVM._ptr_as_int()

---
 src/wrap_cl_part_2.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index a6f604ee7..a2b5cde17 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -299,6 +299,7 @@ void pyopencl_expose_part_2(py::module &m)
     typedef svm_arg_wrapper cls;
     py::class_<cls>(m, "SVM", py::dynamic_attr())
       .def(py::init<py::object>())
+      .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.ptr(); })
       .def("_size", &cls::size)
       ;
   }

From 3abf89cc702fafc4a819866fea215d3c74253a20 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 7 Jul 2022 17:52:07 +0200
Subject: [PATCH 13/16] Move from SVM(np.ndarray(SVMAllocation)) to bare
 SVMAllocation

---
 examples/demo_array_svm.py |  23 ++---
 pyopencl/__init__.py       |  60 +++++------
 pyopencl/tools.py          |   6 +-
 src/wrap_cl.hpp            | 200 ++++++++++++++++++++++++++++++++-----
 src/wrap_cl_part_2.cpp     |  33 ++++--
 5 files changed, 234 insertions(+), 88 deletions(-)

diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py
index 07454f1fe..054e582c0 100644
--- a/examples/demo_array_svm.py
+++ b/examples/demo_array_svm.py
@@ -1,36 +1,24 @@
 import pyopencl as cl
 import pyopencl.array as cl_array
+from pyopencl.tools import SVMAllocator
 import numpy as np
 import numpy.linalg as la
 
-n = 5000000
+n = 500000
 a = np.random.rand(n).astype(np.float32)
 b = np.random.rand(n).astype(np.float32)
 
 
-class SVMAllocator:
-    def __init__(self, ctx, flags, alignment):
-        self._context = ctx
-        self._flags = flags
-        self._alignment = alignment
-
-    def __call__(self, nbytes):
-        return cl.SVM(cl.svm_empty(
-                ctx, self._flags, (nbytes,), np.int8, "C", self._alignment))
-
-
 ctx = cl.create_some_context()
 queue = cl.CommandQueue(ctx)
 
-alloc = SVMAllocator(ctx,
-        cl.svm_mem_flags.READ_WRITE | cl.svm_mem_flags.SVM_FINE_GRAIN_BUFFER,
-        0)
+alloc = SVMAllocator(ctx, cl.svm_mem_flags.READ_WRITE, queue=queue)
 
 a_dev = cl_array.to_device(queue, a, allocator=alloc)
-print("A_DEV", a_dev.data.mem.nbytes, a_dev.data.mem.__array_interface__)
+print("A_DEV", a_dev.data)
 b_dev = cl_array.to_device(queue, b, allocator=alloc)
 dest_dev = cl_array.empty_like(a_dev)
-print("DEST", dest_dev.data.mem.__array_interface__)
+print("DEST", dest_dev.data)
 
 prg = cl.Program(ctx, """
     __kernel void sum(__global const float *a,
@@ -47,6 +35,7 @@ def __call__(self, nbytes):
 # PROBLEM: numpy frees the temporary out of (a_dev+b_dev) before
 # we're done with it
 diff = dest_dev - (a_dev+b_dev)
+
 if 0:
     diff = diff.get()
     np.set_printoptions(linewidth=400)
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 12ae3be04..22562d1f9 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -200,11 +200,9 @@
 
 if get_cl_header_version() >= (2, 0):
     from pyopencl._cl import (  # noqa
-        SVMAllocation,
+        SVMPointer,
         SVM,
-
-        # FIXME
-        #enqueue_svm_migratemem,
+        SVMAllocation,
         )
 
 if _cl.have_gl():
@@ -1144,25 +1142,19 @@ def memory_map_exit(self, exc_type, exc_val, exc_tb):
             """
 
     if get_cl_header_version() >= (2, 0):
-        svmallocation_old_init = SVMAllocation.__init__
-
-    def svmallocation_init(self, ctx, size, alignment, flags, _interface=None,
-            queue=None):
-        """
-        :arg ctx: a :class:`Context`
-        :arg flags: some of :class:`svm_mem_flags`.
-        """
-        svmallocation_old_init(self, ctx, size, alignment, flags, queue)
-
-        # mem_flags.READ_ONLY applies to kernels, not the host
-        read_write = True
-        _interface["data"] = (
-                int(self._ptr_as_int()), not read_write)
-
-        self.__array_interface__ = _interface
-
-    if get_cl_header_version() >= (2, 0):
-        SVMAllocation.__init__ = svmallocation_init
+        class _ArrayInterfaceSVMAllocation(SVMAllocation):
+            def __init__(self, ctx, size, alignment, flags, _interface=None,
+                    queue=None):
+                """
+                :arg ctx: a :class:`Context`
+                :arg flags: some of :class:`svm_mem_flags`.
+                """
+                super().__init__(ctx, size, alignment, flags, queue)
+
+                # mem_flags.READ_ONLY applies to kernels, not the host
+                read_write = True
+                _interface["data"] = (
+                        int(self._ptr_as_int()), not read_write)
 
     # }}}
 
@@ -1773,15 +1765,14 @@ def enqueue_copy(queue, dest, src, **kwargs):
         else:
             raise ValueError("invalid dest mem object type")
 
-    elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVM):
+    elif get_cl_header_version() >= (2, 0) and isinstance(dest, SVMPointer):
         # to SVM
-        if not isinstance(src, SVM):
+        if not isinstance(src, SVMPointer):
             src = SVM(src)
 
         is_blocking = kwargs.pop("is_blocking", True)
         assert kwargs.pop("src_offset", 0) == 0
         assert kwargs.pop("dest_offset", 0) == 0
-        assert "byte_count" not in kwargs or kwargs.pop("byte_count") == src._size()
         return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs)
 
     else:
@@ -1807,7 +1798,7 @@ def enqueue_copy(queue, dest, src, **kwargs):
                         queue, src, origin, region, dest, **kwargs)
             else:
                 raise ValueError("invalid src mem object type")
-        elif isinstance(src, SVM):
+        elif isinstance(src, SVMPointer):
             # from svm
             # dest is not a SVM instance, otherwise we'd be in the branch above
             is_blocking = kwargs.pop("is_blocking", True)
@@ -1822,14 +1813,14 @@ def enqueue_copy(queue, dest, src, **kwargs):
 
 # {{{ enqueue_fill
 
-def enqueue_fill(queue: CommandQueue, dest: Union[MemoryObjectHolder, SVM],
+def enqueue_fill(queue: CommandQueue, dest: Union[MemoryObjectHolder, SVMPointer],
         pattern: Any, size: int, *, offset: int = 0, wait_for=None) -> Event:
     """
     .. versionadded:: 2022.2
     """
     if isinstance(dest, MemoryObjectHolder):
         return enqueue_fill_buffer(queue, dest, pattern, offset, size, wait_for)
-    elif isinstance(dest, SVM):
+    elif isinstance(dest, SVMPointer):
         if offset:
             raise NotImplementedError("enqueue_fill with SVM does not yet support "
                     "offsets")
@@ -1961,7 +1952,7 @@ def enqueue_fill_buffer(queue, mem, pattern, offset, size, wait_for=None):
 def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
     """Fill shared virtual memory with a pattern.
 
-    :arg dest: a Python buffer object, optionally wrapped in an :class:`SVM` object
+    :arg dest: a Python buffer object, or any implementation of :class:`SVMPointer`.
     :arg pattern: a Python buffer object (e.g. a :class:`numpy.ndarray` with the
         fill pattern to be used.
     :arg byte_count: The size of the memory to be fill. Defaults to the
@@ -1972,8 +1963,8 @@ def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
     .. versionadded:: 2016.2
     """
 
-    if not isinstance(dest, SVM):
-        dest = SVM(dest)
+    if not isinstance(dest, SVMPointer):
+        dest = SVMPointer(dest)
 
     return _cl._enqueue_svm_memfill(
             queue, dest, pattern, byte_count=None, wait_for=None)
@@ -1982,7 +1973,7 @@ def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None):
 def enqueue_svm_migratemem(queue, svms, flags, wait_for=None):
     """
     :arg svms: a collection of Python buffer objects (e.g. :mod:`numpy`
-        arrays), optionally wrapped in :class:`SVM` objects.
+        arrays), or any implementation of :class:`SVMPointer`.
     :arg flags: a combination of :class:`mem_migration_flags`
 
     |std-enqueue-blurb|
@@ -2068,7 +2059,8 @@ def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None, queue=None):
     if alignment is None:
         alignment = itemsize
 
-    svm_alloc = SVMAllocation(ctx, nbytes, alignment, flags, _interface=interface,
+    svm_alloc = _ArrayInterfaceSVMAllocation(
+            ctx, nbytes, alignment, flags, _interface=interface,
             queue=queue)
     return np.asarray(svm_alloc)
 
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 1b027f0ff..09e4d0566 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -79,9 +79,9 @@ def __init__(self, ctx, flags, *, alignment=0, queue=None):
 
     def __call__(self, nbytes):
         import pyopencl as cl
-        return cl.SVM(cl.svm_empty(
-                self._context, self._flags, (nbytes,), np.int8, "C", self._alignment,
-                self._queue))
+        return cl.SVMAllocation(
+                self._context, nbytes, self._alignment, self._flags,
+                queue=self._queue)
 
 # }}}
 
diff --git a/src/wrap_cl.hpp b/src/wrap_cl.hpp
index ec6089ccc..76477f55f 100644
--- a/src/wrap_cl.hpp
+++ b/src/wrap_cl.hpp
@@ -227,8 +227,6 @@
     }
 
 
-
-
 #define PYOPENCL_RETRY_IF_MEM_ERROR(OPERATION) \
   { \
     bool failed_with_mem_error = false; \
@@ -258,6 +256,17 @@
     } \
   }
 
+
+#define PYOPENCL_GET_SVM_SIZE(NAME) \
+  size_t NAME##_size; \
+  bool NAME##_has_size = false; \
+  try \
+  { \
+    NAME##_size = NAME.size(); \
+    NAME##_has_size = true; \
+  } \
+  catch (size_not_available)  { }
+
 // }}}
 
 
@@ -501,12 +510,12 @@ namespace pyopencl
 
   // {{{ utility functions
 
-  inline bool is_queue_in_order(cl_command_queue queue)
+  inline bool is_queue_out_of_order(cl_command_queue queue)
   {
       cl_command_queue_properties param_value;
       PYOPENCL_CALL_GUARDED(clGetCommandQueueInfo,
           (queue, CL_QUEUE_PROPERTIES, sizeof(param_value), &param_value, 0));
-      return !(param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+      return param_value & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
   }
 
   // }}}
@@ -3527,11 +3536,26 @@ namespace pyopencl
   // }}}
 
 
-  // {{{ svm
-
 #if PYOPENCL_CL_VERSION >= 0x2000
 
-  class svm_arg_wrapper
+  // {{{ svm pointer
+
+  class size_not_available { };
+
+  class svm_pointer
+  {
+    public:
+      virtual void *ptr() const = 0;
+      // may throw size_not_available
+      virtual size_t size() const = 0;
+  };
+
+  // }}}
+
+
+  // {{{ svm_arg_wrapper
+
+  class svm_arg_wrapper : public svm_pointer
   {
     private:
       void *m_ptr;
@@ -3564,12 +3588,17 @@ namespace pyopencl
       }
   };
 
+  // }}}
+
 
-  class svm_allocation
+  // {{{ svm_allocation
+
+  class svm_allocation : public svm_pointer
   {
     private:
       std::shared_ptr<context> m_context;
       void *m_allocation;
+      size_t m_size;
       command_queue_ref m_queue;
       // FIXME Should keep a list of events so that we can wait for users
       // to finish in the case of out-of-order queues.
@@ -3577,12 +3606,12 @@ namespace pyopencl
     public:
       svm_allocation(std::shared_ptr<context> const &ctx, size_t size, cl_uint alignment,
           cl_svm_mem_flags flags, const command_queue *queue = nullptr)
-        : m_context(ctx)
+        : m_context(ctx), m_size(size)
       {
         if (queue)
         {
           m_queue.set(queue->data());
-          if (!is_queue_in_order(m_queue.data()))
+          if (is_queue_out_of_order(m_queue.data()))
             throw error("SVMAllocation.__init__", CL_INVALID_VALUE,
                 "supplying an out-of-order queue to SVMAllocation is invalid");
         }
@@ -3650,6 +3679,11 @@ namespace pyopencl
         return m_allocation;
       }
 
+      size_t size() const
+      {
+        return m_size;
+      }
+
       intptr_t ptr_as_int() const
       {
         return (intptr_t) m_allocation;
@@ -3667,29 +3701,92 @@ namespace pyopencl
 
       void bind_to_queue(command_queue const &queue)
       {
+        if (is_queue_out_of_order(m_queue.data()))
+          throw error("SVMAllocation.bind_to_queue", CL_INVALID_VALUE,
+              "supplying an out-of-order queue to SVMAllocation is invalid");
+
+        if (m_queue.is_valid())
+        {
+          // make sure synchronization promises stay valid in new queue
+          cl_event evt;
+
+          PYOPENCL_CALL_GUARDED(clEnqueueMarker, (m_queue.data(), &evt));
+          PYOPENCL_CALL_GUARDED(clEnqueueWaitForEvents, (queue.data(), 1, &evt));
+        }
+
         m_queue.set(queue.data());
       }
 
       void unbind_from_queue()
       {
+        // NOTE: This absolves the allocation from any synchronization promises
+        // made. Keeping those before calling this method is the responsibility
+        // of the user.
         m_queue.reset();
       }
   };
 
+  // }}}
+
+
+  // {{{ svm operations
 
   inline
   event *enqueue_svm_memcpy(
       command_queue &cq,
       cl_bool is_blocking,
-      svm_arg_wrapper &dst, svm_arg_wrapper &src,
-      py::object py_wait_for
+      svm_pointer &dst, svm_pointer &src,
+      py::object py_wait_for,
+      py::object byte_count_py
       )
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
-    if (src.size() != dst.size())
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(src);
+    PYOPENCL_GET_SVM_SIZE(dst);
+
+    size_t size;
+    bool have_size = false;
+
+    if (src_has_size)
+    {
+      size = src_size;
+      have_size = true;
+    }
+    if (dst_has_size)
+    {
+      if (have_size)
+      {
+        if (!byte_count_py.is_none())
+          size = std::min(size, dst_size);
+        else if (size != dst_size)
+          throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
+              "sizes of source and destination buffer do not match");
+      }
+      else
+      {
+        size = dst_size;
+        have_size = true;
+      }
+    }
+
+    if (!byte_count_py.is_none())
+    {
+      size_t byte_count = byte_count_py.cast<size_t>();
+      if (have_size && byte_count > size)
+        throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
+            "specified byte_count larger than size of source or destination buffers");
+      size = byte_count;
+      have_size = true;
+    }
+
+    if (!have_size)
       throw error("_enqueue_svm_memcpy", CL_INVALID_VALUE,
-          "sizes of source and destination buffer do not match");
+          "size not passed and could not be determined");
+
+    // }}}
 
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
@@ -3698,7 +3795,7 @@ namespace pyopencl
           cq.data(),
           is_blocking,
           dst.ptr(), src.ptr(),
-          dst.size(),
+          size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3710,7 +3807,7 @@ namespace pyopencl
   inline
   event *enqueue_svm_memfill(
       command_queue &cq,
-      svm_arg_wrapper &dst, py::object py_pattern,
+      svm_pointer &dst, py::object py_pattern,
       py::object byte_count,
       py::object py_wait_for
       )
@@ -3727,9 +3824,32 @@ namespace pyopencl
     pattern_ptr = pattern_ward->m_buf.buf;
     pattern_len = pattern_ward->m_buf.len;
 
-    size_t fill_size = dst.size();
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(dst);
+
+    size_t size;
+    bool have_size = false;
+    if (dst_has_size)
+    {
+      size = dst_size;
+      have_size = true;
+    }
     if (!byte_count.is_none())
-      fill_size = py::cast<size_t>(byte_count);
+    {
+      size_t user_size = py::cast<size_t>(byte_count);
+      if (have_size && user_size > size)
+        throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+            "byte_count too large for specified SVM buffer");
+    }
+
+    if (!have_size)
+    {
+      throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+          "byte_count not passed and could not be determined");
+    }
+
+    // }}}
 
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
@@ -3738,7 +3858,7 @@ namespace pyopencl
           cq.data(),
           dst.ptr(), pattern_ptr,
           pattern_len,
-          fill_size,
+          size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3752,12 +3872,40 @@ namespace pyopencl
       command_queue &cq,
       cl_bool is_blocking,
       cl_map_flags flags,
-      svm_arg_wrapper &svm,
-      py::object py_wait_for
+      svm_pointer &svm,
+      py::object py_wait_for,
+      py::object user_size_py
       )
   {
     PYOPENCL_PARSE_WAIT_FOR;
 
+    // {{{ process size
+
+    PYOPENCL_GET_SVM_SIZE(svm);
+
+    size_t size;
+    bool have_size = false;
+    if (svm_has_size)
+    {
+      size = svm_size;
+      have_size = true;
+    }
+    if (!user_size_py.is_none())
+    {
+      size_t user_size = py::cast<size_t>(user_size_py);
+      if (have_size && user_size > size)
+        throw error("enqueue_svm_memfill", CL_INVALID_VALUE,
+            "user-provided size too large for specified SVM buffer");
+    }
+
+    if (!have_size)
+    {
+      throw error("enqueue_svm_mem_map", CL_INVALID_VALUE,
+          "size not passed and could not be determined");
+    }
+
+    // }}}
+
     cl_event evt;
     PYOPENCL_CALL_GUARDED(
         clEnqueueSVMMap,
@@ -3765,7 +3913,7 @@ namespace pyopencl
           cq.data(),
           is_blocking,
           flags,
-          svm.ptr(), svm.size(),
+          svm.ptr(), size,
           PYOPENCL_WAITLIST_ARGS,
           &evt
         ));
@@ -3777,7 +3925,7 @@ namespace pyopencl
   inline
   event *enqueue_svm_unmap(
       command_queue &cq,
-      svm_arg_wrapper &svm,
+      svm_pointer &svm,
       py::object py_wait_for
       )
   {
@@ -3814,7 +3962,7 @@ namespace pyopencl
 
     for (py::handle py_svm: svms)
     {
-      svm_arg_wrapper &svm(py::cast<svm_arg_wrapper &>(py_svm));
+      svm_pointer &svm(py::cast<svm_pointer &>(py_svm));
 
       svm_pointers.push_back(svm.ptr());
       sizes.push_back(svm.size());
@@ -4609,7 +4757,7 @@ namespace pyopencl
       }
 
 #if PYOPENCL_CL_VERSION >= 0x2000
-      void set_arg_svm(cl_uint arg_index, svm_arg_wrapper const &wrp)
+      void set_arg_svm(cl_uint arg_index, svm_pointer const &wrp)
       {
         PYOPENCL_CALL_GUARDED(clSetKernelArgSVMPointer,
             (m_kernel, arg_index, wrp.ptr()));
@@ -4634,7 +4782,7 @@ namespace pyopencl
 #if PYOPENCL_CL_VERSION >= 0x2000
         try
         {
-          set_arg_svm(arg_index, arg.cast<svm_arg_wrapper const &>());
+          set_arg_svm(arg_index, arg.cast<svm_pointer const &>());
           return;
         }
         catch (py::cast_error &) { }
diff --git a/src/wrap_cl_part_2.cpp b/src/wrap_cl_part_2.cpp
index a2b5cde17..453f34c38 100644
--- a/src/wrap_cl_part_2.cpp
+++ b/src/wrap_cl_part_2.cpp
@@ -295,18 +295,34 @@ void pyopencl_expose_part_2(py::module &m)
   // {{{ svm
 
 #if PYOPENCL_CL_VERSION >= 0x2000
+  {
+    typedef svm_pointer cls;
+    py::class_<cls>(m, "SVMPointer", py::dynamic_attr())
+      .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.ptr(); })
+      .def("_size", [](cls &self) -> py::object
+          {
+            try
+            {
+              return py::cast(self.size());
+            }
+            catch (size_not_available)
+            {
+              return py::none();
+            }
+          })
+      ;
+  }
+
   {
     typedef svm_arg_wrapper cls;
-    py::class_<cls>(m, "SVM", py::dynamic_attr())
+    py::class_<cls, svm_pointer>(m, "SVM", py::dynamic_attr())
       .def(py::init<py::object>())
-      .def("_ptr_as_int", [](cls &self) { return (intptr_t) self.ptr(); })
-      .def("_size", &cls::size)
       ;
   }
 
   {
     typedef svm_allocation cls;
-    py::class_<cls>(m, "SVMAllocation", py::dynamic_attr())
+    py::class_<cls, svm_pointer>(m, "SVMAllocation", py::dynamic_attr())
       .def(py::init<std::shared_ptr<context>, size_t, cl_uint, cl_svm_mem_flags, const command_queue *>(),
           py::arg("context"),
           py::arg("size"),
@@ -318,10 +334,9 @@ void pyopencl_expose_part_2(py::module &m)
       .def("enqueue_release", &cls::enqueue_release,
           ":returns: a :class:`pyopencl.Event`\n\n"
           "|std-enqueue-blurb|")
-      .def("_ptr_as_int", &cls::ptr_as_int)
       .def(py::self == py::self)
       .def(py::self != py::self)
-      .def("__hash__", &cls::ptr_as_int)
+      .def("__hash__", [](cls &self) { return (intptr_t) self.ptr(); })
       .DEF_SIMPLE_METHOD(bind_to_queue)
       .DEF_SIMPLE_METHOD(unbind_from_queue)
       ;
@@ -332,7 +347,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("is_blocking"),
       py::arg("dst"),
       py::arg("src"),
-      py::arg("wait_for")=py::none()
+      py::arg("wait_for")=py::none(),
+      py::arg("byte_count")=py::none()
       );
 
   m.def("_enqueue_svm_memfill", enqueue_svm_memfill,
@@ -348,7 +364,8 @@ void pyopencl_expose_part_2(py::module &m)
       py::arg("is_blocking"),
       py::arg("flags"),
       py::arg("svm"),
-      py::arg("wait_for")=py::none()
+      py::arg("wait_for")=py::none(),
+      py::arg("size")=py::none()
       );
 
   m.def("_enqueue_svm_unmap", enqueue_svm_unmap,

From c34f7ef958dac5f915b2144c851a252b3e3cd7a6 Mon Sep 17 00:00:00 2001
From: Matthias Diener <mdiener@illinois.edu>
Date: Thu, 21 Jul 2022 22:08:05 -0500
Subject: [PATCH 14/16] initial SVMMemoryPool implementation

---
 pyopencl/tools.py    |   4 +-
 src/wrap_mempool.cpp | 114 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 117 insertions(+), 1 deletion(-)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 09e4d0566..9389bcb0a 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -62,7 +62,9 @@ def _register_types():
         PooledBuffer as PooledBuffer,
         _tools_DeferredAllocator as DeferredAllocator,
         _tools_ImmediateAllocator as ImmediateAllocator,
-        MemoryPool as MemoryPool)
+        _tools_SVMAllocator as SVMAllocator,
+        MemoryPool as MemoryPool,
+        SVMMemoryPool as SVMMemoryPool)
 
 # }}}
 
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 35630b036..875c72f4a 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -349,11 +349,95 @@ namespace
       {
         pyopencl::run_python_gc();
       }
+
+      bool is_deferred() const { return false; }
+
+      svm_allocator *copy() const
+      {
+        return new svm_allocator(*this);
+      }
   };
 
   // }}}
 
 
+  // {{{ SVM mempool
+
+  inline
+  void *svm_allocator_call(svm_allocator &alloc, size_t size)
+  {
+    void * mem;
+    int try_count = 0;
+    while (try_count < 2)
+    {
+      try
+      {
+        mem = alloc.allocate(size);
+        break;
+      }
+      catch (pyopencl::error &e)
+      {
+        if (!e.is_out_of_memory())
+          throw;
+        if (++try_count == 2)
+          throw;
+      }
+
+      alloc.try_release_blocks();
+    }
+
+    if (!mem)
+    {
+      if (size == 0)
+        return nullptr;
+      else
+        throw pyopencl::error("Allocator", CL_INVALID_VALUE,
+            "svm allocator succeeded but returned NULL pointer");
+    }
+
+    return mem;
+  }
+
+  // }}}
+
+
+  // {{{ pooled_buffer
+
+  class svm_pooled_buffer
+    : public pyopencl::pooled_allocation<pyopencl::memory_pool<svm_allocator> > //,
+    // public pyopencl::memory_object_holder
+  {
+    private:
+      typedef
+        pyopencl::pooled_allocation<pyopencl::memory_pool<svm_allocator> >
+        super;
+
+    public:
+      svm_pooled_buffer(
+          std::shared_ptr<super::pool_type> p, super::size_type s)
+        : super(p, s)
+      { }
+
+      const super::pointer_type data() const
+      { return ptr(); }
+  };
+
+  // }}}
+
+
+  // {{{{ device_pool_allocate
+
+  svm_pooled_buffer *svm_device_pool_allocate(
+      std::shared_ptr<pyopencl::memory_pool<svm_allocator> > pool,
+      pyopencl::memory_pool<svm_allocator>::size_type sz)
+  {
+    return new svm_pooled_buffer(pool, sz);
+  }
+
+
+  // }}}
+
+
 
 
   template<class Wrapper>
@@ -454,6 +538,36 @@ void pyopencl_expose_mempool(py::module &m)
     expose_memory_pool(wrapper);
   }
 
+  {
+    typedef svm_allocator cls;
+    py::class_<cls> wrapper(
+        m, "_tools_SVMAllocator");
+    wrapper
+      .def(py::init<svm_allocator &>())
+      .def(py::init<std::shared_ptr<pyopencl::context> const &, cl_uint, cl_mem_flags>())
+      ;
+  }
+
+  {
+    typedef pyopencl::memory_pool<svm_allocator> cls;
+
+    py::class_<
+      cls, /* boost::noncopyable, */
+      std::shared_ptr<cls>> wrapper( m, "SVMMemoryPool");
+    wrapper
+      .def(py::init<svm_allocator const &, unsigned>(),
+          py::arg("allocator"),
+          py::arg("leading_bits_in_bin_id")=4
+          )
+      .def("allocate", svm_device_pool_allocate)
+      .def("__call__", svm_device_pool_allocate)
+      // undoc for now
+      .DEF_SIMPLE_METHOD(set_trace)
+      ;
+
+    expose_memory_pool(wrapper);
+  }
+
   {
     typedef pooled_buffer cls;
     py::class_<cls, /* boost::noncopyable, */

From da8b8dcb0d6fb9b129f0af9510ab2dcad45f5780 Mon Sep 17 00:00:00 2001
From: Matthias Diener <mdiener@illinois.edu>
Date: Thu, 21 Jul 2022 23:09:06 -0500
Subject: [PATCH 15/16] make SVMAllocator C++-only

---
 pyopencl/tools.py    | 26 +++++++++++++-------------
 src/wrap_mempool.cpp | 12 +++++++++---
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 9389bcb0a..782b6af5f 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -71,19 +71,19 @@ def _register_types():
 
 # {{{ svm allocator
 
-# FIXME: Replace me with C++
-class SVMAllocator:
-    def __init__(self, ctx, flags, *, alignment=0, queue=None):
-        self._context = ctx
-        self._flags = flags
-        self._alignment = alignment
-        self._queue = queue
-
-    def __call__(self, nbytes):
-        import pyopencl as cl
-        return cl.SVMAllocation(
-                self._context, nbytes, self._alignment, self._flags,
-                queue=self._queue)
+# # FIXME: Replace me with C++
+# class SVMAllocator:
+#     def __init__(self, ctx, flags, *, alignment=0, queue=None):
+#         self._context = ctx
+#         self._flags = flags
+#         self._alignment = alignment
+#         self._queue = queue
+
+#     def __call__(self, nbytes):
+#         import pyopencl as cl
+#         return cl.SVMAllocation(
+#                 self._context, nbytes, self._alignment, self._flags,
+#                 queue=self._queue)
 
 # }}}
 
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 875c72f4a..15691d612 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -307,13 +307,15 @@ namespace
   {
     protected:
       std::shared_ptr<pyopencl::context> m_context;
+      pyopencl::command_queue m_queue;
       cl_uint m_alignment;
       cl_mem_flags m_flags;
 
     public:
       svm_allocator(std::shared_ptr<pyopencl::context> const &ctx,
+          pyopencl::command_queue &queue,
           cl_uint alignment, cl_mem_flags flags=CL_MEM_READ_WRITE)
-        : m_context(ctx), m_alignment(alignment), m_flags(flags)
+        : m_context(ctx), m_queue(queue), m_alignment(alignment), m_flags(flags)
       {
         if (flags & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR))
           throw pyopencl::error("Allocator", CL_INVALID_VALUE,
@@ -321,7 +323,7 @@ namespace
       }
 
       svm_allocator(svm_allocator const &src)
-      : m_context(src.m_context), m_alignment(src.m_alignment),
+      : m_context(src.m_context), m_queue(src.m_queue), m_alignment(src.m_alignment),
       m_flags(src.m_flags)
       { }
 
@@ -544,7 +546,11 @@ void pyopencl_expose_mempool(py::module &m)
         m, "_tools_SVMAllocator");
     wrapper
       .def(py::init<svm_allocator &>())
-      .def(py::init<std::shared_ptr<pyopencl::context> const &, cl_uint, cl_mem_flags>())
+      .def(py::init<std::shared_ptr<pyopencl::context> const &, pyopencl::command_queue &, cl_uint, cl_mem_flags>(),
+      py::arg("ctx"),
+      py::arg("queue"),
+      py::arg("alignment")=0,
+      py::arg("flags")=CL_MEM_READ_WRITE)
       ;
   }
 

From 437dbc36863944431877761921dc38a500223cf1 Mon Sep 17 00:00:00 2001
From: Matthias Diener <mdiener@illinois.edu>
Date: Thu, 21 Jul 2022 23:45:55 -0500
Subject: [PATCH 16/16] svm_pooled_allocation

---
 pyopencl/tools.py    |  1 +
 src/wrap_mempool.cpp | 23 ++++++++++++++++-------
 2 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 782b6af5f..0cc707cfe 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -60,6 +60,7 @@ def _register_types():
 
 from pyopencl._cl import (  # noqa
         PooledBuffer as PooledBuffer,
+        SVMPooledAllocation as SVMPooledAllocation,
         _tools_DeferredAllocator as DeferredAllocator,
         _tools_ImmediateAllocator as ImmediateAllocator,
         _tools_SVMAllocator as SVMAllocator,
diff --git a/src/wrap_mempool.cpp b/src/wrap_mempool.cpp
index 15691d612..85bb287be 100644
--- a/src/wrap_mempool.cpp
+++ b/src/wrap_mempool.cpp
@@ -305,7 +305,7 @@ namespace
   // allocators? (I.e. can I tell whether I am out of memory just from allocations?)
   class svm_allocator
   {
-    protected:
+    public:
       std::shared_ptr<pyopencl::context> m_context;
       pyopencl::command_queue m_queue;
       cl_uint m_alignment;
@@ -366,7 +366,7 @@ namespace
   // {{{ SVM mempool
 
   inline
-  void *svm_allocator_call(svm_allocator &alloc, size_t size)
+  pyopencl::svm_allocation *svm_allocator_call(svm_allocator &alloc, size_t size)
   {
     void * mem;
     int try_count = 0;
@@ -397,7 +397,8 @@ namespace
             "svm allocator succeeded but returned NULL pointer");
     }
 
-    return mem;
+
+    return new pyopencl::svm_allocation(alloc.m_context, size, alloc.m_alignment, alloc.m_flags, &alloc.m_queue);
   }
 
   // }}}
@@ -405,7 +406,7 @@ namespace
 
   // {{{ pooled_buffer
 
-  class svm_pooled_buffer
+  class svm_pooled_allocation
     : public pyopencl::pooled_allocation<pyopencl::memory_pool<svm_allocator> > //,
     // public pyopencl::memory_object_holder
   {
@@ -415,7 +416,7 @@ namespace
         super;
 
     public:
-      svm_pooled_buffer(
+      svm_pooled_allocation(
           std::shared_ptr<super::pool_type> p, super::size_type s)
         : super(p, s)
       { }
@@ -429,11 +430,11 @@ namespace
 
   // {{{{ device_pool_allocate
 
-  svm_pooled_buffer *svm_device_pool_allocate(
+  svm_pooled_allocation *svm_device_pool_allocate(
       std::shared_ptr<pyopencl::memory_pool<svm_allocator> > pool,
       pyopencl::memory_pool<svm_allocator>::size_type sz)
   {
-    return new svm_pooled_buffer(pool, sz);
+    return new svm_pooled_allocation(pool, sz);
   }
 
 
@@ -574,6 +575,14 @@ void pyopencl_expose_mempool(py::module &m)
     expose_memory_pool(wrapper);
   }
 
+  {
+    typedef svm_pooled_allocation cls;
+    py::class_<cls>(
+          m, "SVMPooledAllocation"/* , py::no_init */)
+      .def("release", &cls::free)
+      ;
+  }
+
   {
     typedef pooled_buffer cls;
     py::class_<cls, /* boost::noncopyable, */