diff --git a/examples/demo_array_svm.py b/examples/demo_array_svm.py
new file mode 100644
index 000000000..3ee013fe2
--- /dev/null
+++ b/examples/demo_array_svm.py
@@ -0,0 +1,54 @@
+import pyopencl as cl
+import pyopencl.array as cl_array
+import numpy as np
+import numpy.linalg as la
+
+a = np.random.rand(500).astype(np.float32)
+b = np.random.rand(500).astype(np.float32)
+
+
+class SVMAllocator:
+    def __init__(self, ctx, flags, alignment):
+        self._context = ctx
+        self._flags = flags
+        self._alignment = alignment
+
+    def __call__(self, nbytes):
+        return cl.SVM(cl.svm_empty(
+                ctx, self._flags, (nbytes,), np.int8, "C", self._alignment))
+
+
+ctx = cl.create_some_context()
+queue = cl.CommandQueue(ctx)
+
+alloc = SVMAllocator(ctx,
+        cl.svm_mem_flags.READ_WRITE | cl.svm_mem_flags.SVM_FINE_GRAIN_BUFFER,
+        0)
+
+a_dev = cl_array.to_device(queue, a, allocator=alloc)
+print("A_DEV", a_dev.data.mem.nbytes, a_dev.data.mem.__array_interface__)
+b_dev = cl_array.to_device(queue, b, allocator=alloc)
+dest_dev = cl_array.empty_like(a_dev)
+print("DEST", dest_dev.data.mem.__array_interface__)
+
+prg = cl.Program(ctx, """
+    __kernel void sum(__global const float *a,
+    __global const float *b, __global float *c)
+    {
+      int gid = get_global_id(0);
+      c[gid] = a[gid] + b[gid];
+    }
+    """).build()
+
+knl = prg.sum  # Use this Kernel object for repeated calls
+knl(queue, a.shape, None, a_dev.data, b_dev.data, dest_dev.data)
+
+# PROBLEM: numpy frees the temporary out of (a_dev+b_dev) before
+# we're done with it
+diff = (dest_dev - (a_dev+b_dev)).get()
+np.set_printoptions(linewidth=400)
+print(dest_dev)
+print((a_dev+b_dev).get())
+print(diff)
+print(la.norm(diff))
+print("A_DEV", a_dev.data.mem.__array_interface__)
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index ababfbb35..591f90916 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -1771,7 +1771,12 @@ def enqueue_copy(queue, dest, src, **kwargs):
             src = SVM(src)
 
         is_blocking = kwargs.pop("is_blocking", True)
-        return _cl._enqueue_svm_memcpy(queue, is_blocking, dest, src, **kwargs)
+
+        # FIXME POCL workaround
+        evt = _cl._enqueue_svm_memcpy(queue, False, dest, src, **kwargs)
+        if is_blocking:
+            evt.wait()
+        return evt
 
     else:
         # assume to-host
@@ -1800,8 +1805,13 @@ def enqueue_copy(queue, dest, src, **kwargs):
             # from svm
             # dest is not a SVM instance, otherwise we'd be in the branch above
             is_blocking = kwargs.pop("is_blocking", True)
-            return _cl._enqueue_svm_memcpy(
-                    queue, is_blocking, SVM(dest), src, **kwargs)
+
+            evt = _cl._enqueue_svm_memcpy(queue, False, SVM(dest), src, **kwargs)
+            # FIXME: POCL workaround
+            if is_blocking:
+                evt.wait()
+            return evt
+
         else:
             # assume from-host
             raise TypeError("enqueue_copy cannot perform host-to-host transfers")
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 874ae92c4..c4222ca34 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -670,9 +670,14 @@ def set(self, ary, queue=None, async_=None, **kwargs):
                     stacklevel=2)
 
         if self.size:
-            event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
-                    device_offset=self.offset,
-                    is_blocking=not async_)
+            if self.offset:
+                event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+                        device_offset=self.offset,
+                        is_blocking=not async_)
+            else:
+                event1 = cl.enqueue_copy(queue or self.queue, self.base_data, ary,
+                        is_blocking=not async_)
+
             self.add_event(event1)
 
     def _get(self, queue=None, ary=None, async_=None, **kwargs):
@@ -720,9 +725,14 @@ def _get(self, queue=None, ary=None, async_=None, **kwargs):
                     "to associate one.")
 
         if self.size:
-            event1 = cl.enqueue_copy(queue, ary, self.base_data,
-                    device_offset=self.offset,
-                    wait_for=self.events, is_blocking=not async_)
+            if self.offset:
+                event1 = cl.enqueue_copy(queue, ary, self.base_data,
+                        device_offset=self.offset,
+                        wait_for=self.events, is_blocking=not async_)
+            else:
+                event1 = cl.enqueue_copy(queue, ary, self.base_data,
+                        wait_for=self.events, is_blocking=not async_)
+
             self.add_event(event1)
         else:
             event1 = None