diff --git a/doc/runtime_memory.rst b/doc/runtime_memory.rst index 3816abaa7785016179a45c24afb065f7c1fbac12..e3312973ddd85aeabd55b48728ab6df8c3c04f49 100644 --- a/doc/runtime_memory.rst +++ b/doc/runtime_memory.rst @@ -104,7 +104,6 @@ Buffer .. versionadded:: 2011.2 - .. _svm: Shared Virtual Memory (SVM) @@ -133,6 +132,14 @@ Allocating SVM .. autofunction:: fsvm_empty .. autofunction:: fsvm_empty_like +Operations on SVM +^^^^^^^^^^^^^^^^^ + +(See also :ref:`mem-transfer`.) + +.. autofunction:: enqueue_svm_memfill +.. autofunction:: enqueue_svm_migratemem + SVM Allocation Holder ^^^^^^^^^^^^^^^^^^^^^ @@ -267,6 +274,8 @@ Image .. versionadded:: 2011.2 +.. _mem-transfer: + Transfers --------- diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index b2e437fd1ce7bc3f56917cad1294989e9c14a0e6..015431e14e776df6ee820e2b701717ccf537efb2 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -152,6 +152,8 @@ from pyopencl.cffi_cl import ( # noqa enqueue_fill_image, _enqueue_copy_image_to_buffer, _enqueue_copy_buffer_to_image, + enqueue_svm_memfill, + enqueue_svm_migratemem, have_gl, _GLObject, @@ -854,6 +856,12 @@ def enqueue_copy(queue, dest, src, **kwargs): else: raise ValueError("invalid dest mem object type") + elif isinstance(dest, SVM): + # to SVM + if isinstance(src, SVM): + src = src.mem + + return _cl._enqueue_svm_memcpy(queue, dest.mem, src, **kwargs) else: # assume to-host @@ -877,6 +885,10 @@ def enqueue_copy(queue, dest, src, **kwargs): queue, src, origin, region, dest, **kwargs) else: raise ValueError("invalid src mem object type") + elif isinstance(src, SVM): + # from svm + # dest is not a SVM instance, otherwise we'd be in the branch above + return _cl._enqueue_svm_memcpy(queue, dest, src.mem, **kwargs) else: # assume from-host raise TypeError("enqueue_copy cannot perform host-to-host transfers") diff --git a/pyopencl/cffi_cl.py b/pyopencl/cffi_cl.py index b23fbae179142a6a54e83bc569213f872ef5b5d9..0a5f6abdde0855f4c1b1d53cbf06ed4ff60822bd 100644 --- a/pyopencl/cffi_cl.py +++ b/pyopencl/cffi_cl.py @@ -1122,10 +1122,11 @@ class SVMAllocation(object): self.is_fine_grain = flags & svm_mem_flags.SVM_FINE_GRAIN_BUFFER if _interface is not None: - _interface["data"] = ( - int(_ffi.cast("intptr_t", self.ptr)), + read_write = ( flags & mem_flags.WRITE_ONLY != 0 or flags & mem_flags.READ_WRITE != 0) + _interface["data"] = ( + int(_ffi.cast("intptr_t", self.ptr)), not read_write) self.__array_interface__ = _interface def __del__(self): @@ -1159,13 +1160,7 @@ class SVMAllocation(object): # {{{ SVM -#TODO: -# doc example -# finish copy -# test -# fill -# test -# migrate +# TODO add clSetKernelExecInfo class SVM(_CLKernelArg): """Tags an object exhibiting the Python buffer interface (such as a @@ -1177,25 +1172,54 @@ class SVM(_CLKernelArg): * coarse-grain shared memory as returned by (e.g.) :func:`csvm_empty` for any implementation of OpenCL 2.0. + This is how coarse-grain SVM may be used from both host and device:: + + svm_ary = cl.SVM(cl.csvm_empty(ctx, 1000, np.float32, alignment=64)) + assert isinstance(svm_ary.mem, np.ndarray) + + with svm_ary.map_rw(queue) as ary: + ary.fill(17) # use from host + + prg.twice(queue, svm_ary.mem.shape, None, svm_ary) + * fine-grain shared memory as returned by (e.g.) :func:`fsvm_empty`, if the implementation supports fine-grained shared virtual memory. + This memory may directly be passed to a kernel:: + + ary = cl.fsvm_empty(ctx, 1000, np.float32) + assert isinstance(ary, np.ndarray) + + prg.twice(queue, ary.shape, None, cl.SVM(ary)) + queue.finish() # synchronize + print(ary) # access from host + + Observe how mapping (as needed in coarse-grain SVM) is no longer + necessary. * any :class:`numpy.ndarray` (or other Python object with a buffer interface) if the implementation supports fine-grained *system* shared virtual memory. + This is how plain :mod:`numpy` arrays may directly be passed to a + kernel:: + + ary = np.zeros(1000, np.float32) + prg.twice(queue, ary.shape, None, cl.SVM(ary)) + queue.finish() # synchronize + print(ary) # access from host + Objects of this type may be passed to kernel calls and :func:`enqueue_copy`. Coarse-grain shared-memory *must* be mapped into host address space using :meth:`map` before being accessed through the :mod:`numpy` interface. .. note:: - This object merely serves as a 'tag' that changes the meaning + This object merely serves as a 'tag' that changes the behavior of functions to which it is passed. It has no special management relationship to the memory it tags. For example, it is permissible - to grab a :mod:`numpy.array` out of :attr:`SVM.memory` of one + to grab a :mod:`numpy.array` out of :attr:`SVM.mem` of one :class:`SVM` instance and use the array to construct another. - Neither of the tags needs to be kept alive. + Neither of the tags need to be kept alive. .. versionadded:: 2016.2 @@ -1205,13 +1229,15 @@ class SVM(_CLKernelArg): .. automethod:: __init__ .. automethod:: map + .. automethod:: map_ro + .. automethod:: map_rw .. automethod:: as_buffer """ def __init__(self, mem): self.mem = mem - def map(self, queue, is_blocking=True, flags=None, wait_for=None): + def map(self, queue, flags, is_blocking=True, wait_for=None): """ :arg is_blocking: If *False*, subsequent code must wait on :attr:`SVMMap.event` in the returned object before accessing the @@ -1222,11 +1248,9 @@ class SVM(_CLKernelArg): |std-enqueue-blurb| """ - if flags is None: - flags = map_flags.READ | map_flags.WRITE - - c_buf, size, _ = _c_buffer_from_obj(self.mem, writable=bool( - flags & (map_flags.WRITE | map_flags.INVALIDATE_REGION))) + writable = bool( + flags & (map_flags.WRITE | map_flags.WRITE_INVALIDATE_REGION)) + c_buf, size, _ = _c_buffer_from_obj(self.mem, writable=writable) ptr_event = _ffi.new('clobj_t*') c_wait_for, num_wait_for = _clobj_list(wait_for) @@ -1235,9 +1259,21 @@ class SVM(_CLKernelArg): c_buf, size, c_wait_for, num_wait_for)) - evt = Event._create(ptr_event[0]), SVMMap(self.mem) + evt = Event._create(ptr_event[0]) return SVMMap(self, queue, evt) + def map_ro(self, queue, is_blocking=True, wait_for=None): + """Like :meth:`map`, but with *flags* set for a read-only map.""" + + return self.map(queue, map_flags.READ, + is_blocking=is_blocking, wait_for=wait_for) + + def map_rw(self, queue, is_blocking=True, wait_for=None): + """Like :meth:`map`, but with *flags* set for a read-only map.""" + + return self.map(queue, map_flags.READ | map_flags.WRITE, + is_blocking=is_blocking, wait_for=wait_for) + def _enqueue_unmap(self, queue, wait_for=None): c_buf, _, _ = _c_buffer_from_obj(self.mem) @@ -1248,7 +1284,7 @@ class SVM(_CLKernelArg): c_buf, c_wait_for, num_wait_for)) - return Event._create(ptr_event[0]), SVMMap(self.mem) + return Event._create(ptr_event[0]) def as_buffer(self, ctx, flags=None): """ @@ -1266,6 +1302,93 @@ class SVM(_CLKernelArg): return Buffer(ctx, flags, size=self.mem.nbytes, hostbuf=self.mem) + +def _enqueue_svm_memcpy(queue, dst, src, size=None, + wait_for=None, is_blocking=True): + dst_buf, dst_size, _ = _c_buffer_from_obj(dst, writable=True) + src_buf, src_size, _ = _c_buffer_from_obj(src, writable=False) + + if size is None: + size = min(dst_size, src_size) + + ptr_event = _ffi.new('clobj_t*') + c_wait_for, num_wait_for = _clobj_list(wait_for) + _handle_error(_lib.enqueue_svm_memcpy( + ptr_event, queue.ptr, bool(is_blocking), + dst_buf, src_buf, size, + c_wait_for, num_wait_for, + NannyEvent._handle((dst_buf, src_buf)))) + + return NannyEvent._create(ptr_event[0]) + + +def enqueue_svm_memfill(queue, dest, pattern, byte_count=None, wait_for=None): + """Fill shared virtual memory with a pattern. + + :arg dest: a Python buffer object, optionally wrapped in an :class:`SVM` object + :arg pattern: a Python buffer object (e.g. a :class:`numpy.ndarray` with the + fill pattern to be used. + :arg byte_count: The size of the memory to be fill. Defaults to the + entirety of *dest*. + + |std-enqueue-blurb| + + .. versionadded:: 2016.2 + """ + + if isinstance(dest, SVM): + dest = dest.mem + + dst_buf, dst_size, _ = _c_buffer_from_obj(dest, writable=True) + pattern_buf, pattern_size, _ = _c_buffer_from_obj(pattern, writable=False) + + if byte_count is None: + byte_count = dst_size + + # pattern is copied, no need to nanny. + ptr_event = _ffi.new('clobj_t*') + c_wait_for, num_wait_for = _clobj_list(wait_for) + _handle_error(_lib.enqueue_svm_memfill( + ptr_event, queue.ptr, + dst_buf, pattern_buf, pattern_size, byte_count, + c_wait_for, num_wait_for)) + + return Event._create(ptr_event[0]) + + +def enqueue_svm_migratemem(queue, svms, flags, wait_for=None): + """ + :arg svms: a collection of Python buffer objects (e.g. :mod:`numpy` + arrrays), optionally wrapped in :class:`SVM` objects. + :arg flags: a combination of :class:`mem_migration_flags` + + |std-enqueue-blurb| + + .. versionadded:: 2016.2 + + This function requires OpenCL 2.1. + """ + + svm_pointers = _ffi.new('void *', len(svms)) + sizes = _ffi.new('size_t', len(svms)) + + for i, svm in enumerate(svms): + if isinstance(svm, SVM): + svm = svm.mem + + buf, size, _ = _c_buffer_from_obj(svm, writable=False) + svm_pointers[i] = buf + sizes[i] = size + + ptr_event = _ffi.new('clobj_t*') + c_wait_for, num_wait_for = _clobj_list(wait_for) + _handle_error(_lib.enqueue_svm_memfill( + ptr_event, queue.ptr, + len(svms), svm_pointers, sizes, flags, + c_wait_for, num_wait_for)) + + return Event._create(ptr_event[0]) + # }}} diff --git a/src/c_wrapper/svm.cpp b/src/c_wrapper/svm.cpp index 875c9271b197aaa889689922c2214c2ad1640938..8452ec99953e3806b0b890220f20c5c46d71a875 100644 --- a/src/c_wrapper/svm.cpp +++ b/src/c_wrapper/svm.cpp @@ -64,7 +64,7 @@ enqueue_svm_memcpy( clobj_t *evt, clobj_t _queue, cl_bool is_blocking, void *dst_ptr, const void *src_ptr, size_t size, - const clobj_t *_wait_for, uint32_t num_wait_for) + const clobj_t *_wait_for, uint32_t num_wait_for, void *pyobj) { #if PYOPENCL_CL_VERSION >= 0x2000 const auto wait_for = buf_from_class<event>(_wait_for, num_wait_for); @@ -74,7 +74,7 @@ enqueue_svm_memcpy( clEnqueueSVMMemcpy, queue, is_blocking, dst_ptr, src_ptr, size, - wait_for, event_out(evt)); + wait_for, nanny_event_out(evt, pyobj)); }); #else PYOPENCL_UNSUPPORTED_BEFORE(clEnqueueSVMMemcpy, "CL 2.0") diff --git a/src/c_wrapper/wrap_cl_core.h b/src/c_wrapper/wrap_cl_core.h index 7a4992f478514778dd082c1071116ed32810f13a..746c1b9d85659564a50fdbfdc1890d75d678073d 100644 --- a/src/c_wrapper/wrap_cl_core.h +++ b/src/c_wrapper/wrap_cl_core.h @@ -142,7 +142,8 @@ error* enqueue_svm_memcpy( clobj_t *evt, clobj_t _queue, cl_bool is_blocking, void *dst_ptr, const void *src_ptr, size_t size, - const clobj_t *_wait_for, uint32_t num_wait_for); + const clobj_t *_wait_for, uint32_t num_wait_for, + void *pyobj); error* enqueue_svm_memfill( clobj_t *evt, clobj_t _queue, void *svm_ptr, diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 43624afeef9ac5d8ca51bcbbd2d6df00ac92ad0b..f2b5b645be57c49c74b61c29e48cbfad008c9f05 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -934,28 +934,80 @@ def test_spirv(ctx_factory): def test_coarse_grain_svm(ctx_factory): ctx = ctx_factory() - # queue = cl.CommandQueue(ctx) + queue = cl.CommandQueue(ctx) if (ctx._get_cl_version() < (2, 0) or cl.get_cl_header_version() < (2, 0)): from pytest import skip skip("SVM only available in OpenCL 2.0 and higher") - svm_ary = cl.csvm_empty(ctx, (100, 100), np.float32, alignment=64) - assert isinstance(svm_ary.base, cl.SVMAllocation) + n = 3000 + svm_ary = cl.SVM(cl.csvm_empty(ctx, (n,), np.float32, alignment=64)) + assert isinstance(svm_ary.mem.base, cl.SVMAllocation) + + if ctx.devices[0].platform.name != "Portable Computing Language": + # pocl 0.13 has a bug misinterpreting the size parameter + cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype)) + + with svm_ary.map_rw(queue) as ary: + ary.fill(17) + orig_ary = ary.copy() + + prg = cl.Program(ctx, """ + __kernel void twice(__global float *a_g) + { + a_g[get_global_id(0)] *= 2; + } + """).build() + + prg.twice(queue, svm_ary.mem.shape, None, svm_ary) + + with svm_ary.map_ro(queue) as ary: + print(ary) + assert np.array_equal(orig_ary*2, ary) + + new_ary = np.empty_like(orig_ary) + new_ary.fill(-1) + + if ctx.devices[0].platform.name != "Portable Computing Language": + # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)" + # in pocl 0.13. + + cl.enqueue_copy(queue, new_ary, svm_ary) + assert np.array_equal(orig_ary*2, new_ary) def test_fine_grain_svm(ctx_factory): ctx = ctx_factory() - # queue = cl.CommandQueue(ctx) + queue = cl.CommandQueue(ctx) + from pytest import skip if (ctx._get_cl_version() < (2, 0) or cl.get_cl_header_version() < (2, 0)): - from pytest import skip skip("SVM only available in OpenCL 2.0 and higher") - svm_ary = cl.fsvm_empty(ctx, (100, 100), np.float32, alignment=64) - assert isinstance(svm_ary.base, cl.SVMAllocation) + if not (ctx.devices[0].svm_capabilities + & cl.device_svm_capabilities.FINE_GRAIN_BUFFER): + skip("device does not support fine-grain SVM") + + n = 3000 + ary = cl.fsvm_empty(ctx, n, np.float32, alignment=64) + assert isinstance(ary.base, cl.SVMAllocation) + + ary.fill(17) + orig_ary = ary.copy() + + prg = cl.Program(ctx, """ + __kernel void twice(__global float *a_g) + { + a_g[get_global_id(0)] *= 2; + } + """).build() + + prg.twice(queue, ary.shape, None, cl.SVM(ary)) + + print(ary) + assert np.array_equal(orig_ary*2, ary) if __name__ == "__main__":