From 786eb9cb15be6a5f4e71936b3a1718b9098b9c43 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 19 Feb 2021 12:58:27 +0000 Subject: [PATCH 1/8] Support the CUDA Array Interface This commit adds support for version 3 of the CUDA Array Interface (https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html): - GPU Arrays export the interface so that they can be passed to other supporting libraries. - Function calls check for the presence of the interface and use the data pointer from it if found, so other libraries' arrays can be passed to PyCUDA-compiled kernels. Documentation, tests, and examples for the interface going in both directions are added. --- doc/source/array.rst | 6 ++++ doc/source/tutorial.rst | 29 +++++++++++++++ examples/cai_cupy_arrays.py | 33 +++++++++++++++++ examples/cai_numba.py | 40 +++++++++++++++++++++ pycuda/driver.py | 8 +++++ pycuda/gpuarray.py | 23 ++++++++++++ test/test_driver.py | 71 +++++++++++++++++++++++++++++++++++++ 7 files changed, 210 insertions(+) create mode 100644 examples/cai_cupy_arrays.py create mode 100644 examples/cai_numba.py diff --git a/doc/source/array.rst b/doc/source/array.rst index 3b12f04a..de1cc66e 100644 --- a/doc/source/array.rst +++ b/doc/source/array.rst @@ -80,6 +80,12 @@ The :class:`GPUArray` Array Class .. versionadded: 2011.1 + .. attribute :: __cuda_array_interface__ + + Return a `CUDA Array Interface + `_ + dict describing this array's data. + .. method :: __len__() Returns the size of the leading dimension of *self*. diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index ea6a804f..ba729875 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -191,6 +191,35 @@ only the second:: func(numpy.intp(do2_ptr), block = (32, 1, 1), grid=(1, 1)) print("doubled second only", array1, array2, "\n") +Interoperability With Other Libraries Using The CUDA Array Interface +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Kernel calls can be passed arrays from other CUDA libraries that support the +`CUDA Array Interface +`_. For +example, to double a `CuPy `_ array:: + + import cupy as cp + + cupy_a = cp.random.randn(4, 4).astype(cp.float32) + func = mod.get_function("double_array") + func(cupy_a, block=(4, 4, 1), grid=(1, 1)) + +PyCUDA GPU Arrays implement the CUDA Array Interface, so they can be passed +into functions from other libraries that support it. For example, to double a +PyCUDA GPU Array using a `Numba `_ kernel:: + + from numba import cuda + + a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) + + @cuda.jit + def double(x): + i, j = cuda.grid(2) + x[i, j] *= 2 + + double[(4, 4), (1, 1)](a_gpu) + Where to go from here --------------------- diff --git a/examples/cai_cupy_arrays.py b/examples/cai_cupy_arrays.py new file mode 100644 index 00000000..2c524ba9 --- /dev/null +++ b/examples/cai_cupy_arrays.py @@ -0,0 +1,33 @@ +# Copyright 2008-2021 Andreas Kloeckner +# Copyright 2021 NVIDIA Corporation + +import pycuda.autoinit # noqa +from pycuda.compiler import SourceModule + +import cupy as cp + + +# Create a CuPy array (and a copy for comparison later) +cupy_a = cp.random.randn(4, 4).astype(cp.float32) +original = cupy_a.copy() + + +# Create a kernel +mod = SourceModule(""" + __global__ void doublify(float *a) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= 2; + } + """) + +func = mod.get_function("doublify") + +# Invoke PyCUDA kernel on a CuPy array +func(cupy_a, block=(4, 4, 1), grid=(1, 1), shared=0) + +# Demonstrate that our CuPy array was modified in place by the PyCUDA kernel +print("original array:") +print(original) +print("doubled with kernel:") +print(cupy_a) diff --git a/examples/cai_numba.py b/examples/cai_numba.py new file mode 100644 index 00000000..05fa317f --- /dev/null +++ b/examples/cai_numba.py @@ -0,0 +1,40 @@ +# Copyright 2008-2021 Andreas Kloeckner +# Copyright 2021 NVIDIA Corporation + +from numba import cuda + +import pycuda.driver as pycuda +import pycuda.autoinit # noqa +import pycuda.gpuarray as gpuarray + +import numpy + + +# Create a PyCUDA gpuarray +a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) +print("original array:") +print(a_gpu) + +# Retain PyCUDA context as primary and make current so that Numba is happy +pyc_dev = pycuda.autoinit.device +pyc_ctx = pyc_dev.retain_primary_context() +pyc_ctx.push() + + +# A standard Numba kernel that doubles its input array +@cuda.jit +def double(x): + i, j = cuda.grid(2) + + if i < x.shape[0] and j < x.shape[1]: + x[i, j] *= 2 + + +# Call the Numba kernel on the PyCUDA gpuarray, using the CUDA Array Interface +# transparently +double[(4, 4), (1, 1)](a_gpu) +print("doubled with numba:") +print(a_gpu) + +# Pop context to allow PyCUDA to clean up +pyc_ctx.pop() diff --git a/pycuda/driver.py b/pycuda/driver.py index 6bfd097e..ac305129 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -1,3 +1,8 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + import os import numpy as np @@ -207,6 +212,9 @@ def _add_functionality(): elif isinstance(arg, np.void): arg_data.append(_my_bytes(_memoryview(arg))) format += "%ds" % arg.itemsize + elif hasattr(arg, '__cuda_array_interface__'): + arg_data.append(arg.__cuda_array_interface__['data'][0]) + format += "P" else: try: gpudata = np.uintp(arg.gpudata) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index f5908a06..febfc330 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1,3 +1,8 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + import numpy as np import pycuda.elementwise as elementwise from pytools import memoize, memoize_method @@ -252,6 +257,24 @@ class GPUArray: self._grid, self._block = splay(self.mem_size) + @property + def __cuda_array_interface__(self): + """Returns a CUDA Array Interface dictionary describing this array's + data.""" + if self.gpudata is not None: + ptr = int(self.gpudata) + else: + ptr = 0 + + return { + 'shape': self.shape, + 'strides': self.strides, + 'data': (ptr, False), + 'typestr': self.dtype.str, + 'stream': None, + 'version': 3 + } + @property def ndim(self): return len(self.shape) diff --git a/test/test_driver.py b/test/test_driver.py index c720cebf..7c9c18ac 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -1,3 +1,8 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + import numpy as np import numpy.linalg as la from pycuda.tools import mark_cuda_test, dtype_to_ctype @@ -144,6 +149,23 @@ class TestDriver: diff = (a_g * b_g).get() - a * b assert la.norm(diff) == 0 + @mark_cuda_test + def test_gpuarray_cai(self): + a = np.zeros(10, dtype=np.float32) + a_g = gpuarray.to_gpu(a) + cai = a_g.__cuda_array_interface__ + ptr = cai['data'][0] + masked = cai['data'][1] + + assert cai['shape'] == a.shape + assert cai['strides'] == a.strides + assert cai['typestr'] == a.dtype.str + assert isinstance(ptr, int) + assert ptr != 0 + assert not masked + assert cai['stream'] is None + assert cai['version'] == 3 + @mark_cuda_test def donottest_cublas_mixing(self): self.test_streamed_kernel() @@ -1054,6 +1076,55 @@ def test_pointer_holder_base(): print(ary.get()) +# A class to emulate an object from outside PyCUDA that implements the CUDA +# Array Interface +class CudaArrayInterfaceImpl: + def __init__(self, size, itemsize, dtype): + self._shape = (size,) + self._strides = (itemsize,) + self._typestr = dtype.str + self._ptr = drv.mem_alloc(size * itemsize) + + @property + def __cuda_array_interface__(self): + return { + 'shape': self._shape, + 'strides': self._strides, + 'typestr': self._typestr, + 'data': (int(self._ptr), False), + 'stream': None, + 'version': 3 + } + + @property + def ptr(self): + return self._ptr + + +def test_pass_cai_array(): + dtype = np.int32 + size = 1024 + np_array = np.arange(size, dtype=dtype) + cai_array = CudaArrayInterfaceImpl(size, np_array.itemsize, np_array.dtype) + + mod = SourceModule( + """ + __global__ void gpu_arange(int *x) + { + const int i = threadIdx.x; + x[i] = i; + } + """ + ) + + gpu_arange = mod.get_function("gpu_arange") + gpu_arange(cai_array, grid=(1,), block=(size, 1, 1)) + + host_array = np.empty_like(np_array) + drv.memcpy_dtoh(host_array, cai_array.ptr) + assert (host_array == np_array).all() + + def test_import_pyopencl_before_pycuda(): try: import pyopencl # noqa -- GitLab From 76afe4c508b7f966b0a9fbd3142265932f3fdf69 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Mar 2021 10:43:37 +0000 Subject: [PATCH 2/8] Use appropriate quotes --- pycuda/compyte | 2 +- pycuda/driver.py | 4 ++-- pycuda/gpuarray.py | 12 ++++++------ test/test_driver.py | 26 +++++++++++++------------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pycuda/compyte b/pycuda/compyte index 7533db88..d1f993da 160000 --- a/pycuda/compyte +++ b/pycuda/compyte @@ -1 +1 @@ -Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d +Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786 diff --git a/pycuda/driver.py b/pycuda/driver.py index ac305129..77657afe 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -212,8 +212,8 @@ def _add_functionality(): elif isinstance(arg, np.void): arg_data.append(_my_bytes(_memoryview(arg))) format += "%ds" % arg.itemsize - elif hasattr(arg, '__cuda_array_interface__'): - arg_data.append(arg.__cuda_array_interface__['data'][0]) + elif hasattr(arg, "__cuda_array_interface__"): + arg_data.append(arg.__cuda_array_interface__["data"][0]) format += "P" else: try: diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index febfc330..5d9ccaa5 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -267,12 +267,12 @@ class GPUArray: ptr = 0 return { - 'shape': self.shape, - 'strides': self.strides, - 'data': (ptr, False), - 'typestr': self.dtype.str, - 'stream': None, - 'version': 3 + "shape": self.shape, + "strides": self.strides, + "data": (ptr, False), + "typestr": self.dtype.str, + "stream": None, + "version": 3 } @property diff --git a/test/test_driver.py b/test/test_driver.py index 7c9c18ac..b022aa37 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -154,17 +154,17 @@ class TestDriver: a = np.zeros(10, dtype=np.float32) a_g = gpuarray.to_gpu(a) cai = a_g.__cuda_array_interface__ - ptr = cai['data'][0] - masked = cai['data'][1] + ptr = cai["data"][0] + masked = cai["data"][1] - assert cai['shape'] == a.shape - assert cai['strides'] == a.strides - assert cai['typestr'] == a.dtype.str + assert cai["shape"] == a.shape + assert cai["strides"] == a.strides + assert cai["typestr"] == a.dtype.str assert isinstance(ptr, int) assert ptr != 0 assert not masked - assert cai['stream'] is None - assert cai['version'] == 3 + assert cai["stream"] is None + assert cai["version"] == 3 @mark_cuda_test def donottest_cublas_mixing(self): @@ -1088,12 +1088,12 @@ class CudaArrayInterfaceImpl: @property def __cuda_array_interface__(self): return { - 'shape': self._shape, - 'strides': self._strides, - 'typestr': self._typestr, - 'data': (int(self._ptr), False), - 'stream': None, - 'version': 3 + "shape": self._shape, + "strides": self._strides, + "typestr": self._typestr, + "data": (int(self._ptr), False), + "stream": None, + "version": 3 } @property -- GitLab From 6ec4cebc607121ef866af9224381bd695de6c083 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 5 Mar 2021 11:51:49 +0000 Subject: [PATCH 3/8] Add reference to GPUArray class docs --- doc/source/tutorial.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index ba729875..681ee37d 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -205,9 +205,10 @@ example, to double a `CuPy `_ array:: func = mod.get_function("double_array") func(cupy_a, block=(4, 4, 1), grid=(1, 1)) -PyCUDA GPU Arrays implement the CUDA Array Interface, so they can be passed -into functions from other libraries that support it. For example, to double a -PyCUDA GPU Array using a `Numba `_ kernel:: +:class:`~pycuda.gpuarray.GPUArray` implements the CUDA Array Interface, so its +instances can be passed into functions from other libraries that support it. +For example, to double a PyCUDA GPU Array using a `Numba +`_ kernel:: from numba import cuda -- GitLab From b9ec2b0175a2293acea0862b74814f58ba4d1b3b Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 23 Feb 2021 16:49:22 -0600 Subject: [PATCH 4/8] Revert accidental update of compyte submodule --- pycuda/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/compyte b/pycuda/compyte index d1f993da..7533db88 160000 --- a/pycuda/compyte +++ b/pycuda/compyte @@ -1 +1 @@ -Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786 +Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d -- GitLab From 45fe73c18c970874f6fe10f3a619234ac7433c00 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 5 Mar 2021 12:08:12 +0000 Subject: [PATCH 5/8] Add a comment explaining readonly flag of CAI --- pycuda/gpuarray.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 5d9ccaa5..373cf005 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -269,6 +269,8 @@ class GPUArray: return { "shape": self.shape, "strides": self.strides, + # data is a tuple: (ptr, readonly) - always export GPUArray + # instances as read-write "data": (ptr, False), "typestr": self.dtype.str, "stream": None, -- GitLab From f2a6e7c97e2e888e604c995b318c12053bd95a88 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 9 Mar 2021 14:27:59 +0000 Subject: [PATCH 6/8] Use getattr instead of hasattr for CUDA Array Interface --- pycuda/driver.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pycuda/driver.py b/pycuda/driver.py index 77657afe..47d15b19 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -212,10 +212,13 @@ def _add_functionality(): elif isinstance(arg, np.void): arg_data.append(_my_bytes(_memoryview(arg))) format += "%ds" % arg.itemsize - elif hasattr(arg, "__cuda_array_interface__"): - arg_data.append(arg.__cuda_array_interface__["data"][0]) - format += "P" else: + cai = getattr(arg, "__cuda_array_interface__", None) + if cai: + arg_data.append(cai["data"][0]) + format += "P" + continue + try: gpudata = np.uintp(arg.gpudata) except AttributeError: -- GitLab From 08b53a7ec11fb8c37e41fb5e88468aa10583a428 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 9 Mar 2021 14:34:08 +0000 Subject: [PATCH 7/8] Add autoprimaryctx and use in CAI Numba example --- doc/source/util.rst | 6 ++++++ examples/cai_numba.py | 12 +++--------- pycuda/autoprimaryctx.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 pycuda/autoprimaryctx.py diff --git a/doc/source/util.rst b/doc/source/util.rst index 0cd85d69..44c9a56f 100644 --- a/doc/source/util.rst +++ b/doc/source/util.rst @@ -21,6 +21,12 @@ It uses :func:`pycuda.tools.make_default_context` to create a compute context. on :data:`device`. This context is created by calling :func:`pycuda.tools.make_default_context`. +.. module:: pycuda.autoprimaryctx + +The module :mod:`pycuda.autoprimaryctx` is similar to :mod:`pycuda.autoinit`, +except that it retains the device primary context instead of creating a new +context in :func:`pycuda.tools.make_default_context`. + Choice of Device ---------------- diff --git a/examples/cai_numba.py b/examples/cai_numba.py index 05fa317f..0a94ee48 100644 --- a/examples/cai_numba.py +++ b/examples/cai_numba.py @@ -4,7 +4,9 @@ from numba import cuda import pycuda.driver as pycuda -import pycuda.autoinit # noqa +# We use autoprimaryctx instead of autoinit because Numba can only operate on a +# primary context +import pycuda.autoprimaryctx # noqa import pycuda.gpuarray as gpuarray import numpy @@ -15,11 +17,6 @@ a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) print("original array:") print(a_gpu) -# Retain PyCUDA context as primary and make current so that Numba is happy -pyc_dev = pycuda.autoinit.device -pyc_ctx = pyc_dev.retain_primary_context() -pyc_ctx.push() - # A standard Numba kernel that doubles its input array @cuda.jit @@ -35,6 +32,3 @@ def double(x): double[(4, 4), (1, 1)](a_gpu) print("doubled with numba:") print(a_gpu) - -# Pop context to allow PyCUDA to clean up -pyc_ctx.pop() diff --git a/pycuda/autoprimaryctx.py b/pycuda/autoprimaryctx.py new file mode 100644 index 00000000..537c8610 --- /dev/null +++ b/pycuda/autoprimaryctx.py @@ -0,0 +1,31 @@ +import pycuda.driver as cuda +import atexit + +# Initialize CUDA +cuda.init() + +from pycuda.tools import make_default_context # noqa: E402 + + +def _retain_primary_context(dev): + context = dev.retain_primary_context() + context.push() + return context + + +global context +context = make_default_context(_retain_primary_context) +device = context.get_device() + + +def _finish_up(): + global context + context.pop() + context = None + + from pycuda.tools import clear_context_caches + + clear_context_caches() + + +atexit.register(_finish_up) -- GitLab From ccaefbb81440577d634715503a2a8cd1ffcff540 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Wed, 10 Mar 2021 23:53:25 -0600 Subject: [PATCH 8/8] Doc tweaks for array interface support --- doc/source/tutorial.rst | 2 +- doc/source/util.rst | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index 681ee37d..2c401b88 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -196,7 +196,7 @@ Interoperability With Other Libraries Using The CUDA Array Interface Kernel calls can be passed arrays from other CUDA libraries that support the `CUDA Array Interface -`_. For +`__. For example, to double a `CuPy `_ array:: import cupy as cp diff --git a/doc/source/util.rst b/doc/source/util.rst index 44c9a56f..f83907d6 100644 --- a/doc/source/util.rst +++ b/doc/source/util.rst @@ -25,7 +25,8 @@ It uses :func:`pycuda.tools.make_default_context` to create a compute context. The module :mod:`pycuda.autoprimaryctx` is similar to :mod:`pycuda.autoinit`, except that it retains the device primary context instead of creating a new -context in :func:`pycuda.tools.make_default_context`. +context in :func:`pycuda.tools.make_default_context`. Notably, it also +has ``device`` and ``context`` attributes. Choice of Device ---------------- -- GitLab