From 95e75d2201de1e750b26cde6da6175bac19bea44 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 8 Jan 2022 02:13:43 +0000 Subject: [PATCH 01/66] Rpow scalar --- pycuda/elementwise.py | 39 +++++++++++++++++++++++++++++++++++++++ pycuda/gpuarray.py | 21 +++++++++++++++++++++ test/test_gpuarray.py | 15 +++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 258bae5c..607e9f9c 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -666,6 +666,45 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): ) +@context_dependent_memoize +def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): + """ + Returns the kernel for the operation: ``z = x ** y`` + """ + if np.float64 in [dtype_x, dtype_y]: + func = "pow" + else: + func = "powf" + + if not is_base_array and is_exp_array: + x_ctype = "%(tp_x)s x" + y_ctype = "%(tp_y)s *y" + func = "%s(x,y[i])" % func + + elif is_base_array and is_exp_array: + x_ctype = "%(tp_x)s *x" + y_ctype = "%(tp_y)s *y" + func = "%s(x[i],y[i])" % func + + elif is_base_array and not is_exp_array: + x_ctype = "%(tp_x)s *x" + y_ctype = "%(tp_y)s y" + func = "%s(x[i],y)" % func + + else: + raise AssertionError + + return get_elwise_kernel( + ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = %s" % func, + name="pow_method") + + @context_dependent_memoize def get_fmod_kernel(): return get_elwise_kernel( diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index a0bf84c4..5e7a5607 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -839,6 +839,27 @@ class GPUArray: """ return self._pow(other, new=False) + def __rpow__(self, other): + common_dtype = _get_common_dtype(self, other) + result = self._new_like_me(common_dtype) + + if not np.isscalar(other): + # Base is a gpuarray => do not cast. + base = other + else: + base = common_dtype.type(other) + + func = elementwise.get_rpow_kernel( + base.dtype, self.dtype, result.dtype, + is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self)) + # Evaluates z = x ** y + func.prepared_async_call(self._grid, self._block, None, + result.gpudata, # z + base if np.isscalar(base) else base.gpudata, # x + self.gpudata, # y + self.mem_size) + return result + def reverse(self, stream=None): """Return this array in reversed order. The array is treated as one-dimensional. diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index dbf4b2f7..5234d8a2 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -40,6 +40,21 @@ class TestGPUArray: a_gpu = a_gpu.get() assert (np.abs(a ** 2 - a_gpu) < 1e-3).all() + @mark_cuda_test + def test_rpow_array(self): + scalar = np.random.rand() + a = abs(np.random.rand(10)) + a_gpu = gpuarray.to_gpu(a) + + result = (scalar ** a_gpu).get() + np.testing.assert_allclose(scalar ** a, result) + + result = (a_gpu ** a_gpu).get() + np.testing.assert_allclose(a ** a, result) + + result = (a_gpu ** scalar).get() + np.testing.assert_allclose(a ** scalar, result) + @mark_cuda_test def test_numpy_integer_shape(self): gpuarray.empty(np.int32(17), np.float32) -- GitLab From f8f1c8a6612ad14b55761fd93cee017e4391b5c7 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Mon, 17 Jan 2022 12:56:42 -0600 Subject: [PATCH 02/66] Merged get_rpow_kernel into get_pow_array_kernel + Improved test_pow_array to catch order flips --- pycuda/elementwise.py | 24 +++--------------------- pycuda/gpuarray.py | 6 +++--- test/test_gpuarray.py | 14 ++++++++------ 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 607e9f9c..cbe92953 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -648,26 +648,7 @@ def get_pow_kernel(dtype): @context_dependent_memoize -def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): - if np.float64 in [dtype_x, dtype_y]: - func = "pow" - else: - func = "powf" - - return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" - % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = %s(x[i], y[i])" % func, - "pow_method", - ) - - -@context_dependent_memoize -def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): +def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): """ Returns the kernel for the operation: ``z = x ** y`` """ @@ -702,7 +683,8 @@ def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % func, - name="pow_method") + name="pow_method" + ) @context_dependent_memoize diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 5e7a5607..ea06a792 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -780,16 +780,16 @@ class GPUArray: result = self func = elementwise.get_pow_array_kernel( - self.dtype, other.dtype, result.dtype + self.dtype, other.dtype, result.dtype, True, True ) func.prepared_async_call( self._grid, self._block, None, + result.gpudata, self.gpudata, other.gpudata, - result.gpudata, self.mem_size, ) @@ -849,7 +849,7 @@ class GPUArray: else: base = common_dtype.type(other) - func = elementwise.get_rpow_kernel( + func = elementwise.get_pow_array_kernel( base.dtype, self.dtype, result.dtype, is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self)) # Evaluates z = x ** y diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 5234d8a2..3e18e912 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -17,16 +17,18 @@ class TestGPUArray: def test_pow_array(self): a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) + b = np.array([1, 2, 3, 4, 5]).astype(np.float64) + b_gpu = gpuarray.to_gpu(b) - result = pow(a_gpu, a_gpu).get() - assert (np.abs(a ** a - result) < 1e-3).all() + result = pow(a_gpu, b_gpu).get() + assert (np.abs(a ** b - result) < 1e-3).all() - result = (a_gpu ** a_gpu).get() - assert (np.abs(pow(a, a) - result) < 1e-3).all() + result = (a_gpu ** b_gpu).get() + assert (np.abs(pow(a, b) - result) < 1e-3).all() - a_gpu **= a_gpu + a_gpu **= b_gpu a_gpu = a_gpu.get() - assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all() + assert (np.abs(pow(a, b) - a_gpu) < 1e-3).all() @mark_cuda_test def test_pow_number(self): -- GitLab From b5c421a9ab696faf9930f08aac41e327e53c8639 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Mon, 17 Jan 2022 13:03:46 -0600 Subject: [PATCH 03/66] Merge branch 'main' of https://gitlab.tiker.net/inducer/pycuda --- pycuda/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/compyte b/pycuda/compyte index 165b3aba..71bffa1a 160000 --- a/pycuda/compyte +++ b/pycuda/compyte @@ -1 +1 @@ -Subproject commit 165b3abae63bc39124a342ce1a539adbf6cd8a09 +Subproject commit 71bffa1ae64ed98b9d922c79a6f9cc7eb4fd642f -- GitLab From d38a58aaf1c8eb77a7860e02b680cce766b25a3b Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 20 Dec 2020 13:18:58 -0600 Subject: [PATCH 04/66] Drop usage of six --- test/undistributed/elwise-perf.py | 5 ----- test/undistributed/measure_gpuarray_speed.py | 5 ----- test/undistributed/reduction-perf.py | 8 +------- 3 files changed, 1 insertion(+), 17 deletions(-) diff --git a/test/undistributed/elwise-perf.py b/test/undistributed/elwise-perf.py index dc2c2821..8562b99c 100644 --- a/test/undistributed/elwise-perf.py +++ b/test/undistributed/elwise-perf.py @@ -1,12 +1,7 @@ -from __future__ import absolute_import -from __future__ import print_function - -#! /usr/bin/env python import pycuda.driver as drv import pycuda.autoinit import numpy import numpy.linalg as la -from six.moves import range def main(): diff --git a/test/undistributed/measure_gpuarray_speed.py b/test/undistributed/measure_gpuarray_speed.py index 26a20f41..4f6f390e 100755 --- a/test/undistributed/measure_gpuarray_speed.py +++ b/test/undistributed/measure_gpuarray_speed.py @@ -1,13 +1,8 @@ -from __future__ import absolute_import -from __future__ import print_function - #! /usr/bin/env python import pycuda.driver as drv import pycuda.autoinit import numpy import numpy.linalg as la -from six.moves import range -from six.moves import zip def main(): diff --git a/test/undistributed/reduction-perf.py b/test/undistributed/reduction-perf.py index 842824cd..da4a0d03 100644 --- a/test/undistributed/reduction-perf.py +++ b/test/undistributed/reduction-perf.py @@ -1,11 +1,7 @@ -from __future__ import division -from __future__ import absolute_import -from __future__ import print_function import pycuda.autoinit import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import numpy -from six.moves import range def main(): @@ -14,8 +10,6 @@ def main(): tbl = Table() tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]")) - from random import shuffle - for dtype_out in [numpy.float32, numpy.float64]: for ex in range(15, 27): sz = 1 << ex @@ -28,7 +22,7 @@ def main(): assert sz == a_gpu.shape[0] assert len(a_gpu.shape) == 1 - from pycuda.reduction import get_sum_kernel, get_dot_kernel + from pycuda.reduction import get_dot_kernel krnl = get_dot_kernel(dtype_out, a_gpu.dtype) -- GitLab From 476cb84606c654661f311c4af00381d8e594f4e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 4 Jan 2021 19:50:01 +0100 Subject: [PATCH 05/66] Shorten/standardize doc/conf --- doc/source/conf.py | 137 ++------------------------------------------- 1 file changed, 4 insertions(+), 133 deletions(-) diff --git a/doc/source/conf.py b/doc/source/conf.py index 8d54717f..2fb39ed0 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -1,26 +1,3 @@ -# PyCUDA documentation build configuration file, created by -# sphinx-quickstart on Fri Jun 13 00:51:19 2008. -# -# This file is execfile()d with the current directory set to its containing dir. -# -# The contents of this file are pickled, so don't put values in the namespace -# that aren't pickleable (module imports are okay, they're removed automatically). -# -# All configuration values have a default value; values that are commented out -# serve to show the default value. - -# import sys, os - -# If your extensions are in another directory, add it here. If the directory -# is relative to the documentation root, use os.path.abspath to make it -# absolute, like shown here. -# sys.path.append(os.path.abspath('some/directory')) - -# General configuration -# --------------------- - -# Add any Sphinx extension module names here, as strings. They can be extensions -# coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ "sphinx.ext.intersphinx", "sphinx.ext.mathjax", @@ -38,7 +15,7 @@ master_doc = "index" # General substitutions. project = "PyCUDA" -copyright = "2008, Andreas Kloeckner" +copyright = "2008-20, Andreas Kloeckner" # The default replacements for |version| and |release|, also used in various # other places throughout the built documents. @@ -55,30 +32,6 @@ version = ".".join(str(x) for x in ver_dic["VERSION"]) # The full version, including alpha/beta/rc tags. release = ver_dic["VERSION_TEXT"] -# There are two options for replacing |today|: either, you set today to some -# non-false value, then it is used: -# today = '' -# Else, today_fmt is used as the format for a strftime call. -today_fmt = "%B %d, %Y" - -# List of documents that shouldn't be included in the build. -# unused_docs = [] - -# List of directories, relative to source directories, that shouldn't be searched -# for source files. -# exclude_dirs = [] - -# If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False - # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" @@ -88,94 +41,12 @@ pygments_style = "sphinx" html_theme = "furo" -html_theme_options = { - } - -# The style sheet to use for HTML and HTML Help pages. A file of that name -# must exist either in Sphinx' static/ path, or in one of the custom paths -# given in html_static_path. -# html_style = 'default.css' - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -# html_title = None - -# The name of an image file (within the static path) to place at the top of -# the sidebar. -# html_logo = None - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -# html_static_path = ['_static'] - -# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, -# using the given strftime format. -html_last_updated_fmt = "%b %d, %Y" - -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# html_additional_pages = {} - -# If false, no module index is generated. -# html_use_modindex = True - -# If true, the reST sources are included in the HTML build as _sources/. -# html_copy_source = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# html_use_opensearch = '' - -# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = '' - -# Output file base name for HTML help builder. -htmlhelp_basename = "PyCudadoc" - - -# Options for LaTeX output -# ------------------------ - -# The paper size ('letter' or 'a4'). -# latex_paper_size = 'letter' - -# The font size ('10pt', '11pt' or '12pt'). -# latex_font_size = '10pt' - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, author, document class [howto/manual]). -latex_documents = [ - ("index", "pycdua.tex", "PyCUDA Documentation", "Andreas Kloeckner", "manual"), -] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# latex_use_parts = False - -# Additional stuff for the LaTeX preamble. -# latex_preamble = '' - -# Documents to append as an appendix to all manuals. -# latex_appendices = [] - -# If false, no module index is generated. -# latex_use_modindex = True intersphinx_mapping = { "https://docs.python.org/3": None, "https://numpy.org/doc/stable/": None, "https://documen.tician.de/codepy/": None, } + +autoclass_content = "class" +autodoc_typehints = "description" -- GitLab From 04c54e2739b40e657725f2160282d67e6e6f9bda Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 9 Feb 2021 18:20:38 -0600 Subject: [PATCH 06/66] Check contiguity of arrays in GPUArray.fill (closes gh-265) --- pycuda/gpuarray.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 97630e3b..a1a3f3f3 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -627,6 +627,10 @@ class GPUArray: def fill(self, value, stream=None): """fills the array with the specified value""" + if not self.flags.forc: + raise RuntimeError( + "only contiguous arrays may be used as arguments to this operation") + func = elementwise.get_fill_kernel(self.dtype) func.prepared_async_call( self._grid, self._block, stream, value, self.gpudata, self.mem_size @@ -650,8 +654,7 @@ class GPUArray: ): if not self.flags.forc: raise RuntimeError( - "only contiguous arrays may " "be used as arguments to this operation" - ) + "only contiguous arrays may be used as arguments to this operation") if self.dtype == np.float64 and allow_double_hack: if channels != 1: -- GitLab From dca0898b1afc667ce73444edb4a4725bf2a3bae4 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 9 Feb 2021 18:36:41 -0600 Subject: [PATCH 07/66] Fix test_zeros_like_etc to not use fill on non-contig arrays --- test/test_gpuarray.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index fb6a20fc..fc7b6736 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -1192,6 +1192,9 @@ class TestGPUArray: contig = arr.flags.c_contiguous or arr.flags.f_contiguous + if not contig: + continue + # Output matches order of input. # Non-contiguous becomes C-contiguous new_z = func(arr, order="A") -- GitLab From a746dea70ee6bdb2fab7b01c66b49271cfda92c2 Mon Sep 17 00:00:00 2001 From: zzjjbb <31069326+zzjjbb@users.noreply.github.com> Date: Mon, 7 Dec 2020 19:41:45 -0500 Subject: [PATCH 08/66] add "out" parameter to GPUArray.conj(); add equivalent GPUArray.conjugate() method --- pycuda/elementwise.py | 5 +++-- pycuda/gpuarray.py | 11 ++++++++--- test/test_gpuarray.py | 4 ++++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 633edd78..258bae5c 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -606,11 +606,12 @@ def get_imag_kernel(dtype, real_dtype): @context_dependent_memoize -def get_conj_kernel(dtype): +def get_conj_kernel(dtype, conj_dtype): return get_elwise_kernel( - "%(tp)s *y, %(tp)s *z" + "%(tp)s *y, %(conj_tp)s *z" % { "tp": dtype_to_ctype(dtype), + "conj_tp": dtype_to_ctype(conj_dtype) }, "z[i] = pycuda::conj(y[i])", "conj", diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index a1a3f3f3..f5908a06 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1141,7 +1141,7 @@ class GPUArray: else: return zeros_like(self) - def conj(self): + def conj(self, out=None): dtype = self.dtype if issubclass(self.dtype.type, np.complexfloating): if not self.flags.forc: @@ -1154,9 +1154,12 @@ class GPUArray: order = "F" else: order = "C" - result = self._new_like_me(order=order) + if out is None: + result = self._new_like_me(order=order) + else: + result = out - func = elementwise.get_conj_kernel(dtype) + func = elementwise.get_conj_kernel(dtype, result.dtype) func.prepared_async_call( self._grid, self._block, @@ -1170,6 +1173,8 @@ class GPUArray: else: return self + conjugate = conj + # }}} # {{{ rich comparisons diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index fc7b6736..d5d09251 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -732,6 +732,10 @@ class TestGPUArray: assert la.norm(z.get().real - z.real.get()) == 0 assert la.norm(z.get().imag - z.imag.get()) == 0 assert la.norm(z.get().conj() - z.conj().get()) == 0 + # verify conj with out parameter + z_out = z.astype(np.complex64) + assert z_out is z.conj(out=z_out) + assert la.norm(z.get().conj() - z_out.get()) < 1e-7 # verify contiguity is preserved for order in ["C", "F"]: -- GitLab From 684b2d3d0f2bab88ebb13fd59c3b6b77c59985f1 Mon Sep 17 00:00:00 2001 From: Jiabei Zhu Date: Wed, 10 Feb 2021 04:18:37 -0500 Subject: [PATCH 09/66] change doc for conj/conjugate --- doc/source/array.rst | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/doc/source/array.rst b/doc/source/array.rst index 34efb1cf..3b12f04a 100644 --- a/doc/source/array.rst +++ b/doc/source/array.rst @@ -189,12 +189,26 @@ The :class:`GPUArray` Array Class .. versionadded: 0.94 - .. method :: conj() + .. method :: conj(out=None) - Return the complex conjugate of *self*, or *self* if it is real. + Return the complex conjugate of *self*, or *self* if it is real. If *out* + is not given, a newly allocated :class:`GPUArray` will returned. Use + *out=self* to get conjugate in-place. .. versionadded: 0.94 + .. versionchanged:: 2020.1.1 + + add *out* parameter + + + .. method :: conjugate(out=None) + + alias of :meth:`conj` + + .. versionadded:: 2020.1.1 + + .. method:: bind_to_texref(texref, allow_offset=False) Bind *self* to the :class:`pycuda.driver.TextureReference` *texref*. -- GitLab From 6905d772f8f61bf70ddbd535e8b30cb38be462ca Mon Sep 17 00:00:00 2001 From: Bruce Merry Date: Sun, 14 Feb 2021 13:36:28 +0200 Subject: [PATCH 10/66] Add documentation memset_*_async. Closes #266. --- doc/source/driver.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/source/driver.rst b/doc/source/driver.rst index 9f9e9bb6..8de1f2d1 100644 --- a/doc/source/driver.rst +++ b/doc/source/driver.rst @@ -1476,6 +1476,8 @@ Initializing Device Memory .. function:: memset_d16(dest, data, count) .. function:: memset_d32(dest, data, count) + Fill array with *data*. + .. note:: *count* is the number of elements, not bytes. @@ -1484,6 +1486,25 @@ Initializing Device Memory .. function:: memset_d2d16(dest, pitch, data, width, height) .. function:: memset_d2d32(dest, pitch, data, width, height) + Fill a two-dimensional array with *data*. + +.. function:: memset_d8_async(dest, data, count, stream=None) +.. function:: memset_d16_async(dest, data, count, stream=None) +.. function:: memset_d32_async(dest, data, count, stream=None) + + Fill array with *data* asynchronously, optionally serialized via *stream*. + + .. versionadded:: 2015.1 + +.. function:: memset_d2d8_async(dest, pitch, data, width, height, stream=None) +.. function:: memset_d2d16_async(dest, pitch, data, width, height, stream=None) +.. function:: memset_d2d32_async(dest, pitch, data, width, height, stream=None) + + Fill a two-dimensional array with *data* asynchronously, optionally + serialized via *stream*. + + .. versionadded:: 2015.1 + Unstructured Memory Transfers ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -- GitLab From 982f4dccff03adb63b6cc0948c486096dd4cfc33 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 23 Feb 2021 16:49:22 -0600 Subject: [PATCH 11/66] Upgrade compyte for sized bool --- pycuda/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/compyte b/pycuda/compyte index d1f993da..7533db88 160000 --- a/pycuda/compyte +++ b/pycuda/compyte @@ -1 +1 @@ -Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786 +Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d -- GitLab From e209c10f935051ef6781cb5713fa78d326988c8b Mon Sep 17 00:00:00 2001 From: Bruce Merry Date: Fri, 5 Mar 2021 09:18:46 +0200 Subject: [PATCH 12/66] Add a pyproject.toml to control build-time numpy version Build against the oldest C_API_VERSION available for the current Python version, to maximise compatibility of the resulting installation. Closes #271. --- pyproject.toml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..50864b6a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[build-system] + +# For each Python version, build against the oldest numpy C_API_VERSION for +# which binary numpy wheels exist, and then the newest version of numpy +# implementing that C_API_VERSION. +requires = [ + "setuptools", + "wheel", + "numpy; python_version >= '3.10'", + "numpy==1.19.5; python_version >= '3.8' and python_version < '3.10'", + "numpy==1.15.4; python_version >= '3.7' and python_version < '3.8'", + "numpy==1.12.1; python_version < '3.7'", +] -- GitLab From 4943befa3ab5905519a1b079315a55d918e4a5e7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 8 Mar 2021 00:37:48 -0600 Subject: [PATCH 13/66] CI/README: master->main --- .github/workflows/ci.yml | 4 ++-- .gitlab-ci.yml | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a6295018..9cfa45a6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,7 +2,7 @@ name: CI on: push: branches: - - master + - main pull_request: schedule: - cron: '17 3 * * 0' @@ -19,5 +19,5 @@ jobs: python-version: '3.x' - name: "Main Script" run: | - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" test/*.py diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 174d08dc..4f4d93b8 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,7 +3,7 @@ Python 3 Titan X: - py_version=3 - EXTRA_INSTALL="numpy mako" - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" - ". ./build-and-test-py-project.sh" tags: @@ -17,7 +17,7 @@ Python 3 Titan V: - py_version=3 - EXTRA_INSTALL="numpy mako" - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" - ". ./build-and-test-py-project.sh" tags: @@ -31,7 +31,7 @@ Python 3 K40: - py_version=3 - EXTRA_INSTALL="numpy mako" - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH" - ". ./build-and-test-py-project.sh" @@ -44,7 +44,7 @@ Python 3 K40: Documentation: script: | EXTRA_INSTALL="numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh + curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh . ./ci-support.sh build_py_project_in_venv build_docs --no-check @@ -56,7 +56,7 @@ Documentation: Flake8: script: - - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test/*.py tags: - python3 -- GitLab From a558f4bf7293230324fdfb9fc087de3d4811d3b0 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 19 Feb 2021 12:58:27 +0000 Subject: [PATCH 14/66] Support the CUDA Array Interface This commit adds support for version 3 of the CUDA Array Interface (https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html): - GPU Arrays export the interface so that they can be passed to other supporting libraries. - Function calls check for the presence of the interface and use the data pointer from it if found, so other libraries' arrays can be passed to PyCUDA-compiled kernels. Documentation, tests, and examples for the interface going in both directions are added. --- doc/source/array.rst | 6 ++++ doc/source/tutorial.rst | 29 +++++++++++++++ examples/cai_cupy_arrays.py | 33 +++++++++++++++++ examples/cai_numba.py | 40 +++++++++++++++++++++ pycuda/driver.py | 8 +++++ pycuda/gpuarray.py | 23 ++++++++++++ test/test_driver.py | 71 +++++++++++++++++++++++++++++++++++++ 7 files changed, 210 insertions(+) create mode 100644 examples/cai_cupy_arrays.py create mode 100644 examples/cai_numba.py diff --git a/doc/source/array.rst b/doc/source/array.rst index 3b12f04a..de1cc66e 100644 --- a/doc/source/array.rst +++ b/doc/source/array.rst @@ -80,6 +80,12 @@ The :class:`GPUArray` Array Class .. versionadded: 2011.1 + .. attribute :: __cuda_array_interface__ + + Return a `CUDA Array Interface + `_ + dict describing this array's data. + .. method :: __len__() Returns the size of the leading dimension of *self*. diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index ea6a804f..ba729875 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -191,6 +191,35 @@ only the second:: func(numpy.intp(do2_ptr), block = (32, 1, 1), grid=(1, 1)) print("doubled second only", array1, array2, "\n") +Interoperability With Other Libraries Using The CUDA Array Interface +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Kernel calls can be passed arrays from other CUDA libraries that support the +`CUDA Array Interface +`_. For +example, to double a `CuPy `_ array:: + + import cupy as cp + + cupy_a = cp.random.randn(4, 4).astype(cp.float32) + func = mod.get_function("double_array") + func(cupy_a, block=(4, 4, 1), grid=(1, 1)) + +PyCUDA GPU Arrays implement the CUDA Array Interface, so they can be passed +into functions from other libraries that support it. For example, to double a +PyCUDA GPU Array using a `Numba `_ kernel:: + + from numba import cuda + + a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) + + @cuda.jit + def double(x): + i, j = cuda.grid(2) + x[i, j] *= 2 + + double[(4, 4), (1, 1)](a_gpu) + Where to go from here --------------------- diff --git a/examples/cai_cupy_arrays.py b/examples/cai_cupy_arrays.py new file mode 100644 index 00000000..2c524ba9 --- /dev/null +++ b/examples/cai_cupy_arrays.py @@ -0,0 +1,33 @@ +# Copyright 2008-2021 Andreas Kloeckner +# Copyright 2021 NVIDIA Corporation + +import pycuda.autoinit # noqa +from pycuda.compiler import SourceModule + +import cupy as cp + + +# Create a CuPy array (and a copy for comparison later) +cupy_a = cp.random.randn(4, 4).astype(cp.float32) +original = cupy_a.copy() + + +# Create a kernel +mod = SourceModule(""" + __global__ void doublify(float *a) + { + int idx = threadIdx.x + threadIdx.y*4; + a[idx] *= 2; + } + """) + +func = mod.get_function("doublify") + +# Invoke PyCUDA kernel on a CuPy array +func(cupy_a, block=(4, 4, 1), grid=(1, 1), shared=0) + +# Demonstrate that our CuPy array was modified in place by the PyCUDA kernel +print("original array:") +print(original) +print("doubled with kernel:") +print(cupy_a) diff --git a/examples/cai_numba.py b/examples/cai_numba.py new file mode 100644 index 00000000..05fa317f --- /dev/null +++ b/examples/cai_numba.py @@ -0,0 +1,40 @@ +# Copyright 2008-2021 Andreas Kloeckner +# Copyright 2021 NVIDIA Corporation + +from numba import cuda + +import pycuda.driver as pycuda +import pycuda.autoinit # noqa +import pycuda.gpuarray as gpuarray + +import numpy + + +# Create a PyCUDA gpuarray +a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) +print("original array:") +print(a_gpu) + +# Retain PyCUDA context as primary and make current so that Numba is happy +pyc_dev = pycuda.autoinit.device +pyc_ctx = pyc_dev.retain_primary_context() +pyc_ctx.push() + + +# A standard Numba kernel that doubles its input array +@cuda.jit +def double(x): + i, j = cuda.grid(2) + + if i < x.shape[0] and j < x.shape[1]: + x[i, j] *= 2 + + +# Call the Numba kernel on the PyCUDA gpuarray, using the CUDA Array Interface +# transparently +double[(4, 4), (1, 1)](a_gpu) +print("doubled with numba:") +print(a_gpu) + +# Pop context to allow PyCUDA to clean up +pyc_ctx.pop() diff --git a/pycuda/driver.py b/pycuda/driver.py index 6bfd097e..ac305129 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -1,3 +1,8 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + import os import numpy as np @@ -207,6 +212,9 @@ def _add_functionality(): elif isinstance(arg, np.void): arg_data.append(_my_bytes(_memoryview(arg))) format += "%ds" % arg.itemsize + elif hasattr(arg, '__cuda_array_interface__'): + arg_data.append(arg.__cuda_array_interface__['data'][0]) + format += "P" else: try: gpudata = np.uintp(arg.gpudata) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index f5908a06..febfc330 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1,3 +1,8 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + import numpy as np import pycuda.elementwise as elementwise from pytools import memoize, memoize_method @@ -252,6 +257,24 @@ class GPUArray: self._grid, self._block = splay(self.mem_size) + @property + def __cuda_array_interface__(self): + """Returns a CUDA Array Interface dictionary describing this array's + data.""" + if self.gpudata is not None: + ptr = int(self.gpudata) + else: + ptr = 0 + + return { + 'shape': self.shape, + 'strides': self.strides, + 'data': (ptr, False), + 'typestr': self.dtype.str, + 'stream': None, + 'version': 3 + } + @property def ndim(self): return len(self.shape) diff --git a/test/test_driver.py b/test/test_driver.py index c720cebf..7c9c18ac 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -1,3 +1,8 @@ +__copyright__ = """ +Copyright 2008-2021 Andreas Kloeckner +Copyright 2021 NVIDIA Corporation +""" + import numpy as np import numpy.linalg as la from pycuda.tools import mark_cuda_test, dtype_to_ctype @@ -144,6 +149,23 @@ class TestDriver: diff = (a_g * b_g).get() - a * b assert la.norm(diff) == 0 + @mark_cuda_test + def test_gpuarray_cai(self): + a = np.zeros(10, dtype=np.float32) + a_g = gpuarray.to_gpu(a) + cai = a_g.__cuda_array_interface__ + ptr = cai['data'][0] + masked = cai['data'][1] + + assert cai['shape'] == a.shape + assert cai['strides'] == a.strides + assert cai['typestr'] == a.dtype.str + assert isinstance(ptr, int) + assert ptr != 0 + assert not masked + assert cai['stream'] is None + assert cai['version'] == 3 + @mark_cuda_test def donottest_cublas_mixing(self): self.test_streamed_kernel() @@ -1054,6 +1076,55 @@ def test_pointer_holder_base(): print(ary.get()) +# A class to emulate an object from outside PyCUDA that implements the CUDA +# Array Interface +class CudaArrayInterfaceImpl: + def __init__(self, size, itemsize, dtype): + self._shape = (size,) + self._strides = (itemsize,) + self._typestr = dtype.str + self._ptr = drv.mem_alloc(size * itemsize) + + @property + def __cuda_array_interface__(self): + return { + 'shape': self._shape, + 'strides': self._strides, + 'typestr': self._typestr, + 'data': (int(self._ptr), False), + 'stream': None, + 'version': 3 + } + + @property + def ptr(self): + return self._ptr + + +def test_pass_cai_array(): + dtype = np.int32 + size = 1024 + np_array = np.arange(size, dtype=dtype) + cai_array = CudaArrayInterfaceImpl(size, np_array.itemsize, np_array.dtype) + + mod = SourceModule( + """ + __global__ void gpu_arange(int *x) + { + const int i = threadIdx.x; + x[i] = i; + } + """ + ) + + gpu_arange = mod.get_function("gpu_arange") + gpu_arange(cai_array, grid=(1,), block=(size, 1, 1)) + + host_array = np.empty_like(np_array) + drv.memcpy_dtoh(host_array, cai_array.ptr) + assert (host_array == np_array).all() + + def test_import_pyopencl_before_pycuda(): try: import pyopencl # noqa -- GitLab From 305f887a76ff3656aaab5ed4418c79a00787a490 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Thu, 4 Mar 2021 10:43:37 +0000 Subject: [PATCH 15/66] Use appropriate quotes --- pycuda/compyte | 2 +- pycuda/driver.py | 4 ++-- pycuda/gpuarray.py | 12 ++++++------ test/test_driver.py | 26 +++++++++++++------------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/pycuda/compyte b/pycuda/compyte index 7533db88..d1f993da 160000 --- a/pycuda/compyte +++ b/pycuda/compyte @@ -1 +1 @@ -Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d +Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786 diff --git a/pycuda/driver.py b/pycuda/driver.py index ac305129..77657afe 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -212,8 +212,8 @@ def _add_functionality(): elif isinstance(arg, np.void): arg_data.append(_my_bytes(_memoryview(arg))) format += "%ds" % arg.itemsize - elif hasattr(arg, '__cuda_array_interface__'): - arg_data.append(arg.__cuda_array_interface__['data'][0]) + elif hasattr(arg, "__cuda_array_interface__"): + arg_data.append(arg.__cuda_array_interface__["data"][0]) format += "P" else: try: diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index febfc330..5d9ccaa5 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -267,12 +267,12 @@ class GPUArray: ptr = 0 return { - 'shape': self.shape, - 'strides': self.strides, - 'data': (ptr, False), - 'typestr': self.dtype.str, - 'stream': None, - 'version': 3 + "shape": self.shape, + "strides": self.strides, + "data": (ptr, False), + "typestr": self.dtype.str, + "stream": None, + "version": 3 } @property diff --git a/test/test_driver.py b/test/test_driver.py index 7c9c18ac..b022aa37 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -154,17 +154,17 @@ class TestDriver: a = np.zeros(10, dtype=np.float32) a_g = gpuarray.to_gpu(a) cai = a_g.__cuda_array_interface__ - ptr = cai['data'][0] - masked = cai['data'][1] + ptr = cai["data"][0] + masked = cai["data"][1] - assert cai['shape'] == a.shape - assert cai['strides'] == a.strides - assert cai['typestr'] == a.dtype.str + assert cai["shape"] == a.shape + assert cai["strides"] == a.strides + assert cai["typestr"] == a.dtype.str assert isinstance(ptr, int) assert ptr != 0 assert not masked - assert cai['stream'] is None - assert cai['version'] == 3 + assert cai["stream"] is None + assert cai["version"] == 3 @mark_cuda_test def donottest_cublas_mixing(self): @@ -1088,12 +1088,12 @@ class CudaArrayInterfaceImpl: @property def __cuda_array_interface__(self): return { - 'shape': self._shape, - 'strides': self._strides, - 'typestr': self._typestr, - 'data': (int(self._ptr), False), - 'stream': None, - 'version': 3 + "shape": self._shape, + "strides": self._strides, + "typestr": self._typestr, + "data": (int(self._ptr), False), + "stream": None, + "version": 3 } @property -- GitLab From e9c585ba70c3e2cc528ea218da5e5c5ab3f0d33b Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 5 Mar 2021 11:51:49 +0000 Subject: [PATCH 16/66] Add reference to GPUArray class docs --- doc/source/tutorial.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index ba729875..681ee37d 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -205,9 +205,10 @@ example, to double a `CuPy `_ array:: func = mod.get_function("double_array") func(cupy_a, block=(4, 4, 1), grid=(1, 1)) -PyCUDA GPU Arrays implement the CUDA Array Interface, so they can be passed -into functions from other libraries that support it. For example, to double a -PyCUDA GPU Array using a `Numba `_ kernel:: +:class:`~pycuda.gpuarray.GPUArray` implements the CUDA Array Interface, so its +instances can be passed into functions from other libraries that support it. +For example, to double a PyCUDA GPU Array using a `Numba +`_ kernel:: from numba import cuda -- GitLab From 0273af2c5114a33863782ce41f53936f3730203d Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 23 Feb 2021 16:49:22 -0600 Subject: [PATCH 17/66] Revert accidental update of compyte submodule --- pycuda/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/compyte b/pycuda/compyte index d1f993da..7533db88 160000 --- a/pycuda/compyte +++ b/pycuda/compyte @@ -1 +1 @@ -Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786 +Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d -- GitLab From ab5d0c37819c99000309670c2a7a9f9fdf8801ae Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Fri, 5 Mar 2021 12:08:12 +0000 Subject: [PATCH 18/66] Add a comment explaining readonly flag of CAI --- pycuda/gpuarray.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 5d9ccaa5..373cf005 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -269,6 +269,8 @@ class GPUArray: return { "shape": self.shape, "strides": self.strides, + # data is a tuple: (ptr, readonly) - always export GPUArray + # instances as read-write "data": (ptr, False), "typestr": self.dtype.str, "stream": None, -- GitLab From b4843494be3d733b5e61c0b9d6d9652c00f17f06 Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 9 Mar 2021 14:27:59 +0000 Subject: [PATCH 19/66] Use getattr instead of hasattr for CUDA Array Interface --- pycuda/driver.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pycuda/driver.py b/pycuda/driver.py index 77657afe..47d15b19 100644 --- a/pycuda/driver.py +++ b/pycuda/driver.py @@ -212,10 +212,13 @@ def _add_functionality(): elif isinstance(arg, np.void): arg_data.append(_my_bytes(_memoryview(arg))) format += "%ds" % arg.itemsize - elif hasattr(arg, "__cuda_array_interface__"): - arg_data.append(arg.__cuda_array_interface__["data"][0]) - format += "P" else: + cai = getattr(arg, "__cuda_array_interface__", None) + if cai: + arg_data.append(cai["data"][0]) + format += "P" + continue + try: gpudata = np.uintp(arg.gpudata) except AttributeError: -- GitLab From 88c7bc371aa506207cc88d581eeaa21925a371cd Mon Sep 17 00:00:00 2001 From: Graham Markall Date: Tue, 9 Mar 2021 14:34:08 +0000 Subject: [PATCH 20/66] Add autoprimaryctx and use in CAI Numba example --- doc/source/util.rst | 6 ++++++ examples/cai_numba.py | 12 +++--------- pycuda/autoprimaryctx.py | 31 +++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 9 deletions(-) create mode 100644 pycuda/autoprimaryctx.py diff --git a/doc/source/util.rst b/doc/source/util.rst index 0cd85d69..44c9a56f 100644 --- a/doc/source/util.rst +++ b/doc/source/util.rst @@ -21,6 +21,12 @@ It uses :func:`pycuda.tools.make_default_context` to create a compute context. on :data:`device`. This context is created by calling :func:`pycuda.tools.make_default_context`. +.. module:: pycuda.autoprimaryctx + +The module :mod:`pycuda.autoprimaryctx` is similar to :mod:`pycuda.autoinit`, +except that it retains the device primary context instead of creating a new +context in :func:`pycuda.tools.make_default_context`. + Choice of Device ---------------- diff --git a/examples/cai_numba.py b/examples/cai_numba.py index 05fa317f..0a94ee48 100644 --- a/examples/cai_numba.py +++ b/examples/cai_numba.py @@ -4,7 +4,9 @@ from numba import cuda import pycuda.driver as pycuda -import pycuda.autoinit # noqa +# We use autoprimaryctx instead of autoinit because Numba can only operate on a +# primary context +import pycuda.autoprimaryctx # noqa import pycuda.gpuarray as gpuarray import numpy @@ -15,11 +17,6 @@ a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32)) print("original array:") print(a_gpu) -# Retain PyCUDA context as primary and make current so that Numba is happy -pyc_dev = pycuda.autoinit.device -pyc_ctx = pyc_dev.retain_primary_context() -pyc_ctx.push() - # A standard Numba kernel that doubles its input array @cuda.jit @@ -35,6 +32,3 @@ def double(x): double[(4, 4), (1, 1)](a_gpu) print("doubled with numba:") print(a_gpu) - -# Pop context to allow PyCUDA to clean up -pyc_ctx.pop() diff --git a/pycuda/autoprimaryctx.py b/pycuda/autoprimaryctx.py new file mode 100644 index 00000000..537c8610 --- /dev/null +++ b/pycuda/autoprimaryctx.py @@ -0,0 +1,31 @@ +import pycuda.driver as cuda +import atexit + +# Initialize CUDA +cuda.init() + +from pycuda.tools import make_default_context # noqa: E402 + + +def _retain_primary_context(dev): + context = dev.retain_primary_context() + context.push() + return context + + +global context +context = make_default_context(_retain_primary_context) +device = context.get_device() + + +def _finish_up(): + global context + context.pop() + context = None + + from pycuda.tools import clear_context_caches + + clear_context_caches() + + +atexit.register(_finish_up) -- GitLab From 702af5e80bcdc163649d2a7f98b5bff73d0bc3df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Wed, 10 Mar 2021 23:53:25 -0600 Subject: [PATCH 21/66] Doc tweaks for array interface support --- doc/source/tutorial.rst | 2 +- doc/source/util.rst | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index 681ee37d..2c401b88 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -196,7 +196,7 @@ Interoperability With Other Libraries Using The CUDA Array Interface Kernel calls can be passed arrays from other CUDA libraries that support the `CUDA Array Interface -`_. For +`__. For example, to double a `CuPy `_ array:: import cupy as cp diff --git a/doc/source/util.rst b/doc/source/util.rst index 44c9a56f..f83907d6 100644 --- a/doc/source/util.rst +++ b/doc/source/util.rst @@ -25,7 +25,8 @@ It uses :func:`pycuda.tools.make_default_context` to create a compute context. The module :mod:`pycuda.autoprimaryctx` is similar to :mod:`pycuda.autoinit`, except that it retains the device primary context instead of creating a new -context in :func:`pycuda.tools.make_default_context`. +context in :func:`pycuda.tools.make_default_context`. Notably, it also +has ``device`` and ``context`` attributes. Choice of Device ---------------- -- GitLab From e0fd3f1ea954ab7fb305d61b1afcae6dd8a66b38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Wed, 24 Mar 2021 19:27:06 +0000 Subject: [PATCH 22/66] Remove py-version-specific Trove classifiers from setup.py --- setup.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/setup.py b/setup.py index 2a7acbe3..990a7830 100644 --- a/setup.py +++ b/setup.py @@ -212,10 +212,6 @@ def main(): "Programming Language :: C++", "Programming Language :: Python", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 2.6", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3.3", - "Programming Language :: Python :: 3.4", "Topic :: Scientific/Engineering", "Topic :: Scientific/Engineering :: Mathematics", "Topic :: Scientific/Engineering :: Physics", -- GitLab From a39ef021e0a6ab79e48aee86ed1779cb1d32f537 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 27 Mar 2021 12:26:44 -0500 Subject: [PATCH 23/66] Bump size parameters to size_t for a few memcpys --- src/cpp/cuda.hpp | 6 +++--- src/wrapper/wrap_cudadrv.cpp | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp index e3f8ef24..21cb219c 100644 --- a/src/cpp/cuda.hpp +++ b/src/cpp/cuda.hpp @@ -1624,18 +1624,18 @@ namespace pycuda } inline - void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len) + void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, size_t len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); } inline - void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len) + void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, size_t len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); } inline void memcpy_atoa( array const &dst, unsigned int dst_index, array const &src, unsigned int src_index, - unsigned int len) + size_t len) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); } // }}} diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 7c513a1a..c0fdd99d 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -268,7 +268,7 @@ namespace void py_memcpy_dtod(CUdeviceptr dest, CUdeviceptr src, - unsigned int byte_count) + size_t int byte_count) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoD, (dest, src, byte_count)); } @@ -276,7 +276,7 @@ namespace #if CUDAPP_CUDA_VERSION >= 3000 void py_memcpy_dtod_async(CUdeviceptr dest, CUdeviceptr src, - unsigned int byte_count, py::object stream_py) + size_t byte_count, py::object stream_py) { PYCUDA_PARSE_STREAM_PY; @@ -287,7 +287,7 @@ namespace #if CUDAPP_CUDA_VERSION >= 4000 void py_memcpy_peer(CUdeviceptr dest, CUdeviceptr src, - unsigned int byte_count, + size_t byte_count, py::object dest_context_py, py::object src_context_py ) { @@ -307,7 +307,7 @@ namespace } void py_memcpy_peer_async(CUdeviceptr dest, CUdeviceptr src, - unsigned int byte_count, + size_t byte_count, py::object dest_context_py, py::object src_context_py, py::object stream_py) { -- GitLab From e1ec8dce352227bb0bc9cd48fe21f0e5ba38280c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 27 Mar 2021 12:40:18 -0500 Subject: [PATCH 24/66] Fix type typo in size_t transition --- src/wrapper/wrap_cudadrv.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index c0fdd99d..0be5a0a2 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -268,7 +268,7 @@ namespace void py_memcpy_dtod(CUdeviceptr dest, CUdeviceptr src, - size_t int byte_count) + size_t byte_count) { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoD, (dest, src, byte_count)); } -- GitLab From 048ae7a2558040823136b3e0d470d5adfe8226fc Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 27 Mar 2021 12:58:13 -0500 Subject: [PATCH 25/66] Add links to related software --- doc/source/index.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/doc/source/index.rst b/doc/source/index.rst index e5c0c45c..41ad641a 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -79,6 +79,19 @@ the scenes, a lot more interesting stuff is going on: Curious? Let's get started. +Other software that uses/enhances PyCUDA +======================================== + +This list is by definition incomplete! If you know of other software you +feel should be listed here, please submit a PR! + +* `pyvkfft `__ +* `scikit-cuda `__ +* `reikna `__ +* `reikna `__ +* `compyle `__ +* `pynufft `__ + Contents ========= -- GitLab From ab3f5fd6af16487f76b88661650f6e5d159389c8 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 31 Mar 2021 13:44:24 -0500 Subject: [PATCH 26/66] Update mempool to version from PyOpenCL (closes gh-282) --- doc/source/util.rst | 46 ++++++---- src/cpp/mempool.hpp | 186 ++++++++++++++++++++++++++++------------ src/wrapper/mempool.cpp | 24 +++--- 3 files changed, 173 insertions(+), 83 deletions(-) diff --git a/doc/source/util.rst b/doc/source/util.rst index f83907d6..c7998994 100644 --- a/doc/source/util.rst +++ b/doc/source/util.rst @@ -6,18 +6,18 @@ Automatic Initialization .. module:: pycuda.autoinit -The module :mod:`pycuda.autoinit`, when imported, automatically performs +The module :mod:`pycuda.autoinit`, when imported, automatically performs all the steps necessary to get CUDA ready for submission of compute kernels. It uses :func:`pycuda.tools.make_default_context` to create a compute context. .. data:: device An instance of :class:`pycuda.driver.Device` that was used for automatic - initialization. + initialization. .. data:: context - A default-constructed instance of :class:`pycuda.driver.Context` + A default-constructed instance of :class:`pycuda.driver.Context` on :data:`device`. This context is created by calling :func:`pycuda.tools.make_default_context`. @@ -74,7 +74,7 @@ Kernel Caching .. function:: context_dependent_memoize(func) - This decorator caches the result of the decorated function, *if* a + This decorator caches the result of the decorated function, *if* a subsequent occurs in the same :class:`pycuda.driver.Context`. This is useful for caching of kernels. @@ -98,7 +98,7 @@ Device Metadata and Occupancy ----------------------------- .. class:: DeviceData(dev=None) - + Gives access to more information on a device than is available through :meth:`pycuda.driver.Device.get_attribute`. If `dev` is `None`, it defaults to the device returned by :meth:`pycuda.driver.Context.get_device`. @@ -120,7 +120,7 @@ Device Metadata and Occupancy .. method:: align_bytes(word_size=4) - The distance between global memory base addresses that + The distance between global memory base addresses that allow accesses of word-size `word_size` bytes to get coalesced. .. method:: align(bytes, word_size=4) @@ -134,7 +134,7 @@ Device Metadata and Occupancy .. method:: align_dtype(elements, dtype_size) - Round up `elements` to the next alignment boundary + Round up `elements` to the next alignment boundary as given by :meth:`align_bytes`, where each element is assumed to be `dtype_size` bytes large. @@ -146,7 +146,7 @@ Device Metadata and Occupancy .. class:: OccupancyRecord(devdata, threads, shared_mem=0, registers=0) - Calculate occupancy for a given kernel workload characterized by + Calculate occupancy for a given kernel workload characterized by * thread count of `threads` * shared memory use of `shared_mem` bytes @@ -183,7 +183,7 @@ fresh memory area is allocated for each intermediate result. Memory pools are a remedy for this problem based on the observation that often many of the block allocations are of the same sizes as previously used ones. -Then, instead of fully returning the memory to the system and incurring the +Then, instead of fully returning the memory to the system and incurring the associated reallocation overhead, the pool holds on to the memory and uses it to satisfy future allocations of similarly-sized blocks. The pool reacts appropriately to out-of-memory conditions as long as all memory allocations @@ -198,8 +198,8 @@ Device-based Memory Pool An object representing a :class:`DeviceMemoryPool`-based allocation of linear device memory. Once this object is deleted, its associated device - memory is freed. - :class:`PooledDeviceAllocation` instances can be cast to :class:`int` + memory is freed. + :class:`PooledDeviceAllocation` instances can be cast to :class:`int` (and :class:`long`), yielding the starting address of the device memory allocated. @@ -213,7 +213,7 @@ Device-based Memory Pool .. class:: DeviceMemoryPool - A memory pool for linear device memory as allocated using + A memory pool for linear device memory as allocated using :func:`pycuda.driver.mem_alloc`. (see :ref:`mempool`) .. attribute:: held_blocks @@ -225,6 +225,20 @@ Device-based Memory Pool The number of blocks in active use that have been allocated through this pool. + .. attribute:: managed_bytes + + "Managed" memory is "active" and "held" memory. + + .. versionadded: 2021.1 + + .. attribute:: active_bytes + + "Active" bytes are bytes under the control of the application. + This may be smaller than the actual allocated size reflected + in :attr:`managed_bytes`. + + .. versionadded: 2021.1 + .. method:: allocate(size) Return a :class:`PooledDeviceAllocation` of *size* bytes. @@ -248,7 +262,7 @@ Memory Pool for pagelocked memory An object representing a :class:`PageLockedMemoryPool`-based allocation of linear device memory. Once this object is deleted, its associated device - memory is freed. + memory is freed. .. method:: free @@ -260,12 +274,12 @@ Memory Pool for pagelocked memory .. class:: PageLockedAllocator(flags=0) - Specifies the set of :class:`pycuda.driver.host_alloc_flags` used in its + Specifies the set of :class:`pycuda.driver.host_alloc_flags` used in its associated :class:`PageLockedMemoryPool`. .. class:: PageLockedMemoryPool(allocator=PageLockedAllocator()) - A memory pool for pagelocked host memory as allocated using + A memory pool for pagelocked host memory as allocated using :func:`pycuda.driver.pagelocked_empty`. (see :ref:`mempool`) .. attribute:: held_blocks @@ -279,7 +293,7 @@ Memory Pool for pagelocked memory .. method:: allocate(shape, dtype, order="C") - Return an uninitialized ("empty") :class:`numpy.ndarray` with the given + Return an uninitialized ("empty") :class:`numpy.ndarray` with the given *shape*, *dtype*, and *order*. This array will be backed by a :class:`PooledHostAllocation`, which can be found as the ``.base`` attribute of the array. diff --git a/src/cpp/mempool.hpp b/src/cpp/mempool.hpp index be88f13f..44f0fd64 100644 --- a/src/cpp/mempool.hpp +++ b/src/cpp/mempool.hpp @@ -1,24 +1,61 @@ // Abstract memory pool implementation - - +// +// Copyright (C) 2009-17 Andreas Kloeckner +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. #ifndef _AFJDFJSDFSD_PYGPU_HEADER_SEEN_MEMPOOL_HPP #define _AFJDFJSDFSD_PYGPU_HEADER_SEEN_MEMPOOL_HPP - - -#include -#include -#include +#include +#include +#include +#include +#include +#include #include "bitlog.hpp" - - namespace PYGPU_PACKAGE { + // https://stackoverflow.com/a/44175911 + class mp_noncopyable { + public: + mp_noncopyable() = default; + ~mp_noncopyable() = default; + + private: + mp_noncopyable(const mp_noncopyable&) = delete; + mp_noncopyable& operator=(const mp_noncopyable&) = delete; + }; + +#ifdef PYGPU_PYCUDA +#define PYGPU_SHARED_PTR boost::shared_ptr +#else +#define PYGPU_SHARED_PTR std::shared_ptr +#endif + template inline T signed_left_shift(T x, signed shift_amount) { @@ -43,38 +80,57 @@ namespace PYGPU_PACKAGE +#define always_assert(cond) \ + do { \ + if (!(cond)) \ + throw std::logic_error("mem pool assertion violated: " #cond); \ + } while (false); + + template - class memory_pool + class memory_pool : mp_noncopyable { public: typedef typename Allocator::pointer_type pointer_type; typedef typename Allocator::size_type size_type; private: - typedef boost::uint32_t bin_nr_t; + typedef uint32_t bin_nr_t; typedef std::vector bin_t; - typedef boost::ptr_map container_t; + typedef std::map container_t; container_t m_container; typedef typename container_t::value_type bin_pair_t; - std::auto_ptr m_allocator; + std::unique_ptr m_allocator; // A held block is one that's been released by the application, but that // we are keeping around to dish out again. - unsigned m_held_blocks; + size_type m_held_blocks; // An active block is one that is in use by the application. - unsigned m_active_blocks; + size_type m_active_blocks; + + // "Managed" memory is "active" and "held" memory. + size_type m_managed_bytes; + + // "Active" bytes are bytes under the control of the application. + // This may be smaller than the actual allocated size reflected + // in m_managed_bytes. + size_type m_active_bytes; bool m_stop_holding; int m_trace; + unsigned m_leading_bits_in_bin_id; + public: - memory_pool(Allocator const &alloc=Allocator()) + memory_pool(Allocator const &alloc=Allocator(), unsigned leading_bits_in_bin_id=4) : m_allocator(alloc.copy()), - m_held_blocks(0), m_active_blocks(0), m_stop_holding(false), - m_trace(false) + m_held_blocks(0), m_active_blocks(0), + m_managed_bytes(0), m_active_bytes(0), + m_stop_holding(false), + m_trace(false), m_leading_bits_in_bin_id(leading_bits_in_bin_id) { if (m_allocator->is_deferred()) { @@ -88,17 +144,21 @@ namespace PYGPU_PACKAGE virtual ~memory_pool() { free_held(); } - static const unsigned mantissa_bits = 2; - static const unsigned mantissa_mask = (1 << mantissa_bits) - 1; + private: + unsigned mantissa_mask() const + { + return (1 << m_leading_bits_in_bin_id) - 1; + } - static bin_nr_t bin_number(size_type size) + public: + bin_nr_t bin_number(size_type size) { signed l = bitlog2(size); - size_type shifted = signed_right_shift(size, l-signed(mantissa_bits)); - if (size && (shifted & (1 << mantissa_bits)) == 0) + size_type shifted = signed_right_shift(size, l-signed(m_leading_bits_in_bin_id)); + if (size && (shifted & (1 << m_leading_bits_in_bin_id)) == 0) throw std::runtime_error("memory_pool::bin_number: bitlog2 fault"); - size_type chopped = shifted & mantissa_mask; - return l << mantissa_bits | chopped; + size_type chopped = shifted & mantissa_mask(); + return l << m_leading_bits_in_bin_id | chopped; } void set_trace(bool flag) @@ -109,19 +169,19 @@ namespace PYGPU_PACKAGE --m_trace; } - static size_type alloc_size(bin_nr_t bin) + size_type alloc_size(bin_nr_t bin) { - bin_nr_t exponent = bin >> mantissa_bits; - bin_nr_t mantissa = bin & mantissa_mask; + bin_nr_t exponent = bin >> m_leading_bits_in_bin_id; + bin_nr_t mantissa = bin & mantissa_mask(); - size_type ones = signed_left_shift(1, - signed(exponent)-signed(mantissa_bits) + size_type ones = signed_left_shift((size_type) 1, + signed(exponent)-signed(m_leading_bits_in_bin_id) ); if (ones) ones -= 1; size_type head = signed_left_shift( - (1<second; } else - return *it->second; + return it->second; } void inc_held_blocks() @@ -176,14 +236,15 @@ namespace PYGPU_PACKAGE return pop_block_from_bin(bin, size); } - size_type alloc_sz = alloc_size(bin_nr); + size_type alloc_sz = alloc_size(bin_nr); - assert(bin_number(alloc_sz) == bin_nr); + always_assert(bin_number(alloc_sz) == bin_nr); + always_assert(alloc_sz >= size); if (m_trace) std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl; - try { return get_from_allocator(alloc_sz); } + try { return get_from_allocator(alloc_sz, size); } catch (PYGPU_PACKAGE::error &e) { if (!e.is_out_of_memory()) @@ -202,7 +263,7 @@ namespace PYGPU_PACKAGE while (try_to_free_memory()) { - try { return get_from_allocator(alloc_sz); } + try { return get_from_allocator(alloc_sz, size); } catch (PYGPU_PACKAGE::error &e) { if (!e.is_out_of_memory()) @@ -224,6 +285,7 @@ namespace PYGPU_PACKAGE void free(pointer_type p, size_type size) { --m_active_blocks; + m_active_bytes -= size; bin_nr_t bin_nr = bin_number(size); if (!m_stop_holding) @@ -237,18 +299,22 @@ namespace PYGPU_PACKAGE << " entries" << std::endl; } else + { m_allocator->free(p); + m_managed_bytes -= alloc_size(bin_nr); + } } void free_held() { - BOOST_FOREACH(bin_pair_t bin_pair, m_container) + for (bin_pair_t &bin_pair: m_container) { - bin_t &bin = *bin_pair.second; + bin_t &bin = bin_pair.second; while (bin.size()) { m_allocator->free(bin.back()); + m_managed_bytes -= alloc_size(bin_pair.first); bin.pop_back(); dec_held_blocks(); @@ -264,23 +330,31 @@ namespace PYGPU_PACKAGE free_held(); } - unsigned active_blocks() + size_type active_blocks() const { return m_active_blocks; } - unsigned held_blocks() + size_type held_blocks() const { return m_held_blocks; } + size_type managed_bytes() const + { return m_managed_bytes; } + + size_type active_bytes() const + { return m_active_bytes; } + bool try_to_free_memory() { - BOOST_FOREACH(bin_pair_t bin_pair, - // free largest stuff first - std::make_pair(m_container.rbegin(), m_container.rend())) + // free largest stuff first + for (typename container_t::reverse_iterator it = m_container.rbegin(); + it != m_container.rend(); ++it) { - bin_t &bin = *bin_pair.second; + bin_pair_t &bin_pair = *it; + bin_t &bin = bin_pair.second; if (bin.size()) { m_allocator->free(bin.back()); + m_managed_bytes -= alloc_size(bin_pair.first); bin.pop_back(); dec_held_blocks(); @@ -293,10 +367,12 @@ namespace PYGPU_PACKAGE } private: - pointer_type get_from_allocator(size_type alloc_sz) + pointer_type get_from_allocator(size_type alloc_sz, size_type size) { pointer_type result = m_allocator->allocate(alloc_sz); ++m_active_blocks; + m_managed_bytes += alloc_sz; + m_active_bytes += size; return result; } @@ -308,17 +384,15 @@ namespace PYGPU_PACKAGE dec_held_blocks(); ++m_active_blocks; + m_active_bytes += size; return result; } }; - - - template - class pooled_allocation : public boost::noncopyable + class pooled_allocation : public mp_noncopyable { public: typedef Pool pool_type; @@ -326,14 +400,14 @@ namespace PYGPU_PACKAGE typedef typename Pool::size_type size_type; private: - boost::shared_ptr m_pool; + PYGPU_SHARED_PTR m_pool; pointer_type m_ptr; size_type m_size; bool m_valid; public: - pooled_allocation(boost::shared_ptr p, size_type size) + pooled_allocation(PYGPU_SHARED_PTR p, size_type size) : m_pool(p), m_ptr(p->allocate(size)), m_size(size), m_valid(true) { } @@ -352,7 +426,7 @@ namespace PYGPU_PACKAGE } else throw PYGPU_PACKAGE::error( - "pooled_device_allocation::free", + "pooled_device_allocation::free", #ifdef PYGPU_PYCUDA CUDA_ERROR_INVALID_HANDLE #endif diff --git a/src/wrapper/mempool.cpp b/src/wrapper/mempool.cpp index 918d3d0d..77a4a737 100644 --- a/src/wrapper/mempool.cpp +++ b/src/wrapper/mempool.cpp @@ -102,7 +102,7 @@ namespace template - class context_dependent_memory_pool : + class context_dependent_memory_pool : public pycuda::memory_pool, public pycuda::explicit_context_dependent { @@ -117,12 +117,12 @@ namespace - class pooled_device_allocation - : public pycuda::context_dependent, + class pooled_device_allocation + : public pycuda::context_dependent, public pycuda::pooled_allocation > - { + { private: - typedef + typedef pycuda::pooled_allocation > super; @@ -159,12 +159,12 @@ namespace } - - class pooled_host_allocation + + class pooled_host_allocation : public pycuda::pooled_allocation > { private: - typedef + typedef pycuda::pooled_allocation > super; @@ -193,7 +193,7 @@ namespace back_inserter(dims)); std::auto_ptr alloc( - new pooled_host_allocation( + new pooled_host_allocation( pool, tp_descr->elsize*pycuda::size_from_dims(dims.size(), &dims.front()))); NPY_ORDER order = PyArray_CORDER; @@ -228,6 +228,8 @@ namespace wrapper .add_property("held_blocks", &cl::held_blocks) .add_property("active_blocks", &cl::active_blocks) + .add_property("managed_bytes", &cl::managed_bytes) + .add_property("active_bytes", &cl::active_bytes) .DEF_SIMPLE_METHOD(bin_number) .DEF_SIMPLE_METHOD(alloc_size) .DEF_SIMPLE_METHOD(free_held) @@ -249,7 +251,7 @@ void pycuda_expose_tools() typedef context_dependent_memory_pool cl; py::class_< - cl, boost::noncopyable, + cl, boost::noncopyable, boost::shared_ptr > wrapper("DeviceMemoryPool"); wrapper .def("allocate", device_pool_allocate, @@ -269,7 +271,7 @@ void pycuda_expose_tools() typedef pycuda::memory_pool cl; py::class_< - cl, boost::noncopyable, + cl, boost::noncopyable, boost::shared_ptr > wrapper( "PageLockedMemoryPool", py::init >() -- GitLab From d339e830826608b72c2d7da7e2511cdcf6e1a47f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Wed, 31 Mar 2021 13:59:37 -0500 Subject: [PATCH 27/66] mempool: bin_number and alloc_size are no longer static --- src/wrapper/mempool.cpp | 2 -- test/test_driver.py | 7 ++++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/wrapper/mempool.cpp b/src/wrapper/mempool.cpp index 77a4a737..66f43f7e 100644 --- a/src/wrapper/mempool.cpp +++ b/src/wrapper/mempool.cpp @@ -234,8 +234,6 @@ namespace .DEF_SIMPLE_METHOD(alloc_size) .DEF_SIMPLE_METHOD(free_held) .DEF_SIMPLE_METHOD(stop_holding) - .staticmethod("bin_number") - .staticmethod("alloc_size") ; } } diff --git a/test/test_driver.py b/test/test_driver.py index b022aa37..98f3c8aa 100644 --- a/test/test_driver.py +++ b/test/test_driver.py @@ -669,14 +669,15 @@ class TestDriver: def test_mempool_2(self): from pycuda.tools import DeviceMemoryPool from random import randrange + pool = DeviceMemoryPool() for i in range(2000): s = randrange(1 << 31) >> randrange(32) - bin_nr = DeviceMemoryPool.bin_number(s) - asize = DeviceMemoryPool.alloc_size(bin_nr) + bin_nr = pool.bin_number(s) + asize = pool.alloc_size(bin_nr) assert asize >= s, s - assert DeviceMemoryPool.bin_number(asize) == bin_nr, s + assert pool.bin_number(asize) == bin_nr, s assert asize < asize * (1 + 1 / 8) @mark_cuda_test -- GitLab From 3c8d3932638a45ab013cae58bac133f213466740 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 2 Apr 2021 16:49:37 -0500 Subject: [PATCH 28/66] Drop dependency on decorator module --- pycuda/tools.py | 44 ++++++++++++++++++++++++++------------------ setup.py | 1 - 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/pycuda/tools.py b/pycuda/tools.py index 1a2b50f4..05ac3c52 100644 --- a/pycuda/tools.py +++ b/pycuda/tools.py @@ -26,7 +26,6 @@ OTHER DEALINGS IN THE SOFTWARE. """ import pycuda.driver as cuda -from decorator import decorator import pycuda._driver as _drv import numpy as np @@ -451,25 +450,34 @@ def get_arg_type(c_arg): context_dependent_memoized_functions = [] -@decorator -def context_dependent_memoize(func, *args): - try: - ctx_dict = func._pycuda_ctx_dep_memoize_dic - except AttributeError: - # FIXME: This may keep contexts alive longer than desired. - # But I guess since the memory in them is freed, who cares. - ctx_dict = func._pycuda_ctx_dep_memoize_dic = {} +def context_dependent_memoize(func): + def wrapper(*args, **kwargs): + if kwargs: + cache_key = (args, frozenset(kwargs.items())) + else: + cache_key = (args,) - cur_ctx = cuda.Context.get_current() + try: + ctx_dict = func._pycuda_ctx_dep_memoize_dic + except AttributeError: + # FIXME: This may keep contexts alive longer than desired. + # But I guess since the memory in them is freed, who cares. + ctx_dict = func._pycuda_ctx_dep_memoize_dic = {} - try: - return ctx_dict[cur_ctx][args] - except KeyError: - context_dependent_memoized_functions.append(func) - arg_dict = ctx_dict.setdefault(cur_ctx, {}) - result = func(*args) - arg_dict[args] = result - return result + cur_ctx = cuda.Context.get_current() + + try: + return ctx_dict[cur_ctx][cache_key] + except KeyError: + context_dependent_memoized_functions.append(func) + arg_dict = ctx_dict.setdefault(cur_ctx, {}) + result = func(*args, **kwargs) + arg_dict[cache_key] = result + return result + + from functools import update_wrapper + update_wrapper(wrapper, func) + return wrapper def clear_context_caches(): diff --git a/setup.py b/setup.py index 990a7830..44545e16 100644 --- a/setup.py +++ b/setup.py @@ -225,7 +225,6 @@ def main(): python_requires="~=3.6", install_requires=[ "pytools>=2011.2", - "decorator>=3.2.0", "appdirs>=1.4.0", "mako", ], -- GitLab From 787218f919740b0c432ed5bf598f9e30c2e834f7 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 3 Apr 2021 17:28:03 -0500 Subject: [PATCH 29/66] Tweak MANIFEST.in to ship wiki examples --- MANIFEST.in | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index 5e91a36d..7d3675cd 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,8 +5,7 @@ include src/cpp/*.cpp include src/wrapper/*.hpp include src/wrapper/*.cpp include test/*.py -include examples/*.py -include examples/MORE* +recursive-include examples *.py include doc/source/*.rst include doc/Makefile -- GitLab From 2e9e7b8ce98131c1f1596d0c0aa3feb020056831 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 3 Apr 2021 17:28:11 -0500 Subject: [PATCH 30/66] Bump version to 2021.1 --- pycuda/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/__init__.py b/pycuda/__init__.py index ea2c3fd8..e4c47bc9 100644 --- a/pycuda/__init__.py +++ b/pycuda/__init__.py @@ -1,3 +1,3 @@ -VERSION = (2020, 1) +VERSION = (2021, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS -- GitLab From fd11bd8c5216ad48932afba1b032abe6ea0d1f27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Thu, 8 Apr 2021 12:15:16 -0500 Subject: [PATCH 31/66] Run flake8 in Github CI on target Py version --- .github/workflows/ci.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9cfa45a6..b90957af 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,7 +16,8 @@ jobs: - uses: actions/setup-python@v1 with: - python-version: '3.x' + # matches compat target in setup.py + python-version: '3.6' - name: "Main Script" run: | curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh -- GitLab From fa90450b71cfb41ffe6262331ba6badae80acdba Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 8 Apr 2021 18:29:15 -0500 Subject: [PATCH 32/66] Add Gitlab autopush --- .github/workflows/autopush.yml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 .github/workflows/autopush.yml diff --git a/.github/workflows/autopush.yml b/.github/workflows/autopush.yml new file mode 100644 index 00000000..f89b08ac --- /dev/null +++ b/.github/workflows/autopush.yml @@ -0,0 +1,21 @@ +name: Gitlab mirror +on: + push: + branches: + - main + +jobs: + autopush: + name: Automatic push to gitlab.tiker.net + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - run: | + mkdir ~/.ssh && echo -e "Host gitlab.tiker.net\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config + eval $(ssh-agent) && echo "$GITLAB_AUTOPUSH_KEY" | ssh-add - + git fetch --unshallow + git push "git@gitlab.tiker.net:inducer/$(basename $GITHUB_REPOSITORY).git" main + env: + GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }} + +# vim: sw=4 -- GitLab From ee094655600357a46ee34a030086a2710cb68ed2 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 10 Apr 2021 15:43:03 -0500 Subject: [PATCH 33/66] Update compyte --- pycuda/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/compyte b/pycuda/compyte index 7533db88..71bffa1a 160000 --- a/pycuda/compyte +++ b/pycuda/compyte @@ -1 +1 @@ -Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d +Subproject commit 71bffa1ae64ed98b9d922c79a6f9cc7eb4fd642f -- GitLab From f4c52bc0de08d117c7ab51a92a441763552575c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Fri, 16 Apr 2021 14:25:57 -0500 Subject: [PATCH 34/66] Fix a py2-ism in the tutorial --- doc/source/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst index 2c401b88..4cc909cf 100644 --- a/doc/source/tutorial.rst +++ b/doc/source/tutorial.rst @@ -20,7 +20,7 @@ Transferring Data The next step in most programs is to transfer data onto the device. In PyCuda, you will mostly transfer data from :mod:`numpy` arrays on the host. (But indeed, everything that satisfies the Python buffer -interface will work, even a :class:`str`.) Let's make a 4x4 array +interface will work, even :class:`bytes`.) Let's make a 4x4 array of random numbers:: import numpy -- GitLab From 33a8d259ea6e9560798ab902f37db8e9740c0a11 Mon Sep 17 00:00:00 2001 From: ByLamacq <52816027+ByLamacq@users.noreply.github.com> Date: Tue, 11 May 2021 22:14:05 +0200 Subject: [PATCH 35/66] Update CUdevice_attribute from cuda 6 to 11.3 --- src/wrapper/wrap_cudadrv.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp index 0be5a0a2..6d95edad 100644 --- a/src/wrapper/wrap_cudadrv.cpp +++ b/src/wrapper/wrap_cudadrv.cpp @@ -897,6 +897,36 @@ BOOST_PYTHON_MODULE(_driver) .value("MANAGED_MEMORY", CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY) .value("MULTI_GPU_BOARD", CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD) .value("MULTI_GPU_BOARD_GROUP_ID", CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID) +#endif +#if CUDAPP_CUDA_VERSION >= 8000 + .value("HOST_NATIVE_ATOMIC_SUPPORTED", CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED) + .value("SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO", CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO) + .value("PAGEABLE_MEMORY_ACCESS", CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS) + .value("CONCURRENT_MANAGED_ACCESS", CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS) + .value("COMPUTE_PREEMPTION_SUPPORTED", CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED) + .value("CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM", CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM) +#endif +#if CUDAPP_CUDA_VERSION >= 9000 + .value("MAX_SHARED_MEMORY_PER_BLOCK_OPTIN", CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN) +#endif +#if CUDAPP_CUDA_VERSION >= 9020 + .value("PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES", CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES) + .value("DIRECT_MANAGED_MEM_ACCESS_FROM_HOST", CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST) +#endif +#if CUDAPP_CUDA_VERSION >= 10020 + .value("HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED", CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED) + .value("HANDLE_TYPE_WIN32_HANDLE_SUPPORTED", CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED) + .value("HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED", CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED) +#endif +#if CUDAPP_CUDA_VERSION >= 11000 + .value("MAX_PERSISTING_L2_CACHE_SIZE", CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE) + .value("MAX_BLOCKS_PER_MULTIPROCESSOR", CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR) + .value("GENERIC_COMPRESSION_SUPPORTED", CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED) + .value("RESERVED_SHARED_MEMORY_PER_BLOCK", CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK) +#endif +#if CUDAPP_CUDA_VERSION >= 11020 + .value("READ_ONLY_HOST_REGISTER_SUPPORTED", CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED) + .value("MEMORY_POOLS_SUPPORTED", CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED) #endif ; #if CUDAPP_CUDA_VERSION >= 4000 -- GitLab From f783c4f20dbce6228292be9a56841974445863f5 Mon Sep 17 00:00:00 2001 From: ByLamacq <52816027+ByLamacq@users.noreply.github.com> Date: Wed, 12 May 2021 20:12:58 +0200 Subject: [PATCH 36/66] Update device_attribute from cuda 6 to 11.3 --- doc/source/driver.rst | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/doc/source/driver.rst b/doc/source/driver.rst index 8de1f2d1..86fa2e1b 100644 --- a/doc/source/driver.rst +++ b/doc/source/driver.rst @@ -331,6 +331,42 @@ Constants CUDA 6.0 and above. .. versionadded:: 2014.1 + + .. attribute :: HOST_NATIVE_ATOMIC_SUPPORTED + SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO + PAGEABLE_MEMORY_ACCESS + CONCURRENT_MANAGED_ACCESS + COMPUTE_PREEMPTION_SUPPORTED + CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM + + CUDA 8.0 and above. + + .. attribute :: MAX_SHARED_MEMORY_PER_BLOCK_OPTIN + + CUDA 9.0 and above. + + .. attribute :: PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES + DIRECT_MANAGED_MEM_ACCESS_FROM_HOST + + CUDA 9.2 and above. + + .. attribute :: HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED + HANDLE_TYPE_WIN32_HANDLE_SUPPORTED + HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED + + CUDA 10.2 and above. + + .. attribute :: MAX_PERSISTING_L2_CACHE_SIZE + MAX_BLOCKS_PER_MULTIPROCESSOR + GENERIC_COMPRESSION_SUPPORTED + RESERVED_SHARED_MEMORY_PER_BLOCK + + CUDA 11.0 and above. + + .. attribute :: READ_ONLY_HOST_REGISTER_SUPPORTED + MEMORY_POOLS_SUPPORTED + + CUDA 11.2 and above. .. class:: pointer_attribute -- GitLab From 02b4eac828de76200904bca4401d747692bd5f89 Mon Sep 17 00:00:00 2001 From: Maxim Belkin Date: Mon, 31 May 2021 13:05:06 -0500 Subject: [PATCH 37/66] doc/source/index.rst: remove duplicate reikna entry --- doc/source/index.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/index.rst b/doc/source/index.rst index 41ad641a..bb0d9820 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -88,7 +88,6 @@ feel should be listed here, please submit a PR! * `pyvkfft `__ * `scikit-cuda `__ * `reikna `__ -* `reikna `__ * `compyle `__ * `pynufft `__ -- GitLab From d2dd127eb1599230b6456d991cdbf0a3d5b6c9f9 Mon Sep 17 00:00:00 2001 From: Tim Gates Date: Wed, 9 Jun 2021 22:15:22 +1000 Subject: [PATCH 38/66] docs: fix a few simple typos There are small typos in: - aksetup_helper.py - examples/from-wiki/mandelbrot_interactive.py - pycuda/cuda/pycuda-helpers.hpp - test/test_gpuarray.py Fixes: - Should read `subtraction` rather than `substraction`. - Should read `support` rather than `supprt`. - Should read `right` rather than `rigth`. - Should read `preferred` rather than `prefered`. Closes #294 --- aksetup_helper.py | 2 +- examples/from-wiki/mandelbrot_interactive.py | 2 +- pycuda/cuda/pycuda-helpers.hpp | 2 +- test/test_gpuarray.py | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/aksetup_helper.py b/aksetup_helper.py index 91e9ac0c..6543215f 100644 --- a/aksetup_helper.py +++ b/aksetup_helper.py @@ -926,7 +926,7 @@ def has_flag(compiler, flagname): def cpp_flag(compiler): """Return the -std=c++[11/14] compiler flag. - The c++14 is prefered over c++11 (when it is available). + The c++14 is preferred over c++11 (when it is available). """ if has_flag(compiler, '-std=gnu++14'): return '-std=gnu++14' diff --git a/examples/from-wiki/mandelbrot_interactive.py b/examples/from-wiki/mandelbrot_interactive.py index 0780be20..66c3a011 100644 --- a/examples/from-wiki/mandelbrot_interactive.py +++ b/examples/from-wiki/mandelbrot_interactive.py @@ -11,7 +11,7 @@ # Point and click with the right buttom to magnify by a factor of 10 -# Click with the left button on the rigth side of the +# Click with the left button on the right side of the # image to randomly change the colormap # Click with right button on the right side of the image to set the default colormap diff --git a/pycuda/cuda/pycuda-helpers.hpp b/pycuda/cuda/pycuda-helpers.hpp index a543fc58..5f25092a 100644 --- a/pycuda/cuda/pycuda-helpers.hpp +++ b/pycuda/cuda/pycuda-helpers.hpp @@ -102,7 +102,7 @@ extern "C++" { return pycuda::complex(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z)); } - // FP_Surfaces with complex supprt + // FP_Surfaces with complex support __device__ void fp_surf2DLayeredwrite(double var,surface surf, int i, int j, int layer, enum cudaSurfaceBoundaryMode mode) { diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index d5d09251..4b53aca7 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -154,7 +154,7 @@ class TestGPUArray: @mark_cuda_test def test_substract_array(self): - """Test the substraction of two arrays.""" + """Test the subtraction of two arrays.""" # test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) b = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32) @@ -170,7 +170,7 @@ class TestGPUArray: @mark_cuda_test def test_substract_scalar(self): - """Test the substraction of an array and a scalar.""" + """Test the subtraction of an array and a scalar.""" # test data a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) -- GitLab From 3c4a4a6243e91bf3fcbd65a6e25c988ad14331c6 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Fri, 25 Jun 2021 13:48:21 -0500 Subject: [PATCH 39/66] Simplify, standardize sphinx setup --- .gitignore | 1 + doc/Makefile | 78 +++++++---------------------------- doc/{source => }/array.rst | 0 doc/conf.py | 25 +++++++++++ doc/{source => }/driver.rst | 0 doc/{source => }/gl.rst | 0 doc/{source => }/index.rst | 0 doc/{source => }/install.rst | 0 doc/{source => }/metaprog.rst | 0 doc/{source => }/misc.rst | 0 doc/source/conf.py | 52 ----------------------- doc/{source => }/tutorial.rst | 0 doc/upload-docs.sh | 2 +- doc/{source => }/util.rst | 0 14 files changed, 41 insertions(+), 117 deletions(-) rename doc/{source => }/array.rst (100%) create mode 100644 doc/conf.py rename doc/{source => }/driver.rst (100%) rename doc/{source => }/gl.rst (100%) rename doc/{source => }/index.rst (100%) rename doc/{source => }/install.rst (100%) rename doc/{source => }/metaprog.rst (100%) rename doc/{source => }/misc.rst (100%) delete mode 100644 doc/source/conf.py rename doc/{source => }/tutorial.rst (100%) rename doc/{source => }/util.rst (100%) diff --git a/.gitignore b/.gitignore index 4f1618a3..b73b75c2 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ *.pyc *.pyo build +_build *.prof siteconf.py doc/hedge-notes.pdf diff --git a/doc/Makefile b/doc/Makefile index e59ebe6c..747126b3 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -1,70 +1,20 @@ -# Makefile for Sphinx documentation +# Minimal makefile for Sphinx documentation # -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = python3 `which sphinx-build` -PAPER = - -# Internal variables. -PAPEROPT_a4 = -D latex_paper_size=a4 -PAPEROPT_letter = -D latex_paper_size=letter -ALLSPHINXOPTS = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source - -.PHONY: help clean html web pickle htmlhelp latex changes linkcheck +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= -n +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build +# Put it first so that "make" without argument is like "make help". help: - @echo "Please use \`make ' where is one of" - @echo " html to make standalone HTML files" - @echo " pickle to make pickle files (usable by e.g. sphinx-web)" - @echo " htmlhelp to make HTML files and a HTML help project" - @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" - @echo " changes to make an overview over all changed/added/deprecated items" - @echo " linkcheck to check all external links for integrity" - -clean: - -rm -rf build/* - -html: - mkdir -p build/html build/doctrees source/.static - $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html - @echo - @echo "Build finished. The HTML pages are in build/html." - -pickle: - mkdir -p build/pickle build/doctrees - $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle - @echo - @echo "Build finished; now you can process the pickle files or run" - @echo " sphinx-web build/pickle" - @echo "to start the sphinx-web server." - -web: pickle - -htmlhelp: - mkdir -p build/htmlhelp build/doctrees - $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) build/htmlhelp - @echo - @echo "Build finished; now you can run HTML Help Workshop with the" \ - ".hhp project file in build/htmlhelp." - -latex: - mkdir -p build/latex build/doctrees - $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) build/latex - @echo - @echo "Build finished; the LaTeX files are in build/latex." - @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ - "run these through (pdf)latex." + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -changes: - mkdir -p build/changes build/doctrees - $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes - @echo - @echo "The overview file is in build/changes." +.PHONY: help Makefile -linkcheck: - mkdir -p build/linkcheck build/doctrees - $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck - @echo - @echo "Link check complete; look for any errors in the above output " \ - "or in build/linkcheck/output.txt." +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/source/array.rst b/doc/array.rst similarity index 100% rename from doc/source/array.rst rename to doc/array.rst diff --git a/doc/conf.py b/doc/conf.py new file mode 100644 index 00000000..f2482ba2 --- /dev/null +++ b/doc/conf.py @@ -0,0 +1,25 @@ +from urllib.request import urlopen + +_conf_url = \ + "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py" +with urlopen(_conf_url) as _inf: + exec(compile(_inf.read(), _conf_url, "exec"), globals()) + +copyright = "2008-21, Andreas Kloeckner" + +ver_dic = {} +exec( + compile( + open("../pycuda/__init__.py").read(), "../pycuda/__init__.py", "exec" + ), + ver_dic, +) +version = ".".join(str(x) for x in ver_dic["VERSION"]) +# The full version, including alpha/beta/rc tags. +release = ver_dic["VERSION_TEXT"] + +intersphinx_mapping = { + "https://docs.python.org/3": None, + "https://numpy.org/doc/stable/": None, + "https://documen.tician.de/codepy/": None, +} diff --git a/doc/source/driver.rst b/doc/driver.rst similarity index 100% rename from doc/source/driver.rst rename to doc/driver.rst diff --git a/doc/source/gl.rst b/doc/gl.rst similarity index 100% rename from doc/source/gl.rst rename to doc/gl.rst diff --git a/doc/source/index.rst b/doc/index.rst similarity index 100% rename from doc/source/index.rst rename to doc/index.rst diff --git a/doc/source/install.rst b/doc/install.rst similarity index 100% rename from doc/source/install.rst rename to doc/install.rst diff --git a/doc/source/metaprog.rst b/doc/metaprog.rst similarity index 100% rename from doc/source/metaprog.rst rename to doc/metaprog.rst diff --git a/doc/source/misc.rst b/doc/misc.rst similarity index 100% rename from doc/source/misc.rst rename to doc/misc.rst diff --git a/doc/source/conf.py b/doc/source/conf.py deleted file mode 100644 index 2fb39ed0..00000000 --- a/doc/source/conf.py +++ /dev/null @@ -1,52 +0,0 @@ -extensions = [ - "sphinx.ext.intersphinx", - "sphinx.ext.mathjax", - "sphinx_copybutton", -] - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# The suffix of source filenames. -source_suffix = ".rst" - -# The master toctree document. -master_doc = "index" - -# General substitutions. -project = "PyCUDA" -copyright = "2008-20, Andreas Kloeckner" - -# The default replacements for |version| and |release|, also used in various -# other places throughout the built documents. -# -# The short X.Y version. -ver_dic = {} -exec( - compile( - open("../../pycuda/__init__.py").read(), "../../pycuda/__init__.py", "exec" - ), - ver_dic, -) -version = ".".join(str(x) for x in ver_dic["VERSION"]) -# The full version, including alpha/beta/rc tags. -release = ver_dic["VERSION_TEXT"] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = "sphinx" - - -# Options for HTML output -# ----------------------- - -html_theme = "furo" - - -intersphinx_mapping = { - "https://docs.python.org/3": None, - "https://numpy.org/doc/stable/": None, - "https://documen.tician.de/codepy/": None, -} - -autoclass_content = "class" -autodoc_typehints = "description" diff --git a/doc/source/tutorial.rst b/doc/tutorial.rst similarity index 100% rename from doc/source/tutorial.rst rename to doc/tutorial.rst diff --git a/doc/upload-docs.sh b/doc/upload-docs.sh index d5bba855..5914600b 100755 --- a/doc/upload-docs.sh +++ b/doc/upload-docs.sh @@ -1,3 +1,3 @@ #! /bin/sh -rsync --verbose --archive --delete build/html/* doc-upload:doc/pycuda +rsync --verbose --archive --delete _build/html/* doc-upload:doc/pycuda diff --git a/doc/source/util.rst b/doc/util.rst similarity index 100% rename from doc/source/util.rst rename to doc/util.rst -- GitLab From 835406a7033c1705b79044c56c010cabbabc8acb Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Mon, 26 Jul 2021 04:16:58 +0000 Subject: [PATCH 40/66] Implemented gpuarray.(stack|concatenate) + docs --- doc/array.rst | 9 +++++ pycuda/gpuarray.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/doc/array.rst b/doc/array.rst index de1cc66e..23f22ce0 100644 --- a/doc/array.rst +++ b/doc/array.rst @@ -339,6 +339,15 @@ Constructing :class:`GPUArray` Instances Return the :class:`GPUArray` ``[a[indices[0]], ..., a[indices[n]]]``. For the moment, *a* must be a type that can be bound to a texture. +.. function:: concatenate(arrays, axis=0, allocator=None) + + Join a sequence of arrays along an existing axis. + + .. function:: stack(arrays, axis=0, allocator=None) + + Join a sequence of arrays along a new axis. + + Conditionals ^^^^^^^^^^^^ diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 373cf005..e5d853be 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1767,6 +1767,95 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None): # {{{ shape manipulation +def concatenate(arrays, axis=0, allocator=None): + """ + Join a sequence of arrays along an existing axis. + :arg arrays: A sequnce of :class:`GPUArray`. + :arg axis: Index of the dimension of the new axis in the result array. + Can be -1, for the new axis to be last dimension. + :returns: :class:`GPUArray` + """ + # implementation is borrowed from pyopencl.array.concatenate() + # {{{ find properties of result array + + shape = None + + def shape_except_axis(ary: GPUArray): + return ary.shape[:axis] + ary.shape[axis+1:] + + for i_ary, ary in enumerate(arrays): + allocator = allocator or ary.allocator + + if shape is None: + # first array + shape = list(ary.shape) + + else: + if len(ary.shape) != len(shape): + raise ValueError("%d'th array has different number of axes " + "(should have %d, has %d)" + % (i_ary, len(ary.shape), len(shape))) + + if (ary.ndim != arrays[0].ndim + or shape_except_axis(ary) != shape_except_axis(arrays[0])): + raise ValueError("%d'th array has residual not matching " + "other arrays" % i_ary) + + shape[axis] += ary.shape[axis] + + # }}} + + shape = tuple(shape) + dtype = np.find_common_type([ary.dtype for ary in arrays], []) + result = empty(shape, dtype, allocator=allocator) + + full_slice = (slice(None),) * len(shape) + + base_idx = 0 + for ary in arrays: + my_len = ary.shape[axis] + result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary + base_idx += my_len + + return result + + + def stack(arrays, axis=0, allocator=None): + """ + Join a sequence of arrays along a new axis. + :arg arrays: A sequnce of :class:`GPUArray`. + :arg axis: Index of the dimension of the new axis in the result array. + Can be -1, for the new axis to be last dimension. + :returns: :class:`GPUArray` + """ + # implementation is borrowed from pyopencl.array.stack() + allocator = allocator or arrays[0].allocator + + if not arrays: + raise ValueError("need at least one array to stack") + + input_shape = arrays[0].shape + input_ndim = arrays[0].ndim + axis = input_ndim if axis == -1 else axis + + if not all(ary.shape == input_shape for ary in arrays[1:]): + raise ValueError("arrays must have the same shape") + + if not (0 <= axis <= input_ndim): + raise ValueError("invalid axis") + + result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:] + result = empty(shape=result_shape, + dtype=np.result_type(*(ary.dtype for ary in arrays)), + allocator=allocator, order="C" if axis == 0 else "F") + + for i, ary in enumerate(arrays): + + idx = (slice(None),)*axis + (i,) + (slice(None),)*(input_ndim-axis) + result[idx] = ary + + return result + def transpose(a, axes=None): """Permute the dimensions of an array. -- GitLab From 2fd2d8475f61f9d1446bbc03bebd9437de128624 Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Mon, 26 Jul 2021 04:20:51 +0000 Subject: [PATCH 41/66] Implemented test_gpuarray.test_(stack|concatenate) --- test/test_gpuarray.py | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 4b53aca7..2b8b66b3 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -468,6 +468,53 @@ class TestGPUArray: a = gpuarray.arange(12, dtype=np.float32) assert (np.arange(12, dtype=np.float32) == a.get()).all() + @mark_cuda_test + def test_stack(self): + + orders = ["F", "C"] + input_dims_lst = [0, 1, 2] + + for order in orders: + for input_dims in input_dims_lst: + shape = (2, 2, 2)[:input_dims] + axis = -1 if order == "F" else 0 + + from numpy.random import default_rng + rng = default_rng() + x_in = rng.random(size=shape) + y_in = rng.random(size=shape) + x_in = x_in if order == "C" else np.asfortranarray(x_in) + y_in = y_in if order == "C" else np.asfortranarray(y_in) + + x_gpu = gpuarray.to_gpu(x_in) + y_gpu = gpuarray.to_gpu(y_in) + + numpy_stack = np.stack((x_in, y_in), axis=axis) + gpuarray_stack = gpuarray.stack((x_gpu, y_gpu), axis=axis) + + np.testing.assert_allclose(gpuarray_stack.get(), numpy_stack) + + assert gpuarray_stack.shape == numpy_stack.shape + + @mark_cuda_test + def test_concatenate(self): + + from pycuda.curandom import rand as curand + + a_dev = curand((5, 15, 20), dtype=np.float32) + b_dev = curand((4, 15, 20), dtype=np.float32) + c_dev = curand((3, 15, 20), dtype=np.float32) + a = a_dev.get() + b = b_dev.get() + c = c_dev.get() + + cat_dev = gpuarray.concatenate((a_dev, b_dev, c_dev)) + cat = np.concatenate((a, b, c)) + + np.testing.assert_allclose(cat, cat_dev.get()) + + assert cat.shape == cat_dev.shape + @mark_cuda_test def test_reverse(self): a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) -- GitLab From 5664b3e421179a62305a1f2b194593f7331c9370 Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Mon, 26 Jul 2021 04:28:06 +0000 Subject: [PATCH 42/66] Implemented gpuarray.(stack|concatenate) + docs --- doc/array.rst | 2 +- pycuda/gpuarray.py | 128 ++++++++++++++++++++++----------------------- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/doc/array.rst b/doc/array.rst index 23f22ce0..34c65693 100644 --- a/doc/array.rst +++ b/doc/array.rst @@ -343,7 +343,7 @@ Constructing :class:`GPUArray` Instances Join a sequence of arrays along an existing axis. - .. function:: stack(arrays, axis=0, allocator=None) +.. function:: stack(arrays, axis=0, allocator=None) Join a sequence of arrays along a new axis. diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index e5d853be..4d09c77a 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1768,93 +1768,93 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None): # {{{ shape manipulation def concatenate(arrays, axis=0, allocator=None): - """ - Join a sequence of arrays along an existing axis. - :arg arrays: A sequnce of :class:`GPUArray`. - :arg axis: Index of the dimension of the new axis in the result array. - Can be -1, for the new axis to be last dimension. - :returns: :class:`GPUArray` - """ - # implementation is borrowed from pyopencl.array.concatenate() - # {{{ find properties of result array + """ + Join a sequence of arrays along an existing axis. + :arg arrays: A sequnce of :class:`GPUArray`. + :arg axis: Index of the dimension of the new axis in the result array. + Can be -1, for the new axis to be last dimension. + :returns: :class:`GPUArray` + """ + # implementation is borrowed from pyopencl.array.concatenate() + # {{{ find properties of result array - shape = None + shape = None - def shape_except_axis(ary: GPUArray): - return ary.shape[:axis] + ary.shape[axis+1:] + def shape_except_axis(ary: GPUArray): + return ary.shape[:axis] + ary.shape[axis+1:] - for i_ary, ary in enumerate(arrays): - allocator = allocator or ary.allocator + for i_ary, ary in enumerate(arrays): + allocator = allocator or ary.allocator - if shape is None: - # first array - shape = list(ary.shape) + if shape is None: + # first array + shape = list(ary.shape) - else: - if len(ary.shape) != len(shape): - raise ValueError("%d'th array has different number of axes " - "(should have %d, has %d)" - % (i_ary, len(ary.shape), len(shape))) + else: + if len(ary.shape) != len(shape): + raise ValueError("%d'th array has different number of axes " + "(should have %d, has %d)" + % (i_ary, len(ary.shape), len(shape))) - if (ary.ndim != arrays[0].ndim - or shape_except_axis(ary) != shape_except_axis(arrays[0])): - raise ValueError("%d'th array has residual not matching " - "other arrays" % i_ary) + if (ary.ndim != arrays[0].ndim + or shape_except_axis(ary) != shape_except_axis(arrays[0])): + raise ValueError("%d'th array has residual not matching " + "other arrays" % i_ary) - shape[axis] += ary.shape[axis] + shape[axis] += ary.shape[axis] - # }}} + # }}} - shape = tuple(shape) - dtype = np.find_common_type([ary.dtype for ary in arrays], []) - result = empty(shape, dtype, allocator=allocator) + shape = tuple(shape) + dtype = np.find_common_type([ary.dtype for ary in arrays], []) + result = empty(shape, dtype, allocator=allocator) - full_slice = (slice(None),) * len(shape) + full_slice = (slice(None),) * len(shape) - base_idx = 0 - for ary in arrays: - my_len = ary.shape[axis] - result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary - base_idx += my_len + base_idx = 0 + for ary in arrays: + my_len = ary.shape[axis] + result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary + base_idx += my_len - return result + return result def stack(arrays, axis=0, allocator=None): - """ - Join a sequence of arrays along a new axis. - :arg arrays: A sequnce of :class:`GPUArray`. - :arg axis: Index of the dimension of the new axis in the result array. - Can be -1, for the new axis to be last dimension. - :returns: :class:`GPUArray` - """ - # implementation is borrowed from pyopencl.array.stack() - allocator = allocator or arrays[0].allocator + """ + Join a sequence of arrays along a new axis. + :arg arrays: A sequnce of :class:`GPUArray`. + :arg axis: Index of the dimension of the new axis in the result array. + Can be -1, for the new axis to be last dimension. + :returns: :class:`GPUArray` + """ + # implementation is borrowed from pyopencl.array.stack() + allocator = allocator or arrays[0].allocator - if not arrays: - raise ValueError("need at least one array to stack") + if not arrays: + raise ValueError("need at least one array to stack") - input_shape = arrays[0].shape - input_ndim = arrays[0].ndim - axis = input_ndim if axis == -1 else axis + input_shape = arrays[0].shape + input_ndim = arrays[0].ndim + axis = input_ndim if axis == -1 else axis - if not all(ary.shape == input_shape for ary in arrays[1:]): - raise ValueError("arrays must have the same shape") + if not all(ary.shape == input_shape for ary in arrays[1:]): + raise ValueError("arrays must have the same shape") - if not (0 <= axis <= input_ndim): - raise ValueError("invalid axis") + if not (0 <= axis <= input_ndim): + raise ValueError("invalid axis") - result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:] - result = empty(shape=result_shape, - dtype=np.result_type(*(ary.dtype for ary in arrays)), - allocator=allocator, order="C" if axis == 0 else "F") + result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:] + result = empty(shape=result_shape, + dtype=np.result_type(*(ary.dtype for ary in arrays)), + allocator=allocator, order="C" if axis == 0 else "F") - for i, ary in enumerate(arrays): + for i, ary in enumerate(arrays): - idx = (slice(None),)*axis + (i,) + (slice(None),)*(input_ndim-axis) - result[idx] = ary + idx = (slice(None),)*axis + (i,) + (slice(None),)*(input_ndim-axis) + result[idx] = ary - return result + return result def transpose(a, axes=None): -- GitLab From e5cc82e8e1fb0079e7b101097d1104523969506c Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Mon, 26 Jul 2021 04:44:09 +0000 Subject: [PATCH 43/66] Removed Indentation errors --- pycuda/gpuarray.py | 2 +- test/test_gpuarray.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 4d09c77a..dfd360eb 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1779,7 +1779,7 @@ def concatenate(arrays, axis=0, allocator=None): # {{{ find properties of result array shape = None - + def shape_except_axis(ary: GPUArray): return ary.shape[:axis] + ary.shape[axis+1:] diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 2b8b66b3..dbf4b2f7 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -499,7 +499,7 @@ class TestGPUArray: @mark_cuda_test def test_concatenate(self): - from pycuda.curandom import rand as curand + from pycuda.curandom import rand as curand a_dev = curand((5, 15, 20), dtype=np.float32) b_dev = curand((4, 15, 20), dtype=np.float32) -- GitLab From 6b06323cda97f0031f27fbae9574ed20655c1f56 Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Mon, 26 Jul 2021 04:54:05 +0000 Subject: [PATCH 44/66] Removed Indentation in stack() --- pycuda/gpuarray.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index dfd360eb..a0bf84c4 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1779,7 +1779,7 @@ def concatenate(arrays, axis=0, allocator=None): # {{{ find properties of result array shape = None - + def shape_except_axis(ary: GPUArray): return ary.shape[:axis] + ary.shape[axis+1:] @@ -1820,7 +1820,7 @@ def concatenate(arrays, axis=0, allocator=None): return result - def stack(arrays, axis=0, allocator=None): +def stack(arrays, axis=0, allocator=None): """ Join a sequence of arrays along a new axis. :arg arrays: A sequnce of :class:`GPUArray`. -- GitLab From b8d756ba0cb1ca6aa82641ff783e9dd1de8956ac Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 8 Jan 2022 02:13:43 +0000 Subject: [PATCH 45/66] Rpow scalar --- pycuda/elementwise.py | 39 +++++++++++++++++++++++++++++++++++++++ pycuda/gpuarray.py | 21 +++++++++++++++++++++ test/test_gpuarray.py | 15 +++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 258bae5c..607e9f9c 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -666,6 +666,45 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): ) +@context_dependent_memoize +def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): + """ + Returns the kernel for the operation: ``z = x ** y`` + """ + if np.float64 in [dtype_x, dtype_y]: + func = "pow" + else: + func = "powf" + + if not is_base_array and is_exp_array: + x_ctype = "%(tp_x)s x" + y_ctype = "%(tp_y)s *y" + func = "%s(x,y[i])" % func + + elif is_base_array and is_exp_array: + x_ctype = "%(tp_x)s *x" + y_ctype = "%(tp_y)s *y" + func = "%s(x[i],y[i])" % func + + elif is_base_array and not is_exp_array: + x_ctype = "%(tp_x)s *x" + y_ctype = "%(tp_y)s y" + func = "%s(x[i],y)" % func + + else: + raise AssertionError + + return get_elwise_kernel( + ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) + % { + "tp_x": dtype_to_ctype(dtype_x), + "tp_y": dtype_to_ctype(dtype_y), + "tp_z": dtype_to_ctype(dtype_z), + }, + "z[i] = %s" % func, + name="pow_method") + + @context_dependent_memoize def get_fmod_kernel(): return get_elwise_kernel( diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index a0bf84c4..5e7a5607 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -839,6 +839,27 @@ class GPUArray: """ return self._pow(other, new=False) + def __rpow__(self, other): + common_dtype = _get_common_dtype(self, other) + result = self._new_like_me(common_dtype) + + if not np.isscalar(other): + # Base is a gpuarray => do not cast. + base = other + else: + base = common_dtype.type(other) + + func = elementwise.get_rpow_kernel( + base.dtype, self.dtype, result.dtype, + is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self)) + # Evaluates z = x ** y + func.prepared_async_call(self._grid, self._block, None, + result.gpudata, # z + base if np.isscalar(base) else base.gpudata, # x + self.gpudata, # y + self.mem_size) + return result + def reverse(self, stream=None): """Return this array in reversed order. The array is treated as one-dimensional. diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index dbf4b2f7..5234d8a2 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -40,6 +40,21 @@ class TestGPUArray: a_gpu = a_gpu.get() assert (np.abs(a ** 2 - a_gpu) < 1e-3).all() + @mark_cuda_test + def test_rpow_array(self): + scalar = np.random.rand() + a = abs(np.random.rand(10)) + a_gpu = gpuarray.to_gpu(a) + + result = (scalar ** a_gpu).get() + np.testing.assert_allclose(scalar ** a, result) + + result = (a_gpu ** a_gpu).get() + np.testing.assert_allclose(a ** a, result) + + result = (a_gpu ** scalar).get() + np.testing.assert_allclose(a ** scalar, result) + @mark_cuda_test def test_numpy_integer_shape(self): gpuarray.empty(np.int32(17), np.float32) -- GitLab From e0d744c668ff251238373b291c637554fe150190 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Mon, 17 Jan 2022 12:56:42 -0600 Subject: [PATCH 46/66] Merged get_rpow_kernel into get_pow_array_kernel + Improved test_pow_array to catch order flips --- pycuda/elementwise.py | 24 +++--------------------- pycuda/gpuarray.py | 6 +++--- test/test_gpuarray.py | 14 ++++++++------ 3 files changed, 14 insertions(+), 30 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 607e9f9c..cbe92953 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -648,26 +648,7 @@ def get_pow_kernel(dtype): @context_dependent_memoize -def get_pow_array_kernel(dtype_x, dtype_y, dtype_z): - if np.float64 in [dtype_x, dtype_y]: - func = "pow" - else: - func = "powf" - - return get_elwise_kernel( - "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z" - % { - "tp_x": dtype_to_ctype(dtype_x), - "tp_y": dtype_to_ctype(dtype_y), - "tp_z": dtype_to_ctype(dtype_z), - }, - "z[i] = %s(x[i], y[i])" % func, - "pow_method", - ) - - -@context_dependent_memoize -def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): +def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): """ Returns the kernel for the operation: ``z = x ** y`` """ @@ -702,7 +683,8 @@ def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): "tp_z": dtype_to_ctype(dtype_z), }, "z[i] = %s" % func, - name="pow_method") + name="pow_method" + ) @context_dependent_memoize diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 5e7a5607..ea06a792 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -780,16 +780,16 @@ class GPUArray: result = self func = elementwise.get_pow_array_kernel( - self.dtype, other.dtype, result.dtype + self.dtype, other.dtype, result.dtype, True, True ) func.prepared_async_call( self._grid, self._block, None, + result.gpudata, self.gpudata, other.gpudata, - result.gpudata, self.mem_size, ) @@ -849,7 +849,7 @@ class GPUArray: else: base = common_dtype.type(other) - func = elementwise.get_rpow_kernel( + func = elementwise.get_pow_array_kernel( base.dtype, self.dtype, result.dtype, is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self)) # Evaluates z = x ** y diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 5234d8a2..3e18e912 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -17,16 +17,18 @@ class TestGPUArray: def test_pow_array(self): a = np.array([1, 2, 3, 4, 5]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) + b = np.array([1, 2, 3, 4, 5]).astype(np.float64) + b_gpu = gpuarray.to_gpu(b) - result = pow(a_gpu, a_gpu).get() - assert (np.abs(a ** a - result) < 1e-3).all() + result = pow(a_gpu, b_gpu).get() + assert (np.abs(a ** b - result) < 1e-3).all() - result = (a_gpu ** a_gpu).get() - assert (np.abs(pow(a, a) - result) < 1e-3).all() + result = (a_gpu ** b_gpu).get() + assert (np.abs(pow(a, b) - result) < 1e-3).all() - a_gpu **= a_gpu + a_gpu **= b_gpu a_gpu = a_gpu.get() - assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all() + assert (np.abs(pow(a, b) - a_gpu) < 1e-3).all() @mark_cuda_test def test_pow_number(self): -- GitLab From 87d3436813cda134d8f492b99d0c5e4732f1a3be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Wed, 1 Dec 2021 11:17:51 -0600 Subject: [PATCH 47/66] Update docs to reflect that 3D grids are possible --- doc/driver.rst | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/driver.rst b/doc/driver.rst index 86fa2e1b..6a690ed6 100644 --- a/doc/driver.rst +++ b/doc/driver.rst @@ -1807,8 +1807,8 @@ Code on the Device: Modules and Functions *arg1* through *argn* are the positional C arguments to the kernel. See :meth:`param_set` for details. See especially the warnings there. - *grid* specifies, as a 2-tuple, the number of thread blocks to launch, as a - two-dimensional grid. + *grid* specifies, as a tuple of up to three integer entries, the number + of thread blocks to launch, as a multi-dimensional grid. *stream*, if specified, is a :class:`Stream` instance serializing the copying of input arguments (if any), execution, and the copying of output arguments (again, if any). @@ -1853,6 +1853,10 @@ Code on the Device: Modules and Functions which can make it somewhat slow. For a kernel that is invoked often, this can be inconvenient. For a faster (but mildly less convenient) way of invoking kernels, see :meth:`prepare` and :meth:`prepared_call`. + + .. note:: + + *grid* with more than two dimensions requires CUDA 4.0 or newer. .. method:: param_set_texref(texref) -- GitLab From 384ebd2717c1887f87714af29dfec1cd86893652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 6 Dec 2021 23:22:19 -0600 Subject: [PATCH 48/66] Add issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 34 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 20 +++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..e1552cf6 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,34 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: bug +assignees: '' + +--- + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Desktop (please complete the following information):** + - OS: [e.g. iOS] + - CUDA version: [e.g. 11.1] + - CUDA driver version: [e.g. 470.86] + - PyCUDA version: [e.g 2021.1] + - Python version: [e.g. 3.10] + +**Additional context** +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..11fc491e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,20 @@ +--- +name: Feature request +about: Suggest an idea for this project +title: '' +labels: enhancement +assignees: '' + +--- + +**Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] + +**Describe the solution you'd like** +A clear and concise description of what you want to happen. + +**Describe alternatives you've considered** +A clear and concise description of any alternative solutions or features you've considered. + +**Additional context** +Add any other context or screenshots about the feature request here. -- GitLab From efd4bab263173685fab4c2a8d27fcb97daadea13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 6 Dec 2021 23:23:08 -0600 Subject: [PATCH 49/66] Create config.yml --- .github/ISSUE_TEMPLATE/config.yml | 9 +++++++++ 1 file changed, 9 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/config.yml diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 00000000..707bf62d --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,9 @@ + +blank_issues_enabled: true +contact_links: + - name: ❓ Question + url: https://github.com/inducer/pycuda/discussions/categories/q-a + about: Ask and answer questions about PyCUDA on Discussions + - name: 🔧 Troubleshooting + url: https://github.com/inducer/pycuda/discussions/categories/troubleshooting + about: For troubleshooting help, see the Discussions -- GitLab From d6e22f144fe57ccb56f12e480b27fd6cea27986a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 6 Dec 2021 23:24:19 -0600 Subject: [PATCH 50/66] Tweak bug report template --- .github/ISSUE_TEMPLATE/bug_report.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index e1552cf6..2dcaf386 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -20,10 +20,7 @@ Steps to reproduce the behavior: **Expected behavior** A clear and concise description of what you expected to happen. -**Screenshots** -If applicable, add screenshots to help explain your problem. - -**Desktop (please complete the following information):** +**Environment (please complete the following information):** - OS: [e.g. iOS] - CUDA version: [e.g. 11.1] - CUDA driver version: [e.g. 470.86] -- GitLab From 59fe0e357320d4b92bd4e960cc7d71771fb7faf0 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Tue, 11 Jan 2022 08:17:08 -0600 Subject: [PATCH 51/66] Fix Py2 syntax in tutorial (gh-340) --- doc/tutorial.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 4cc909cf..834d8ea6 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -68,8 +68,8 @@ original *a*:: a_doubled = numpy.empty_like(a) cuda.memcpy_dtoh(a_doubled, a_gpu) - print a_doubled - print a + print(a_doubled) + print(a) This will print something like this:: @@ -127,8 +127,8 @@ achieved with much less writing:: a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32)) a_doubled = (2*a_gpu).get() - print a_doubled - print a_gpu + print(a_doubled) + print(a_gpu) Advanced Topics --------------- -- GitLab From 0ca08699a1379d91522dcd4ad71c09ba239e8959 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Fri, 21 Jan 2022 17:32:37 -0600 Subject: [PATCH 52/66] Slightly merged conditional branches in _pow --- pycuda/elementwise.py | 2 +- pycuda/gpuarray.py | 40 +++++++++++++++++++--------------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index cbe92953..5c6fb895 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -676,7 +676,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array) raise AssertionError return get_elwise_kernel( - ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype) + (x_ctype + ", " + y_ctype + ", " + "%(tp_z)s *z") % { "tp_x": dtype_to_ctype(dtype_x), "tp_y": dtype_to_ctype(dtype_y), diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index ea06a792..5b6635a0 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -783,17 +783,10 @@ class GPUArray: self.dtype, other.dtype, result.dtype, True, True ) - func.prepared_async_call( - self._grid, - self._block, - None, - result.gpudata, - self.gpudata, - other.gpudata, - self.mem_size, - ) + x_data = self.gpudata + y_data = other.gpudata + z_data = result.gpudata - return result else: if not self.flags.forc: raise RuntimeError( @@ -806,17 +799,22 @@ class GPUArray: else: result = self func = elementwise.get_pow_kernel(self.dtype) - func.prepared_async_call( - self._grid, - self._block, - None, - other, - self.gpudata, - result.gpudata, - self.mem_size, - ) - return result + x_data = other + y_data = self.gpudata + z_data = result.gpudata + + func.prepared_async_call( + self._grid, + self._block, + None, + x_data, + y_data, + z_data, + self.mem_size, + ) + + return result def __pow__(self, other): """pow function:: @@ -854,9 +852,9 @@ class GPUArray: is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self)) # Evaluates z = x ** y func.prepared_async_call(self._grid, self._block, None, - result.gpudata, # z base if np.isscalar(base) else base.gpudata, # x self.gpudata, # y + result.gpudata, # z self.mem_size) return result -- GitLab From ea42e668dbf24388762b1117784eb700da6dfd24 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 21 Jan 2022 21:10:33 -0600 Subject: [PATCH 53/66] roll back to main's compyte version --- pycuda/compyte | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/compyte b/pycuda/compyte index 71bffa1a..165b3aba 160000 --- a/pycuda/compyte +++ b/pycuda/compyte @@ -1 +1 @@ -Subproject commit 71bffa1ae64ed98b9d922c79a6f9cc7eb4fd642f +Subproject commit 165b3abae63bc39124a342ce1a539adbf6cd8a09 -- GitLab From 157d1d3d19666c915b281b6c1ff9097c6cbc25d0 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Fri, 4 Feb 2022 18:17:34 -0600 Subject: [PATCH 54/66] eliminated get_rpow_kernel + updated test tol --- aksetup_helper.py | 4 ++-- pycuda/gpuarray.py | 27 ++++++++++----------- test.py | 56 +++++++++++++++++++++++++++++++++++++++++++ test/test_gpuarray.py | 10 ++++---- 4 files changed, 75 insertions(+), 22 deletions(-) create mode 100644 test.py diff --git a/aksetup_helper.py b/aksetup_helper.py index 6543215f..342911e8 100644 --- a/aksetup_helper.py +++ b/aksetup_helper.py @@ -146,7 +146,7 @@ def get_config(schema=None, warn_about_no_config=True): return config -def hack_distutils(debug=False, fast_link=True, what_opt=3): +def hack_distutils(debug=True, fast_link=True, what_opt=0): # hack distutils.sysconfig to eliminate debug flags # stolen from mpi4py @@ -169,7 +169,7 @@ def hack_distutils(debug=False, fast_link=True, what_opt=3): if cflags: cflags = remove_prefixes(cflags.split(), bad_prefixes) if debug: - cflags.append("-g") + cflags.append("-g3") else: if what_opt is None: pass diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 5b6635a0..cf1b62e4 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -779,14 +779,6 @@ class GPUArray: else: result = self - func = elementwise.get_pow_array_kernel( - self.dtype, other.dtype, result.dtype, True, True - ) - - x_data = self.gpudata - y_data = other.gpudata - z_data = result.gpudata - else: if not self.flags.forc: raise RuntimeError( @@ -798,19 +790,24 @@ class GPUArray: result = self._new_like_me() else: result = self - func = elementwise.get_pow_kernel(self.dtype) - x_data = other - y_data = self.gpudata - z_data = result.gpudata + func = elementwise.get_pow_array_kernel( + self.dtype, + _get_common_dtype(self,other).type(other).dtype + if np.isscalar(other) else + other.dtype, + result.dtype, + not np.isscalar(self), + not np.isscalar(other) + ) func.prepared_async_call( self._grid, self._block, None, - x_data, - y_data, - z_data, + self.gpudata, + other if np.isscalar(other) else other.gpudata, + result.gpudata, self.mem_size, ) diff --git a/test.py b/test.py new file mode 100644 index 00000000..f0da15da --- /dev/null +++ b/test.py @@ -0,0 +1,56 @@ +import pycuda.driver as cuda +import pycuda.autoinit +from pycuda.compiler import SourceModule +import pycuda.gpuarray as gpuarray +import numpy as np + +N = 222341 + +func_mod = SourceModule(""" +extern "C" { +__global__ void func(float *a, int N, float minval, int denom) +{ +int idx = threadIdx.x+threadIdx.y*32+blockIdx.x*blockDim.x; +if (idx < N) + a[idx] = (a[idx]-minval)/denom; +} +} +""", no_extern_c=1) + +func = func_mod.get_function('func') + + +values = np.random.randn(N) +number_of_blocks=N/1024 + +graph = cuda.Graph() +node1 = graph.add_empty_node() +node2 = graph.add_kernel_node([node1],func,(1024, 1, 1),(number_of_blocks+1,1,1),1) +# node1 = graph.add_empty_node() +# node2 = graph.add_empty_node([node1]) +# print(graph.get_dependent_nodes(node2)) +# print(graph.get_nodes()) +# print("Printing out all the nodes") +# print(graph.get_nodes()) +# print("Removing dependency between root node and child node") +# graph.remove_dependencies(node1,[node2]) +# print("Printing out all dependencies") +# print(graph.get_dependencies(node1)) +# print("Adding dependency between root node and child node") +# graph.add_dependencies(node1,[node2]) +# print("Printing out root nodes") +# print(graph.get_root_nodes()) + + +# graph2 = cuda.Graph() +# node3 = graph2.add_empty_node() +# node1 = graph.add_child_graph_node(graph2,[]) +# print(graph.get_child_graph(node1)) +# event = cuda.Event() +# node5 = graph.add_event_record_node(event,[]) +# event2 = cuda.Event() +# graph.set_record_node_event(node5,event2) + +# event3 = graph.get_event_from_event_record_node(node5) +# graph.debug_dot_print("test.dot") + diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 3e18e912..3daeb932 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -21,14 +21,14 @@ class TestGPUArray: b_gpu = gpuarray.to_gpu(b) result = pow(a_gpu, b_gpu).get() - assert (np.abs(a ** b - result) < 1e-3).all() + np.testing.assert_allclose(a ** b, result, rtol=1e-6) result = (a_gpu ** b_gpu).get() - assert (np.abs(pow(a, b) - result) < 1e-3).all() + np.testing.assert_allclose(pow(a,b), result, rtol=1e-6) a_gpu **= b_gpu a_gpu = a_gpu.get() - assert (np.abs(pow(a, b) - a_gpu) < 1e-3).all() + np.testing.assert_allclose(pow(a,b), a_gpu, rtol=1e-6) @mark_cuda_test def test_pow_number(self): @@ -36,11 +36,11 @@ class TestGPUArray: a_gpu = gpuarray.to_gpu(a) result = pow(a_gpu, 2).get() - assert (np.abs(a ** 2 - result) < 1e-3).all() + np.testing.assert_allclose(a ** 2, result, rtol=1e-6) a_gpu **= 2 a_gpu = a_gpu.get() - assert (np.abs(a ** 2 - a_gpu) < 1e-3).all() + np.testing.assert_allclose(a ** 2, a_gpu, rtol=1e-6) @mark_cuda_test def test_rpow_array(self): -- GitLab From 3121b8ed78eb6eb9da53e2cd800388a98ff02570 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Fri, 4 Feb 2022 18:30:58 -0600 Subject: [PATCH 55/66] retiring get_pow + resolved flake8 --- pycuda/elementwise.py | 18 ------------------ pycuda/gpuarray.py | 16 ++++++++-------- test/test_gpuarray.py | 4 ++-- 3 files changed, 10 insertions(+), 28 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 5c6fb895..06d1fcb1 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -629,24 +629,6 @@ def get_arange_kernel(dtype): "arange", ) - -@context_dependent_memoize -def get_pow_kernel(dtype): - if dtype == np.float32: - func = "powf" - else: - func = "pow" - - return get_elwise_kernel( - "%(tp)s value, %(tp)s *y, %(tp)s *z" - % { - "tp": dtype_to_ctype(dtype), - }, - "z[i] = %s(y[i], value)" % func, - "pow_method", - ) - - @context_dependent_memoize def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): """ diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index cf1b62e4..8d6d9fd0 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -792,14 +792,14 @@ class GPUArray: result = self func = elementwise.get_pow_array_kernel( - self.dtype, - _get_common_dtype(self,other).type(other).dtype - if np.isscalar(other) else - other.dtype, - result.dtype, - not np.isscalar(self), - not np.isscalar(other) - ) + self.dtype, + _get_common_dtype(self, other).type(other).dtype + if np.isscalar(other) else + other.dtype, + result.dtype, + not np.isscalar(self), + not np.isscalar(other) + ) func.prepared_async_call( self._grid, diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 3daeb932..73ec3ade 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -24,11 +24,11 @@ class TestGPUArray: np.testing.assert_allclose(a ** b, result, rtol=1e-6) result = (a_gpu ** b_gpu).get() - np.testing.assert_allclose(pow(a,b), result, rtol=1e-6) + np.testing.assert_allclose(pow(a, b), result, rtol=1e-6) a_gpu **= b_gpu a_gpu = a_gpu.get() - np.testing.assert_allclose(pow(a,b), a_gpu, rtol=1e-6) + np.testing.assert_allclose(pow(a, b), a_gpu, rtol=1e-6) @mark_cuda_test def test_pow_number(self): -- GitLab From 38b1ee593c12cf0d356b4437fc0b8af626136547 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Fri, 4 Feb 2022 18:36:38 -0600 Subject: [PATCH 56/66] Added space to elementwise.py to resolve flake8 --- pycuda/elementwise.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 06d1fcb1..1f428b89 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -629,6 +629,7 @@ def get_arange_kernel(dtype): "arange", ) + @context_dependent_memoize def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array): """ -- GitLab From c1331fc1756016cc71b6429226398c7269d5117b Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Sun, 6 Feb 2022 20:26:39 +0000 Subject: [PATCH 57/66] Removed spurious diff --- aksetup_helper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aksetup_helper.py b/aksetup_helper.py index 342911e8..a0700fcf 100644 --- a/aksetup_helper.py +++ b/aksetup_helper.py @@ -146,7 +146,7 @@ def get_config(schema=None, warn_about_no_config=True): return config -def hack_distutils(debug=True, fast_link=True, what_opt=0): +def hack_distutils(debug=False, fast_link=True, what_opt=3): # hack distutils.sysconfig to eliminate debug flags # stolen from mpi4py -- GitLab From 1a9fd48d204199477b8cdbbd092c22b412ac7246 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Sun, 6 Feb 2022 14:31:08 -0600 Subject: [PATCH 58/66] Casting issue resolved --- pycuda/elementwise.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 1f428b89..985fec75 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -635,10 +635,12 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array) """ Returns the kernel for the operation: ``z = x ** y`` """ - if np.float64 in [dtype_x, dtype_y]: - func = "pow" - else: + if dtype_z == np.float32: func = "powf" + else: + # FIXME: Casting args to double-precision not + # ideal for all cases (ex. int args) + func = "pow" if not is_base_array and is_exp_array: x_ctype = "%(tp_x)s x" -- GitLab From 34a905e8546ba100a33d8435074d5bd30f8fee4a Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Sun, 6 Feb 2022 22:16:14 +0000 Subject: [PATCH 59/66] removing more spurious differences --- aksetup_helper.py | 2 +- test.py | 56 ----------------------------------------------- 2 files changed, 1 insertion(+), 57 deletions(-) delete mode 100644 test.py diff --git a/aksetup_helper.py b/aksetup_helper.py index a0700fcf..6543215f 100644 --- a/aksetup_helper.py +++ b/aksetup_helper.py @@ -169,7 +169,7 @@ def hack_distutils(debug=False, fast_link=True, what_opt=3): if cflags: cflags = remove_prefixes(cflags.split(), bad_prefixes) if debug: - cflags.append("-g3") + cflags.append("-g") else: if what_opt is None: pass diff --git a/test.py b/test.py deleted file mode 100644 index f0da15da..00000000 --- a/test.py +++ /dev/null @@ -1,56 +0,0 @@ -import pycuda.driver as cuda -import pycuda.autoinit -from pycuda.compiler import SourceModule -import pycuda.gpuarray as gpuarray -import numpy as np - -N = 222341 - -func_mod = SourceModule(""" -extern "C" { -__global__ void func(float *a, int N, float minval, int denom) -{ -int idx = threadIdx.x+threadIdx.y*32+blockIdx.x*blockDim.x; -if (idx < N) - a[idx] = (a[idx]-minval)/denom; -} -} -""", no_extern_c=1) - -func = func_mod.get_function('func') - - -values = np.random.randn(N) -number_of_blocks=N/1024 - -graph = cuda.Graph() -node1 = graph.add_empty_node() -node2 = graph.add_kernel_node([node1],func,(1024, 1, 1),(number_of_blocks+1,1,1),1) -# node1 = graph.add_empty_node() -# node2 = graph.add_empty_node([node1]) -# print(graph.get_dependent_nodes(node2)) -# print(graph.get_nodes()) -# print("Printing out all the nodes") -# print(graph.get_nodes()) -# print("Removing dependency between root node and child node") -# graph.remove_dependencies(node1,[node2]) -# print("Printing out all dependencies") -# print(graph.get_dependencies(node1)) -# print("Adding dependency between root node and child node") -# graph.add_dependencies(node1,[node2]) -# print("Printing out root nodes") -# print(graph.get_root_nodes()) - - -# graph2 = cuda.Graph() -# node3 = graph2.add_empty_node() -# node1 = graph.add_child_graph_node(graph2,[]) -# print(graph.get_child_graph(node1)) -# event = cuda.Event() -# node5 = graph.add_event_record_node(event,[]) -# event2 = cuda.Event() -# graph.set_record_node_event(node5,event2) - -# event3 = graph.get_event_from_event_record_node(node5) -# graph.debug_dot_print("test.dot") - -- GitLab From e85ea5c4297a967842111f6638163bf10d90319e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 10 Feb 2022 00:05:12 -0600 Subject: [PATCH 60/66] GPUArray._pow: minor cleanup --- pycuda/gpuarray.py | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 8d6d9fd0..36a39dc3 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -764,42 +764,29 @@ class GPUArray: Do the pow operator. with new, the user can choose between ipow or just pow """ + common_dtype = _get_common_dtype(self, other) + if new: + result = self._new_like_me(common_dtype) + else: + result = self - if isinstance(other, GPUArray): - if not self.flags.forc or not other.flags.forc: - raise RuntimeError( - "only contiguous arrays may " - "be used as arguments to this operation" - ) - - assert self.shape == other.shape - - if new: - result = self._new_like_me(_get_common_dtype(self, other)) - else: - result = self + # {{{ sanity checks - else: - if not self.flags.forc: - raise RuntimeError( - "only contiguous arrays may " - "be used as arguments to this operation" - ) + if (not self.flags.forc) or (isinstance(other, GPUArray) + and not other.flags.forc): + raise RuntimeError("only contiguous arrays may " + "be used as arguments to this operation") + assert not isinstance(other, GPUArray) or other.shape == self.shape - if new: - result = self._new_like_me() - else: - result = self + # }}} func = elementwise.get_pow_array_kernel( self.dtype, - _get_common_dtype(self, other).type(other).dtype - if np.isscalar(other) else - other.dtype, + common_dtype if np.isscalar(other) else other.dtype, result.dtype, not np.isscalar(self), not np.isscalar(other) - ) + ) func.prepared_async_call( self._grid, -- GitLab From 33f8529967f350ab7acf2eac51d796101efc89f2 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Thu, 10 Feb 2022 00:06:44 -0600 Subject: [PATCH 61/66] formatting: removes indent --- pycuda/elementwise.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 985fec75..4e8601f0 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -669,7 +669,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array) }, "z[i] = %s" % func, name="pow_method" - ) + ) @context_dependent_memoize -- GitLab From 6e4e8353ee6ea3116e6bbbea0d1f1a28493500bb Mon Sep 17 00:00:00 2001 From: mit kotak Date: Sat, 12 Mar 2022 22:26:36 -0600 Subject: [PATCH 62/66] Merge branch 'main' of https://github.com/inducer/pycuda --- test/InputOutput.cuh | 21 +++ test/TimingGPU.cuh | 31 ++++ test/Utilities.cuh | 193 ++++++++++++++++++++ test/test.py | 18 ++ test/test_add.cu | 412 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 675 insertions(+) create mode 100644 test/InputOutput.cuh create mode 100644 test/TimingGPU.cuh create mode 100644 test/Utilities.cuh create mode 100644 test/test.py create mode 100644 test/test_add.cu diff --git a/test/InputOutput.cuh b/test/InputOutput.cuh new file mode 100644 index 00000000..5abb4eca --- /dev/null +++ b/test/InputOutput.cuh @@ -0,0 +1,21 @@ +#ifndef INPUTOUTPUT_CUH +#define INPUTOUTPUT_CUH + +template +void saveGPUrealtxt(const T *, const char *, const int); + +template +void saveCPUrealtxt(const T *, const char *, const int); + +template +void saveGPUcomplextxt(const T *, const char *, const int); + +void saveGPUcomplextxt(const double2 *, const char *, const int); + +template +T * loadCPUrealtxt(const char *, T * __restrict__, const int); + +template +T * loadGPUrealtxt(const char *, T * __restrict__, const int); + +#endif \ No newline at end of file diff --git a/test/TimingGPU.cuh b/test/TimingGPU.cuh new file mode 100644 index 00000000..408949df --- /dev/null +++ b/test/TimingGPU.cuh @@ -0,0 +1,31 @@ +#ifndef __TIMINGGPU_CUH__ +#define __TIMINGGPU_CUH__ + +/**************/ +/* TIMING GPU */ +/**************/ + +// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5 +// microsecond precision. + +struct PrivateTimingGPU; + +class TimingGPU +{ +private: + PrivateTimingGPU *privateTimingGPU; + +public: + + TimingGPU(); + + ~TimingGPU(); + + void StartCounter(); + void StartCounterFlags(); + + float GetCounter(); + +}; // TimingCPU class + +#endif \ No newline at end of file diff --git a/test/Utilities.cuh b/test/Utilities.cuh new file mode 100644 index 00000000..3b3623e6 --- /dev/null +++ b/test/Utilities.cuh @@ -0,0 +1,193 @@ +#ifndef UTILITIES_CUH +#define UTILITIES_CUH + +#include +#include +#include +#include +#include + +//#include + +//extern "C" int iDivUp(int, int); +__host__ __device__ int iDivUp(int, int); +extern "C" void gpuErrchk(cudaError_t); +extern "C" void cusolveSafeCall(cusolverStatus_t); +extern "C" void cublasSafeCall(cublasStatus_t); +extern "C" void cufftSafeCall(cufftResult err); +extern "C" void cusparseSafeCall(cusparseStatus_t err); +extern "C" void curandSafeCall(curandStatus_t err); + +template +void reverseArray(const T * __restrict__, T * __restrict__, const int, const T a = static_cast(1)); + +//template +//thrust::pair Cartesian2Polar(const T * __restrict__ d_x, const T * __restrict__ d_y, const int N, const T a = static_cast(1)); +// +//template +//thrust::pair h_Cartesian2Polar(const T * __restrict__ d_x, const T * __restrict__ d_y, const int N, const T a = static_cast(1)); + +template +T h_l2_norm(T *v1, T *v2, const int N); + +template +void linearCombination(const T * __restrict__, const T * __restrict__, T * __restrict__, const int, const int, const cublasHandle_t); + +void linearCombination(const float * __restrict__, const float * __restrict__, float * __restrict__, + const int, const int, const cublasHandle_t); + +void linearCombination(const double * __restrict__, const double * __restrict__, double * __restrict__, + const int, const int, const cublasHandle_t); + +template +void vectorAddConstant(T * __restrict__, const T, const int); + +template +void vectorMulConstant(T * __restrict__, const T, const int); + +template +void h_vectorMulConstant(T * __restrict__, const T, const int); + +template +__host__ __device__ T fma2(T, T, T); + +__device__ int modulo(int, int); + +__device__ double atomicAdd(double *, double); +__device__ float atomicMin(float *, float); + +double deg2rad(double); + +void cudaMemoryUsage(); + +/**************************/ +/* TEMPLATE SHARED MEMORY */ +/**************************/ +// --- Credit to the simpleTemplates CUDA sample +template +struct SharedMemory +{ + // Ensure that we won't compile any un-specialized types + __device__ T *getPointer() + { + extern __device__ void error(void); + error(); + return NULL; + } +}; + +// Following are the specializations for the following types. +// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double +// One could also specialize it for user-defined types. + +template <> +struct SharedMemory +{ + __device__ int *getPointer() + { + extern __shared__ int s_int[]; + return s_int; + } +}; + +template <> +struct SharedMemory +{ + __device__ unsigned int *getPointer() + { + extern __shared__ unsigned int s_uint[]; + return s_uint; + } +}; + +template <> +struct SharedMemory +{ + __device__ char *getPointer() + { + extern __shared__ char s_char[]; + return s_char; + } +}; + +template <> +struct SharedMemory +{ + __device__ unsigned char *getPointer() + { + extern __shared__ unsigned char s_uchar[]; + return s_uchar; + } +}; + +template <> +struct SharedMemory +{ + __device__ short *getPointer() + { + extern __shared__ short s_short[]; + return s_short; + } +}; + +template <> +struct SharedMemory +{ + __device__ unsigned short *getPointer() + { + extern __shared__ unsigned short s_ushort[]; + return s_ushort; + } +}; + +template <> +struct SharedMemory +{ + __device__ long *getPointer() + { + extern __shared__ long s_long[]; + return s_long; + } +}; + +template <> +struct SharedMemory +{ + __device__ unsigned long *getPointer() + { + extern __shared__ unsigned long s_ulong[]; + return s_ulong; + } +}; + +template <> +struct SharedMemory +{ + __device__ bool *getPointer() + { + extern __shared__ bool s_bool[]; + return s_bool; + } +}; + +template <> +struct SharedMemory +{ + __device__ float *getPointer() + { + extern __shared__ float s_float[]; + return s_float; + } +}; + +template <> +struct SharedMemory +{ + __device__ double *getPointer() + { + extern __shared__ double s_double[]; + return s_double; + } +}; + +#endif \ No newline at end of file diff --git a/test/test.py b/test/test.py new file mode 100644 index 00000000..1a467be5 --- /dev/null +++ b/test/test.py @@ -0,0 +1,18 @@ +import pycuda.autoinit # noqa: F401 +import pycuda.driver as cuda + +import numpy as np + +g = cuda.Graph() + +shape = (32, 8) +a = cuda.pagelocked_zeros(shape, dtype=np.float32) +a[:] = np.random.randn(*shape) + +a_gpu = cuda.mem_alloc(a.nbytes) + +import pycuda +ctx = pycuda.tools.make_default_context() + +g.add_memcpy_node([],ctx,a,a_gpu,0,0,0) + diff --git a/test/test_add.cu b/test/test_add.cu new file mode 100644 index 00000000..35758911 --- /dev/null +++ b/test/test_add.cu @@ -0,0 +1,412 @@ +/* +* Author: Gregory Gutmann +* Simple demonstration of CUDA graphs using the vector add code from Visual Studio's default CUDA project as a starting point. +*/ + +#include "cuda_runtime.h" +#include "device_launch_parameters.h" +#include "helper_cuda.h" +#include +#include +#include +#include + +#define LOOP_COUNT 100 +#define VERBOSE 0 + +typedef std::chrono::high_resolution_clock Clock; + +cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, int loopCount, int64_t *tElapsed); +cudaError_t addWithCudaGraph(int *c, const int *a, const int *b, unsigned int size, int loopCount, int64_t * tElapsedGraph); + +int originalTest(); +int extendedTest(); + +__global__ void addKernel(int *c, const int *a, const int *b, int n) +{ + int gtid = blockIdx.x * blockDim.x + threadIdx.x; + c[gtid] = a[gtid] + b[gtid]; +} + +int main() +{ + if (originalTest()) { + fprintf(stderr, "originalTest failed!"); + return 1; + } + + if (extendedTest()) { + fprintf(stderr, "extendedTest failed!"); + return 1; + } + + // Print warning, assumes code ran successfully + printf("\nWARNING: If loop count is low these timings may be skewed by GPU warmup time\n"); + + return 0; +} + +int originalTest() { + const int arraySize = 5; + + const int a[arraySize] = { 1, 2, 3, 4, 5 }; + const int b[arraySize] = { 10, 20, 30, 40, 50 }; + int c[arraySize] = { 0 }; + + int64_t tElapsed = 0; + int64_t tElapsedGraph = 0; + + // Add vectors in parallel. + printf("\nRunning addWithCuda: arraySize = %d\n", arraySize); + + cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize, LOOP_COUNT, &tElapsed); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "addWithCuda failed!"); + return 1; + } + + if (VERBOSE) + printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", + c[0], c[1], c[2], c[3], c[4]); + + // Add vectors in parallel with graph. + printf("\nRunning addWithCudaGraph: arraySize = %d\n", arraySize); + + cudaStatus = addWithCudaGraph(c, a, b, arraySize, LOOP_COUNT, &tElapsedGraph); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "addWithCudaGraph failed!"); + return 1; + } + + if (VERBOSE) + printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", + c[0], c[1], c[2], c[3], c[4]); + + printf("\n\n", + (tElapsed > tElapsedGraph) ? "reduced" : "increased", + abs(tElapsed - tElapsedGraph), tElapsed / (double)tElapsedGraph); + + printf("\n----------------------------------------------------------------------------\n"); + + // cudaDeviceReset must be called before exiting in order for profiling and + // tracing tools such as Nsight and Visual Profiler to show complete traces. + cudaStatus = cudaDeviceReset(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceReset failed!"); + return 1; + } + + return 0; +} + +int extendedTest() { + const int arraySize = 1 << 18; + + int *a; + int *b; + int *c; + + int64_t tElapsed = 0; + int64_t tElapsedGraph = 0; + + // Greater Host <-> Device memory copy performance (normally use sparingly) + cudaMallocHost((void**)&a, arraySize * sizeof(int)); + cudaMallocHost((void**)&b, arraySize * sizeof(int)); + cudaMallocHost((void**)&c, arraySize * sizeof(int)); + + int i = 0; +#pragma omp parallel for + for (i = 0; i < arraySize; ++i) { + a[i] = rand() % 20; + b[i] = rand() % 20; + c[i] = 0; + } + + // Add vectors in parallel. + printf("\nRunning addWithCuda: arraySize = %d\n", arraySize); + + cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize, LOOP_COUNT, &tElapsed); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "addWithCuda failed!"); + return 1; + } + + if (VERBOSE) + printf("{%d,%d,%d,%d,%d,...} + {%d,%d,%d,%d,%d,...} = {%d,%d,%d,%d,%d,...}\n", + a[0], a[1], a[2], a[3], a[4], + b[0], b[1], b[2], b[3], b[4], + c[0], c[1], c[2], c[3], c[4]); + + // Add vectors in parallel with graph. + printf("\nRunning addWithCudaGraph: arraySize = %d\n", arraySize); + + cudaStatus = addWithCudaGraph(c, a, b, arraySize, LOOP_COUNT, &tElapsedGraph); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "addWithCudaGraph failed!"); + return 1; + } + + if (VERBOSE) + printf("{%d,%d,%d,%d,%d,...} + {%d,%d,%d,%d,%d,...} = {%d,%d,%d,%d,%d,...}\n", + a[0], a[1], a[2], a[3], a[4], + b[0], b[1], b[2], b[3], b[4], + c[0], c[1], c[2], c[3], c[4]); + + printf("\n\n", + (tElapsed > tElapsedGraph) ? "reduced" : "increased", + abs(tElapsed - tElapsedGraph), tElapsed / (double)tElapsedGraph); + + // Free CUDA host memory + cudaFree(a); + cudaFree(b); + cudaFree(c); + + printf("\n----------------------------------------------------------------------------\n"); + + // cudaDeviceReset must be called before exiting in order for profiling and + // tracing tools such as Nsight and Visual Profiler to show complete traces. + cudaStatus = cudaDeviceReset(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceReset failed!"); + return 1; + } + + return 0; +} + +// Helper function for using CUDA to add vectors in parallel. +cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, int loopCount, int64_t* tElapsed) +{ + int *dev_a = 0; + int *dev_b = 0; + int *dev_c = 0; + cudaError_t cudaStatus; + + int threads = 256; + int blocks = (size + threads - 1) / threads; + + // Pre-declare timers for reducing warnings related to the goto statements + std::chrono::system_clock::time_point t1; + std::chrono::system_clock::time_point t2; + int64_t us_elapsed = 0; + + // Choose which GPU to run on, change this on a multi-GPU system. Then allocate GPU memory. + { + cudaStatus = cudaSetDevice(0); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); + } + + cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + } + + cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + } + + cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + } + } + + t1 = Clock::now(); + for (int i = 0; i < loopCount; ++i) { + // Copy input vectors from host memory to GPU buffers. + cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMemcpy failed!"); + goto Error; + } + + cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMemcpy failed!"); + goto Error; + } + + // Launch a kernel on the GPU with one thread for each element. + addKernel << > > (dev_c, dev_a, dev_b, size); + + // Check for any errors launching the kernel + cudaStatus = cudaGetLastError(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); + goto Error; + } + + // cudaDeviceSynchronize waits for the kernel to finish, and returns + // any errors encountered during the launch. + // NOTE: Below in the graph implementation this sync is included via graph dependencies + cudaStatus = cudaDeviceSynchronize(); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); + goto Error; + } + + // Copy output vector from GPU buffer to host memory. + cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMemcpy failed!"); + goto Error; + } + } + t2 = Clock::now(); + us_elapsed = (int64_t)(t2 - t1).count() / 1000; + printf("Looped %d time(s) in %lld microseconds\n", loopCount, us_elapsed); + *tElapsed = us_elapsed; + +Error: + cudaFree(dev_c); + cudaFree(dev_a); + cudaFree(dev_b); + + return cudaStatus; +} + +cudaError_t addWithCudaGraph(int* c, const int* a, const int* b, unsigned int size, int loopCount, int64_t* tElapsedGraph) +{ + // Original + int* dev_a = 0; + int* dev_b = 0; + int* dev_c = 0; + cudaError_t cudaStatus; + + int threads = 256; + int blocks = (size + threads - 1) / threads; + + // For Graph + cudaStream_t streamForGraph; + cudaGraph_t graph; + std::vector nodeDependencies; + cudaGraphNode_t memcpyNode, kernelNode; + cudaKernelNodeParams kernelNodeParams = { 0 }; + cudaMemcpy3DParms memcpyParams = { 0 }; + + // Choose which GPU to run on, change this on a multi-GPU system. Then allocate GPU memory. + { + cudaStatus = cudaSetDevice(0); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); + } + + cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + } + + cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + } + + cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); + if (cudaStatus != cudaSuccess) { + fprintf(stderr, "cudaMalloc failed!"); + } + } + + // Start of Graph Creation + + checkCudaErrors(cudaGraphCreate(&graph, 0)); + checkCudaErrors(cudaStreamCreateWithFlags(&streamForGraph, cudaStreamNonBlocking)); + + // Add memcpy nodes for copying input vectors from host memory to GPU buffers + + memset(&memcpyParams, 0, sizeof(memcpyParams)); + + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr((void*)a, size * sizeof(int), size, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(dev_a, size * sizeof(float), size, 1); + memcpyParams.extent = make_cudaExtent(size * sizeof(float), 1, 1); + memcpyParams.kind = cudaMemcpyHostToDevice; + + checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams)); + nodeDependencies.push_back(memcpyNode); + + memset(&memcpyParams, 0, sizeof(memcpyParams)); + + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr((void*)b, size * sizeof(int), size, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(dev_b, size * sizeof(float), size, 1); + memcpyParams.extent = make_cudaExtent(size * sizeof(float), 1, 1); + memcpyParams.kind = cudaMemcpyHostToDevice; + + checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams)); + nodeDependencies.push_back(memcpyNode); + + // Add a kernel node for launching a kernel on the GPU + + memset(&kernelNodeParams, 0, sizeof(kernelNodeParams)); + + kernelNodeParams.func = (void*)addKernel; + kernelNodeParams.gridDim = dim3(blocks, 1, 1); + kernelNodeParams.blockDim = dim3(threads, 1, 1); + kernelNodeParams.sharedMemBytes = 0; + void* kernelArgs[4] = { (void*)&dev_c, (void*)&dev_a, (void*)&dev_b, &size }; + kernelNodeParams.kernelParams = kernelArgs; + kernelNodeParams.extra = NULL; + + checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &kernelNodeParams)); + nodeDependencies.clear(); + nodeDependencies.push_back(kernelNode); + + // Add memcpy node for copying output vector from GPU buffers to host memory + + memset(&memcpyParams, 0, sizeof(memcpyParams)); + + memcpyParams.srcArray = NULL; + memcpyParams.srcPos = make_cudaPos(0, 0, 0); + memcpyParams.srcPtr = make_cudaPitchedPtr(dev_c, size * sizeof(int), size, 1); + memcpyParams.dstArray = NULL; + memcpyParams.dstPos = make_cudaPos(0, 0, 0); + memcpyParams.dstPtr = make_cudaPitchedPtr(c, size * sizeof(int), size, 1); + memcpyParams.extent = make_cudaExtent(size * sizeof(int), 1, 1); + memcpyParams.kind = cudaMemcpyDeviceToHost; + checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams)); + + if (VERBOSE) { + cudaGraphNode_t* nodes = NULL; + size_t numNodes = 0; + checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes)); + printf("Num of nodes in the graph created manually = %zu\n", numNodes); + } + + // Create an executable graph from a graph + + cudaGraphExec_t graphExec; + checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); + + // Run the graph + + auto t1 = Clock::now(); + for (int i = 0; i < loopCount; ++i) { + checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); + checkCudaErrors(cudaStreamSynchronize(streamForGraph)); + } + auto t2 = Clock::now(); + int64_t us_elapsed = (int64_t)(t2 - t1).count() / 1000; + printf("Looped %d time(s) in %lld microseconds\n", loopCount, us_elapsed); + *tElapsedGraph = us_elapsed; + + // Clean up + + checkCudaErrors(cudaGraphExecDestroy(graphExec)); + checkCudaErrors(cudaGraphDestroy(graph)); + checkCudaErrors(cudaStreamDestroy(streamForGraph)); + + cudaFree(dev_c); + cudaFree(dev_a); + cudaFree(dev_b); + + return cudaStatus; +} \ No newline at end of file -- GitLab From 0bb45c006dcfa31ad3bc7eb284b6c6b8168e6d24 Mon Sep 17 00:00:00 2001 From: mit kotak Date: Fri, 18 Mar 2022 19:18:47 -0500 Subject: [PATCH 63/66] removed extra tests --- test/InputOutput.cuh | 21 --- test/TimingGPU.cuh | 31 ---- test/Utilities.cuh | 193 -------------------- test/test.py | 18 -- test/test_add.cu | 412 ------------------------------------------- 5 files changed, 675 deletions(-) delete mode 100644 test/InputOutput.cuh delete mode 100644 test/TimingGPU.cuh delete mode 100644 test/Utilities.cuh delete mode 100644 test/test.py delete mode 100644 test/test_add.cu diff --git a/test/InputOutput.cuh b/test/InputOutput.cuh deleted file mode 100644 index 5abb4eca..00000000 --- a/test/InputOutput.cuh +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef INPUTOUTPUT_CUH -#define INPUTOUTPUT_CUH - -template -void saveGPUrealtxt(const T *, const char *, const int); - -template -void saveCPUrealtxt(const T *, const char *, const int); - -template -void saveGPUcomplextxt(const T *, const char *, const int); - -void saveGPUcomplextxt(const double2 *, const char *, const int); - -template -T * loadCPUrealtxt(const char *, T * __restrict__, const int); - -template -T * loadGPUrealtxt(const char *, T * __restrict__, const int); - -#endif \ No newline at end of file diff --git a/test/TimingGPU.cuh b/test/TimingGPU.cuh deleted file mode 100644 index 408949df..00000000 --- a/test/TimingGPU.cuh +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef __TIMINGGPU_CUH__ -#define __TIMINGGPU_CUH__ - -/**************/ -/* TIMING GPU */ -/**************/ - -// Events are a part of CUDA API and provide a system independent way to measure execution times on CUDA devices with approximately 0.5 -// microsecond precision. - -struct PrivateTimingGPU; - -class TimingGPU -{ -private: - PrivateTimingGPU *privateTimingGPU; - -public: - - TimingGPU(); - - ~TimingGPU(); - - void StartCounter(); - void StartCounterFlags(); - - float GetCounter(); - -}; // TimingCPU class - -#endif \ No newline at end of file diff --git a/test/Utilities.cuh b/test/Utilities.cuh deleted file mode 100644 index 3b3623e6..00000000 --- a/test/Utilities.cuh +++ /dev/null @@ -1,193 +0,0 @@ -#ifndef UTILITIES_CUH -#define UTILITIES_CUH - -#include -#include -#include -#include -#include - -//#include - -//extern "C" int iDivUp(int, int); -__host__ __device__ int iDivUp(int, int); -extern "C" void gpuErrchk(cudaError_t); -extern "C" void cusolveSafeCall(cusolverStatus_t); -extern "C" void cublasSafeCall(cublasStatus_t); -extern "C" void cufftSafeCall(cufftResult err); -extern "C" void cusparseSafeCall(cusparseStatus_t err); -extern "C" void curandSafeCall(curandStatus_t err); - -template -void reverseArray(const T * __restrict__, T * __restrict__, const int, const T a = static_cast(1)); - -//template -//thrust::pair Cartesian2Polar(const T * __restrict__ d_x, const T * __restrict__ d_y, const int N, const T a = static_cast(1)); -// -//template -//thrust::pair h_Cartesian2Polar(const T * __restrict__ d_x, const T * __restrict__ d_y, const int N, const T a = static_cast(1)); - -template -T h_l2_norm(T *v1, T *v2, const int N); - -template -void linearCombination(const T * __restrict__, const T * __restrict__, T * __restrict__, const int, const int, const cublasHandle_t); - -void linearCombination(const float * __restrict__, const float * __restrict__, float * __restrict__, - const int, const int, const cublasHandle_t); - -void linearCombination(const double * __restrict__, const double * __restrict__, double * __restrict__, - const int, const int, const cublasHandle_t); - -template -void vectorAddConstant(T * __restrict__, const T, const int); - -template -void vectorMulConstant(T * __restrict__, const T, const int); - -template -void h_vectorMulConstant(T * __restrict__, const T, const int); - -template -__host__ __device__ T fma2(T, T, T); - -__device__ int modulo(int, int); - -__device__ double atomicAdd(double *, double); -__device__ float atomicMin(float *, float); - -double deg2rad(double); - -void cudaMemoryUsage(); - -/**************************/ -/* TEMPLATE SHARED MEMORY */ -/**************************/ -// --- Credit to the simpleTemplates CUDA sample -template -struct SharedMemory -{ - // Ensure that we won't compile any un-specialized types - __device__ T *getPointer() - { - extern __device__ void error(void); - error(); - return NULL; - } -}; - -// Following are the specializations for the following types. -// int, uint, char, uchar, short, ushort, long, ulong, bool, float, and double -// One could also specialize it for user-defined types. - -template <> -struct SharedMemory -{ - __device__ int *getPointer() - { - extern __shared__ int s_int[]; - return s_int; - } -}; - -template <> -struct SharedMemory -{ - __device__ unsigned int *getPointer() - { - extern __shared__ unsigned int s_uint[]; - return s_uint; - } -}; - -template <> -struct SharedMemory -{ - __device__ char *getPointer() - { - extern __shared__ char s_char[]; - return s_char; - } -}; - -template <> -struct SharedMemory -{ - __device__ unsigned char *getPointer() - { - extern __shared__ unsigned char s_uchar[]; - return s_uchar; - } -}; - -template <> -struct SharedMemory -{ - __device__ short *getPointer() - { - extern __shared__ short s_short[]; - return s_short; - } -}; - -template <> -struct SharedMemory -{ - __device__ unsigned short *getPointer() - { - extern __shared__ unsigned short s_ushort[]; - return s_ushort; - } -}; - -template <> -struct SharedMemory -{ - __device__ long *getPointer() - { - extern __shared__ long s_long[]; - return s_long; - } -}; - -template <> -struct SharedMemory -{ - __device__ unsigned long *getPointer() - { - extern __shared__ unsigned long s_ulong[]; - return s_ulong; - } -}; - -template <> -struct SharedMemory -{ - __device__ bool *getPointer() - { - extern __shared__ bool s_bool[]; - return s_bool; - } -}; - -template <> -struct SharedMemory -{ - __device__ float *getPointer() - { - extern __shared__ float s_float[]; - return s_float; - } -}; - -template <> -struct SharedMemory -{ - __device__ double *getPointer() - { - extern __shared__ double s_double[]; - return s_double; - } -}; - -#endif \ No newline at end of file diff --git a/test/test.py b/test/test.py deleted file mode 100644 index 1a467be5..00000000 --- a/test/test.py +++ /dev/null @@ -1,18 +0,0 @@ -import pycuda.autoinit # noqa: F401 -import pycuda.driver as cuda - -import numpy as np - -g = cuda.Graph() - -shape = (32, 8) -a = cuda.pagelocked_zeros(shape, dtype=np.float32) -a[:] = np.random.randn(*shape) - -a_gpu = cuda.mem_alloc(a.nbytes) - -import pycuda -ctx = pycuda.tools.make_default_context() - -g.add_memcpy_node([],ctx,a,a_gpu,0,0,0) - diff --git a/test/test_add.cu b/test/test_add.cu deleted file mode 100644 index 35758911..00000000 --- a/test/test_add.cu +++ /dev/null @@ -1,412 +0,0 @@ -/* -* Author: Gregory Gutmann -* Simple demonstration of CUDA graphs using the vector add code from Visual Studio's default CUDA project as a starting point. -*/ - -#include "cuda_runtime.h" -#include "device_launch_parameters.h" -#include "helper_cuda.h" -#include -#include -#include -#include - -#define LOOP_COUNT 100 -#define VERBOSE 0 - -typedef std::chrono::high_resolution_clock Clock; - -cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, int loopCount, int64_t *tElapsed); -cudaError_t addWithCudaGraph(int *c, const int *a, const int *b, unsigned int size, int loopCount, int64_t * tElapsedGraph); - -int originalTest(); -int extendedTest(); - -__global__ void addKernel(int *c, const int *a, const int *b, int n) -{ - int gtid = blockIdx.x * blockDim.x + threadIdx.x; - c[gtid] = a[gtid] + b[gtid]; -} - -int main() -{ - if (originalTest()) { - fprintf(stderr, "originalTest failed!"); - return 1; - } - - if (extendedTest()) { - fprintf(stderr, "extendedTest failed!"); - return 1; - } - - // Print warning, assumes code ran successfully - printf("\nWARNING: If loop count is low these timings may be skewed by GPU warmup time\n"); - - return 0; -} - -int originalTest() { - const int arraySize = 5; - - const int a[arraySize] = { 1, 2, 3, 4, 5 }; - const int b[arraySize] = { 10, 20, 30, 40, 50 }; - int c[arraySize] = { 0 }; - - int64_t tElapsed = 0; - int64_t tElapsedGraph = 0; - - // Add vectors in parallel. - printf("\nRunning addWithCuda: arraySize = %d\n", arraySize); - - cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize, LOOP_COUNT, &tElapsed); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "addWithCuda failed!"); - return 1; - } - - if (VERBOSE) - printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", - c[0], c[1], c[2], c[3], c[4]); - - // Add vectors in parallel with graph. - printf("\nRunning addWithCudaGraph: arraySize = %d\n", arraySize); - - cudaStatus = addWithCudaGraph(c, a, b, arraySize, LOOP_COUNT, &tElapsedGraph); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "addWithCudaGraph failed!"); - return 1; - } - - if (VERBOSE) - printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n", - c[0], c[1], c[2], c[3], c[4]); - - printf("\n\n", - (tElapsed > tElapsedGraph) ? "reduced" : "increased", - abs(tElapsed - tElapsedGraph), tElapsed / (double)tElapsedGraph); - - printf("\n----------------------------------------------------------------------------\n"); - - // cudaDeviceReset must be called before exiting in order for profiling and - // tracing tools such as Nsight and Visual Profiler to show complete traces. - cudaStatus = cudaDeviceReset(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceReset failed!"); - return 1; - } - - return 0; -} - -int extendedTest() { - const int arraySize = 1 << 18; - - int *a; - int *b; - int *c; - - int64_t tElapsed = 0; - int64_t tElapsedGraph = 0; - - // Greater Host <-> Device memory copy performance (normally use sparingly) - cudaMallocHost((void**)&a, arraySize * sizeof(int)); - cudaMallocHost((void**)&b, arraySize * sizeof(int)); - cudaMallocHost((void**)&c, arraySize * sizeof(int)); - - int i = 0; -#pragma omp parallel for - for (i = 0; i < arraySize; ++i) { - a[i] = rand() % 20; - b[i] = rand() % 20; - c[i] = 0; - } - - // Add vectors in parallel. - printf("\nRunning addWithCuda: arraySize = %d\n", arraySize); - - cudaError_t cudaStatus = addWithCuda(c, a, b, arraySize, LOOP_COUNT, &tElapsed); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "addWithCuda failed!"); - return 1; - } - - if (VERBOSE) - printf("{%d,%d,%d,%d,%d,...} + {%d,%d,%d,%d,%d,...} = {%d,%d,%d,%d,%d,...}\n", - a[0], a[1], a[2], a[3], a[4], - b[0], b[1], b[2], b[3], b[4], - c[0], c[1], c[2], c[3], c[4]); - - // Add vectors in parallel with graph. - printf("\nRunning addWithCudaGraph: arraySize = %d\n", arraySize); - - cudaStatus = addWithCudaGraph(c, a, b, arraySize, LOOP_COUNT, &tElapsedGraph); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "addWithCudaGraph failed!"); - return 1; - } - - if (VERBOSE) - printf("{%d,%d,%d,%d,%d,...} + {%d,%d,%d,%d,%d,...} = {%d,%d,%d,%d,%d,...}\n", - a[0], a[1], a[2], a[3], a[4], - b[0], b[1], b[2], b[3], b[4], - c[0], c[1], c[2], c[3], c[4]); - - printf("\n\n", - (tElapsed > tElapsedGraph) ? "reduced" : "increased", - abs(tElapsed - tElapsedGraph), tElapsed / (double)tElapsedGraph); - - // Free CUDA host memory - cudaFree(a); - cudaFree(b); - cudaFree(c); - - printf("\n----------------------------------------------------------------------------\n"); - - // cudaDeviceReset must be called before exiting in order for profiling and - // tracing tools such as Nsight and Visual Profiler to show complete traces. - cudaStatus = cudaDeviceReset(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceReset failed!"); - return 1; - } - - return 0; -} - -// Helper function for using CUDA to add vectors in parallel. -cudaError_t addWithCuda(int *c, const int *a, const int *b, unsigned int size, int loopCount, int64_t* tElapsed) -{ - int *dev_a = 0; - int *dev_b = 0; - int *dev_c = 0; - cudaError_t cudaStatus; - - int threads = 256; - int blocks = (size + threads - 1) / threads; - - // Pre-declare timers for reducing warnings related to the goto statements - std::chrono::system_clock::time_point t1; - std::chrono::system_clock::time_point t2; - int64_t us_elapsed = 0; - - // Choose which GPU to run on, change this on a multi-GPU system. Then allocate GPU memory. - { - cudaStatus = cudaSetDevice(0); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); - } - - cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMalloc failed!"); - } - - cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMalloc failed!"); - } - - cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMalloc failed!"); - } - } - - t1 = Clock::now(); - for (int i = 0; i < loopCount; ++i) { - // Copy input vectors from host memory to GPU buffers. - cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMemcpy failed!"); - goto Error; - } - - cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMemcpy failed!"); - goto Error; - } - - // Launch a kernel on the GPU with one thread for each element. - addKernel << > > (dev_c, dev_a, dev_b, size); - - // Check for any errors launching the kernel - cudaStatus = cudaGetLastError(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus)); - goto Error; - } - - // cudaDeviceSynchronize waits for the kernel to finish, and returns - // any errors encountered during the launch. - // NOTE: Below in the graph implementation this sync is included via graph dependencies - cudaStatus = cudaDeviceSynchronize(); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus); - goto Error; - } - - // Copy output vector from GPU buffer to host memory. - cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMemcpy failed!"); - goto Error; - } - } - t2 = Clock::now(); - us_elapsed = (int64_t)(t2 - t1).count() / 1000; - printf("Looped %d time(s) in %lld microseconds\n", loopCount, us_elapsed); - *tElapsed = us_elapsed; - -Error: - cudaFree(dev_c); - cudaFree(dev_a); - cudaFree(dev_b); - - return cudaStatus; -} - -cudaError_t addWithCudaGraph(int* c, const int* a, const int* b, unsigned int size, int loopCount, int64_t* tElapsedGraph) -{ - // Original - int* dev_a = 0; - int* dev_b = 0; - int* dev_c = 0; - cudaError_t cudaStatus; - - int threads = 256; - int blocks = (size + threads - 1) / threads; - - // For Graph - cudaStream_t streamForGraph; - cudaGraph_t graph; - std::vector nodeDependencies; - cudaGraphNode_t memcpyNode, kernelNode; - cudaKernelNodeParams kernelNodeParams = { 0 }; - cudaMemcpy3DParms memcpyParams = { 0 }; - - // Choose which GPU to run on, change this on a multi-GPU system. Then allocate GPU memory. - { - cudaStatus = cudaSetDevice(0); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?"); - } - - cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int)); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMalloc failed!"); - } - - cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int)); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMalloc failed!"); - } - - cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int)); - if (cudaStatus != cudaSuccess) { - fprintf(stderr, "cudaMalloc failed!"); - } - } - - // Start of Graph Creation - - checkCudaErrors(cudaGraphCreate(&graph, 0)); - checkCudaErrors(cudaStreamCreateWithFlags(&streamForGraph, cudaStreamNonBlocking)); - - // Add memcpy nodes for copying input vectors from host memory to GPU buffers - - memset(&memcpyParams, 0, sizeof(memcpyParams)); - - memcpyParams.srcArray = NULL; - memcpyParams.srcPos = make_cudaPos(0, 0, 0); - memcpyParams.srcPtr = make_cudaPitchedPtr((void*)a, size * sizeof(int), size, 1); - memcpyParams.dstArray = NULL; - memcpyParams.dstPos = make_cudaPos(0, 0, 0); - memcpyParams.dstPtr = make_cudaPitchedPtr(dev_a, size * sizeof(float), size, 1); - memcpyParams.extent = make_cudaExtent(size * sizeof(float), 1, 1); - memcpyParams.kind = cudaMemcpyHostToDevice; - - checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams)); - nodeDependencies.push_back(memcpyNode); - - memset(&memcpyParams, 0, sizeof(memcpyParams)); - - memcpyParams.srcArray = NULL; - memcpyParams.srcPos = make_cudaPos(0, 0, 0); - memcpyParams.srcPtr = make_cudaPitchedPtr((void*)b, size * sizeof(int), size, 1); - memcpyParams.dstArray = NULL; - memcpyParams.dstPos = make_cudaPos(0, 0, 0); - memcpyParams.dstPtr = make_cudaPitchedPtr(dev_b, size * sizeof(float), size, 1); - memcpyParams.extent = make_cudaExtent(size * sizeof(float), 1, 1); - memcpyParams.kind = cudaMemcpyHostToDevice; - - checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, NULL, 0, &memcpyParams)); - nodeDependencies.push_back(memcpyNode); - - // Add a kernel node for launching a kernel on the GPU - - memset(&kernelNodeParams, 0, sizeof(kernelNodeParams)); - - kernelNodeParams.func = (void*)addKernel; - kernelNodeParams.gridDim = dim3(blocks, 1, 1); - kernelNodeParams.blockDim = dim3(threads, 1, 1); - kernelNodeParams.sharedMemBytes = 0; - void* kernelArgs[4] = { (void*)&dev_c, (void*)&dev_a, (void*)&dev_b, &size }; - kernelNodeParams.kernelParams = kernelArgs; - kernelNodeParams.extra = NULL; - - checkCudaErrors(cudaGraphAddKernelNode(&kernelNode, graph, nodeDependencies.data(), nodeDependencies.size(), &kernelNodeParams)); - nodeDependencies.clear(); - nodeDependencies.push_back(kernelNode); - - // Add memcpy node for copying output vector from GPU buffers to host memory - - memset(&memcpyParams, 0, sizeof(memcpyParams)); - - memcpyParams.srcArray = NULL; - memcpyParams.srcPos = make_cudaPos(0, 0, 0); - memcpyParams.srcPtr = make_cudaPitchedPtr(dev_c, size * sizeof(int), size, 1); - memcpyParams.dstArray = NULL; - memcpyParams.dstPos = make_cudaPos(0, 0, 0); - memcpyParams.dstPtr = make_cudaPitchedPtr(c, size * sizeof(int), size, 1); - memcpyParams.extent = make_cudaExtent(size * sizeof(int), 1, 1); - memcpyParams.kind = cudaMemcpyDeviceToHost; - checkCudaErrors(cudaGraphAddMemcpyNode(&memcpyNode, graph, nodeDependencies.data(), nodeDependencies.size(), &memcpyParams)); - - if (VERBOSE) { - cudaGraphNode_t* nodes = NULL; - size_t numNodes = 0; - checkCudaErrors(cudaGraphGetNodes(graph, nodes, &numNodes)); - printf("Num of nodes in the graph created manually = %zu\n", numNodes); - } - - // Create an executable graph from a graph - - cudaGraphExec_t graphExec; - checkCudaErrors(cudaGraphInstantiate(&graphExec, graph, NULL, NULL, 0)); - - // Run the graph - - auto t1 = Clock::now(); - for (int i = 0; i < loopCount; ++i) { - checkCudaErrors(cudaGraphLaunch(graphExec, streamForGraph)); - checkCudaErrors(cudaStreamSynchronize(streamForGraph)); - } - auto t2 = Clock::now(); - int64_t us_elapsed = (int64_t)(t2 - t1).count() / 1000; - printf("Looped %d time(s) in %lld microseconds\n", loopCount, us_elapsed); - *tElapsedGraph = us_elapsed; - - // Clean up - - checkCudaErrors(cudaGraphExecDestroy(graphExec)); - checkCudaErrors(cudaGraphDestroy(graph)); - checkCudaErrors(cudaStreamDestroy(streamForGraph)); - - cudaFree(dev_c); - cudaFree(dev_a); - cudaFree(dev_b); - - return cudaStatus; -} \ No newline at end of file -- GitLab From 786f8050af5ff1a5b5301a5c82ddee4c3721299b Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Sat, 11 Jun 2022 19:04:49 -0500 Subject: [PATCH 64/66] added np.float64 as default value for zeros --- pycuda/gpuarray.py | 2 +- test/test_gpuarray.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index 36a39dc3..be6582a3 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -1240,7 +1240,7 @@ def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None): empty = GPUArray -def zeros(shape, dtype, allocator=drv.mem_alloc, order="C"): +def zeros(shape, dtype=np.float64, allocator=drv.mem_alloc, order="C"): """Returns an array of the given shape and dtype filled with 0's.""" result = GPUArray(shape, dtype, allocator, order=order) zero = np.zeros((), dtype) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 73ec3ade..530698b9 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -461,7 +461,7 @@ class TestGPUArray: ] ): - a_gpu = gpuarray.zeros((50000,), dtype=np.float32) + a_gpu = gpuarray.zeros((50000,)) a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype) a_cpu[slc] = 7 -- GitLab From 1367f583ef05c87ecb4fcc172f9fc6286baaa3c4 Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Sat, 11 Jun 2022 19:39:24 -0500 Subject: [PATCH 65/66] enabled simple broadcasting for addition --- pycuda/elementwise.py | 11 +++++++++-- pycuda/gpuarray.py | 34 ++++++++++++++++++++++++++++------ test/test_gpuarray.py | 12 ++++++++++++ 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py index 4e8601f0..1c83a7ce 100644 --- a/pycuda/elementwise.py +++ b/pycuda/elementwise.py @@ -464,7 +464,14 @@ def get_linear_combination_kernel(summand_descriptors, dtype_z): @context_dependent_memoize -def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): +def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z, + x_is_scalar=False, y_is_scalar=False): + out_t = dtype_to_ctype(dtype_z) + x = "x[0]" if x_is_scalar else "x[i]" + ax = f"a*(({out_t}) {x})" + y = "y[0]" if y_is_scalar else "y[i]" + by = f"b*(({out_t}) {y})" + result = f"{ax} + {by}" return get_elwise_kernel( "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z" % { @@ -472,7 +479,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z): "tp_y": dtype_to_ctype(dtype_y), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = a*x[i] + b*y[i]", + "z[i] = %s" % result, "axpbyz", ) diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py index be6582a3..9ea807e6 100644 --- a/pycuda/gpuarray.py +++ b/pycuda/gpuarray.py @@ -25,6 +25,18 @@ def _get_common_dtype(obj1, obj2): return _get_common_dtype_base(obj1, obj2, has_double_support()) +def _get_broadcasted_binary_op_result(obj1, obj2, + dtype_getter=_get_common_dtype): + + if obj1.shape == obj2.shape: + return obj1._new_like_me(dtype_getter(obj1, obj2)) + elif obj1.shape == () or obj1.shape == (1,): + return obj2._new_like_me(dtype_getter(obj1, obj2)) + elif obj2.shape == () or obj2.shape == (1,): + return obj1._new_like_me(dtype_getter(obj1, obj2)) + else: + raise NotImplementedError("Broadcasting binary operator with shapes:" + f" {obj1.shape}, {obj2.shape}.") # {{{ vector types @@ -391,13 +403,22 @@ class GPUArray: def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None): """Compute ``out = selffac * self + otherfac*other``, where `other` is a vector..""" - assert self.shape == other.shape if not self.flags.forc or not other.flags.forc: raise RuntimeError( "only contiguous arrays may " "be used as arguments to this operation" ) - - func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype) + self_shape = self.shape + other_shape = other.shape + out_shape = out.shape + assert ((self_shape == other_shape == out_shape) + or ((self_shape == () or self_shape == (1,)) and other_shape == out_shape) + or ((other_shape == () or other_shape == (1,)) and self_shape == out_shape)) + if (self.size == 0) or (other.size == 0): + return out + func = elementwise.get_axpbyz_kernel( + self.dtype, other.dtype, out.dtype, + x_is_scalar=(self_shape == () or self_shape == (1,)), + y_is_scalar=(other_shape == () or other_shape == (1,))) if add_timer is not None: add_timer( @@ -537,16 +558,17 @@ class GPUArray: if isinstance(other, GPUArray): # add another vector - result = self._new_like_me(_get_common_dtype(self, other)) + result = _get_broadcasted_binary_op_result(self, other) return self._axpbyz(1, other, 1, result) - else: + elif np.isscalar(other): # add a scalar if other == 0: return self.copy() else: result = self._new_like_me(_get_common_dtype(self, other)) return self._axpbz(1, other, result) - + else: + return NotImplemented __radd__ = __add__ def __sub__(self, other): diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 530698b9..1725d73d 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -133,9 +133,21 @@ class TestGPUArray: a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32) a_gpu = gpuarray.to_gpu(a) + b = np.array([1]).astype(np.float32) + b_gpu = gpuarray.to_gpu(b) + a_empty = np.array([]).astype(np.float32) + a_empty_gpu = gpuarray.to_gpu(a_empty) a_added = (a_gpu + a_gpu).get() + a_added_scalar = (a_gpu + 1).get() + a_empty_pl_b = (a_empty_gpu + b_gpu).get() + a_gpu_pl_b_gpu = (a_gpu + b_gpu).get() + a_added_empty = (a_empty_gpu + a_empty_gpu).get() assert (a + a == a_added).all() + assert (a + 1 == a_added_scalar).all() + assert (a_empty + a_empty == a_added_empty).all() + assert (a_empty + b == a_empty_pl_b).all() + assert (a + b == a_gpu_pl_b_gpu).all() @mark_cuda_test def test_iaddition_array(self): -- GitLab From 4b60a872552dd3753698703f206f2ab5ba3dc14c Mon Sep 17 00:00:00 2001 From: Mit Kotak Date: Sat, 11 Jun 2022 19:47:53 -0500 Subject: [PATCH 66/66] added reverse broadcast test --- test/test_gpuarray.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py index 1725d73d..1581cbbd 100644 --- a/test/test_gpuarray.py +++ b/test/test_gpuarray.py @@ -139,14 +139,18 @@ class TestGPUArray: a_empty_gpu = gpuarray.to_gpu(a_empty) a_added = (a_gpu + a_gpu).get() a_added_scalar = (a_gpu + 1).get() + scalar_added_a = (1 + a_gpu).get() a_empty_pl_b = (a_empty_gpu + b_gpu).get() + b_pl_a_empty = (b_gpu + a_empty_gpu).get() a_gpu_pl_b_gpu = (a_gpu + b_gpu).get() a_added_empty = (a_empty_gpu + a_empty_gpu).get() assert (a + a == a_added).all() assert (a + 1 == a_added_scalar).all() + assert (1 + a == scalar_added_a).all() assert (a_empty + a_empty == a_added_empty).all() assert (a_empty + b == a_empty_pl_b).all() + assert (b + a_empty == b_pl_a_empty).all() assert (a + b == a_gpu_pl_b_gpu).all() @mark_cuda_test -- GitLab