From 95e75d2201de1e750b26cde6da6175bac19bea44 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kgk2@illinois.edu>
Date: Sat, 8 Jan 2022 02:13:43 +0000
Subject: [PATCH 01/61] Rpow scalar

---
 pycuda/elementwise.py | 39 +++++++++++++++++++++++++++++++++++++++
 pycuda/gpuarray.py    | 21 +++++++++++++++++++++
 test/test_gpuarray.py | 15 +++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 258bae5c..607e9f9c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -666,6 +666,45 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
     )
 
 
+@context_dependent_memoize
+def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
+    """
+    Returns the kernel for the operation: ``z = x ** y``
+    """
+    if np.float64 in [dtype_x, dtype_y]:
+        func = "pow"
+    else:
+        func = "powf"
+
+    if not is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x,y[i])" % func
+
+    elif is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x[i],y[i])" % func
+
+    elif is_base_array and not is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s y"
+        func = "%s(x[i],y)" % func
+
+    else:
+        raise AssertionError
+
+    return get_elwise_kernel(
+        ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype)
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = %s" % func,
+        name="pow_method")
+
+
 @context_dependent_memoize
 def get_fmod_kernel():
     return get_elwise_kernel(
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a0bf84c4..5e7a5607 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -839,6 +839,27 @@ class GPUArray:
         """
         return self._pow(other, new=False)
 
+    def __rpow__(self, other):
+        common_dtype = _get_common_dtype(self, other)
+        result = self._new_like_me(common_dtype)
+
+        if not np.isscalar(other):
+            # Base is a gpuarray => do not cast.
+            base = other
+        else:
+            base = common_dtype.type(other)
+
+        func = elementwise.get_rpow_kernel(
+            base.dtype, self.dtype, result.dtype,
+            is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self))
+        # Evaluates z = x ** y
+        func.prepared_async_call(self._grid, self._block, None,
+                                 result.gpudata,  # z
+                                 base if np.isscalar(base) else base.gpudata,  # x
+                                 self.gpudata,  # y
+                                 self.mem_size)
+        return result
+
     def reverse(self, stream=None):
         """Return this array in reversed order. The array is treated
         as one-dimensional.
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index dbf4b2f7..5234d8a2 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -40,6 +40,21 @@ class TestGPUArray:
         a_gpu = a_gpu.get()
         assert (np.abs(a ** 2 - a_gpu) < 1e-3).all()
 
+    @mark_cuda_test
+    def test_rpow_array(self):
+        scalar = np.random.rand()
+        a = abs(np.random.rand(10))
+        a_gpu = gpuarray.to_gpu(a)
+
+        result = (scalar ** a_gpu).get()
+        np.testing.assert_allclose(scalar ** a, result)
+
+        result = (a_gpu ** a_gpu).get()
+        np.testing.assert_allclose(a ** a, result)
+
+        result = (a_gpu ** scalar).get()
+        np.testing.assert_allclose(a ** scalar, result)
+
     @mark_cuda_test
     def test_numpy_integer_shape(self):
         gpuarray.empty(np.int32(17), np.float32)
-- 
GitLab


From f8f1c8a6612ad14b55761fd93cee017e4391b5c7 Mon Sep 17 00:00:00 2001
From: mit kotak <mitak2@illinois.edu>
Date: Mon, 17 Jan 2022 12:56:42 -0600
Subject: [PATCH 02/61] Merged get_rpow_kernel into get_pow_array_kernel +
 Improved test_pow_array to catch order flips

---
 pycuda/elementwise.py | 24 +++---------------------
 pycuda/gpuarray.py    |  6 +++---
 test/test_gpuarray.py | 14 ++++++++------
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 607e9f9c..cbe92953 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -648,26 +648,7 @@ def get_pow_kernel(dtype):
 
 
 @context_dependent_memoize
-def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
-    if np.float64 in [dtype_x, dtype_y]:
-        func = "pow"
-    else:
-        func = "powf"
-
-    return get_elwise_kernel(
-        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
-        % {
-            "tp_x": dtype_to_ctype(dtype_x),
-            "tp_y": dtype_to_ctype(dtype_y),
-            "tp_z": dtype_to_ctype(dtype_z),
-        },
-        "z[i] = %s(x[i], y[i])" % func,
-        "pow_method",
-    )
-
-
-@context_dependent_memoize
-def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
+def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
     """
     Returns the kernel for the operation: ``z = x ** y``
     """
@@ -702,7 +683,8 @@ def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
             "tp_z": dtype_to_ctype(dtype_z),
         },
         "z[i] = %s" % func,
-        name="pow_method")
+        name="pow_method"
+        )
 
 
 @context_dependent_memoize
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 5e7a5607..ea06a792 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -780,16 +780,16 @@ class GPUArray:
                 result = self
 
             func = elementwise.get_pow_array_kernel(
-                self.dtype, other.dtype, result.dtype
+                self.dtype, other.dtype, result.dtype, True, True
             )
 
             func.prepared_async_call(
                 self._grid,
                 self._block,
                 None,
+                result.gpudata,
                 self.gpudata,
                 other.gpudata,
-                result.gpudata,
                 self.mem_size,
             )
 
@@ -849,7 +849,7 @@ class GPUArray:
         else:
             base = common_dtype.type(other)
 
-        func = elementwise.get_rpow_kernel(
+        func = elementwise.get_pow_array_kernel(
             base.dtype, self.dtype, result.dtype,
             is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self))
         # Evaluates z = x ** y
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 5234d8a2..3e18e912 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -17,16 +17,18 @@ class TestGPUArray:
     def test_pow_array(self):
         a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
+        b = np.array([1, 2, 3, 4, 5]).astype(np.float64)
+        b_gpu = gpuarray.to_gpu(b)
 
-        result = pow(a_gpu, a_gpu).get()
-        assert (np.abs(a ** a - result) < 1e-3).all()
+        result = pow(a_gpu, b_gpu).get()
+        assert (np.abs(a ** b - result) < 1e-3).all()
 
-        result = (a_gpu ** a_gpu).get()
-        assert (np.abs(pow(a, a) - result) < 1e-3).all()
+        result = (a_gpu ** b_gpu).get()
+        assert (np.abs(pow(a, b) - result) < 1e-3).all()
 
-        a_gpu **= a_gpu
+        a_gpu **= b_gpu
         a_gpu = a_gpu.get()
-        assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all()
+        assert (np.abs(pow(a, b) - a_gpu) < 1e-3).all()
 
     @mark_cuda_test
     def test_pow_number(self):
-- 
GitLab


From b5c421a9ab696faf9930f08aac41e327e53c8639 Mon Sep 17 00:00:00 2001
From: mit kotak <mitak2@illinois.edu>
Date: Mon, 17 Jan 2022 13:03:46 -0600
Subject: [PATCH 03/61] Merge branch 'main' of
 https://gitlab.tiker.net/inducer/pycuda

---
 pycuda/compyte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/compyte b/pycuda/compyte
index 165b3aba..71bffa1a 160000
--- a/pycuda/compyte
+++ b/pycuda/compyte
@@ -1 +1 @@
-Subproject commit 165b3abae63bc39124a342ce1a539adbf6cd8a09
+Subproject commit 71bffa1ae64ed98b9d922c79a6f9cc7eb4fd642f
-- 
GitLab


From d38a58aaf1c8eb77a7860e02b680cce766b25a3b Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 20 Dec 2020 13:18:58 -0600
Subject: [PATCH 04/61] Drop usage of six

---
 test/undistributed/elwise-perf.py            | 5 -----
 test/undistributed/measure_gpuarray_speed.py | 5 -----
 test/undistributed/reduction-perf.py         | 8 +-------
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/test/undistributed/elwise-perf.py b/test/undistributed/elwise-perf.py
index dc2c2821..8562b99c 100644
--- a/test/undistributed/elwise-perf.py
+++ b/test/undistributed/elwise-perf.py
@@ -1,12 +1,7 @@
-from __future__ import absolute_import
-from __future__ import print_function
-
-#! /usr/bin/env python
 import pycuda.driver as drv
 import pycuda.autoinit
 import numpy
 import numpy.linalg as la
-from six.moves import range
 
 
 def main():
diff --git a/test/undistributed/measure_gpuarray_speed.py b/test/undistributed/measure_gpuarray_speed.py
index 26a20f41..4f6f390e 100755
--- a/test/undistributed/measure_gpuarray_speed.py
+++ b/test/undistributed/measure_gpuarray_speed.py
@@ -1,13 +1,8 @@
-from __future__ import absolute_import
-from __future__ import print_function
-
 #! /usr/bin/env python
 import pycuda.driver as drv
 import pycuda.autoinit
 import numpy
 import numpy.linalg as la
-from six.moves import range
-from six.moves import zip
 
 
 def main():
diff --git a/test/undistributed/reduction-perf.py b/test/undistributed/reduction-perf.py
index 842824cd..da4a0d03 100644
--- a/test/undistributed/reduction-perf.py
+++ b/test/undistributed/reduction-perf.py
@@ -1,11 +1,7 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
 import pycuda.autoinit
 import pycuda.gpuarray as gpuarray
 import pycuda.driver as cuda
 import numpy
-from six.moves import range
 
 
 def main():
@@ -14,8 +10,6 @@ def main():
     tbl = Table()
     tbl.add_row(("type", "size [MiB]", "time [ms]", "mem.bw [GB/s]"))
 
-    from random import shuffle
-
     for dtype_out in [numpy.float32, numpy.float64]:
         for ex in range(15, 27):
             sz = 1 << ex
@@ -28,7 +22,7 @@ def main():
             assert sz == a_gpu.shape[0]
             assert len(a_gpu.shape) == 1
 
-            from pycuda.reduction import get_sum_kernel, get_dot_kernel
+            from pycuda.reduction import get_dot_kernel
 
             krnl = get_dot_kernel(dtype_out, a_gpu.dtype)
 
-- 
GitLab


From 476cb84606c654661f311c4af00381d8e594f4e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Mon, 4 Jan 2021 19:50:01 +0100
Subject: [PATCH 05/61] Shorten/standardize doc/conf

---
 doc/source/conf.py | 137 ++-------------------------------------------
 1 file changed, 4 insertions(+), 133 deletions(-)

diff --git a/doc/source/conf.py b/doc/source/conf.py
index 8d54717f..2fb39ed0 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -1,26 +1,3 @@
-# PyCUDA documentation build configuration file, created by
-# sphinx-quickstart on Fri Jun 13 00:51:19 2008.
-#
-# This file is execfile()d with the current directory set to its containing dir.
-#
-# The contents of this file are pickled, so don't put values in the namespace
-# that aren't pickleable (module imports are okay, they're removed automatically).
-#
-# All configuration values have a default value; values that are commented out
-# serve to show the default value.
-
-# import sys, os
-
-# If your extensions are in another directory, add it here. If the directory
-# is relative to the documentation root, use os.path.abspath to make it
-# absolute, like shown here.
-# sys.path.append(os.path.abspath('some/directory'))
-
-# General configuration
-# ---------------------
-
-# Add any Sphinx extension module names here, as strings. They can be extensions
-# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 extensions = [
     "sphinx.ext.intersphinx",
     "sphinx.ext.mathjax",
@@ -38,7 +15,7 @@ master_doc = "index"
 
 # General substitutions.
 project = "PyCUDA"
-copyright = "2008, Andreas Kloeckner"
+copyright = "2008-20, Andreas Kloeckner"
 
 # The default replacements for |version| and |release|, also used in various
 # other places throughout the built documents.
@@ -55,30 +32,6 @@ version = ".".join(str(x) for x in ver_dic["VERSION"])
 # The full version, including alpha/beta/rc tags.
 release = ver_dic["VERSION_TEXT"]
 
-# There are two options for replacing |today|: either, you set today to some
-# non-false value, then it is used:
-# today = ''
-# Else, today_fmt is used as the format for a strftime call.
-today_fmt = "%B %d, %Y"
-
-# List of documents that shouldn't be included in the build.
-# unused_docs = []
-
-# List of directories, relative to source directories, that shouldn't be searched
-# for source files.
-# exclude_dirs = []
-
-# If true, '()' will be appended to :func: etc. cross-reference text.
-# add_function_parentheses = True
-
-# If true, the current module name will be prepended to all description
-# unit titles (such as .. function::).
-# add_module_names = True
-
-# If true, sectionauthor and moduleauthor directives will be shown in the
-# output. They are ignored by default.
-# show_authors = False
-
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = "sphinx"
 
@@ -88,94 +41,12 @@ pygments_style = "sphinx"
 
 html_theme = "furo"
 
-html_theme_options = {
-        }
-
-# The style sheet to use for HTML and HTML Help pages. A file of that name
-# must exist either in Sphinx' static/ path, or in one of the custom paths
-# given in html_static_path.
-# html_style = 'default.css'
-
-# The name for this set of Sphinx documents.  If None, it defaults to
-# "<project> v<release> documentation".
-# html_title = None
-
-# The name of an image file (within the static path) to place at the top of
-# the sidebar.
-# html_logo = None
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-# html_static_path = ['_static']
-
-# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
-# using the given strftime format.
-html_last_updated_fmt = "%b %d, %Y"
-
-# If true, SmartyPants will be used to convert quotes and dashes to
-# typographically correct entities.
-# html_use_smartypants = True
-
-# Custom sidebar templates, maps document names to template names.
-# html_sidebars = {}
-
-# Additional templates that should be rendered to pages, maps page names to
-# template names.
-# html_additional_pages = {}
-
-# If false, no module index is generated.
-# html_use_modindex = True
-
-# If true, the reST sources are included in the HTML build as _sources/<name>.
-# html_copy_source = True
-
-# If true, an OpenSearch description file will be output, and all pages will
-# contain a <link> tag referring to it.  The value of this option must be the
-# base URL from which the finished HTML is served.
-# html_use_opensearch = ''
-
-# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
-# html_file_suffix = ''
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = "PyCudadoc"
-
-
-# Options for LaTeX output
-# ------------------------
-
-# The paper size ('letter' or 'a4').
-# latex_paper_size = 'letter'
-
-# The font size ('10pt', '11pt' or '12pt').
-# latex_font_size = '10pt'
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title, author, document class [howto/manual]).
-latex_documents = [
-    ("index", "pycdua.tex", "PyCUDA Documentation", "Andreas Kloeckner", "manual"),
-]
-
-# The name of an image file (relative to this directory) to place at the top of
-# the title page.
-# latex_logo = None
-
-# For "manual" documents, if this is true, then toplevel headings are parts,
-# not chapters.
-# latex_use_parts = False
-
-# Additional stuff for the LaTeX preamble.
-# latex_preamble = ''
-
-# Documents to append as an appendix to all manuals.
-# latex_appendices = []
-
-# If false, no module index is generated.
-# latex_use_modindex = True
 
 intersphinx_mapping = {
     "https://docs.python.org/3": None,
     "https://numpy.org/doc/stable/": None,
     "https://documen.tician.de/codepy/": None,
 }
+
+autoclass_content = "class"
+autodoc_typehints = "description"
-- 
GitLab


From 04c54e2739b40e657725f2160282d67e6e6f9bda Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 9 Feb 2021 18:20:38 -0600
Subject: [PATCH 06/61] Check contiguity of arrays in GPUArray.fill (closes
 gh-265)

---
 pycuda/gpuarray.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 97630e3b..a1a3f3f3 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -627,6 +627,10 @@ class GPUArray:
 
     def fill(self, value, stream=None):
         """fills the array with the specified value"""
+        if not self.flags.forc:
+            raise RuntimeError(
+                "only contiguous arrays may be used as arguments to this operation")
+
         func = elementwise.get_fill_kernel(self.dtype)
         func.prepared_async_call(
             self._grid, self._block, stream, value, self.gpudata, self.mem_size
@@ -650,8 +654,7 @@ class GPUArray:
     ):
         if not self.flags.forc:
             raise RuntimeError(
-                "only contiguous arrays may " "be used as arguments to this operation"
-            )
+                "only contiguous arrays may be used as arguments to this operation")
 
         if self.dtype == np.float64 and allow_double_hack:
             if channels != 1:
-- 
GitLab


From dca0898b1afc667ce73444edb4a4725bf2a3bae4 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 9 Feb 2021 18:36:41 -0600
Subject: [PATCH 07/61] Fix test_zeros_like_etc to not use fill on non-contig
 arrays

---
 test/test_gpuarray.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index fb6a20fc..fc7b6736 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -1192,6 +1192,9 @@ class TestGPUArray:
 
                 contig = arr.flags.c_contiguous or arr.flags.f_contiguous
 
+                if not contig:
+                    continue
+
                 # Output matches order of input.
                 # Non-contiguous becomes C-contiguous
                 new_z = func(arr, order="A")
-- 
GitLab


From a746dea70ee6bdb2fab7b01c66b49271cfda92c2 Mon Sep 17 00:00:00 2001
From: zzjjbb <31069326+zzjjbb@users.noreply.github.com>
Date: Mon, 7 Dec 2020 19:41:45 -0500
Subject: [PATCH 08/61] add "out" parameter to GPUArray.conj(); add equivalent
 GPUArray.conjugate() method

---
 pycuda/elementwise.py |  5 +++--
 pycuda/gpuarray.py    | 11 ++++++++---
 test/test_gpuarray.py |  4 ++++
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 633edd78..258bae5c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -606,11 +606,12 @@ def get_imag_kernel(dtype, real_dtype):
 
 
 @context_dependent_memoize
-def get_conj_kernel(dtype):
+def get_conj_kernel(dtype, conj_dtype):
     return get_elwise_kernel(
-        "%(tp)s *y, %(tp)s *z"
+        "%(tp)s *y, %(conj_tp)s *z"
         % {
             "tp": dtype_to_ctype(dtype),
+            "conj_tp": dtype_to_ctype(conj_dtype)
         },
         "z[i] = pycuda::conj(y[i])",
         "conj",
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a1a3f3f3..f5908a06 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1141,7 +1141,7 @@ class GPUArray:
         else:
             return zeros_like(self)
 
-    def conj(self):
+    def conj(self, out=None):
         dtype = self.dtype
         if issubclass(self.dtype.type, np.complexfloating):
             if not self.flags.forc:
@@ -1154,9 +1154,12 @@ class GPUArray:
                 order = "F"
             else:
                 order = "C"
-            result = self._new_like_me(order=order)
+            if out is None:
+                result = self._new_like_me(order=order)
+            else:
+                result = out
 
-            func = elementwise.get_conj_kernel(dtype)
+            func = elementwise.get_conj_kernel(dtype, result.dtype)
             func.prepared_async_call(
                 self._grid,
                 self._block,
@@ -1170,6 +1173,8 @@ class GPUArray:
         else:
             return self
 
+    conjugate = conj
+
     # }}}
 
     # {{{ rich comparisons
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index fc7b6736..d5d09251 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -732,6 +732,10 @@ class TestGPUArray:
             assert la.norm(z.get().real - z.real.get()) == 0
             assert la.norm(z.get().imag - z.imag.get()) == 0
             assert la.norm(z.get().conj() - z.conj().get()) == 0
+            # verify conj with out parameter
+            z_out = z.astype(np.complex64)
+            assert z_out is z.conj(out=z_out)
+            assert la.norm(z.get().conj() - z_out.get()) < 1e-7
 
             # verify contiguity is preserved
             for order in ["C", "F"]:
-- 
GitLab


From 684b2d3d0f2bab88ebb13fd59c3b6b77c59985f1 Mon Sep 17 00:00:00 2001
From: Jiabei Zhu <zjb@bu.edu>
Date: Wed, 10 Feb 2021 04:18:37 -0500
Subject: [PATCH 09/61] change doc for conj/conjugate

---
 doc/source/array.rst | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/doc/source/array.rst b/doc/source/array.rst
index 34efb1cf..3b12f04a 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -189,12 +189,26 @@ The :class:`GPUArray` Array Class
 
         .. versionadded: 0.94
 
-    .. method :: conj()
+    .. method :: conj(out=None)
 
-        Return the complex conjugate of *self*, or *self* if it is real.
+        Return the complex conjugate of *self*, or *self* if it is real. If *out*
+        is not given, a newly allocated :class:`GPUArray` will returned. Use
+        *out=self* to get conjugate in-place.
 
         .. versionadded: 0.94
 
+        .. versionchanged:: 2020.1.1
+
+            add *out* parameter
+
+
+    .. method :: conjugate(out=None)
+
+        alias of :meth:`conj`
+
+        .. versionadded:: 2020.1.1
+
+
     .. method:: bind_to_texref(texref, allow_offset=False)
 
         Bind *self* to the :class:`pycuda.driver.TextureReference` *texref*.
-- 
GitLab


From 6905d772f8f61bf70ddbd535e8b30cb38be462ca Mon Sep 17 00:00:00 2001
From: Bruce Merry <bmerry@gmail.com>
Date: Sun, 14 Feb 2021 13:36:28 +0200
Subject: [PATCH 10/61] Add documentation memset_*_async.

Closes #266.
---
 doc/source/driver.rst | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/doc/source/driver.rst b/doc/source/driver.rst
index 9f9e9bb6..8de1f2d1 100644
--- a/doc/source/driver.rst
+++ b/doc/source/driver.rst
@@ -1476,6 +1476,8 @@ Initializing Device Memory
 .. function:: memset_d16(dest, data, count)
 .. function:: memset_d32(dest, data, count)
 
+    Fill array with *data*.
+
     .. note::
 
         *count* is the number of elements, not bytes.
@@ -1484,6 +1486,25 @@ Initializing Device Memory
 .. function:: memset_d2d16(dest, pitch, data, width, height)
 .. function:: memset_d2d32(dest, pitch, data, width, height)
 
+    Fill a two-dimensional array with *data*.
+
+.. function:: memset_d8_async(dest, data, count, stream=None)
+.. function:: memset_d16_async(dest, data, count, stream=None)
+.. function:: memset_d32_async(dest, data, count, stream=None)
+
+    Fill array with *data* asynchronously, optionally serialized via *stream*.
+
+    .. versionadded:: 2015.1
+
+.. function:: memset_d2d8_async(dest, pitch, data, width, height, stream=None)
+.. function:: memset_d2d16_async(dest, pitch, data, width, height, stream=None)
+.. function:: memset_d2d32_async(dest, pitch, data, width, height, stream=None)
+
+    Fill a two-dimensional array with *data* asynchronously, optionally
+    serialized via *stream*.
+
+    .. versionadded:: 2015.1
+
 Unstructured Memory Transfers
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- 
GitLab


From 982f4dccff03adb63b6cc0948c486096dd4cfc33 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 23 Feb 2021 16:49:22 -0600
Subject: [PATCH 11/61] Upgrade compyte for sized bool

---
 pycuda/compyte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/compyte b/pycuda/compyte
index d1f993da..7533db88 160000
--- a/pycuda/compyte
+++ b/pycuda/compyte
@@ -1 +1 @@
-Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786
+Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d
-- 
GitLab


From e209c10f935051ef6781cb5713fa78d326988c8b Mon Sep 17 00:00:00 2001
From: Bruce Merry <bmerry@ska.ac.za>
Date: Fri, 5 Mar 2021 09:18:46 +0200
Subject: [PATCH 12/61] Add a pyproject.toml to control build-time numpy
 version

Build against the oldest C_API_VERSION available for the current Python
version, to maximise compatibility of the resulting installation.

Closes #271.
---
 pyproject.toml | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 pyproject.toml

diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..50864b6a
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,13 @@
+[build-system]
+
+# For each Python version, build against the oldest numpy C_API_VERSION for
+# which binary numpy wheels exist, and then the newest version of numpy
+# implementing that C_API_VERSION.
+requires = [
+    "setuptools",
+    "wheel",
+    "numpy; python_version >= '3.10'",
+    "numpy==1.19.5; python_version >= '3.8' and python_version < '3.10'",
+    "numpy==1.15.4; python_version >= '3.7' and python_version < '3.8'",
+    "numpy==1.12.1; python_version < '3.7'",
+]
-- 
GitLab


From 4943befa3ab5905519a1b079315a55d918e4a5e7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 8 Mar 2021 00:37:48 -0600
Subject: [PATCH 13/61] CI/README: master->main

---
 .github/workflows/ci.yml |  4 ++--
 .gitlab-ci.yml           | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a6295018..9cfa45a6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -2,7 +2,7 @@ name: CI
 on:
     push:
         branches:
-        - master
+        - main
     pull_request:
     schedule:
         - cron:  '17 3 * * 0'
@@ -19,5 +19,5 @@ jobs:
                 python-version: '3.x'
         -   name: "Main Script"
             run: |
-                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
                 . ./prepare-and-run-flake8.sh "$(basename $GITHUB_REPOSITORY)" test/*.py
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 174d08dc..4f4d93b8 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -3,7 +3,7 @@ Python 3 Titan X:
   - py_version=3
   - EXTRA_INSTALL="numpy mako"
   - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
   - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -17,7 +17,7 @@ Python 3 Titan V:
   - py_version=3
   - EXTRA_INSTALL="numpy mako"
   - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
   - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
   - ". ./build-and-test-py-project.sh"
   tags:
@@ -31,7 +31,7 @@ Python 3 K40:
   - py_version=3
   - EXTRA_INSTALL="numpy mako"
   - echo "CUDADRV_LIB_DIR = ['/usr/lib/x86_64-linux-gnu/nvidia/current']" > siteconf.py
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
   - "export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH"
   - ". ./build-and-test-py-project.sh"
 
@@ -44,7 +44,7 @@ Python 3 K40:
 Documentation:
   script: |
     EXTRA_INSTALL="numpy mako"
-    curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/ci-support.sh
+    curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh
     . ./ci-support.sh
      build_py_project_in_venv
      build_docs --no-check
@@ -56,7 +56,7 @@ Documentation:
 
 Flake8:
   script:
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
   - . ./prepare-and-run-flake8.sh "$CI_PROJECT_NAME" test/*.py
   tags:
   - python3
-- 
GitLab


From a558f4bf7293230324fdfb9fc087de3d4811d3b0 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 19 Feb 2021 12:58:27 +0000
Subject: [PATCH 14/61] Support the CUDA Array Interface

This commit adds support for version 3 of the CUDA Array Interface
(https://numba.readthedocs.io/en/latest/cuda/cuda_array_interface.html):

- GPU Arrays export the interface so that they can be passed to other
  supporting libraries.
- Function calls check for the presence of the interface and use the
  data pointer from it if found, so other libraries' arrays can be
  passed to PyCUDA-compiled kernels.

Documentation, tests, and examples for the interface going in both
directions are added.
---
 doc/source/array.rst        |  6 ++++
 doc/source/tutorial.rst     | 29 +++++++++++++++
 examples/cai_cupy_arrays.py | 33 +++++++++++++++++
 examples/cai_numba.py       | 40 +++++++++++++++++++++
 pycuda/driver.py            |  8 +++++
 pycuda/gpuarray.py          | 23 ++++++++++++
 test/test_driver.py         | 71 +++++++++++++++++++++++++++++++++++++
 7 files changed, 210 insertions(+)
 create mode 100644 examples/cai_cupy_arrays.py
 create mode 100644 examples/cai_numba.py

diff --git a/doc/source/array.rst b/doc/source/array.rst
index 3b12f04a..de1cc66e 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -80,6 +80,12 @@ The :class:`GPUArray` Array Class
 
         .. versionadded: 2011.1
 
+    .. attribute :: __cuda_array_interface__
+
+        Return a `CUDA Array Interface
+        <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_
+        dict describing this array's data.
+
     .. method :: __len__()
 
         Returns the size of the leading dimension of *self*.
diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst
index ea6a804f..ba729875 100644
--- a/doc/source/tutorial.rst
+++ b/doc/source/tutorial.rst
@@ -191,6 +191,35 @@ only the second::
   func(numpy.intp(do2_ptr), block = (32, 1, 1), grid=(1, 1))
   print("doubled second only", array1, array2, "\n")
 
+Interoperability With Other Libraries Using The CUDA Array Interface
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Kernel calls can be passed arrays from other CUDA libraries that support the
+`CUDA Array Interface
+<https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_. For
+example, to double a `CuPy <https://docs.cupy.dev/en/stable/>`_ array::
+
+  import cupy as cp
+
+  cupy_a = cp.random.randn(4, 4).astype(cp.float32)
+  func = mod.get_function("double_array")
+  func(cupy_a, block=(4, 4, 1), grid=(1, 1))
+
+PyCUDA GPU Arrays implement the CUDA Array Interface, so they can be passed
+into functions from other libraries that support it. For example, to double a
+PyCUDA GPU Array using a `Numba <https://numba.readthedocs.io/>`_ kernel::
+
+  from numba import cuda
+
+  a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32))
+
+  @cuda.jit
+  def double(x):
+      i, j = cuda.grid(2)
+      x[i, j] *= 2
+
+  double[(4, 4), (1, 1)](a_gpu)
+
 Where to go from here
 ---------------------
 
diff --git a/examples/cai_cupy_arrays.py b/examples/cai_cupy_arrays.py
new file mode 100644
index 00000000..2c524ba9
--- /dev/null
+++ b/examples/cai_cupy_arrays.py
@@ -0,0 +1,33 @@
+# Copyright 2008-2021 Andreas Kloeckner
+# Copyright 2021 NVIDIA Corporation
+
+import pycuda.autoinit  # noqa
+from pycuda.compiler import SourceModule
+
+import cupy as cp
+
+
+# Create a CuPy array (and a copy for comparison later)
+cupy_a = cp.random.randn(4, 4).astype(cp.float32)
+original = cupy_a.copy()
+
+
+# Create a kernel
+mod = SourceModule("""
+    __global__ void doublify(float *a)
+    {
+      int idx = threadIdx.x + threadIdx.y*4;
+      a[idx] *= 2;
+    }
+    """)
+
+func = mod.get_function("doublify")
+
+# Invoke PyCUDA kernel on a CuPy array
+func(cupy_a, block=(4, 4, 1), grid=(1, 1), shared=0)
+
+# Demonstrate that our CuPy array was modified in place by the PyCUDA kernel
+print("original array:")
+print(original)
+print("doubled with kernel:")
+print(cupy_a)
diff --git a/examples/cai_numba.py b/examples/cai_numba.py
new file mode 100644
index 00000000..05fa317f
--- /dev/null
+++ b/examples/cai_numba.py
@@ -0,0 +1,40 @@
+# Copyright 2008-2021 Andreas Kloeckner
+# Copyright 2021 NVIDIA Corporation
+
+from numba import cuda
+
+import pycuda.driver as pycuda
+import pycuda.autoinit  # noqa
+import pycuda.gpuarray as gpuarray
+
+import numpy
+
+
+# Create a PyCUDA gpuarray
+a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32))
+print("original array:")
+print(a_gpu)
+
+# Retain PyCUDA context as primary and make current so that Numba is happy
+pyc_dev = pycuda.autoinit.device
+pyc_ctx = pyc_dev.retain_primary_context()
+pyc_ctx.push()
+
+
+# A standard Numba kernel that doubles its input array
+@cuda.jit
+def double(x):
+    i, j = cuda.grid(2)
+
+    if i < x.shape[0] and j < x.shape[1]:
+        x[i, j] *= 2
+
+
+# Call the Numba kernel on the PyCUDA gpuarray, using the CUDA Array Interface
+# transparently
+double[(4, 4), (1, 1)](a_gpu)
+print("doubled with numba:")
+print(a_gpu)
+
+# Pop context to allow PyCUDA to clean up
+pyc_ctx.pop()
diff --git a/pycuda/driver.py b/pycuda/driver.py
index 6bfd097e..ac305129 100644
--- a/pycuda/driver.py
+++ b/pycuda/driver.py
@@ -1,3 +1,8 @@
+__copyright__ = """
+Copyright 2008-2021 Andreas Kloeckner
+Copyright 2021 NVIDIA Corporation
+"""
+
 import os
 import numpy as np
 
@@ -207,6 +212,9 @@ def _add_functionality():
             elif isinstance(arg, np.void):
                 arg_data.append(_my_bytes(_memoryview(arg)))
                 format += "%ds" % arg.itemsize
+            elif hasattr(arg, '__cuda_array_interface__'):
+                arg_data.append(arg.__cuda_array_interface__['data'][0])
+                format += "P"
             else:
                 try:
                     gpudata = np.uintp(arg.gpudata)
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index f5908a06..febfc330 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1,3 +1,8 @@
+__copyright__ = """
+Copyright 2008-2021 Andreas Kloeckner
+Copyright 2021 NVIDIA Corporation
+"""
+
 import numpy as np
 import pycuda.elementwise as elementwise
 from pytools import memoize, memoize_method
@@ -252,6 +257,24 @@ class GPUArray:
 
         self._grid, self._block = splay(self.mem_size)
 
+    @property
+    def __cuda_array_interface__(self):
+        """Returns a CUDA Array Interface dictionary describing this array's
+        data."""
+        if self.gpudata is not None:
+            ptr = int(self.gpudata)
+        else:
+            ptr = 0
+
+        return {
+            'shape': self.shape,
+            'strides': self.strides,
+            'data': (ptr, False),
+            'typestr': self.dtype.str,
+            'stream': None,
+            'version': 3
+        }
+
     @property
     def ndim(self):
         return len(self.shape)
diff --git a/test/test_driver.py b/test/test_driver.py
index c720cebf..7c9c18ac 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -1,3 +1,8 @@
+__copyright__ = """
+Copyright 2008-2021 Andreas Kloeckner
+Copyright 2021 NVIDIA Corporation
+"""
+
 import numpy as np
 import numpy.linalg as la
 from pycuda.tools import mark_cuda_test, dtype_to_ctype
@@ -144,6 +149,23 @@ class TestDriver:
         diff = (a_g * b_g).get() - a * b
         assert la.norm(diff) == 0
 
+    @mark_cuda_test
+    def test_gpuarray_cai(self):
+        a = np.zeros(10, dtype=np.float32)
+        a_g = gpuarray.to_gpu(a)
+        cai = a_g.__cuda_array_interface__
+        ptr = cai['data'][0]
+        masked = cai['data'][1]
+
+        assert cai['shape'] == a.shape
+        assert cai['strides'] == a.strides
+        assert cai['typestr'] == a.dtype.str
+        assert isinstance(ptr, int)
+        assert ptr != 0
+        assert not masked
+        assert cai['stream'] is None
+        assert cai['version'] == 3
+
     @mark_cuda_test
     def donottest_cublas_mixing(self):
         self.test_streamed_kernel()
@@ -1054,6 +1076,55 @@ def test_pointer_holder_base():
     print(ary.get())
 
 
+# A class to emulate an object from outside PyCUDA that implements the CUDA
+# Array Interface
+class CudaArrayInterfaceImpl:
+    def __init__(self, size, itemsize, dtype):
+        self._shape = (size,)
+        self._strides = (itemsize,)
+        self._typestr = dtype.str
+        self._ptr = drv.mem_alloc(size * itemsize)
+
+    @property
+    def __cuda_array_interface__(self):
+        return {
+            'shape': self._shape,
+            'strides': self._strides,
+            'typestr': self._typestr,
+            'data': (int(self._ptr), False),
+            'stream': None,
+            'version': 3
+        }
+
+    @property
+    def ptr(self):
+        return self._ptr
+
+
+def test_pass_cai_array():
+    dtype = np.int32
+    size = 1024
+    np_array = np.arange(size, dtype=dtype)
+    cai_array = CudaArrayInterfaceImpl(size, np_array.itemsize, np_array.dtype)
+
+    mod = SourceModule(
+        """
+    __global__ void gpu_arange(int *x)
+    {
+      const int i = threadIdx.x;
+      x[i] = i;
+    }
+    """
+    )
+
+    gpu_arange = mod.get_function("gpu_arange")
+    gpu_arange(cai_array, grid=(1,), block=(size, 1, 1))
+
+    host_array = np.empty_like(np_array)
+    drv.memcpy_dtoh(host_array, cai_array.ptr)
+    assert (host_array == np_array).all()
+
+
 def test_import_pyopencl_before_pycuda():
     try:
         import pyopencl  # noqa
-- 
GitLab


From 305f887a76ff3656aaab5ed4418c79a00787a490 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Thu, 4 Mar 2021 10:43:37 +0000
Subject: [PATCH 15/61] Use appropriate quotes

---
 pycuda/compyte      |  2 +-
 pycuda/driver.py    |  4 ++--
 pycuda/gpuarray.py  | 12 ++++++------
 test/test_driver.py | 26 +++++++++++++-------------
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/pycuda/compyte b/pycuda/compyte
index 7533db88..d1f993da 160000
--- a/pycuda/compyte
+++ b/pycuda/compyte
@@ -1 +1 @@
-Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d
+Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786
diff --git a/pycuda/driver.py b/pycuda/driver.py
index ac305129..77657afe 100644
--- a/pycuda/driver.py
+++ b/pycuda/driver.py
@@ -212,8 +212,8 @@ def _add_functionality():
             elif isinstance(arg, np.void):
                 arg_data.append(_my_bytes(_memoryview(arg)))
                 format += "%ds" % arg.itemsize
-            elif hasattr(arg, '__cuda_array_interface__'):
-                arg_data.append(arg.__cuda_array_interface__['data'][0])
+            elif hasattr(arg, "__cuda_array_interface__"):
+                arg_data.append(arg.__cuda_array_interface__["data"][0])
                 format += "P"
             else:
                 try:
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index febfc330..5d9ccaa5 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -267,12 +267,12 @@ class GPUArray:
             ptr = 0
 
         return {
-            'shape': self.shape,
-            'strides': self.strides,
-            'data': (ptr, False),
-            'typestr': self.dtype.str,
-            'stream': None,
-            'version': 3
+            "shape": self.shape,
+            "strides": self.strides,
+            "data": (ptr, False),
+            "typestr": self.dtype.str,
+            "stream": None,
+            "version": 3
         }
 
     @property
diff --git a/test/test_driver.py b/test/test_driver.py
index 7c9c18ac..b022aa37 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -154,17 +154,17 @@ class TestDriver:
         a = np.zeros(10, dtype=np.float32)
         a_g = gpuarray.to_gpu(a)
         cai = a_g.__cuda_array_interface__
-        ptr = cai['data'][0]
-        masked = cai['data'][1]
+        ptr = cai["data"][0]
+        masked = cai["data"][1]
 
-        assert cai['shape'] == a.shape
-        assert cai['strides'] == a.strides
-        assert cai['typestr'] == a.dtype.str
+        assert cai["shape"] == a.shape
+        assert cai["strides"] == a.strides
+        assert cai["typestr"] == a.dtype.str
         assert isinstance(ptr, int)
         assert ptr != 0
         assert not masked
-        assert cai['stream'] is None
-        assert cai['version'] == 3
+        assert cai["stream"] is None
+        assert cai["version"] == 3
 
     @mark_cuda_test
     def donottest_cublas_mixing(self):
@@ -1088,12 +1088,12 @@ class CudaArrayInterfaceImpl:
     @property
     def __cuda_array_interface__(self):
         return {
-            'shape': self._shape,
-            'strides': self._strides,
-            'typestr': self._typestr,
-            'data': (int(self._ptr), False),
-            'stream': None,
-            'version': 3
+            "shape": self._shape,
+            "strides": self._strides,
+            "typestr": self._typestr,
+            "data": (int(self._ptr), False),
+            "stream": None,
+            "version": 3
         }
 
     @property
-- 
GitLab


From e9c585ba70c3e2cc528ea218da5e5c5ab3f0d33b Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 5 Mar 2021 11:51:49 +0000
Subject: [PATCH 16/61] Add reference to GPUArray class docs

---
 doc/source/tutorial.rst | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst
index ba729875..681ee37d 100644
--- a/doc/source/tutorial.rst
+++ b/doc/source/tutorial.rst
@@ -205,9 +205,10 @@ example, to double a `CuPy <https://docs.cupy.dev/en/stable/>`_ array::
   func = mod.get_function("double_array")
   func(cupy_a, block=(4, 4, 1), grid=(1, 1))
 
-PyCUDA GPU Arrays implement the CUDA Array Interface, so they can be passed
-into functions from other libraries that support it. For example, to double a
-PyCUDA GPU Array using a `Numba <https://numba.readthedocs.io/>`_ kernel::
+:class:`~pycuda.gpuarray.GPUArray` implements the CUDA Array Interface, so its
+instances can be passed into functions from other libraries that support it.
+For example, to double a PyCUDA GPU Array using a `Numba
+<https://numba.readthedocs.io/>`_ kernel::
 
   from numba import cuda
 
-- 
GitLab


From 0273af2c5114a33863782ce41f53936f3730203d Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 23 Feb 2021 16:49:22 -0600
Subject: [PATCH 17/61] Revert accidental update of compyte submodule

---
 pycuda/compyte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/compyte b/pycuda/compyte
index d1f993da..7533db88 160000
--- a/pycuda/compyte
+++ b/pycuda/compyte
@@ -1 +1 @@
-Subproject commit d1f993daecc03947d9e6e3e60d2a5145ecbf3786
+Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d
-- 
GitLab


From ab5d0c37819c99000309670c2a7a9f9fdf8801ae Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Fri, 5 Mar 2021 12:08:12 +0000
Subject: [PATCH 18/61] Add a comment explaining readonly flag of CAI

---
 pycuda/gpuarray.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 5d9ccaa5..373cf005 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -269,6 +269,8 @@ class GPUArray:
         return {
             "shape": self.shape,
             "strides": self.strides,
+            # data is a tuple: (ptr, readonly) - always export GPUArray
+            # instances as read-write
             "data": (ptr, False),
             "typestr": self.dtype.str,
             "stream": None,
-- 
GitLab


From b4843494be3d733b5e61c0b9d6d9652c00f17f06 Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 9 Mar 2021 14:27:59 +0000
Subject: [PATCH 19/61] Use getattr instead of hasattr for CUDA Array Interface

---
 pycuda/driver.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/pycuda/driver.py b/pycuda/driver.py
index 77657afe..47d15b19 100644
--- a/pycuda/driver.py
+++ b/pycuda/driver.py
@@ -212,10 +212,13 @@ def _add_functionality():
             elif isinstance(arg, np.void):
                 arg_data.append(_my_bytes(_memoryview(arg)))
                 format += "%ds" % arg.itemsize
-            elif hasattr(arg, "__cuda_array_interface__"):
-                arg_data.append(arg.__cuda_array_interface__["data"][0])
-                format += "P"
             else:
+                cai = getattr(arg, "__cuda_array_interface__", None)
+                if cai:
+                    arg_data.append(cai["data"][0])
+                    format += "P"
+                    continue
+
                 try:
                     gpudata = np.uintp(arg.gpudata)
                 except AttributeError:
-- 
GitLab


From 88c7bc371aa506207cc88d581eeaa21925a371cd Mon Sep 17 00:00:00 2001
From: Graham Markall <gmarkall@nvidia.com>
Date: Tue, 9 Mar 2021 14:34:08 +0000
Subject: [PATCH 20/61] Add autoprimaryctx and use in CAI Numba example

---
 doc/source/util.rst      |  6 ++++++
 examples/cai_numba.py    | 12 +++---------
 pycuda/autoprimaryctx.py | 31 +++++++++++++++++++++++++++++++
 3 files changed, 40 insertions(+), 9 deletions(-)
 create mode 100644 pycuda/autoprimaryctx.py

diff --git a/doc/source/util.rst b/doc/source/util.rst
index 0cd85d69..44c9a56f 100644
--- a/doc/source/util.rst
+++ b/doc/source/util.rst
@@ -21,6 +21,12 @@ It uses :func:`pycuda.tools.make_default_context` to create a compute context.
   on :data:`device`. This context is created by calling
   :func:`pycuda.tools.make_default_context`.
 
+.. module:: pycuda.autoprimaryctx
+
+The module :mod:`pycuda.autoprimaryctx` is similar to :mod:`pycuda.autoinit`,
+except that it retains the device primary context instead of creating a new
+context in :func:`pycuda.tools.make_default_context`.
+
 Choice of Device
 ----------------
 
diff --git a/examples/cai_numba.py b/examples/cai_numba.py
index 05fa317f..0a94ee48 100644
--- a/examples/cai_numba.py
+++ b/examples/cai_numba.py
@@ -4,7 +4,9 @@
 from numba import cuda
 
 import pycuda.driver as pycuda
-import pycuda.autoinit  # noqa
+# We use autoprimaryctx instead of autoinit because Numba can only operate on a
+# primary context
+import pycuda.autoprimaryctx  # noqa
 import pycuda.gpuarray as gpuarray
 
 import numpy
@@ -15,11 +17,6 @@ a_gpu = gpuarray.to_gpu(numpy.random.randn(4, 4).astype(numpy.float32))
 print("original array:")
 print(a_gpu)
 
-# Retain PyCUDA context as primary and make current so that Numba is happy
-pyc_dev = pycuda.autoinit.device
-pyc_ctx = pyc_dev.retain_primary_context()
-pyc_ctx.push()
-
 
 # A standard Numba kernel that doubles its input array
 @cuda.jit
@@ -35,6 +32,3 @@ def double(x):
 double[(4, 4), (1, 1)](a_gpu)
 print("doubled with numba:")
 print(a_gpu)
-
-# Pop context to allow PyCUDA to clean up
-pyc_ctx.pop()
diff --git a/pycuda/autoprimaryctx.py b/pycuda/autoprimaryctx.py
new file mode 100644
index 00000000..537c8610
--- /dev/null
+++ b/pycuda/autoprimaryctx.py
@@ -0,0 +1,31 @@
+import pycuda.driver as cuda
+import atexit
+
+# Initialize CUDA
+cuda.init()
+
+from pycuda.tools import make_default_context  # noqa: E402
+
+
+def _retain_primary_context(dev):
+    context = dev.retain_primary_context()
+    context.push()
+    return context
+
+
+global context
+context = make_default_context(_retain_primary_context)
+device = context.get_device()
+
+
+def _finish_up():
+    global context
+    context.pop()
+    context = None
+
+    from pycuda.tools import clear_context_caches
+
+    clear_context_caches()
+
+
+atexit.register(_finish_up)
-- 
GitLab


From 702af5e80bcdc163649d2a7f98b5bff73d0bc3df Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Wed, 10 Mar 2021 23:53:25 -0600
Subject: [PATCH 21/61] Doc tweaks for array interface support

---
 doc/source/tutorial.rst | 2 +-
 doc/source/util.rst     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst
index 681ee37d..2c401b88 100644
--- a/doc/source/tutorial.rst
+++ b/doc/source/tutorial.rst
@@ -196,7 +196,7 @@ Interoperability With Other Libraries Using The CUDA Array Interface
 
 Kernel calls can be passed arrays from other CUDA libraries that support the
 `CUDA Array Interface
-<https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`_. For
+<https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>`__. For
 example, to double a `CuPy <https://docs.cupy.dev/en/stable/>`_ array::
 
   import cupy as cp
diff --git a/doc/source/util.rst b/doc/source/util.rst
index 44c9a56f..f83907d6 100644
--- a/doc/source/util.rst
+++ b/doc/source/util.rst
@@ -25,7 +25,8 @@ It uses :func:`pycuda.tools.make_default_context` to create a compute context.
 
 The module :mod:`pycuda.autoprimaryctx` is similar to :mod:`pycuda.autoinit`,
 except that it retains the device primary context instead of creating a new
-context in :func:`pycuda.tools.make_default_context`.
+context in :func:`pycuda.tools.make_default_context`. Notably, it also
+has ``device`` and ``context`` attributes.
 
 Choice of Device
 ----------------
-- 
GitLab


From e0fd3f1ea954ab7fb305d61b1afcae6dd8a66b38 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Wed, 24 Mar 2021 19:27:06 +0000
Subject: [PATCH 22/61] Remove py-version-specific Trove classifiers from
 setup.py

---
 setup.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/setup.py b/setup.py
index 2a7acbe3..990a7830 100644
--- a/setup.py
+++ b/setup.py
@@ -212,10 +212,6 @@ def main():
             "Programming Language :: C++",
             "Programming Language :: Python",
             "Programming Language :: Python :: 3",
-            "Programming Language :: Python :: 2.6",
-            "Programming Language :: Python :: 2.7",
-            "Programming Language :: Python :: 3.3",
-            "Programming Language :: Python :: 3.4",
             "Topic :: Scientific/Engineering",
             "Topic :: Scientific/Engineering :: Mathematics",
             "Topic :: Scientific/Engineering :: Physics",
-- 
GitLab


From a39ef021e0a6ab79e48aee86ed1779cb1d32f537 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 27 Mar 2021 12:26:44 -0500
Subject: [PATCH 23/61] Bump size parameters to size_t for a few memcpys

---
 src/cpp/cuda.hpp             | 6 +++---
 src/wrapper/wrap_cudadrv.cpp | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/cpp/cuda.hpp b/src/cpp/cuda.hpp
index e3f8ef24..21cb219c 100644
--- a/src/cpp/cuda.hpp
+++ b/src/cpp/cuda.hpp
@@ -1624,18 +1624,18 @@ namespace pycuda
   }
 
   inline
-  void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, unsigned int len)
+  void memcpy_dtoa(array const &ary, unsigned int index, CUdeviceptr src, size_t len)
   { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoA, (ary.handle(), index, src, len)); }
 
   inline
-  void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, unsigned int len)
+  void memcpy_atod(CUdeviceptr dst, array const &ary, unsigned int index, size_t len)
   { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoD, (dst, ary.handle(), index, len)); }
 
   inline
   void memcpy_atoa(
       array const &dst, unsigned int dst_index,
       array const &src, unsigned int src_index,
-      unsigned int len)
+      size_t len)
   { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyAtoA, (dst.handle(), dst_index, src.handle(), src_index, len)); }
 
   // }}}
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 7c513a1a..c0fdd99d 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -268,7 +268,7 @@ namespace
 
 
   void  py_memcpy_dtod(CUdeviceptr dest, CUdeviceptr src,
-      unsigned int byte_count)
+      size_t int byte_count)
   { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoD, (dest, src, byte_count)); }
 
 
@@ -276,7 +276,7 @@ namespace
 
 #if CUDAPP_CUDA_VERSION >= 3000
   void  py_memcpy_dtod_async(CUdeviceptr dest, CUdeviceptr src,
-      unsigned int byte_count, py::object stream_py)
+      size_t byte_count, py::object stream_py)
   {
     PYCUDA_PARSE_STREAM_PY;
 
@@ -287,7 +287,7 @@ namespace
 
 #if CUDAPP_CUDA_VERSION >= 4000
   void  py_memcpy_peer(CUdeviceptr dest, CUdeviceptr src,
-      unsigned int byte_count,
+      size_t byte_count,
       py::object dest_context_py, py::object src_context_py
       )
   {
@@ -307,7 +307,7 @@ namespace
   }
 
   void  py_memcpy_peer_async(CUdeviceptr dest, CUdeviceptr src,
-      unsigned int byte_count,
+      size_t byte_count,
       py::object dest_context_py, py::object src_context_py,
       py::object stream_py)
   {
-- 
GitLab


From e1ec8dce352227bb0bc9cd48fe21f0e5ba38280c Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 27 Mar 2021 12:40:18 -0500
Subject: [PATCH 24/61] Fix type typo in size_t transition

---
 src/wrapper/wrap_cudadrv.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index c0fdd99d..0be5a0a2 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -268,7 +268,7 @@ namespace
 
 
   void  py_memcpy_dtod(CUdeviceptr dest, CUdeviceptr src,
-      size_t int byte_count)
+      size_t byte_count)
   { CUDAPP_CALL_GUARDED_THREADED(cuMemcpyDtoD, (dest, src, byte_count)); }
 
 
-- 
GitLab


From 048ae7a2558040823136b3e0d470d5adfe8226fc Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 27 Mar 2021 12:58:13 -0500
Subject: [PATCH 25/61] Add links to related software

---
 doc/source/index.rst | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/doc/source/index.rst b/doc/source/index.rst
index e5c0c45c..41ad641a 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -79,6 +79,19 @@ the scenes, a lot more interesting stuff is going on:
 
 Curious? Let's get started.
 
+Other software that uses/enhances PyCUDA
+========================================
+
+This list is by definition incomplete! If you know of other software you
+feel should be listed here, please submit a PR!
+
+* `pyvkfft <https://github.com/vincefn/pyvkfft>`__
+* `scikit-cuda <https://scikit-cuda.readthedocs.io/en/latest/>`__
+* `reikna <https://pypi.org/project/reikna/>`__
+* `reikna <https://pypi.org/project/reikna/>`__
+* `compyle <https://pypi.org/project/compyle/>`__
+* `pynufft <https://pypi.org/project/pynufft/>`__
+
 Contents
 =========
 
-- 
GitLab


From ab3f5fd6af16487f76b88661650f6e5d159389c8 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 31 Mar 2021 13:44:24 -0500
Subject: [PATCH 26/61] Update mempool to version from PyOpenCL (closes gh-282)

---
 doc/source/util.rst     |  46 ++++++----
 src/cpp/mempool.hpp     | 186 ++++++++++++++++++++++++++++------------
 src/wrapper/mempool.cpp |  24 +++---
 3 files changed, 173 insertions(+), 83 deletions(-)

diff --git a/doc/source/util.rst b/doc/source/util.rst
index f83907d6..c7998994 100644
--- a/doc/source/util.rst
+++ b/doc/source/util.rst
@@ -6,18 +6,18 @@ Automatic Initialization
 
 .. module:: pycuda.autoinit
 
-The module :mod:`pycuda.autoinit`,  when imported, automatically performs 
+The module :mod:`pycuda.autoinit`,  when imported, automatically performs
 all the steps necessary to get CUDA ready for submission of compute kernels.
 It uses :func:`pycuda.tools.make_default_context` to create a compute context.
 
 .. data:: device
 
   An instance of :class:`pycuda.driver.Device` that was used for automatic
-  initialization. 
+  initialization.
 
 .. data:: context
 
-  A default-constructed instance of :class:`pycuda.driver.Context` 
+  A default-constructed instance of :class:`pycuda.driver.Context`
   on :data:`device`. This context is created by calling
   :func:`pycuda.tools.make_default_context`.
 
@@ -74,7 +74,7 @@ Kernel Caching
 
 .. function:: context_dependent_memoize(func)
 
-    This decorator caches the result of the decorated function, *if* a 
+    This decorator caches the result of the decorated function, *if* a
     subsequent occurs in the same :class:`pycuda.driver.Context`.
     This is useful for caching of kernels.
 
@@ -98,7 +98,7 @@ Device Metadata and Occupancy
 -----------------------------
 
 .. class:: DeviceData(dev=None)
-  
+
   Gives access to more information on a device than is available through
   :meth:`pycuda.driver.Device.get_attribute`. If `dev` is `None`, it defaults
   to the device returned by :meth:`pycuda.driver.Context.get_device`.
@@ -120,7 +120,7 @@ Device Metadata and Occupancy
 
   .. method:: align_bytes(word_size=4)
 
-    The distance between global memory base addresses that 
+    The distance between global memory base addresses that
     allow accesses of word-size `word_size` bytes to get coalesced.
 
   .. method:: align(bytes, word_size=4)
@@ -134,7 +134,7 @@ Device Metadata and Occupancy
 
   .. method:: align_dtype(elements, dtype_size)
 
-    Round up `elements` to the next alignment boundary 
+    Round up `elements` to the next alignment boundary
     as given by :meth:`align_bytes`, where each element is assumed to be
     `dtype_size` bytes large.
 
@@ -146,7 +146,7 @@ Device Metadata and Occupancy
 
 .. class:: OccupancyRecord(devdata, threads, shared_mem=0, registers=0)
 
-  Calculate occupancy for a given kernel workload characterized by 
+  Calculate occupancy for a given kernel workload characterized by
 
   * thread count of `threads`
   * shared memory use of `shared_mem` bytes
@@ -183,7 +183,7 @@ fresh memory area is allocated for each intermediate result. Memory pools are a
 remedy for this problem based on the observation that often many of the block
 allocations are of the same sizes as previously used ones.
 
-Then, instead of fully returning the memory to the system and incurring the 
+Then, instead of fully returning the memory to the system and incurring the
 associated reallocation overhead, the pool holds on to the memory and uses it
 to satisfy future allocations of similarly-sized blocks. The pool reacts
 appropriately to out-of-memory conditions as long as all memory allocations
@@ -198,8 +198,8 @@ Device-based Memory Pool
 
     An object representing a :class:`DeviceMemoryPool`-based allocation of
     linear device memory.  Once this object is deleted, its associated device
-    memory is freed. 
-    :class:`PooledDeviceAllocation` instances can be cast to :class:`int` 
+    memory is freed.
+    :class:`PooledDeviceAllocation` instances can be cast to :class:`int`
     (and :class:`long`), yielding the starting address of the device memory
     allocated.
 
@@ -213,7 +213,7 @@ Device-based Memory Pool
 
 .. class:: DeviceMemoryPool
 
-    A memory pool for linear device memory as allocated using 
+    A memory pool for linear device memory as allocated using
     :func:`pycuda.driver.mem_alloc`. (see :ref:`mempool`)
 
     .. attribute:: held_blocks
@@ -225,6 +225,20 @@ Device-based Memory Pool
         The number of blocks in active use that have been allocated
         through this pool.
 
+    .. attribute:: managed_bytes
+
+        "Managed" memory is "active" and "held" memory.
+
+        .. versionadded: 2021.1
+
+    .. attribute:: active_bytes
+
+        "Active" bytes are bytes under the control of the application.
+        This may be smaller than the actual allocated size reflected
+        in :attr:`managed_bytes`.
+
+        .. versionadded: 2021.1
+
     .. method:: allocate(size)
 
         Return a :class:`PooledDeviceAllocation` of *size* bytes.
@@ -248,7 +262,7 @@ Memory Pool for pagelocked memory
 
     An object representing a :class:`PageLockedMemoryPool`-based allocation of
     linear device memory.  Once this object is deleted, its associated device
-    memory is freed. 
+    memory is freed.
 
     .. method:: free
 
@@ -260,12 +274,12 @@ Memory Pool for pagelocked memory
 
 .. class:: PageLockedAllocator(flags=0)
 
-    Specifies the set of :class:`pycuda.driver.host_alloc_flags` used in its 
+    Specifies the set of :class:`pycuda.driver.host_alloc_flags` used in its
     associated :class:`PageLockedMemoryPool`.
 
 .. class:: PageLockedMemoryPool(allocator=PageLockedAllocator())
 
-    A memory pool for pagelocked host memory as allocated using 
+    A memory pool for pagelocked host memory as allocated using
     :func:`pycuda.driver.pagelocked_empty`. (see :ref:`mempool`)
 
     .. attribute:: held_blocks
@@ -279,7 +293,7 @@ Memory Pool for pagelocked memory
 
     .. method:: allocate(shape, dtype, order="C")
 
-        Return an uninitialized ("empty") :class:`numpy.ndarray` with the given 
+        Return an uninitialized ("empty") :class:`numpy.ndarray` with the given
         *shape*, *dtype*, and *order*. This array will be backed by a
         :class:`PooledHostAllocation`, which can be found as the ``.base``
         attribute of the array.
diff --git a/src/cpp/mempool.hpp b/src/cpp/mempool.hpp
index be88f13f..44f0fd64 100644
--- a/src/cpp/mempool.hpp
+++ b/src/cpp/mempool.hpp
@@ -1,24 +1,61 @@
 // Abstract memory pool implementation
-
-
+//
+// Copyright (C) 2009-17 Andreas Kloeckner
+//
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
 
 
 #ifndef _AFJDFJSDFSD_PYGPU_HEADER_SEEN_MEMPOOL_HPP
 #define _AFJDFJSDFSD_PYGPU_HEADER_SEEN_MEMPOOL_HPP
 
 
-
-
-#include <boost/ptr_container/ptr_map.hpp>
-#include <boost/foreach.hpp>
-#include <boost/format.hpp>
+#include <cassert>
+#include <vector>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <iostream>
 #include "bitlog.hpp"
 
 
-
-
 namespace PYGPU_PACKAGE
 {
+  // https://stackoverflow.com/a/44175911
+  class mp_noncopyable {
+  public:
+    mp_noncopyable() = default;
+    ~mp_noncopyable() = default;
+
+  private:
+    mp_noncopyable(const mp_noncopyable&) = delete;
+    mp_noncopyable& operator=(const mp_noncopyable&) = delete;
+  };
+
+#ifdef PYGPU_PYCUDA
+#define PYGPU_SHARED_PTR boost::shared_ptr
+#else
+#define PYGPU_SHARED_PTR std::shared_ptr
+#endif
+
   template <class T>
   inline T signed_left_shift(T x, signed shift_amount)
   {
@@ -43,38 +80,57 @@ namespace PYGPU_PACKAGE
 
 
+#define always_assert(cond) \
+  do { \
+    if (!(cond)) \
+      throw std::logic_error("mem pool assertion violated: " #cond); \
+  } while (false);
+
+
   template<class Allocator>
-  class memory_pool
+  class memory_pool : mp_noncopyable
   {
     public:
       typedef typename Allocator::pointer_type pointer_type;
       typedef typename Allocator::size_type size_type;
 
     private:
-      typedef boost::uint32_t bin_nr_t;
+      typedef uint32_t bin_nr_t;
       typedef std::vector<pointer_type> bin_t;
 
-      typedef boost::ptr_map<bin_nr_t, bin_t > container_t;
+      typedef std::map<bin_nr_t, bin_t> container_t;
       container_t m_container;
       typedef typename container_t::value_type bin_pair_t;
 
-      std::auto_ptr<Allocator> m_allocator;
+      std::unique_ptr<Allocator> m_allocator;
 
       // A held block is one that's been released by the application, but that
       // we are keeping around to dish out again.
-      unsigned m_held_blocks;
+      size_type m_held_blocks;
 
       // An active block is one that is in use by the application.
-      unsigned m_active_blocks;
+      size_type m_active_blocks;
+
+      // "Managed" memory is "active" and "held" memory.
+      size_type m_managed_bytes;
+
+      // "Active" bytes are bytes under the control of the application.
+      // This may be smaller than the actual allocated size reflected
+      // in m_managed_bytes.
+      size_type m_active_bytes;
 
       bool m_stop_holding;
       int m_trace;
 
+      unsigned m_leading_bits_in_bin_id;
+
     public:
-      memory_pool(Allocator const &alloc=Allocator())
+      memory_pool(Allocator const &alloc=Allocator(), unsigned leading_bits_in_bin_id=4)
         : m_allocator(alloc.copy()),
-        m_held_blocks(0), m_active_blocks(0), m_stop_holding(false),
-        m_trace(false)
+        m_held_blocks(0), m_active_blocks(0),
+        m_managed_bytes(0), m_active_bytes(0),
+        m_stop_holding(false),
+        m_trace(false), m_leading_bits_in_bin_id(leading_bits_in_bin_id)
       {
         if (m_allocator->is_deferred())
         {
@@ -88,17 +144,21 @@ namespace PYGPU_PACKAGE
       virtual ~memory_pool()
       { free_held(); }
 
-      static const unsigned mantissa_bits = 2;
-      static const unsigned mantissa_mask = (1 << mantissa_bits) - 1;
+    private:
+      unsigned mantissa_mask() const
+      {
+        return (1 << m_leading_bits_in_bin_id) - 1;
+      }
 
-      static bin_nr_t bin_number(size_type size)
+    public:
+      bin_nr_t bin_number(size_type size)
       {
         signed l = bitlog2(size);
-        size_type shifted = signed_right_shift(size, l-signed(mantissa_bits));
-        if (size && (shifted & (1 << mantissa_bits)) == 0)
+        size_type shifted = signed_right_shift(size, l-signed(m_leading_bits_in_bin_id));
+        if (size && (shifted & (1 << m_leading_bits_in_bin_id)) == 0)
           throw std::runtime_error("memory_pool::bin_number: bitlog2 fault");
-        size_type chopped = shifted & mantissa_mask;
-        return l << mantissa_bits | chopped;
+        size_type chopped = shifted & mantissa_mask();
+        return l << m_leading_bits_in_bin_id | chopped;
       }
 
       void set_trace(bool flag)
@@ -109,19 +169,19 @@ namespace PYGPU_PACKAGE
           --m_trace;
       }
 
-      static size_type alloc_size(bin_nr_t bin)
+      size_type alloc_size(bin_nr_t bin)
       {
-        bin_nr_t exponent = bin >> mantissa_bits;
-        bin_nr_t mantissa = bin & mantissa_mask;
+        bin_nr_t exponent = bin >> m_leading_bits_in_bin_id;
+        bin_nr_t mantissa = bin & mantissa_mask();
 
-        size_type ones = signed_left_shift(1,
-            signed(exponent)-signed(mantissa_bits)
+        size_type ones = signed_left_shift((size_type) 1,
+            signed(exponent)-signed(m_leading_bits_in_bin_id)
             );
         if (ones) ones -= 1;
 
         size_type head = signed_left_shift(
-           (1<<mantissa_bits) | mantissa,
-            signed(exponent)-signed(mantissa_bits));
+           (size_type) ((1<<m_leading_bits_in_bin_id) | mantissa),
+            signed(exponent)-signed(m_leading_bits_in_bin_id));
         if (ones & head)
           throw std::runtime_error("memory_pool::alloc_size: bit-counting fault");
         return head | ones;
@@ -133,12 +193,12 @@ namespace PYGPU_PACKAGE
         typename container_t::iterator it = m_container.find(bin_nr);
         if (it == m_container.end())
         {
-          bin_t *new_bin = new bin_t;
-          m_container.insert(bin_nr, new_bin);
-          return *new_bin;
+          auto it_and_inserted = m_container.insert(std::make_pair(bin_nr, bin_t()));
+          assert(it_and_inserted.second);
+          return it_and_inserted.first->second;
         }
         else
-          return *it->second;
+          return it->second;
       }
 
       void inc_held_blocks()
@@ -176,14 +236,15 @@ namespace PYGPU_PACKAGE
           return pop_block_from_bin(bin, size);
         }
 
-        size_type alloc_sz = alloc_size(bin_nr);
+         size_type alloc_sz = alloc_size(bin_nr);
 
-        assert(bin_number(alloc_sz) == bin_nr);
+        always_assert(bin_number(alloc_sz) == bin_nr);
+        always_assert(alloc_sz >= size);
 
         if (m_trace)
           std::cout << "[pool] allocation of size " << size << " required new memory" << std::endl;
 
-        try { return get_from_allocator(alloc_sz); }
+        try { return get_from_allocator(alloc_sz, size); }
         catch (PYGPU_PACKAGE::error &e)
         {
           if (!e.is_out_of_memory())
@@ -202,7 +263,7 @@ namespace PYGPU_PACKAGE
 
         while (try_to_free_memory())
         {
-          try { return get_from_allocator(alloc_sz); }
+          try { return get_from_allocator(alloc_sz, size); }
           catch (PYGPU_PACKAGE::error &e)
           {
             if (!e.is_out_of_memory())
@@ -224,6 +285,7 @@ namespace PYGPU_PACKAGE
       void free(pointer_type p, size_type size)
       {
         --m_active_blocks;
+        m_active_bytes -= size;
         bin_nr_t bin_nr = bin_number(size);
 
         if (!m_stop_holding)
@@ -237,18 +299,22 @@ namespace PYGPU_PACKAGE
               << " entries" << std::endl;
         }
         else
+        {
           m_allocator->free(p);
+          m_managed_bytes -= alloc_size(bin_nr);
+        }
       }
 
       void free_held()
       {
-        BOOST_FOREACH(bin_pair_t bin_pair, m_container)
+        for (bin_pair_t &bin_pair: m_container)
         {
-          bin_t &bin = *bin_pair.second;
+          bin_t &bin = bin_pair.second;
 
           while (bin.size())
           {
             m_allocator->free(bin.back());
+            m_managed_bytes -= alloc_size(bin_pair.first);
             bin.pop_back();
 
             dec_held_blocks();
@@ -264,23 +330,31 @@ namespace PYGPU_PACKAGE
         free_held();
       }
 
-      unsigned active_blocks()
+      size_type active_blocks() const
       { return m_active_blocks; }
 
-      unsigned held_blocks()
+      size_type held_blocks() const
       { return m_held_blocks; }
 
+      size_type managed_bytes() const
+      { return m_managed_bytes; }
+
+      size_type active_bytes() const
+      { return m_active_bytes; }
+
       bool try_to_free_memory()
       {
-        BOOST_FOREACH(bin_pair_t bin_pair,
-            // free largest stuff first
-            std::make_pair(m_container.rbegin(), m_container.rend()))
+        // free largest stuff first
+        for (typename container_t::reverse_iterator it = m_container.rbegin();
+            it != m_container.rend(); ++it)
         {
-          bin_t &bin = *bin_pair.second;
+          bin_pair_t &bin_pair = *it;
+          bin_t &bin = bin_pair.second;
 
           if (bin.size())
           {
             m_allocator->free(bin.back());
+            m_managed_bytes -= alloc_size(bin_pair.first);
             bin.pop_back();
 
             dec_held_blocks();
@@ -293,10 +367,12 @@ namespace PYGPU_PACKAGE
       }
 
     private:
-      pointer_type get_from_allocator(size_type alloc_sz)
+      pointer_type get_from_allocator(size_type alloc_sz, size_type size)
       {
         pointer_type result = m_allocator->allocate(alloc_sz);
         ++m_active_blocks;
+        m_managed_bytes += alloc_sz;
+        m_active_bytes += size;
 
         return result;
       }
@@ -308,17 +384,15 @@ namespace PYGPU_PACKAGE
 
         dec_held_blocks();
         ++m_active_blocks;
+        m_active_bytes += size;
 
         return result;
       }
   };
 
 
-
-
-
   template <class Pool>
-  class pooled_allocation : public boost::noncopyable
+  class pooled_allocation : public mp_noncopyable
   {
     public:
       typedef Pool pool_type;
@@ -326,14 +400,14 @@ namespace PYGPU_PACKAGE
       typedef typename Pool::size_type size_type;
 
     private:
-      boost::shared_ptr<pool_type> m_pool;
+      PYGPU_SHARED_PTR<pool_type> m_pool;
 
       pointer_type m_ptr;
       size_type m_size;
       bool m_valid;
 
     public:
-      pooled_allocation(boost::shared_ptr<pool_type> p, size_type size)
+      pooled_allocation(PYGPU_SHARED_PTR<pool_type> p, size_type size)
         : m_pool(p), m_ptr(p->allocate(size)), m_size(size), m_valid(true)
       { }
 
@@ -352,7 +426,7 @@ namespace PYGPU_PACKAGE
         }
         else
           throw PYGPU_PACKAGE::error(
-              "pooled_device_allocation::free", 
+              "pooled_device_allocation::free",
 #ifdef PYGPU_PYCUDA
               CUDA_ERROR_INVALID_HANDLE
 #endif
diff --git a/src/wrapper/mempool.cpp b/src/wrapper/mempool.cpp
index 918d3d0d..77a4a737 100644
--- a/src/wrapper/mempool.cpp
+++ b/src/wrapper/mempool.cpp
@@ -102,7 +102,7 @@ namespace
 
 
   template<class Allocator>
-  class context_dependent_memory_pool : 
+  class context_dependent_memory_pool :
     public pycuda::memory_pool<Allocator>,
     public pycuda::explicit_context_dependent
   {
@@ -117,12 +117,12 @@ namespace
 
 
-  class pooled_device_allocation 
-    : public pycuda::context_dependent, 
+  class pooled_device_allocation
+    : public pycuda::context_dependent,
     public pycuda::pooled_allocation<context_dependent_memory_pool<device_allocator> >
-  { 
+  {
     private:
-      typedef 
+      typedef
         pycuda::pooled_allocation<context_dependent_memory_pool<device_allocator> >
         super;
 
@@ -159,12 +159,12 @@ namespace
   }
 
 
-  
-  class pooled_host_allocation 
+
+  class pooled_host_allocation
     : public pycuda::pooled_allocation<pycuda::memory_pool<host_allocator> >
   {
     private:
-      typedef 
+      typedef
         pycuda::pooled_allocation<pycuda::memory_pool<host_allocator> >
         super;
 
@@ -193,7 +193,7 @@ namespace
         back_inserter(dims));
 
     std::auto_ptr<pooled_host_allocation> alloc(
-        new pooled_host_allocation( 
+        new pooled_host_allocation(
           pool, tp_descr->elsize*pycuda::size_from_dims(dims.size(), &dims.front())));
 
     NPY_ORDER order = PyArray_CORDER;
@@ -228,6 +228,8 @@ namespace
     wrapper
       .add_property("held_blocks", &cl::held_blocks)
       .add_property("active_blocks", &cl::active_blocks)
+      .add_property("managed_bytes", &cl::managed_bytes)
+      .add_property("active_bytes", &cl::active_bytes)
       .DEF_SIMPLE_METHOD(bin_number)
       .DEF_SIMPLE_METHOD(alloc_size)
       .DEF_SIMPLE_METHOD(free_held)
@@ -249,7 +251,7 @@ void pycuda_expose_tools()
     typedef context_dependent_memory_pool<device_allocator> cl;
 
     py::class_<
-      cl, boost::noncopyable, 
+      cl, boost::noncopyable,
       boost::shared_ptr<cl> > wrapper("DeviceMemoryPool");
     wrapper
       .def("allocate", device_pool_allocate,
@@ -269,7 +271,7 @@ void pycuda_expose_tools()
     typedef pycuda::memory_pool<host_allocator> cl;
 
     py::class_<
-      cl, boost::noncopyable, 
+      cl, boost::noncopyable,
       boost::shared_ptr<cl> > wrapper(
           "PageLockedMemoryPool",
           py::init<py::optional<host_allocator const &> >()
-- 
GitLab


From d339e830826608b72c2d7da7e2511cdcf6e1a47f Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Wed, 31 Mar 2021 13:59:37 -0500
Subject: [PATCH 27/61] mempool: bin_number and alloc_size are no longer static

---
 src/wrapper/mempool.cpp | 2 --
 test/test_driver.py     | 7 ++++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/wrapper/mempool.cpp b/src/wrapper/mempool.cpp
index 77a4a737..66f43f7e 100644
--- a/src/wrapper/mempool.cpp
+++ b/src/wrapper/mempool.cpp
@@ -234,8 +234,6 @@ namespace
       .DEF_SIMPLE_METHOD(alloc_size)
       .DEF_SIMPLE_METHOD(free_held)
       .DEF_SIMPLE_METHOD(stop_holding)
-      .staticmethod("bin_number")
-      .staticmethod("alloc_size")
       ;
   }
 }
diff --git a/test/test_driver.py b/test/test_driver.py
index b022aa37..98f3c8aa 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -669,14 +669,15 @@ class TestDriver:
     def test_mempool_2(self):
         from pycuda.tools import DeviceMemoryPool
         from random import randrange
+        pool = DeviceMemoryPool()
 
         for i in range(2000):
             s = randrange(1 << 31) >> randrange(32)
-            bin_nr = DeviceMemoryPool.bin_number(s)
-            asize = DeviceMemoryPool.alloc_size(bin_nr)
+            bin_nr = pool.bin_number(s)
+            asize = pool.alloc_size(bin_nr)
 
             assert asize >= s, s
-            assert DeviceMemoryPool.bin_number(asize) == bin_nr, s
+            assert pool.bin_number(asize) == bin_nr, s
             assert asize < asize * (1 + 1 / 8)
 
     @mark_cuda_test
-- 
GitLab


From 3c8d3932638a45ab013cae58bac133f213466740 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 2 Apr 2021 16:49:37 -0500
Subject: [PATCH 28/61] Drop dependency on decorator module

---
 pycuda/tools.py | 44 ++++++++++++++++++++++++++------------------
 setup.py        |  1 -
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/pycuda/tools.py b/pycuda/tools.py
index 1a2b50f4..05ac3c52 100644
--- a/pycuda/tools.py
+++ b/pycuda/tools.py
@@ -26,7 +26,6 @@ OTHER DEALINGS IN THE SOFTWARE.
 """
 
 import pycuda.driver as cuda
-from decorator import decorator
 import pycuda._driver as _drv
 import numpy as np
 
@@ -451,25 +450,34 @@ def get_arg_type(c_arg):
 context_dependent_memoized_functions = []
 
 
-@decorator
-def context_dependent_memoize(func, *args):
-    try:
-        ctx_dict = func._pycuda_ctx_dep_memoize_dic
-    except AttributeError:
-        # FIXME: This may keep contexts alive longer than desired.
-        # But I guess since the memory in them is freed, who cares.
-        ctx_dict = func._pycuda_ctx_dep_memoize_dic = {}
+def context_dependent_memoize(func):
+    def wrapper(*args, **kwargs):
+        if kwargs:
+            cache_key = (args, frozenset(kwargs.items()))
+        else:
+            cache_key = (args,)
 
-    cur_ctx = cuda.Context.get_current()
+        try:
+            ctx_dict = func._pycuda_ctx_dep_memoize_dic
+        except AttributeError:
+            # FIXME: This may keep contexts alive longer than desired.
+            # But I guess since the memory in them is freed, who cares.
+            ctx_dict = func._pycuda_ctx_dep_memoize_dic = {}
 
-    try:
-        return ctx_dict[cur_ctx][args]
-    except KeyError:
-        context_dependent_memoized_functions.append(func)
-        arg_dict = ctx_dict.setdefault(cur_ctx, {})
-        result = func(*args)
-        arg_dict[args] = result
-        return result
+        cur_ctx = cuda.Context.get_current()
+
+        try:
+            return ctx_dict[cur_ctx][cache_key]
+        except KeyError:
+            context_dependent_memoized_functions.append(func)
+            arg_dict = ctx_dict.setdefault(cur_ctx, {})
+            result = func(*args, **kwargs)
+            arg_dict[cache_key] = result
+            return result
+
+    from functools import update_wrapper
+    update_wrapper(wrapper, func)
+    return wrapper
 
 
 def clear_context_caches():
diff --git a/setup.py b/setup.py
index 990a7830..44545e16 100644
--- a/setup.py
+++ b/setup.py
@@ -225,7 +225,6 @@ def main():
         python_requires="~=3.6",
         install_requires=[
             "pytools>=2011.2",
-            "decorator>=3.2.0",
             "appdirs>=1.4.0",
             "mako",
         ],
-- 
GitLab


From 787218f919740b0c432ed5bf598f9e30c2e834f7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 3 Apr 2021 17:28:03 -0500
Subject: [PATCH 29/61] Tweak MANIFEST.in to ship wiki examples

---
 MANIFEST.in | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MANIFEST.in b/MANIFEST.in
index 5e91a36d..7d3675cd 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -5,8 +5,7 @@ include src/cpp/*.cpp
 include src/wrapper/*.hpp
 include src/wrapper/*.cpp
 include test/*.py
-include examples/*.py
-include examples/MORE*
+recursive-include examples *.py
 
 include doc/source/*.rst
 include doc/Makefile
-- 
GitLab


From 2e9e7b8ce98131c1f1596d0c0aa3feb020056831 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 3 Apr 2021 17:28:11 -0500
Subject: [PATCH 30/61] Bump version to 2021.1

---
 pycuda/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/__init__.py b/pycuda/__init__.py
index ea2c3fd8..e4c47bc9 100644
--- a/pycuda/__init__.py
+++ b/pycuda/__init__.py
@@ -1,3 +1,3 @@
-VERSION = (2020, 1)
+VERSION = (2021, 1)
 VERSION_STATUS = ""
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
-- 
GitLab


From fd11bd8c5216ad48932afba1b032abe6ea0d1f27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Thu, 8 Apr 2021 12:15:16 -0500
Subject: [PATCH 31/61] Run flake8 in Github CI on target Py version

---
 .github/workflows/ci.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 9cfa45a6..b90957af 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -16,7 +16,8 @@ jobs:
         -
             uses: actions/setup-python@v1
             with:
-                python-version: '3.x'
+                # matches compat target in setup.py
+                python-version: '3.6'
         -   name: "Main Script"
             run: |
                 curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-flake8.sh
-- 
GitLab


From fa90450b71cfb41ffe6262331ba6badae80acdba Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 8 Apr 2021 18:29:15 -0500
Subject: [PATCH 32/61] Add Gitlab autopush

---
 .github/workflows/autopush.yml | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 .github/workflows/autopush.yml

diff --git a/.github/workflows/autopush.yml b/.github/workflows/autopush.yml
new file mode 100644
index 00000000..f89b08ac
--- /dev/null
+++ b/.github/workflows/autopush.yml
@@ -0,0 +1,21 @@
+name: Gitlab mirror
+on:
+    push:
+        branches:
+        - main
+
+jobs:
+    autopush:
+        name: Automatic push to gitlab.tiker.net
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   run: |
+                mkdir ~/.ssh && echo -e "Host gitlab.tiker.net\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
+                eval $(ssh-agent) && echo "$GITLAB_AUTOPUSH_KEY" | ssh-add -
+                git fetch --unshallow
+                git push "git@gitlab.tiker.net:inducer/$(basename $GITHUB_REPOSITORY).git"  main
+            env:
+                GITLAB_AUTOPUSH_KEY: ${{ secrets.GITLAB_AUTOPUSH_KEY }}
+
+# vim: sw=4
-- 
GitLab


From ee094655600357a46ee34a030086a2710cb68ed2 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 10 Apr 2021 15:43:03 -0500
Subject: [PATCH 33/61] Update compyte

---
 pycuda/compyte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/compyte b/pycuda/compyte
index 7533db88..71bffa1a 160000
--- a/pycuda/compyte
+++ b/pycuda/compyte
@@ -1 +1 @@
-Subproject commit 7533db88124045924a47d7392eaf9a078670fc4d
+Subproject commit 71bffa1ae64ed98b9d922c79a6f9cc7eb4fd642f
-- 
GitLab


From f4c52bc0de08d117c7ab51a92a441763552575c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Fri, 16 Apr 2021 14:25:57 -0500
Subject: [PATCH 34/61] Fix a py2-ism in the tutorial

---
 doc/source/tutorial.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/source/tutorial.rst b/doc/source/tutorial.rst
index 2c401b88..4cc909cf 100644
--- a/doc/source/tutorial.rst
+++ b/doc/source/tutorial.rst
@@ -20,7 +20,7 @@ Transferring Data
 The next step in most programs is to transfer data onto the device.
 In PyCuda, you will mostly transfer data from :mod:`numpy` arrays
 on the host. (But indeed, everything that satisfies the Python buffer
-interface will work, even a :class:`str`.) Let's make a 4x4 array
+interface will work, even :class:`bytes`.) Let's make a 4x4 array
 of random numbers::
 
   import numpy
-- 
GitLab


From 33a8d259ea6e9560798ab902f37db8e9740c0a11 Mon Sep 17 00:00:00 2001
From: ByLamacq <52816027+ByLamacq@users.noreply.github.com>
Date: Tue, 11 May 2021 22:14:05 +0200
Subject: [PATCH 35/61] Update CUdevice_attribute from cuda 6 to 11.3

---
 src/wrapper/wrap_cudadrv.cpp | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 0be5a0a2..6d95edad 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -897,6 +897,36 @@ BOOST_PYTHON_MODULE(_driver)
     .value("MANAGED_MEMORY", CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY)
     .value("MULTI_GPU_BOARD", CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD)
     .value("MULTI_GPU_BOARD_GROUP_ID", CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID)
+#endif
+#if CUDAPP_CUDA_VERSION >= 8000
+    .value("HOST_NATIVE_ATOMIC_SUPPORTED", CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED)
+    .value("SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO", CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO)
+    .value("PAGEABLE_MEMORY_ACCESS", CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS)
+    .value("CONCURRENT_MANAGED_ACCESS", CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)
+    .value("COMPUTE_PREEMPTION_SUPPORTED", CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED)
+    .value("CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM", CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM)
+#endif
+#if CUDAPP_CUDA_VERSION >= 9000
+    .value("MAX_SHARED_MEMORY_PER_BLOCK_OPTIN", CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK_OPTIN)
+#endif
+#if CUDAPP_CUDA_VERSION >= 9020
+    .value("PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES", CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES)
+    .value("DIRECT_MANAGED_MEM_ACCESS_FROM_HOST", CU_DEVICE_ATTRIBUTE_DIRECT_MANAGED_MEM_ACCESS_FROM_HOST)
+#endif
+#if CUDAPP_CUDA_VERSION >= 10020
+    .value("HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED", CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED)
+    .value("HANDLE_TYPE_WIN32_HANDLE_SUPPORTED", CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_HANDLE_SUPPORTED)
+    .value("HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED", CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED)
+#endif
+#if CUDAPP_CUDA_VERSION >= 11000
+    .value("MAX_PERSISTING_L2_CACHE_SIZE", CU_DEVICE_ATTRIBUTE_MAX_PERSISTING_L2_CACHE_SIZE)
+    .value("MAX_BLOCKS_PER_MULTIPROCESSOR", CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR)
+    .value("GENERIC_COMPRESSION_SUPPORTED", CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED)
+    .value("RESERVED_SHARED_MEMORY_PER_BLOCK", CU_DEVICE_ATTRIBUTE_RESERVED_SHARED_MEMORY_PER_BLOCK)
+#endif
+#if CUDAPP_CUDA_VERSION >= 11020
+    .value("READ_ONLY_HOST_REGISTER_SUPPORTED", CU_DEVICE_ATTRIBUTE_READ_ONLY_HOST_REGISTER_SUPPORTED)
+    .value("MEMORY_POOLS_SUPPORTED", CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED)
 #endif
     ;
 #if CUDAPP_CUDA_VERSION >= 4000
-- 
GitLab


From f783c4f20dbce6228292be9a56841974445863f5 Mon Sep 17 00:00:00 2001
From: ByLamacq <52816027+ByLamacq@users.noreply.github.com>
Date: Wed, 12 May 2021 20:12:58 +0200
Subject: [PATCH 36/61] Update device_attribute from cuda 6 to 11.3

---
 doc/source/driver.rst | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/doc/source/driver.rst b/doc/source/driver.rst
index 8de1f2d1..86fa2e1b 100644
--- a/doc/source/driver.rst
+++ b/doc/source/driver.rst
@@ -331,6 +331,42 @@ Constants
         CUDA 6.0 and above.
 
         .. versionadded:: 2014.1
+        
+    .. attribute :: HOST_NATIVE_ATOMIC_SUPPORTED
+        SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO
+        PAGEABLE_MEMORY_ACCESS
+        CONCURRENT_MANAGED_ACCESS
+        COMPUTE_PREEMPTION_SUPPORTED
+        CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM
+
+        CUDA 8.0 and above.
+
+    .. attribute :: MAX_SHARED_MEMORY_PER_BLOCK_OPTIN
+
+        CUDA 9.0 and above.
+
+    .. attribute :: PAGEABLE_MEMORY_ACCESS_USES_HOST_PAGE_TABLES
+        DIRECT_MANAGED_MEM_ACCESS_FROM_HOST
+
+        CUDA 9.2 and above.
+
+    .. attribute :: HANDLE_TYPE_POSIX_FILE_DESCRIPTOR_SUPPORTED
+        HANDLE_TYPE_WIN32_HANDLE_SUPPORTED
+        HANDLE_TYPE_WIN32_KMT_HANDLE_SUPPORTED
+
+        CUDA 10.2 and above.
+
+    .. attribute :: MAX_PERSISTING_L2_CACHE_SIZE
+        MAX_BLOCKS_PER_MULTIPROCESSOR
+        GENERIC_COMPRESSION_SUPPORTED
+        RESERVED_SHARED_MEMORY_PER_BLOCK
+
+        CUDA 11.0 and above.
+
+    .. attribute :: READ_ONLY_HOST_REGISTER_SUPPORTED
+        MEMORY_POOLS_SUPPORTED
+
+        CUDA 11.2 and above.
 
 .. class:: pointer_attribute
 
-- 
GitLab


From 02b4eac828de76200904bca4401d747692bd5f89 Mon Sep 17 00:00:00 2001
From: Maxim Belkin <maxim.belkin@gmail.com>
Date: Mon, 31 May 2021 13:05:06 -0500
Subject: [PATCH 37/61] doc/source/index.rst: remove duplicate reikna entry

---
 doc/source/index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/source/index.rst b/doc/source/index.rst
index 41ad641a..bb0d9820 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -88,7 +88,6 @@ feel should be listed here, please submit a PR!
 * `pyvkfft <https://github.com/vincefn/pyvkfft>`__
 * `scikit-cuda <https://scikit-cuda.readthedocs.io/en/latest/>`__
 * `reikna <https://pypi.org/project/reikna/>`__
-* `reikna <https://pypi.org/project/reikna/>`__
 * `compyle <https://pypi.org/project/compyle/>`__
 * `pynufft <https://pypi.org/project/pynufft/>`__
 
-- 
GitLab


From d2dd127eb1599230b6456d991cdbf0a3d5b6c9f9 Mon Sep 17 00:00:00 2001
From: Tim Gates <tim.gates@iress.com>
Date: Wed, 9 Jun 2021 22:15:22 +1000
Subject: [PATCH 38/61] docs: fix a few simple typos

There are small typos in:
- aksetup_helper.py
- examples/from-wiki/mandelbrot_interactive.py
- pycuda/cuda/pycuda-helpers.hpp
- test/test_gpuarray.py

Fixes:
- Should read `subtraction` rather than `substraction`.
- Should read `support` rather than `supprt`.
- Should read `right` rather than `rigth`.
- Should read `preferred` rather than `prefered`.

Closes #294
---
 aksetup_helper.py                            | 2 +-
 examples/from-wiki/mandelbrot_interactive.py | 2 +-
 pycuda/cuda/pycuda-helpers.hpp               | 2 +-
 test/test_gpuarray.py                        | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/aksetup_helper.py b/aksetup_helper.py
index 91e9ac0c..6543215f 100644
--- a/aksetup_helper.py
+++ b/aksetup_helper.py
@@ -926,7 +926,7 @@ def has_flag(compiler, flagname):
 def cpp_flag(compiler):
     """Return the -std=c++[11/14] compiler flag.
 
-    The c++14 is prefered over c++11 (when it is available).
+    The c++14 is preferred over c++11 (when it is available).
     """
     if has_flag(compiler, '-std=gnu++14'):
         return '-std=gnu++14'
diff --git a/examples/from-wiki/mandelbrot_interactive.py b/examples/from-wiki/mandelbrot_interactive.py
index 0780be20..66c3a011 100644
--- a/examples/from-wiki/mandelbrot_interactive.py
+++ b/examples/from-wiki/mandelbrot_interactive.py
@@ -11,7 +11,7 @@
 
 # Point and click with the right buttom to magnify by a factor of 10
 
-# Click with the left button on the rigth side of the 
+# Click with the left button on the right side of the 
 # image to randomly change the colormap
 
 # Click with right button on the right side of the image to set the default colormap
diff --git a/pycuda/cuda/pycuda-helpers.hpp b/pycuda/cuda/pycuda-helpers.hpp
index a543fc58..5f25092a 100644
--- a/pycuda/cuda/pycuda-helpers.hpp
+++ b/pycuda/cuda/pycuda-helpers.hpp
@@ -102,7 +102,7 @@ extern "C++" {
     return pycuda::complex<double>(__hiloint2double(v.y, v.x), __hiloint2double(v.w, v.z));
   }
 
-  // FP_Surfaces with complex supprt
+  // FP_Surfaces with complex support
 
   __device__ void fp_surf2DLayeredwrite(double var,surface<void, cudaSurfaceType2DLayered> surf, int i, int j, int layer, enum cudaSurfaceBoundaryMode mode)
   {
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index d5d09251..4b53aca7 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -154,7 +154,7 @@ class TestGPUArray:
 
     @mark_cuda_test
     def test_substract_array(self):
-        """Test the substraction of two arrays."""
+        """Test the subtraction of two arrays."""
         # test data
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
         b = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
@@ -170,7 +170,7 @@ class TestGPUArray:
 
     @mark_cuda_test
     def test_substract_scalar(self):
-        """Test the substraction of an array and a scalar."""
+        """Test the subtraction of an array and a scalar."""
 
         # test data
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
-- 
GitLab


From 3c4a4a6243e91bf3fcbd65a6e25c988ad14331c6 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Fri, 25 Jun 2021 13:48:21 -0500
Subject: [PATCH 39/61] Simplify, standardize sphinx setup

---
 .gitignore                    |  1 +
 doc/Makefile                  | 78 +++++++----------------------------
 doc/{source => }/array.rst    |  0
 doc/conf.py                   | 25 +++++++++++
 doc/{source => }/driver.rst   |  0
 doc/{source => }/gl.rst       |  0
 doc/{source => }/index.rst    |  0
 doc/{source => }/install.rst  |  0
 doc/{source => }/metaprog.rst |  0
 doc/{source => }/misc.rst     |  0
 doc/source/conf.py            | 52 -----------------------
 doc/{source => }/tutorial.rst |  0
 doc/upload-docs.sh            |  2 +-
 doc/{source => }/util.rst     |  0
 14 files changed, 41 insertions(+), 117 deletions(-)
 rename doc/{source => }/array.rst (100%)
 create mode 100644 doc/conf.py
 rename doc/{source => }/driver.rst (100%)
 rename doc/{source => }/gl.rst (100%)
 rename doc/{source => }/index.rst (100%)
 rename doc/{source => }/install.rst (100%)
 rename doc/{source => }/metaprog.rst (100%)
 rename doc/{source => }/misc.rst (100%)
 delete mode 100644 doc/source/conf.py
 rename doc/{source => }/tutorial.rst (100%)
 rename doc/{source => }/util.rst (100%)

diff --git a/.gitignore b/.gitignore
index 4f1618a3..b73b75c2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,7 @@
 *.pyc
 *.pyo
 build
+_build
 *.prof
 siteconf.py
 doc/hedge-notes.pdf
diff --git a/doc/Makefile b/doc/Makefile
index e59ebe6c..747126b3 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -1,70 +1,20 @@
-# Makefile for Sphinx documentation
+# Minimal makefile for Sphinx documentation
 #
 
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = python3 `which sphinx-build`
-PAPER         =
-
-# Internal variables.
-PAPEROPT_a4     = -D latex_paper_size=a4
-PAPEROPT_letter = -D latex_paper_size=letter
-ALLSPHINXOPTS   = -d build/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
-
-.PHONY: help clean html web pickle htmlhelp latex changes linkcheck
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?= -n
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
 
+# Put it first so that "make" without argument is like "make help".
 help:
-	@echo "Please use \`make <target>' where <target> is one of"
-	@echo "  html      to make standalone HTML files"
-	@echo "  pickle    to make pickle files (usable by e.g. sphinx-web)"
-	@echo "  htmlhelp  to make HTML files and a HTML help project"
-	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
-	@echo "  changes   to make an overview over all changed/added/deprecated items"
-	@echo "  linkcheck to check all external links for integrity"
-
-clean:
-	-rm -rf build/*
-
-html:
-	mkdir -p build/html build/doctrees source/.static
-	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) build/html
-	@echo
-	@echo "Build finished. The HTML pages are in build/html."
-
-pickle:
-	mkdir -p build/pickle build/doctrees
-	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) build/pickle
-	@echo
-	@echo "Build finished; now you can process the pickle files or run"
-	@echo "  sphinx-web build/pickle"
-	@echo "to start the sphinx-web server."
-
-web: pickle
-
-htmlhelp:
-	mkdir -p build/htmlhelp build/doctrees
-	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) build/htmlhelp
-	@echo
-	@echo "Build finished; now you can run HTML Help Workshop with the" \
-	      ".hhp project file in build/htmlhelp."
-
-latex:
-	mkdir -p build/latex build/doctrees
-	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) build/latex
-	@echo
-	@echo "Build finished; the LaTeX files are in build/latex."
-	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
-	      "run these through (pdf)latex."
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 
-changes:
-	mkdir -p build/changes build/doctrees
-	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) build/changes
-	@echo
-	@echo "The overview file is in build/changes."
+.PHONY: help Makefile
 
-linkcheck:
-	mkdir -p build/linkcheck build/doctrees
-	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) build/linkcheck
-	@echo
-	@echo "Link check complete; look for any errors in the above output " \
-	      "or in build/linkcheck/output.txt."
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/doc/source/array.rst b/doc/array.rst
similarity index 100%
rename from doc/source/array.rst
rename to doc/array.rst
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 00000000..f2482ba2
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,25 @@
+from urllib.request import urlopen
+
+_conf_url = \
+        "https://raw.githubusercontent.com/inducer/sphinxconfig/main/sphinxconfig.py"
+with urlopen(_conf_url) as _inf:
+    exec(compile(_inf.read(), _conf_url, "exec"), globals())
+
+copyright = "2008-21, Andreas Kloeckner"
+
+ver_dic = {}
+exec(
+    compile(
+        open("../pycuda/__init__.py").read(), "../pycuda/__init__.py", "exec"
+    ),
+    ver_dic,
+)
+version = ".".join(str(x) for x in ver_dic["VERSION"])
+# The full version, including alpha/beta/rc tags.
+release = ver_dic["VERSION_TEXT"]
+
+intersphinx_mapping = {
+    "https://docs.python.org/3": None,
+    "https://numpy.org/doc/stable/": None,
+    "https://documen.tician.de/codepy/": None,
+}
diff --git a/doc/source/driver.rst b/doc/driver.rst
similarity index 100%
rename from doc/source/driver.rst
rename to doc/driver.rst
diff --git a/doc/source/gl.rst b/doc/gl.rst
similarity index 100%
rename from doc/source/gl.rst
rename to doc/gl.rst
diff --git a/doc/source/index.rst b/doc/index.rst
similarity index 100%
rename from doc/source/index.rst
rename to doc/index.rst
diff --git a/doc/source/install.rst b/doc/install.rst
similarity index 100%
rename from doc/source/install.rst
rename to doc/install.rst
diff --git a/doc/source/metaprog.rst b/doc/metaprog.rst
similarity index 100%
rename from doc/source/metaprog.rst
rename to doc/metaprog.rst
diff --git a/doc/source/misc.rst b/doc/misc.rst
similarity index 100%
rename from doc/source/misc.rst
rename to doc/misc.rst
diff --git a/doc/source/conf.py b/doc/source/conf.py
deleted file mode 100644
index 2fb39ed0..00000000
--- a/doc/source/conf.py
+++ /dev/null
@@ -1,52 +0,0 @@
-extensions = [
-    "sphinx.ext.intersphinx",
-    "sphinx.ext.mathjax",
-    "sphinx_copybutton",
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The suffix of source filenames.
-source_suffix = ".rst"
-
-# The master toctree document.
-master_doc = "index"
-
-# General substitutions.
-project = "PyCUDA"
-copyright = "2008-20, Andreas Kloeckner"
-
-# The default replacements for |version| and |release|, also used in various
-# other places throughout the built documents.
-#
-# The short X.Y version.
-ver_dic = {}
-exec(
-    compile(
-        open("../../pycuda/__init__.py").read(), "../../pycuda/__init__.py", "exec"
-    ),
-    ver_dic,
-)
-version = ".".join(str(x) for x in ver_dic["VERSION"])
-# The full version, including alpha/beta/rc tags.
-release = ver_dic["VERSION_TEXT"]
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-
-
-# Options for HTML output
-# -----------------------
-
-html_theme = "furo"
-
-
-intersphinx_mapping = {
-    "https://docs.python.org/3": None,
-    "https://numpy.org/doc/stable/": None,
-    "https://documen.tician.de/codepy/": None,
-}
-
-autoclass_content = "class"
-autodoc_typehints = "description"
diff --git a/doc/source/tutorial.rst b/doc/tutorial.rst
similarity index 100%
rename from doc/source/tutorial.rst
rename to doc/tutorial.rst
diff --git a/doc/upload-docs.sh b/doc/upload-docs.sh
index d5bba855..5914600b 100755
--- a/doc/upload-docs.sh
+++ b/doc/upload-docs.sh
@@ -1,3 +1,3 @@
 #! /bin/sh
 
-rsync --verbose --archive --delete build/html/* doc-upload:doc/pycuda
+rsync --verbose --archive --delete _build/html/* doc-upload:doc/pycuda
diff --git a/doc/source/util.rst b/doc/util.rst
similarity index 100%
rename from doc/source/util.rst
rename to doc/util.rst
-- 
GitLab


From 835406a7033c1705b79044c56c010cabbabc8acb Mon Sep 17 00:00:00 2001
From: Mit Kotak <mitak2@illinois.edu>
Date: Mon, 26 Jul 2021 04:16:58 +0000
Subject: [PATCH 40/61] Implemented gpuarray.(stack|concatenate) + docs

---
 doc/array.rst      |  9 +++++
 pycuda/gpuarray.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/doc/array.rst b/doc/array.rst
index de1cc66e..23f22ce0 100644
--- a/doc/array.rst
+++ b/doc/array.rst
@@ -339,6 +339,15 @@ Constructing :class:`GPUArray` Instances
     Return the :class:`GPUArray` ``[a[indices[0]], ..., a[indices[n]]]``.
     For the moment, *a* must be a type that can be bound to a texture.
 
+.. function:: concatenate(arrays, axis=0, allocator=None)
+
+    Join a sequence of arrays along an existing axis.
+
+ .. function:: stack(arrays, axis=0, allocator=None)
+
+    Join a sequence of arrays along a new axis.
+
+
 Conditionals
 ^^^^^^^^^^^^
 
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 373cf005..e5d853be 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1767,6 +1767,95 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None):
 
 # {{{ shape manipulation
 
+def concatenate(arrays, axis=0, allocator=None):
+     """
+     Join a sequence of arrays along an existing axis.
+     :arg arrays: A sequnce of :class:`GPUArray`.
+     :arg axis: Index of the dimension of the new axis in the result array.
+         Can be -1, for the new axis to be last dimension.
+     :returns: :class:`GPUArray`
+     """
+     # implementation is borrowed from pyopencl.array.concatenate()
+     # {{{ find properties of result array
+
+     shape = None
+
+     def shape_except_axis(ary: GPUArray):
+         return ary.shape[:axis] + ary.shape[axis+1:]
+
+     for i_ary, ary in enumerate(arrays):
+         allocator = allocator or ary.allocator
+
+         if shape is None:
+             # first array
+             shape = list(ary.shape)
+
+         else:
+             if len(ary.shape) != len(shape):
+                 raise ValueError("%d'th array has different number of axes "
+                         "(should have %d, has %d)"
+                         % (i_ary, len(ary.shape), len(shape)))
+
+             if (ary.ndim != arrays[0].ndim
+                     or shape_except_axis(ary) != shape_except_axis(arrays[0])):
+                 raise ValueError("%d'th array has residual not matching "
+                         "other arrays" % i_ary)
+
+             shape[axis] += ary.shape[axis]
+
+     # }}}
+
+     shape = tuple(shape)
+     dtype = np.find_common_type([ary.dtype for ary in arrays], [])
+     result = empty(shape, dtype, allocator=allocator)
+
+     full_slice = (slice(None),) * len(shape)
+
+     base_idx = 0
+     for ary in arrays:
+         my_len = ary.shape[axis]
+         result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary
+         base_idx += my_len
+
+     return result
+
+
+ def stack(arrays, axis=0, allocator=None):
+     """
+     Join a sequence of arrays along a new axis.
+     :arg arrays: A sequnce of :class:`GPUArray`.
+     :arg axis: Index of the dimension of the new axis in the result array.
+         Can be -1, for the new axis to be last dimension.
+     :returns: :class:`GPUArray`
+     """
+     # implementation is borrowed from pyopencl.array.stack()
+     allocator = allocator or arrays[0].allocator
+
+     if not arrays:
+         raise ValueError("need at least one array to stack")
+
+     input_shape = arrays[0].shape
+     input_ndim = arrays[0].ndim
+     axis = input_ndim if axis == -1 else axis
+
+     if not all(ary.shape == input_shape for ary in arrays[1:]):
+         raise ValueError("arrays must have the same shape")
+
+     if not (0 <= axis <= input_ndim):
+         raise ValueError("invalid axis")
+
+     result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:]
+     result = empty(shape=result_shape,
+             dtype=np.result_type(*(ary.dtype for ary in arrays)),
+             allocator=allocator, order="C" if axis == 0 else "F")
+
+     for i, ary in enumerate(arrays):
+
+         idx = (slice(None),)*axis + (i,) + (slice(None),)*(input_ndim-axis)
+         result[idx] = ary
+
+     return result
+
 
 def transpose(a, axes=None):
     """Permute the dimensions of an array.
-- 
GitLab


From 2fd2d8475f61f9d1446bbc03bebd9437de128624 Mon Sep 17 00:00:00 2001
From: Mit Kotak <mitak2@illinois.edu>
Date: Mon, 26 Jul 2021 04:20:51 +0000
Subject: [PATCH 41/61] Implemented test_gpuarray.test_(stack|concatenate)

---
 test/test_gpuarray.py | 47 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 4b53aca7..2b8b66b3 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -468,6 +468,53 @@ class TestGPUArray:
         a = gpuarray.arange(12, dtype=np.float32)
         assert (np.arange(12, dtype=np.float32) == a.get()).all()
 
+    @mark_cuda_test
+    def test_stack(self):
+
+        orders = ["F", "C"]
+        input_dims_lst = [0, 1, 2]
+
+        for order in orders:
+            for input_dims in input_dims_lst:
+                shape = (2, 2, 2)[:input_dims]
+                axis = -1 if order == "F" else 0
+
+                from numpy.random import default_rng
+                rng = default_rng()
+                x_in = rng.random(size=shape)
+                y_in = rng.random(size=shape)
+                x_in = x_in if order == "C" else np.asfortranarray(x_in)
+                y_in = y_in if order == "C" else np.asfortranarray(y_in)
+
+                x_gpu = gpuarray.to_gpu(x_in)
+                y_gpu = gpuarray.to_gpu(y_in)
+
+                numpy_stack = np.stack((x_in, y_in), axis=axis)
+                gpuarray_stack = gpuarray.stack((x_gpu, y_gpu), axis=axis)
+
+                np.testing.assert_allclose(gpuarray_stack.get(), numpy_stack)
+
+                assert gpuarray_stack.shape == numpy_stack.shape
+
+    @mark_cuda_test
+    def test_concatenate(self):
+
+    from pycuda.curandom import rand as curand
+
+        a_dev = curand((5, 15, 20), dtype=np.float32)
+        b_dev = curand((4, 15, 20), dtype=np.float32)
+        c_dev = curand((3, 15, 20), dtype=np.float32)
+        a = a_dev.get()
+        b = b_dev.get()
+        c = c_dev.get()
+
+        cat_dev = gpuarray.concatenate((a_dev, b_dev, c_dev))
+        cat = np.concatenate((a, b, c))
+
+        np.testing.assert_allclose(cat, cat_dev.get())
+
+        assert cat.shape == cat_dev.shape
+
     @mark_cuda_test
     def test_reverse(self):
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
-- 
GitLab


From 5664b3e421179a62305a1f2b194593f7331c9370 Mon Sep 17 00:00:00 2001
From: Mit Kotak <mitak2@illinois.edu>
Date: Mon, 26 Jul 2021 04:28:06 +0000
Subject: [PATCH 42/61] Implemented gpuarray.(stack|concatenate) + docs

---
 doc/array.rst      |   2 +-
 pycuda/gpuarray.py | 128 ++++++++++++++++++++++-----------------------
 2 files changed, 65 insertions(+), 65 deletions(-)

diff --git a/doc/array.rst b/doc/array.rst
index 23f22ce0..34c65693 100644
--- a/doc/array.rst
+++ b/doc/array.rst
@@ -343,7 +343,7 @@ Constructing :class:`GPUArray` Instances
 
     Join a sequence of arrays along an existing axis.
 
- .. function:: stack(arrays, axis=0, allocator=None)
+.. function:: stack(arrays, axis=0, allocator=None)
 
     Join a sequence of arrays along a new axis.
 
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index e5d853be..4d09c77a 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1768,93 +1768,93 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, stream=None):
 # {{{ shape manipulation
 
 def concatenate(arrays, axis=0, allocator=None):
-     """
-     Join a sequence of arrays along an existing axis.
-     :arg arrays: A sequnce of :class:`GPUArray`.
-     :arg axis: Index of the dimension of the new axis in the result array.
-         Can be -1, for the new axis to be last dimension.
-     :returns: :class:`GPUArray`
-     """
-     # implementation is borrowed from pyopencl.array.concatenate()
-     # {{{ find properties of result array
+    """
+    Join a sequence of arrays along an existing axis.
+    :arg arrays: A sequnce of :class:`GPUArray`.
+    :arg axis: Index of the dimension of the new axis in the result array.
+        Can be -1, for the new axis to be last dimension.
+    :returns: :class:`GPUArray`
+    """
+    # implementation is borrowed from pyopencl.array.concatenate()
+    # {{{ find properties of result array
 
-     shape = None
+    shape = None
 
-     def shape_except_axis(ary: GPUArray):
-         return ary.shape[:axis] + ary.shape[axis+1:]
+    def shape_except_axis(ary: GPUArray):
+        return ary.shape[:axis] + ary.shape[axis+1:]
 
-     for i_ary, ary in enumerate(arrays):
-         allocator = allocator or ary.allocator
+    for i_ary, ary in enumerate(arrays):
+        allocator = allocator or ary.allocator
 
-         if shape is None:
-             # first array
-             shape = list(ary.shape)
+        if shape is None:
+            # first array
+            shape = list(ary.shape)
 
-         else:
-             if len(ary.shape) != len(shape):
-                 raise ValueError("%d'th array has different number of axes "
-                         "(should have %d, has %d)"
-                         % (i_ary, len(ary.shape), len(shape)))
+        else:
+            if len(ary.shape) != len(shape):
+                raise ValueError("%d'th array has different number of axes "
+                        "(should have %d, has %d)"
+                        % (i_ary, len(ary.shape), len(shape)))
 
-             if (ary.ndim != arrays[0].ndim
-                     or shape_except_axis(ary) != shape_except_axis(arrays[0])):
-                 raise ValueError("%d'th array has residual not matching "
-                         "other arrays" % i_ary)
+            if (ary.ndim != arrays[0].ndim
+                    or shape_except_axis(ary) != shape_except_axis(arrays[0])):
+                raise ValueError("%d'th array has residual not matching "
+                        "other arrays" % i_ary)
 
-             shape[axis] += ary.shape[axis]
+            shape[axis] += ary.shape[axis]
 
-     # }}}
+    # }}}
 
-     shape = tuple(shape)
-     dtype = np.find_common_type([ary.dtype for ary in arrays], [])
-     result = empty(shape, dtype, allocator=allocator)
+    shape = tuple(shape)
+    dtype = np.find_common_type([ary.dtype for ary in arrays], [])
+    result = empty(shape, dtype, allocator=allocator)
 
-     full_slice = (slice(None),) * len(shape)
+    full_slice = (slice(None),) * len(shape)
 
-     base_idx = 0
-     for ary in arrays:
-         my_len = ary.shape[axis]
-         result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary
-         base_idx += my_len
+    base_idx = 0
+    for ary in arrays:
+        my_len = ary.shape[axis]
+        result[full_slice[:axis] + (slice(base_idx, base_idx+my_len),) + full_slice[axis+1:]] = ary
+        base_idx += my_len
 
-     return result
+    return result
 
 
  def stack(arrays, axis=0, allocator=None):
-     """
-     Join a sequence of arrays along a new axis.
-     :arg arrays: A sequnce of :class:`GPUArray`.
-     :arg axis: Index of the dimension of the new axis in the result array.
-         Can be -1, for the new axis to be last dimension.
-     :returns: :class:`GPUArray`
-     """
-     # implementation is borrowed from pyopencl.array.stack()
-     allocator = allocator or arrays[0].allocator
+    """
+    Join a sequence of arrays along a new axis.
+    :arg arrays: A sequnce of :class:`GPUArray`.
+    :arg axis: Index of the dimension of the new axis in the result array.
+        Can be -1, for the new axis to be last dimension.
+    :returns: :class:`GPUArray`
+    """
+    # implementation is borrowed from pyopencl.array.stack()
+    allocator = allocator or arrays[0].allocator
 
-     if not arrays:
-         raise ValueError("need at least one array to stack")
+    if not arrays:
+        raise ValueError("need at least one array to stack")
 
-     input_shape = arrays[0].shape
-     input_ndim = arrays[0].ndim
-     axis = input_ndim if axis == -1 else axis
+    input_shape = arrays[0].shape
+    input_ndim = arrays[0].ndim
+    axis = input_ndim if axis == -1 else axis
 
-     if not all(ary.shape == input_shape for ary in arrays[1:]):
-         raise ValueError("arrays must have the same shape")
+    if not all(ary.shape == input_shape for ary in arrays[1:]):
+        raise ValueError("arrays must have the same shape")
 
-     if not (0 <= axis <= input_ndim):
-         raise ValueError("invalid axis")
+    if not (0 <= axis <= input_ndim):
+        raise ValueError("invalid axis")
 
-     result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:]
-     result = empty(shape=result_shape,
-             dtype=np.result_type(*(ary.dtype for ary in arrays)),
-             allocator=allocator, order="C" if axis == 0 else "F")
+    result_shape = input_shape[:axis] + (len(arrays),) + input_shape[axis:]
+    result = empty(shape=result_shape,
+            dtype=np.result_type(*(ary.dtype for ary in arrays)),
+            allocator=allocator, order="C" if axis == 0 else "F")
 
-     for i, ary in enumerate(arrays):
+    for i, ary in enumerate(arrays):
 
-         idx = (slice(None),)*axis + (i,) + (slice(None),)*(input_ndim-axis)
-         result[idx] = ary
+        idx = (slice(None),)*axis + (i,) + (slice(None),)*(input_ndim-axis)
+        result[idx] = ary
 
-     return result
+    return result
 
 
 def transpose(a, axes=None):
-- 
GitLab


From e5cc82e8e1fb0079e7b101097d1104523969506c Mon Sep 17 00:00:00 2001
From: Mit Kotak <mitak2@illinois.edu>
Date: Mon, 26 Jul 2021 04:44:09 +0000
Subject: [PATCH 43/61] Removed Indentation errors

---
 pycuda/gpuarray.py    | 2 +-
 test/test_gpuarray.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 4d09c77a..dfd360eb 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1779,7 +1779,7 @@ def concatenate(arrays, axis=0, allocator=None):
     # {{{ find properties of result array
 
     shape = None
-
+    
     def shape_except_axis(ary: GPUArray):
         return ary.shape[:axis] + ary.shape[axis+1:]
 
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 2b8b66b3..dbf4b2f7 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -499,7 +499,7 @@ class TestGPUArray:
     @mark_cuda_test
     def test_concatenate(self):
 
-    from pycuda.curandom import rand as curand
+        from pycuda.curandom import rand as curand
 
         a_dev = curand((5, 15, 20), dtype=np.float32)
         b_dev = curand((4, 15, 20), dtype=np.float32)
-- 
GitLab


From 6b06323cda97f0031f27fbae9574ed20655c1f56 Mon Sep 17 00:00:00 2001
From: Mit Kotak <mitak2@illinois.edu>
Date: Mon, 26 Jul 2021 04:54:05 +0000
Subject: [PATCH 44/61] Removed Indentation in stack()

---
 pycuda/gpuarray.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index dfd360eb..a0bf84c4 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1779,7 +1779,7 @@ def concatenate(arrays, axis=0, allocator=None):
     # {{{ find properties of result array
 
     shape = None
-    
+
     def shape_except_axis(ary: GPUArray):
         return ary.shape[:axis] + ary.shape[axis+1:]
 
@@ -1820,7 +1820,7 @@ def concatenate(arrays, axis=0, allocator=None):
     return result
 
 
- def stack(arrays, axis=0, allocator=None):
+def stack(arrays, axis=0, allocator=None):
     """
     Join a sequence of arrays along a new axis.
     :arg arrays: A sequnce of :class:`GPUArray`.
-- 
GitLab


From b8d756ba0cb1ca6aa82641ff783e9dd1de8956ac Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kgk2@illinois.edu>
Date: Sat, 8 Jan 2022 02:13:43 +0000
Subject: [PATCH 45/61] Rpow scalar

---
 pycuda/elementwise.py | 39 +++++++++++++++++++++++++++++++++++++++
 pycuda/gpuarray.py    | 21 +++++++++++++++++++++
 test/test_gpuarray.py | 15 +++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 258bae5c..607e9f9c 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -666,6 +666,45 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
     )
 
 
+@context_dependent_memoize
+def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
+    """
+    Returns the kernel for the operation: ``z = x ** y``
+    """
+    if np.float64 in [dtype_x, dtype_y]:
+        func = "pow"
+    else:
+        func = "powf"
+
+    if not is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x,y[i])" % func
+
+    elif is_base_array and is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s *y"
+        func = "%s(x[i],y[i])" % func
+
+    elif is_base_array and not is_exp_array:
+        x_ctype = "%(tp_x)s *x"
+        y_ctype = "%(tp_y)s y"
+        func = "%s(x[i],y)" % func
+
+    else:
+        raise AssertionError
+
+    return get_elwise_kernel(
+        ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype)
+        % {
+            "tp_x": dtype_to_ctype(dtype_x),
+            "tp_y": dtype_to_ctype(dtype_y),
+            "tp_z": dtype_to_ctype(dtype_z),
+        },
+        "z[i] = %s" % func,
+        name="pow_method")
+
+
 @context_dependent_memoize
 def get_fmod_kernel():
     return get_elwise_kernel(
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index a0bf84c4..5e7a5607 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -839,6 +839,27 @@ class GPUArray:
         """
         return self._pow(other, new=False)
 
+    def __rpow__(self, other):
+        common_dtype = _get_common_dtype(self, other)
+        result = self._new_like_me(common_dtype)
+
+        if not np.isscalar(other):
+            # Base is a gpuarray => do not cast.
+            base = other
+        else:
+            base = common_dtype.type(other)
+
+        func = elementwise.get_rpow_kernel(
+            base.dtype, self.dtype, result.dtype,
+            is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self))
+        # Evaluates z = x ** y
+        func.prepared_async_call(self._grid, self._block, None,
+                                 result.gpudata,  # z
+                                 base if np.isscalar(base) else base.gpudata,  # x
+                                 self.gpudata,  # y
+                                 self.mem_size)
+        return result
+
     def reverse(self, stream=None):
         """Return this array in reversed order. The array is treated
         as one-dimensional.
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index dbf4b2f7..5234d8a2 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -40,6 +40,21 @@ class TestGPUArray:
         a_gpu = a_gpu.get()
         assert (np.abs(a ** 2 - a_gpu) < 1e-3).all()
 
+    @mark_cuda_test
+    def test_rpow_array(self):
+        scalar = np.random.rand()
+        a = abs(np.random.rand(10))
+        a_gpu = gpuarray.to_gpu(a)
+
+        result = (scalar ** a_gpu).get()
+        np.testing.assert_allclose(scalar ** a, result)
+
+        result = (a_gpu ** a_gpu).get()
+        np.testing.assert_allclose(a ** a, result)
+
+        result = (a_gpu ** scalar).get()
+        np.testing.assert_allclose(a ** scalar, result)
+
     @mark_cuda_test
     def test_numpy_integer_shape(self):
         gpuarray.empty(np.int32(17), np.float32)
-- 
GitLab


From e0d744c668ff251238373b291c637554fe150190 Mon Sep 17 00:00:00 2001
From: mit kotak <mitak2@illinois.edu>
Date: Mon, 17 Jan 2022 12:56:42 -0600
Subject: [PATCH 46/61] Merged get_rpow_kernel into get_pow_array_kernel +
 Improved test_pow_array to catch order flips

---
 pycuda/elementwise.py | 24 +++---------------------
 pycuda/gpuarray.py    |  6 +++---
 test/test_gpuarray.py | 14 ++++++++------
 3 files changed, 14 insertions(+), 30 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 607e9f9c..cbe92953 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -648,26 +648,7 @@ def get_pow_kernel(dtype):
 
 
 @context_dependent_memoize
-def get_pow_array_kernel(dtype_x, dtype_y, dtype_z):
-    if np.float64 in [dtype_x, dtype_y]:
-        func = "pow"
-    else:
-        func = "powf"
-
-    return get_elwise_kernel(
-        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
-        % {
-            "tp_x": dtype_to_ctype(dtype_x),
-            "tp_y": dtype_to_ctype(dtype_y),
-            "tp_z": dtype_to_ctype(dtype_z),
-        },
-        "z[i] = %s(x[i], y[i])" % func,
-        "pow_method",
-    )
-
-
-@context_dependent_memoize
-def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
+def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
     """
     Returns the kernel for the operation: ``z = x ** y``
     """
@@ -702,7 +683,8 @@ def get_rpow_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
             "tp_z": dtype_to_ctype(dtype_z),
         },
         "z[i] = %s" % func,
-        name="pow_method")
+        name="pow_method"
+        )
 
 
 @context_dependent_memoize
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 5e7a5607..ea06a792 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -780,16 +780,16 @@ class GPUArray:
                 result = self
 
             func = elementwise.get_pow_array_kernel(
-                self.dtype, other.dtype, result.dtype
+                self.dtype, other.dtype, result.dtype, True, True
             )
 
             func.prepared_async_call(
                 self._grid,
                 self._block,
                 None,
+                result.gpudata,
                 self.gpudata,
                 other.gpudata,
-                result.gpudata,
                 self.mem_size,
             )
 
@@ -849,7 +849,7 @@ class GPUArray:
         else:
             base = common_dtype.type(other)
 
-        func = elementwise.get_rpow_kernel(
+        func = elementwise.get_pow_array_kernel(
             base.dtype, self.dtype, result.dtype,
             is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self))
         # Evaluates z = x ** y
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 5234d8a2..3e18e912 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -17,16 +17,18 @@ class TestGPUArray:
     def test_pow_array(self):
         a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
+        b = np.array([1, 2, 3, 4, 5]).astype(np.float64)
+        b_gpu = gpuarray.to_gpu(b)
 
-        result = pow(a_gpu, a_gpu).get()
-        assert (np.abs(a ** a - result) < 1e-3).all()
+        result = pow(a_gpu, b_gpu).get()
+        assert (np.abs(a ** b - result) < 1e-3).all()
 
-        result = (a_gpu ** a_gpu).get()
-        assert (np.abs(pow(a, a) - result) < 1e-3).all()
+        result = (a_gpu ** b_gpu).get()
+        assert (np.abs(pow(a, b) - result) < 1e-3).all()
 
-        a_gpu **= a_gpu
+        a_gpu **= b_gpu
         a_gpu = a_gpu.get()
-        assert (np.abs(pow(a, a) - a_gpu) < 1e-3).all()
+        assert (np.abs(pow(a, b) - a_gpu) < 1e-3).all()
 
     @mark_cuda_test
     def test_pow_number(self):
-- 
GitLab


From 87d3436813cda134d8f492b99d0c5e4732f1a3be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Wed, 1 Dec 2021 11:17:51 -0600
Subject: [PATCH 47/61] Update docs to reflect that 3D grids are possible

---
 doc/driver.rst | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/doc/driver.rst b/doc/driver.rst
index 86fa2e1b..6a690ed6 100644
--- a/doc/driver.rst
+++ b/doc/driver.rst
@@ -1807,8 +1807,8 @@ Code on the Device: Modules and Functions
         *arg1* through *argn* are the positional C arguments to the kernel. See
         :meth:`param_set` for details. See especially the warnings there.
 
-        *grid* specifies, as a 2-tuple, the number of thread blocks to launch, as a
-        two-dimensional grid.
+        *grid* specifies, as a tuple of up to three integer entries, the number
+        of thread blocks to launch, as a multi-dimensional grid.
         *stream*, if specified, is a :class:`Stream` instance serializing the
         copying of input arguments (if any), execution, and the copying
         of output arguments (again, if any).
@@ -1853,6 +1853,10 @@ Code on the Device: Modules and Functions
             which can make it somewhat slow. For a kernel that is invoked often,
             this can be inconvenient. For a faster (but mildly less convenient) way
             of invoking kernels, see :meth:`prepare` and :meth:`prepared_call`.
+            
+        .. note::
+        
+            *grid* with more than two dimensions requires CUDA 4.0 or newer.
 
     .. method:: param_set_texref(texref)
 
-- 
GitLab


From 384ebd2717c1887f87714af29dfec1cd86893652 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Mon, 6 Dec 2021 23:22:19 -0600
Subject: [PATCH 48/61] Add issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md      | 34 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.md | 20 +++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 00000000..e1552cf6
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,34 @@
+---
+name: Bug report
+about: Create a report to help us improve
+title: ''
+labels: bug
+assignees: ''
+
+---
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Click on '....'
+3. Scroll down to '....'
+4. See error
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Desktop (please complete the following information):**
+ - OS: [e.g. iOS]
+ - CUDA version: [e.g. 11.1]
+ - CUDA driver version: [e.g. 470.86]
+  - PyCUDA version: [e.g 2021.1]
+  - Python version: [e.g. 3.10]
+
+**Additional context**
+Add any other context about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 00000000..11fc491e
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,20 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: enhancement
+assignees: ''
+
+---
+
+**Is your feature request related to a problem? Please describe.**
+A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
+
+**Describe the solution you'd like**
+A clear and concise description of what you want to happen.
+
+**Describe alternatives you've considered**
+A clear and concise description of any alternative solutions or features you've considered.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
-- 
GitLab


From efd4bab263173685fab4c2a8d27fcb97daadea13 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Mon, 6 Dec 2021 23:23:08 -0600
Subject: [PATCH 49/61] Create config.yml

---
 .github/ISSUE_TEMPLATE/config.yml | 9 +++++++++
 1 file changed, 9 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/config.yml

diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 00000000..707bf62d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,9 @@
+
+blank_issues_enabled: true
+contact_links:
+  - name: ❓ Question
+    url: https://github.com/inducer/pycuda/discussions/categories/q-a
+    about: Ask and answer questions about PyCUDA on Discussions
+  - name: 🔧 Troubleshooting
+    url: https://github.com/inducer/pycuda/discussions/categories/troubleshooting
+    about: For troubleshooting help, see the Discussions
-- 
GitLab


From d6e22f144fe57ccb56f12e480b27fd6cea27986a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Mon, 6 Dec 2021 23:24:19 -0600
Subject: [PATCH 50/61] Tweak bug report template

---
 .github/ISSUE_TEMPLATE/bug_report.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index e1552cf6..2dcaf386 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -20,10 +20,7 @@ Steps to reproduce the behavior:
 **Expected behavior**
 A clear and concise description of what you expected to happen.
 
-**Screenshots**
-If applicable, add screenshots to help explain your problem.
-
-**Desktop (please complete the following information):**
+**Environment (please complete the following information):**
  - OS: [e.g. iOS]
  - CUDA version: [e.g. 11.1]
  - CUDA driver version: [e.g. 470.86]
-- 
GitLab


From 59fe0e357320d4b92bd4e960cc7d71771fb7faf0 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 11 Jan 2022 08:17:08 -0600
Subject: [PATCH 51/61] Fix Py2 syntax in tutorial (gh-340)

---
 doc/tutorial.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 4cc909cf..834d8ea6 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -68,8 +68,8 @@ original *a*::
 
   a_doubled = numpy.empty_like(a)
   cuda.memcpy_dtoh(a_doubled, a_gpu)
-  print a_doubled
-  print a
+  print(a_doubled)
+  print(a)
 
 This will print something like this::
 
@@ -127,8 +127,8 @@ achieved with much less writing::
 
   a_gpu = gpuarray.to_gpu(numpy.random.randn(4,4).astype(numpy.float32))
   a_doubled = (2*a_gpu).get()
-  print a_doubled
-  print a_gpu
+  print(a_doubled)
+  print(a_gpu)
 
 Advanced Topics
 ---------------
-- 
GitLab


From 0ca08699a1379d91522dcd4ad71c09ba239e8959 Mon Sep 17 00:00:00 2001
From: mit kotak <mitak2@illinois.edu>
Date: Fri, 21 Jan 2022 17:32:37 -0600
Subject: [PATCH 52/61] Slightly merged conditional branches in _pow

---
 pycuda/elementwise.py |  2 +-
 pycuda/gpuarray.py    | 40 +++++++++++++++++++---------------------
 2 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index cbe92953..5c6fb895 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -676,7 +676,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array)
         raise AssertionError
 
     return get_elwise_kernel(
-        ("%(tp_z)s *z, " + x_ctype + ", "+y_ctype)
+        (x_ctype + ", " + y_ctype + ", " + "%(tp_z)s *z")
         % {
             "tp_x": dtype_to_ctype(dtype_x),
             "tp_y": dtype_to_ctype(dtype_y),
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index ea06a792..5b6635a0 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -783,17 +783,10 @@ class GPUArray:
                 self.dtype, other.dtype, result.dtype, True, True
             )
 
-            func.prepared_async_call(
-                self._grid,
-                self._block,
-                None,
-                result.gpudata,
-                self.gpudata,
-                other.gpudata,
-                self.mem_size,
-            )
+            x_data = self.gpudata
+            y_data = other.gpudata
+            z_data = result.gpudata
 
-            return result
         else:
             if not self.flags.forc:
                 raise RuntimeError(
@@ -806,17 +799,22 @@ class GPUArray:
             else:
                 result = self
             func = elementwise.get_pow_kernel(self.dtype)
-            func.prepared_async_call(
-                self._grid,
-                self._block,
-                None,
-                other,
-                self.gpudata,
-                result.gpudata,
-                self.mem_size,
-            )
 
-            return result
+            x_data = other
+            y_data = self.gpudata
+            z_data = result.gpudata
+
+        func.prepared_async_call(
+            self._grid,
+            self._block,
+            None,
+            x_data,
+            y_data,
+            z_data,
+            self.mem_size,
+        )
+
+        return result
 
     def __pow__(self, other):
         """pow function::
@@ -854,9 +852,9 @@ class GPUArray:
             is_base_array=not np.isscalar(other), is_exp_array=not np.isscalar(self))
         # Evaluates z = x ** y
         func.prepared_async_call(self._grid, self._block, None,
-                                 result.gpudata,  # z
                                  base if np.isscalar(base) else base.gpudata,  # x
                                  self.gpudata,  # y
+                                 result.gpudata,  # z
                                  self.mem_size)
         return result
 
-- 
GitLab


From ea42e668dbf24388762b1117784eb700da6dfd24 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Fri, 21 Jan 2022 21:10:33 -0600
Subject: [PATCH 53/61] roll back to main's compyte version

---
 pycuda/compyte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/compyte b/pycuda/compyte
index 71bffa1a..165b3aba 160000
--- a/pycuda/compyte
+++ b/pycuda/compyte
@@ -1 +1 @@
-Subproject commit 71bffa1ae64ed98b9d922c79a6f9cc7eb4fd642f
+Subproject commit 165b3abae63bc39124a342ce1a539adbf6cd8a09
-- 
GitLab


From 157d1d3d19666c915b281b6c1ff9097c6cbc25d0 Mon Sep 17 00:00:00 2001
From: mit kotak <mitak2@illinois.edu>
Date: Fri, 4 Feb 2022 18:17:34 -0600
Subject: [PATCH 54/61] eliminated get_rpow_kernel + updated test tol

---
 aksetup_helper.py     |  4 ++--
 pycuda/gpuarray.py    | 27 ++++++++++-----------
 test.py               | 56 +++++++++++++++++++++++++++++++++++++++++++
 test/test_gpuarray.py | 10 ++++----
 4 files changed, 75 insertions(+), 22 deletions(-)
 create mode 100644 test.py

diff --git a/aksetup_helper.py b/aksetup_helper.py
index 6543215f..342911e8 100644
--- a/aksetup_helper.py
+++ b/aksetup_helper.py
@@ -146,7 +146,7 @@ def get_config(schema=None, warn_about_no_config=True):
     return config
 
 
-def hack_distutils(debug=False, fast_link=True, what_opt=3):
+def hack_distutils(debug=True, fast_link=True, what_opt=0):
     # hack distutils.sysconfig to eliminate debug flags
     # stolen from mpi4py
 
@@ -169,7 +169,7 @@ def hack_distutils(debug=False, fast_link=True, what_opt=3):
         if cflags:
             cflags = remove_prefixes(cflags.split(), bad_prefixes)
             if debug:
-                cflags.append("-g")
+                cflags.append("-g3")
             else:
                 if what_opt is None:
                     pass
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 5b6635a0..cf1b62e4 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -779,14 +779,6 @@ class GPUArray:
             else:
                 result = self
 
-            func = elementwise.get_pow_array_kernel(
-                self.dtype, other.dtype, result.dtype, True, True
-            )
-
-            x_data = self.gpudata
-            y_data = other.gpudata
-            z_data = result.gpudata
-
         else:
             if not self.flags.forc:
                 raise RuntimeError(
@@ -798,19 +790,24 @@ class GPUArray:
                 result = self._new_like_me()
             else:
                 result = self
-            func = elementwise.get_pow_kernel(self.dtype)
 
-            x_data = other
-            y_data = self.gpudata
-            z_data = result.gpudata
+        func = elementwise.get_pow_array_kernel(
+                        self.dtype,
+                        _get_common_dtype(self,other).type(other).dtype 
+                        if np.isscalar(other) else 
+                        other.dtype,
+                        result.dtype,
+                        not np.isscalar(self),
+                        not np.isscalar(other)
+                        )
 
         func.prepared_async_call(
             self._grid,
             self._block,
             None,
-            x_data,
-            y_data,
-            z_data,
+            self.gpudata,
+            other if np.isscalar(other) else other.gpudata,
+            result.gpudata,
             self.mem_size,
         )
 
diff --git a/test.py b/test.py
new file mode 100644
index 00000000..f0da15da
--- /dev/null
+++ b/test.py
@@ -0,0 +1,56 @@
+import pycuda.driver as cuda
+import pycuda.autoinit
+from pycuda.compiler import SourceModule
+import pycuda.gpuarray as gpuarray
+import numpy as np
+
+N = 222341
+
+func_mod = SourceModule("""
+extern "C" {
+__global__ void func(float *a, int N, float minval, int denom)
+{
+int idx = threadIdx.x+threadIdx.y*32+blockIdx.x*blockDim.x;
+if (idx < N)
+    a[idx] = (a[idx]-minval)/denom;
+}
+}
+""", no_extern_c=1)
+
+func = func_mod.get_function('func')
+
+
+values = np.random.randn(N)
+number_of_blocks=N/1024
+
+graph = cuda.Graph()
+node1 = graph.add_empty_node()
+node2 = graph.add_kernel_node([node1],func,(1024, 1, 1),(number_of_blocks+1,1,1),1)
+# node1 = graph.add_empty_node()
+# node2 = graph.add_empty_node([node1])
+# print(graph.get_dependent_nodes(node2))
+# print(graph.get_nodes())
+# print("Printing out all the nodes")
+# print(graph.get_nodes())
+# print("Removing dependency between root node and child node")
+# graph.remove_dependencies(node1,[node2])
+# print("Printing out all dependencies")
+# print(graph.get_dependencies(node1))
+# print("Adding dependency between root node and child node")
+# graph.add_dependencies(node1,[node2])
+# print("Printing out root nodes")
+# print(graph.get_root_nodes())
+
+
+# graph2 = cuda.Graph()
+# node3 = graph2.add_empty_node()
+# node1 = graph.add_child_graph_node(graph2,[])
+# print(graph.get_child_graph(node1))
+# event = cuda.Event()
+# node5 = graph.add_event_record_node(event,[])
+# event2 = cuda.Event()
+# graph.set_record_node_event(node5,event2)
+
+# event3 = graph.get_event_from_event_record_node(node5)
+# graph.debug_dot_print("test.dot")
+
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 3e18e912..3daeb932 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -21,14 +21,14 @@ class TestGPUArray:
         b_gpu = gpuarray.to_gpu(b)
 
         result = pow(a_gpu, b_gpu).get()
-        assert (np.abs(a ** b - result) < 1e-3).all()
+        np.testing.assert_allclose(a ** b, result, rtol=1e-6)
 
         result = (a_gpu ** b_gpu).get()
-        assert (np.abs(pow(a, b) - result) < 1e-3).all()
+        np.testing.assert_allclose(pow(a,b), result, rtol=1e-6)
 
         a_gpu **= b_gpu
         a_gpu = a_gpu.get()
-        assert (np.abs(pow(a, b) - a_gpu) < 1e-3).all()
+        np.testing.assert_allclose(pow(a,b), a_gpu, rtol=1e-6)
 
     @mark_cuda_test
     def test_pow_number(self):
@@ -36,11 +36,11 @@ class TestGPUArray:
         a_gpu = gpuarray.to_gpu(a)
 
         result = pow(a_gpu, 2).get()
-        assert (np.abs(a ** 2 - result) < 1e-3).all()
+        np.testing.assert_allclose(a ** 2, result, rtol=1e-6)
 
         a_gpu **= 2
         a_gpu = a_gpu.get()
-        assert (np.abs(a ** 2 - a_gpu) < 1e-3).all()
+        np.testing.assert_allclose(a ** 2, a_gpu, rtol=1e-6)
 
     @mark_cuda_test
     def test_rpow_array(self):
-- 
GitLab


From 3121b8ed78eb6eb9da53e2cd800388a98ff02570 Mon Sep 17 00:00:00 2001
From: mit kotak <mitak2@illinois.edu>
Date: Fri, 4 Feb 2022 18:30:58 -0600
Subject: [PATCH 55/61] retiring get_pow + resolved flake8

---
 pycuda/elementwise.py | 18 ------------------
 pycuda/gpuarray.py    | 16 ++++++++--------
 test/test_gpuarray.py |  4 ++--
 3 files changed, 10 insertions(+), 28 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 5c6fb895..06d1fcb1 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -629,24 +629,6 @@ def get_arange_kernel(dtype):
         "arange",
     )
 
-
-@context_dependent_memoize
-def get_pow_kernel(dtype):
-    if dtype == np.float32:
-        func = "powf"
-    else:
-        func = "pow"
-
-    return get_elwise_kernel(
-        "%(tp)s value, %(tp)s *y, %(tp)s *z"
-        % {
-            "tp": dtype_to_ctype(dtype),
-        },
-        "z[i] = %s(y[i], value)" % func,
-        "pow_method",
-    )
-
-
 @context_dependent_memoize
 def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
     """
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index cf1b62e4..8d6d9fd0 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -792,14 +792,14 @@ class GPUArray:
                 result = self
 
         func = elementwise.get_pow_array_kernel(
-                        self.dtype,
-                        _get_common_dtype(self,other).type(other).dtype 
-                        if np.isscalar(other) else 
-                        other.dtype,
-                        result.dtype,
-                        not np.isscalar(self),
-                        not np.isscalar(other)
-                        )
+            self.dtype,
+            _get_common_dtype(self, other).type(other).dtype
+            if np.isscalar(other) else
+            other.dtype,
+            result.dtype,
+            not np.isscalar(self),
+            not np.isscalar(other)
+            )
 
         func.prepared_async_call(
             self._grid,
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 3daeb932..73ec3ade 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -24,11 +24,11 @@ class TestGPUArray:
         np.testing.assert_allclose(a ** b, result, rtol=1e-6)
 
         result = (a_gpu ** b_gpu).get()
-        np.testing.assert_allclose(pow(a,b), result, rtol=1e-6)
+        np.testing.assert_allclose(pow(a, b), result, rtol=1e-6)
 
         a_gpu **= b_gpu
         a_gpu = a_gpu.get()
-        np.testing.assert_allclose(pow(a,b), a_gpu, rtol=1e-6)
+        np.testing.assert_allclose(pow(a, b), a_gpu, rtol=1e-6)
 
     @mark_cuda_test
     def test_pow_number(self):
-- 
GitLab


From 38b1ee593c12cf0d356b4437fc0b8af626136547 Mon Sep 17 00:00:00 2001
From: mit kotak <mitak2@illinois.edu>
Date: Fri, 4 Feb 2022 18:36:38 -0600
Subject: [PATCH 56/61] Added space to elementwise.py to resolve flake8

---
 pycuda/elementwise.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 06d1fcb1..1f428b89 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -629,6 +629,7 @@ def get_arange_kernel(dtype):
         "arange",
     )
 
+
 @context_dependent_memoize
 def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array):
     """
-- 
GitLab


From c1331fc1756016cc71b6429226398c7269d5117b Mon Sep 17 00:00:00 2001
From: Mit Kotak <mitak2@illinois.edu>
Date: Sun, 6 Feb 2022 20:26:39 +0000
Subject: [PATCH 57/61] Removed spurious diff

---
 aksetup_helper.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aksetup_helper.py b/aksetup_helper.py
index 342911e8..a0700fcf 100644
--- a/aksetup_helper.py
+++ b/aksetup_helper.py
@@ -146,7 +146,7 @@ def get_config(schema=None, warn_about_no_config=True):
     return config
 
 
-def hack_distutils(debug=True, fast_link=True, what_opt=0):
+def hack_distutils(debug=False, fast_link=True, what_opt=3):
     # hack distutils.sysconfig to eliminate debug flags
     # stolen from mpi4py
 
-- 
GitLab


From 1a9fd48d204199477b8cdbbd092c22b412ac7246 Mon Sep 17 00:00:00 2001
From: mit kotak <mitak2@illinois.edu>
Date: Sun, 6 Feb 2022 14:31:08 -0600
Subject: [PATCH 58/61] Casting issue resolved

---
 pycuda/elementwise.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 1f428b89..985fec75 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -635,10 +635,12 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array)
     """
     Returns the kernel for the operation: ``z = x ** y``
     """
-    if np.float64 in [dtype_x, dtype_y]:
-        func = "pow"
-    else:
+    if dtype_z == np.float32:
         func = "powf"
+    else:
+        # FIXME: Casting args to double-precision not
+        # ideal for all cases (ex. int args)
+        func = "pow"
 
     if not is_base_array and is_exp_array:
         x_ctype = "%(tp_x)s x"
-- 
GitLab


From 34a905e8546ba100a33d8435074d5bd30f8fee4a Mon Sep 17 00:00:00 2001
From: Mit Kotak <mitak2@illinois.edu>
Date: Sun, 6 Feb 2022 22:16:14 +0000
Subject: [PATCH 59/61] removing more spurious differences

---
 aksetup_helper.py |  2 +-
 test.py           | 56 -----------------------------------------------
 2 files changed, 1 insertion(+), 57 deletions(-)
 delete mode 100644 test.py

diff --git a/aksetup_helper.py b/aksetup_helper.py
index a0700fcf..6543215f 100644
--- a/aksetup_helper.py
+++ b/aksetup_helper.py
@@ -169,7 +169,7 @@ def hack_distutils(debug=False, fast_link=True, what_opt=3):
         if cflags:
             cflags = remove_prefixes(cflags.split(), bad_prefixes)
             if debug:
-                cflags.append("-g3")
+                cflags.append("-g")
             else:
                 if what_opt is None:
                     pass
diff --git a/test.py b/test.py
deleted file mode 100644
index f0da15da..00000000
--- a/test.py
+++ /dev/null
@@ -1,56 +0,0 @@
-import pycuda.driver as cuda
-import pycuda.autoinit
-from pycuda.compiler import SourceModule
-import pycuda.gpuarray as gpuarray
-import numpy as np
-
-N = 222341
-
-func_mod = SourceModule("""
-extern "C" {
-__global__ void func(float *a, int N, float minval, int denom)
-{
-int idx = threadIdx.x+threadIdx.y*32+blockIdx.x*blockDim.x;
-if (idx < N)
-    a[idx] = (a[idx]-minval)/denom;
-}
-}
-""", no_extern_c=1)
-
-func = func_mod.get_function('func')
-
-
-values = np.random.randn(N)
-number_of_blocks=N/1024
-
-graph = cuda.Graph()
-node1 = graph.add_empty_node()
-node2 = graph.add_kernel_node([node1],func,(1024, 1, 1),(number_of_blocks+1,1,1),1)
-# node1 = graph.add_empty_node()
-# node2 = graph.add_empty_node([node1])
-# print(graph.get_dependent_nodes(node2))
-# print(graph.get_nodes())
-# print("Printing out all the nodes")
-# print(graph.get_nodes())
-# print("Removing dependency between root node and child node")
-# graph.remove_dependencies(node1,[node2])
-# print("Printing out all dependencies")
-# print(graph.get_dependencies(node1))
-# print("Adding dependency between root node and child node")
-# graph.add_dependencies(node1,[node2])
-# print("Printing out root nodes")
-# print(graph.get_root_nodes())
-
-
-# graph2 = cuda.Graph()
-# node3 = graph2.add_empty_node()
-# node1 = graph.add_child_graph_node(graph2,[])
-# print(graph.get_child_graph(node1))
-# event = cuda.Event()
-# node5 = graph.add_event_record_node(event,[])
-# event2 = cuda.Event()
-# graph.set_record_node_event(node5,event2)
-
-# event3 = graph.get_event_from_event_record_node(node5)
-# graph.debug_dot_print("test.dot")
-
-- 
GitLab


From e85ea5c4297a967842111f6638163bf10d90319e Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 10 Feb 2022 00:05:12 -0600
Subject: [PATCH 60/61] GPUArray._pow: minor cleanup

---
 pycuda/gpuarray.py | 41 ++++++++++++++---------------------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 8d6d9fd0..36a39dc3 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -764,42 +764,29 @@ class GPUArray:
         Do the pow operator.
         with new, the user can choose between ipow or just pow
         """
+        common_dtype = _get_common_dtype(self, other)
+        if new:
+            result = self._new_like_me(common_dtype)
+        else:
+            result = self
 
-        if isinstance(other, GPUArray):
-            if not self.flags.forc or not other.flags.forc:
-                raise RuntimeError(
-                    "only contiguous arrays may "
-                    "be used as arguments to this operation"
-                )
-
-            assert self.shape == other.shape
-
-            if new:
-                result = self._new_like_me(_get_common_dtype(self, other))
-            else:
-                result = self
+        # {{{ sanity checks
 
-        else:
-            if not self.flags.forc:
-                raise RuntimeError(
-                    "only contiguous arrays may "
-                    "be used as arguments to this operation"
-                )
+        if (not self.flags.forc) or (isinstance(other, GPUArray)
+                                     and not other.flags.forc):
+            raise RuntimeError("only contiguous arrays may "
+                               "be used as arguments to this operation")
+        assert not isinstance(other, GPUArray) or other.shape == self.shape
 
-            if new:
-                result = self._new_like_me()
-            else:
-                result = self
+        # }}}
 
         func = elementwise.get_pow_array_kernel(
             self.dtype,
-            _get_common_dtype(self, other).type(other).dtype
-            if np.isscalar(other) else
-            other.dtype,
+            common_dtype if np.isscalar(other) else other.dtype,
             result.dtype,
             not np.isscalar(self),
             not np.isscalar(other)
-            )
+        )
 
         func.prepared_async_call(
             self._grid,
-- 
GitLab


From 33f8529967f350ab7acf2eac51d796101efc89f2 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 10 Feb 2022 00:06:44 -0600
Subject: [PATCH 61/61] formatting: removes indent

---
 pycuda/elementwise.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 985fec75..4e8601f0 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -669,7 +669,7 @@ def get_pow_array_kernel(dtype_x, dtype_y, dtype_z, is_base_array, is_exp_array)
         },
         "z[i] = %s" % func,
         name="pow_method"
-        )
+    )
 
 
 @context_dependent_memoize
-- 
GitLab