From 8ae62aaa2f707fc55ff0e9190bb0ec44990ff29e Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sun, 14 Jul 2024 21:12:46 +0200
Subject: [PATCH 1/8] Compatibility with numpy 2

---
 pycuda/gpuarray.py             |  2 +-
 pyproject.toml                 |  6 ++----
 src/wrapper/_pvt_struct_v3.cpp |  4 ++--
 src/wrapper/mempool.cpp        | 15 ++++++++-------
 src/wrapper/wrap_cudadrv.cpp   | 22 ++++++++++++++--------
 test/test_gpuarray.py          |  2 +-
 6 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 4f739ad6..b4399042 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -1883,7 +1883,7 @@ def concatenate(arrays, axis=0, allocator=None):
     # }}}
 
     shape = tuple(shape)
-    dtype = np.find_common_type([ary.dtype for ary in arrays], [])
+    dtype = np.result_type(*(ary.dtype for ary in arrays))
     result = empty(shape, dtype, allocator=allocator)
 
     full_slice = (slice(None),) * len(shape)
diff --git a/pyproject.toml b/pyproject.toml
index 2bc8218a..98caf09e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,10 +1,8 @@
 [build-system]
 
-# For each Python version, build against the oldest numpy C_API_VERSION for
-# which binary numpy wheels exist, and then the newest version of numpy
-# implementing that C_API_VERSION.
+# See https://github.com/scipy/oldest-supported-numpy deprecation notice
 requires = [
     "setuptools",
     "wheel",
-    "oldest-supported-numpy",
+    "numpy>=2.0",
 ]
diff --git a/src/wrapper/_pvt_struct_v3.cpp b/src/wrapper/_pvt_struct_v3.cpp
index 2b10cc04..ab488b31 100644
--- a/src/wrapper/_pvt_struct_v3.cpp
+++ b/src/wrapper/_pvt_struct_v3.cpp
@@ -743,7 +743,7 @@ np_complex_float(char *p, PyObject *v, const formatdef *f)
                 NPY_CFLOAT);
         if (!v_cast)
             return -1;
-        memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast));
+        memcpy(p, PyArray_DATA((PyArrayObject*)v_cast), PyArray_NBYTES((PyArrayObject*)v_cast));
         Py_DECREF(v_cast);
     }
     else {
@@ -773,7 +773,7 @@ np_complex_double(char *p, PyObject *v, const formatdef *f)
                 NPY_CDOUBLE);
         if (!v_cast)
             return -1;
-        memcpy(p, PyArray_DATA(v_cast), PyArray_NBYTES(v_cast));
+        memcpy(p, PyArray_DATA((PyArrayObject*)v_cast), PyArray_NBYTES((PyArrayObject*)v_cast));
         Py_DECREF(v_cast);
     }
     else {
diff --git a/src/wrapper/mempool.cpp b/src/wrapper/mempool.cpp
index d889d516..07c00de9 100644
--- a/src/wrapper/mempool.cpp
+++ b/src/wrapper/mempool.cpp
@@ -8,8 +8,9 @@
 #include <mempool.hpp>
 #include <boost/python/stl_iterator.hpp>
 
-
-
+#if NPY_ABI_VERSION < 0x02000000
+  #define PyDataType_ELSIZE(descr) ((descr)->elsize)
+#endif
 
 namespace py = boost::python;
 
@@ -194,15 +195,15 @@ namespace
 
     std::unique_ptr<pooled_host_allocation> alloc(
         new pooled_host_allocation(
-          pool, tp_descr->elsize*pycuda::size_from_dims(dims.size(), &dims.front())));
+          pool, PyDataType_ELSIZE(tp_descr)*pycuda::size_from_dims(dims.size(), &dims.front())));
 
-    NPY_ORDER order = PyArray_CORDER;
+    NPY_ORDER order = NPY_CORDER;
     PyArray_OrderConverter(order_py.ptr(), &order);
 
     int flags = 0;
-    if (order == PyArray_FORTRANORDER)
+    if (order == NPY_FORTRANORDER)
       flags |= NPY_FARRAY;
-    else if (order == PyArray_CORDER)
+    else if (order == NPY_CORDER)
       flags |= NPY_CARRAY;
     else
       throw std::runtime_error("unrecognized order specifier");
@@ -213,7 +214,7 @@ namespace
         alloc->ptr(), flags, /*obj*/NULL));
 
     py::handle<> alloc_py(handle_from_new_ptr(alloc.release()));
-    PyArray_BASE(result.get()) = alloc_py.get();
+    PyArray_SetBaseObject((PyArrayObject*)result.get(), alloc_py.get());
     Py_INCREF(alloc_py.get());
 
     return result;
diff --git a/src/wrapper/wrap_cudadrv.cpp b/src/wrapper/wrap_cudadrv.cpp
index 3758689b..5d53271d 100644
--- a/src/wrapper/wrap_cudadrv.cpp
+++ b/src/wrapper/wrap_cudadrv.cpp
@@ -10,6 +10,9 @@
 #include "wrap_helpers.hpp"
 #include <boost/python/stl_iterator.hpp>
 
+#if NPY_ABI_VERSION < 0x02000000
+  #define PyDataType_ELSIZE(descr) ((descr)->elsize)
+#endif
 
 
@@ -573,17 +576,17 @@ namespace
 
     std::unique_ptr<Allocation> alloc(
         new Allocation(
-          tp_descr->elsize*pycuda::size_from_dims(dims.size(), &dims.front()),
+          PyDataType_ELSIZE(tp_descr)*pycuda::size_from_dims(dims.size(), &dims.front()),
           par1)
         );
 
-    NPY_ORDER order = PyArray_CORDER;
+    NPY_ORDER order = NPY_CORDER;
     PyArray_OrderConverter(order_py.ptr(), &order);
 
     int ary_flags = 0;
-    if (order == PyArray_FORTRANORDER)
+    if (order == NPY_FORTRANORDER)
       ary_flags |= NPY_FARRAY;
-    else if (order == PyArray_CORDER)
+    else if (order == NPY_CORDER)
       ary_flags |= NPY_CARRAY;
     else
       throw pycuda::error("numpy_empty", CUDA_ERROR_INVALID_VALUE,
@@ -595,7 +598,7 @@ namespace
         alloc->data(), ary_flags, /*obj*/NULL));
 
     py::handle<> alloc_py(handle_from_new_ptr(alloc.release()));
-    PyArray_BASE(result.get()) = alloc_py.get();
+    PyArray_SetBaseObject((PyArrayObject*)result.get(), alloc_py.get());
     Py_INCREF(alloc_py.get());
 
     return result;
@@ -608,13 +611,15 @@ namespace
       throw pycuda::error("register_host_memory", CUDA_ERROR_INVALID_VALUE,
           "ary argument is not a numpy array");
 
-    if (!PyArray_ISCONTIGUOUS(ary.ptr()))
+    if (!PyArray_ISCONTIGUOUS((PyArrayObject*)ary.ptr()))
       throw pycuda::error("register_host_memory", CUDA_ERROR_INVALID_VALUE,
           "ary argument is not contiguous");
 
     std::unique_ptr<registered_host_memory> regmem(
         new registered_host_memory(
-          PyArray_DATA(ary.ptr()), PyArray_NBYTES(ary.ptr()), flags, ary));
+          PyArray_DATA((PyArrayObject*)ary.ptr()),
+          PyArray_NBYTES((PyArrayObject*)ary.ptr()),
+          flags, ary));
 
     PyObject *new_array_ptr = PyArray_FromInterface(ary.ptr());
     if (new_array_ptr == Py_NotImplemented)
@@ -624,7 +629,8 @@ namespace
     py::handle<> result(new_array_ptr);
 
     py::handle<> regmem_py(handle_from_new_ptr(regmem.release()));
-    PyArray_BASE(result.get()) = regmem_py.get();
+    // ValueError: Cannot set the NumPy array 'base' dependency more than once
+    PyArray_SetBaseObject((PyArrayObject*)result.get(), regmem_py.get());
     Py_INCREF(regmem_py.get());
 
     return result;
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 629a2630..2fa72157 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -1376,7 +1376,7 @@ class TestGPUArray:
         for i in range(10):
             red(a_gpu[i], out=max_gpu[i])
 
-        assert np.alltrue(a.max(axis=1) == max_gpu.get())
+        assert np.all(a.max(axis=1) == max_gpu.get())
 
     def test_sum_allocator(self):
         # FIXME
-- 
GitLab


From f820b84f31cdbc5ae16ce467607774dd45882f1e Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Sun, 14 Jul 2024 21:19:17 +0200
Subject: [PATCH 2/8] Update compyte version for numpy 2

---
 pycuda/compyte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/compyte b/pycuda/compyte
index d4549d4c..f6d87a19 160000
--- a/pycuda/compyte
+++ b/pycuda/compyte
@@ -1 +1 @@
-Subproject commit d4549d4c711513e2cc098d3f5d4e918eac53ee7a
+Subproject commit f6d87a19ec50c86654845fee7fc6876154087ff1
-- 
GitLab


From ed783b3c8ca8da712f06fed872e7fdccdf344e3b Mon Sep 17 00:00:00 2001
From: Vincent Favre-Nicolin <favre@esrf.fr>
Date: Mon, 15 Jul 2024 12:15:18 +0200
Subject: [PATCH 3/8] Make sure array shape and strides are int and not
 np.intNN types which could overflow when using numpy 2

---
 pycuda/gpuarray.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index b4399042..b2d3ac37 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -242,6 +242,10 @@ class GPUArray:
             # bombs if s is a Python integer
             s = s.item()
 
+        # Make sure shape is made of int and not e.g. np.int32 as these can overflow
+        # e.g. in __getitem__() when adding the new_offset...
+        shape = tuple(int(v) for v in shape)
+
         if strides is None:
             if order == "F":
                 strides = _f_contiguous_strides(dtype.itemsize, shape)
@@ -255,7 +259,9 @@ class GPUArray:
 
             strides = tuple(strides)
 
-        self.shape = tuple(shape)
+        strides = tuple(int(v) for v in strides)
+
+        self.shape = shape
         self.dtype = dtype
         self.strides = strides
         self.mem_size = self.size = s
-- 
GitLab


From bf012c532c49d03b509e29a3585661b1696c472f Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 15 Jul 2024 13:42:22 -0500
Subject: [PATCH 4/8] Fix compyte git submodule URL

---
 .gitmodules | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 5ca5d423..14599ac3 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,4 +3,4 @@
 	url = https://github.com/inducer/bpl-subset
 [submodule "pycuda/compyte"]
 	path = pycuda/compyte
-	url = https://github.com/inducer/compyte
+	url = https://github.com/inducer/compyte.git
-- 
GitLab


From 0b905ece74d53c301a08e839e966964975274e0d Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 15 Jul 2024 13:43:37 -0500
Subject: [PATCH 5/8] Update compyte

---
 pycuda/compyte | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycuda/compyte b/pycuda/compyte
index f6d87a19..955160ac 160000
--- a/pycuda/compyte
+++ b/pycuda/compyte
@@ -1 +1 @@
-Subproject commit f6d87a19ec50c86654845fee7fc6876154087ff1
+Subproject commit 955160ac2f504dabcd8641471a56146fa1afe35d
-- 
GitLab


From 1b760e4b8c5c1d8a8ea59c69708e83c474f0eecc Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 23 Jul 2024 14:44:48 -0500
Subject: [PATCH 6/8] Skip texture tests on CUDA 12+

---
 test/test_driver.py   | 33 +++++++++++++++++++++++++++++++++
 test/test_gpuarray.py |  6 ++++++
 2 files changed, 39 insertions(+)

diff --git a/test/test_driver.py b/test/test_driver.py
index 9deae3be..be526971 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -181,6 +181,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_2d_texture(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         mod = SourceModule(
             """
         texture<float, 2, cudaReadModeElementType> mtx_tex;
@@ -208,6 +211,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_multiple_2d_textures(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         mod = SourceModule(
             """
         texture<float, 2, cudaReadModeElementType> mtx_tex;
@@ -242,6 +248,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_multichannel_2d_texture(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         mod = SourceModule(
             """
         #define CHANNELS 4
@@ -280,6 +289,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_multichannel_linear_texture(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         mod = SourceModule(
             """
         #define CHANNELS 4
@@ -315,6 +327,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_2d_fp_textures(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         orden = "F"
         npoints = 32
 
@@ -369,6 +384,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_2d_fp_textures_layered(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         orden = "F"
         npoints = 32
 
@@ -423,6 +441,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_3d_fp_textures(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         orden = "C"
         npoints = 32
 
@@ -477,6 +498,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_3d_fp_surfaces(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("surface references were removed in CUDA 12")
+
         orden = "C"
         npoints = 32
 
@@ -556,6 +580,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_2d_fp_surfaces(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("surface references were removed in CUDA 12")
+
         orden = "C"
         npoints = 32
 
@@ -734,6 +761,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_3d_texture(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         # adapted from code by Nicolas Pinto
         w = 2
         h = 4
@@ -842,6 +872,9 @@ class TestDriver:
 
     @mark_cuda_test
     def test_fp_textures(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         if drv.Context.get_device().compute_capability() < (1, 3):
             return
 
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 2fa72157..0bc37eb4 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -696,6 +696,9 @@ class TestGPUArray:
             assert la.norm(a_cpu - a_gpu.get()) == 0, i
 
     def test_take(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         idx = gpuarray.arange(0, 10000, 2, dtype=np.uint32)
         for dtype in [np.float32, np.complex64]:
             a = gpuarray.arange(0, 600000, dtype=np.uint32).astype(dtype)
@@ -1055,6 +1058,9 @@ class TestGPUArray:
         assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
 
     def test_take_put(self):
+        if drv.get_driver_version() // 1000 >= 12:
+            pytest.skip("texture references were removed in CUDA 12")
+
         for n in [5, 17, 333]:
             one_field_size = 8
             buf_gpu = gpuarray.zeros(n * one_field_size, dtype=np.float32)
-- 
GitLab


From 276ec2a88f5d97bd0418468dc66eac6db7ecec42 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 23 Jul 2024 15:01:12 -0500
Subject: [PATCH 7/8] xfail test_register_host_memory

---
 test/test_driver.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/test_driver.py b/test/test_driver.py
index be526971..d167aed2 100644
--- a/test/test_driver.py
+++ b/test/test_driver.py
@@ -948,6 +948,8 @@ class TestDriver:
 
     @mark_cuda_test
     def test_register_host_memory(self):
+        pytest.xfail("known issue: must fix array creation")
+
         if drv.get_version() < (4,):
             from py.test import skip
 
-- 
GitLab


From e99dbe3a084b32c44fbfab3ffd1be4c65a4f0e21 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Tue, 23 Jul 2024 15:02:02 -0500
Subject: [PATCH 8/8] Ifdef out pycuda helpers header on CUDA 12

---
 pycuda/cuda/pycuda-helpers.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pycuda/cuda/pycuda-helpers.hpp b/pycuda/cuda/pycuda-helpers.hpp
index 5f25092a..f1f7dec2 100644
--- a/pycuda/cuda/pycuda-helpers.hpp
+++ b/pycuda/cuda/pycuda-helpers.hpp
@@ -12,6 +12,7 @@ extern "C++" {
   typedef uint2 fp_tex_cfloat;
   typedef int4 fp_tex_cdouble;
 
+#if __CUDACC_VER_MAJOR__ < 12
    template <enum cudaTextureReadMode read_mode>
   __device__ pycuda::complex<float> fp_tex1Dfetch(texture<fp_tex_cfloat, 1, read_mode> tex, int i)
   {
@@ -244,6 +245,7 @@ extern "C++" {
   PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned short int)
   PYCUDA_GENERATE_FP_TEX_FUNCS(char)
   PYCUDA_GENERATE_FP_TEX_FUNCS(unsigned char)
+#endif
 }
 
 #endif
-- 
GitLab