From 1dcaf262964fda21e362a10d064663df169d50a2 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Fri, 26 Aug 2022 16:30:21 -0500
Subject: [PATCH 1/2] handle real casting logic for complex arith ops

---
 pycuda/elementwise.py | 52 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 7d633011..5486d097 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -27,6 +27,7 @@ OTHER DEALINGS IN THE SOFTWARE.
 
 
 from pycuda.tools import context_dependent_memoize
+from typing import Any
 import numpy as np
 from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
 from pytools import memoize_method
@@ -462,6 +463,11 @@ def get_linear_combination_kernel(summand_descriptors, dtype_z):
     return func, tex_src
 
 
+def _get_real_dtype(dtype: np.dtype[Any]) -> np.dtype[Any]:
+    assert dtype.kind == "c"
+    return np.empty(0, dtype).real.dtype
+
+
 @context_dependent_memoize
 def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z,
                     x_is_scalar=False, y_is_scalar=False):
@@ -472,10 +478,29 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z,
     :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`.
     """
     out_t = dtype_to_ctype(dtype_z)
+
+    # {{{ cast real scalars in context of complex scalar arithmetic
+
+    if dtype_z.kind == "c" and dtype_x.kind != "c":
+        dtype_z_real = _get_real_dtype(dtype_z)
+        x_t = dtype_to_ctype(dtype_z_real)
+    else:
+        x_t = out_t
+
+    if dtype_z.kind == "c" and dtype_y.kind != "c":
+        dtype_z_real = _get_real_dtype(dtype_z)
+        y_t = dtype_to_ctype(dtype_z_real)
+    else:
+        y_t = out_t
+
+    # }}}
+
     x = "x[0]" if x_is_scalar else "x[i]"
-    ax = f"a*(({out_t}) {x})"
+    a = f"({x_t}) a"
+    ax = f"{a}*(({x_t}) {x})"
     y = "y[0]" if y_is_scalar else "y[i]"
-    by = f"b*(({out_t}) {y})"
+    b = f"({y_t}) b"
+    by = f"({b})*(({y_t}) {y})"
     result = f"{ax} + {by}"
     return get_elwise_kernel(
         "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z"
@@ -508,8 +533,29 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator,
     :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`.
     :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`.
     """
+
+    out_t = dtype_to_ctype(dtype_z)
+
+    # {{{ cast real scalars in context of complex scalar arithmetic
+
+    if dtype_z.kind == "c" and dtype_x.kind != "c":
+        dtype_z_real = _get_real_dtype(dtype_z)
+        x_t = dtype_to_ctype(dtype_z_real)
+    else:
+        x_t = out_t
+
+    if dtype_z.kind == "c" and dtype_y.kind != "c":
+        dtype_z_real = _get_real_dtype(dtype_z)
+        y_t = dtype_to_ctype(dtype_z_real)
+    else:
+        y_t = out_t
+
+    # }}}
+
     x = "x[0]" if x_is_scalar else "x[i]"
+    x = f"({x_t}) {x}"
     y = "y[0]" if y_is_scalar else "y[i]"
+    y = f"({y_t}) {y}"
     result = f"{x} {operator} {y}"
     return get_elwise_kernel(
         "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
@@ -518,7 +564,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator,
             "tp_y": dtype_to_ctype(dtype_y),
             "tp_z": dtype_to_ctype(dtype_z),
         },
-        f"z[i] = {result}",
+        f"z[i] = ({out_t}) {result}",
         "multiply",
     )
 
-- 
GitLab


From f9c821bef0c8f6ea2dd5945600b8f485463dace3 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Fri, 26 Aug 2022 17:17:41 -0500
Subject: [PATCH 2/2] test unequal dtype arith ops (fixes gh-372)

---
 test/test_gpuarray.py | 45 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 7091fee9..6f83f585 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -11,6 +11,7 @@ import pycuda.gpuarray as gpuarray
 import pycuda.driver as drv
 from pycuda.compiler import SourceModule
 import pytest
+import operator
 
 
 @pytest.fixture(autouse=True)
@@ -18,6 +19,23 @@ def init_cuda_context():
     yield from init_cuda_context_fixture()
 
 
+def get_random_array(rng, shape, dtype):
+    dtype = np.dtype(dtype)
+
+    if dtype.kind == "f":
+        return rng.random(shape, dtype)
+    elif dtype.kind in "il":
+        return rng.integers(-42, 42, shape, dtype)
+    elif dtype.kind in "u":
+        return rng.integers(0, 42, shape, dtype)
+    elif dtype.kind == "c":
+        real_dtype = np.empty(0, dtype).real.dtype
+        return (dtype.type(1j) * get_random_array(rng, shape, real_dtype)
+                + get_random_array(rng, shape, real_dtype))
+    else:
+        raise NotImplementedError(f"dtype = {dtype}")
+
+
 @pytest.mark.cuda
 class TestGPUArray:
     def test_pow_array(self):
@@ -1434,6 +1452,33 @@ class TestGPUArray:
         np.testing.assert_allclose(cumath.log10(x_cu).get(), np.log10(x_np),
                                    rtol=rtol)
 
+    @pytest.mark.parametrize("ldtype", [np.int32, np.int64,
+                                        np.float32, np.float64,
+                                        np.complex64, np.complex128])
+    @pytest.mark.parametrize("rdtype", [np.int32, np.int64,
+                                        np.float32, np.float64,
+                                        np.complex64, np.complex128])
+    @pytest.mark.parametrize("op", [operator.add, operator.sub, operator.mul,
+                                    operator.truediv])
+    def test_binary_ops_with_unequal_dtypes(self, ldtype, rdtype, op):
+        # See https://github.com/inducer/pycuda/issues/372
+        if op == operator.truediv and {ldtype, rdtype} <= {np.int32, np.int64}:
+            pytest.xfail("Enable after"
+                         " gitlab.tiker.net/inducer/pycuda/-/merge_requests/66"
+                         "is merged.")
+
+        rng = np.random.default_rng(0)
+        lop_np = get_random_array(rng, (10, 4), ldtype)
+        rop_np = get_random_array(rng, (10, 4), rdtype)
+
+        expected_result = op(lop_np, rop_np)
+        result = op(gpuarray.to_gpu(lop_np), gpuarray.to_gpu(rop_np)).get()
+
+        assert result.dtype == expected_result.dtype
+        assert result.shape == expected_result.shape
+        np.testing.assert_allclose(expected_result, result,
+                                   rtol=5e-5)
+
 
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.
-- 
GitLab