Andreas Klöckner · Kaushik Kulkarni · Andreas Klöckner · Kaushik Kulkarni · Andreas Klöckner · Andreas Klöckner
--- a/bpl-subset @ 381e9410
+++ b/bpl-subset @ 381e9410
-Subproject commit 3702fb119804dddde5eaf4a254822b891a947104
+Subproject commit 381e9410e2b380ec44920ca10b4252506ab507d0
--- a/pycuda/driver.py
+++ b/pycuda/driver.py
@@ -38,6 +38,7 @@ def _add_cuda_libdir_to_dll_path():
    nvcc_path = _search_on_path(["nvcc.exe"])
    if nvcc_path is not None:
        os.add_dll_directory(dirname(nvcc_path))
+        return

    from warnings import warn


--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -27,6 +27,7 @@ OTHER DEALINGS IN THE SOFTWARE.


 from pycuda.tools import context_dependent_memoize
+from typing import Any
 import numpy as np
 from pycuda.tools import dtype_to_ctype, VectorArg, ScalarArg
 from pytools import memoize_method
@@ -462,6 +463,11 @@ def get_linear_combination_kernel(summand_descriptors, dtype_z):
    return func, tex_src


+def _get_real_dtype(dtype: "np.dtype[Any]") -> "np.dtype[Any]":
+    assert dtype.kind == "c"
+    return np.empty(0, dtype).real.dtype
+
+
 @context_dependent_memoize
 def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z,
                    x_is_scalar=False, y_is_scalar=False):
@@ -472,10 +478,29 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z,
    :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`.
    """
    out_t = dtype_to_ctype(dtype_z)
+
+    # {{{ cast real scalars in context of complex scalar arithmetic
+
+    if dtype_z.kind == "c" and dtype_x.kind != "c":
+        dtype_z_real = _get_real_dtype(dtype_z)
+        x_t = dtype_to_ctype(dtype_z_real)
+    else:
+        x_t = out_t
+
+    if dtype_z.kind == "c" and dtype_y.kind != "c":
+        dtype_z_real = _get_real_dtype(dtype_z)
+        y_t = dtype_to_ctype(dtype_z_real)
+    else:
+        y_t = out_t
+
+    # }}}
+
    x = "x[0]" if x_is_scalar else "x[i]"
-    ax = f"a*(({out_t}) {x})"
+    a = f"({x_t}) a"
+    ax = f"{a}*(({x_t}) {x})"
    y = "y[0]" if y_is_scalar else "y[i]"
-    by = f"b*(({out_t}) {y})"
+    b = f"({y_t}) b"
+    by = f"({b})*(({y_t}) {y})"
    result = f"{ax} + {by}"
    return get_elwise_kernel(
        "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z"
@@ -508,8 +533,29 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator,
    :arg x_is_scalar: A :class:`bool` which is *True* only if `x` is a scalar :class:`gpuarray`.
    :arg y_is_scalar: A :class:`bool` which is *True* only if `y` is a scalar :class:`gpuarray`.
    """
+
+    out_t = dtype_to_ctype(dtype_z)
+
+    # {{{ cast real scalars in context of complex scalar arithmetic
+
+    if dtype_z.kind == "c" and dtype_x.kind != "c":
+        dtype_z_real = _get_real_dtype(dtype_z)
+        x_t = dtype_to_ctype(dtype_z_real)
+    else:
+        x_t = out_t
+
+    if dtype_z.kind == "c" and dtype_y.kind != "c":
+        dtype_z_real = _get_real_dtype(dtype_z)
+        y_t = dtype_to_ctype(dtype_z_real)
+    else:
+        y_t = out_t
+
+    # }}}
+
    x = "x[0]" if x_is_scalar else "x[i]"
+    x = f"({x_t}) {x}"
    y = "y[0]" if y_is_scalar else "y[i]"
+    y = f"({y_t}) {y}"
    result = f"{x} {operator} {y}"
    return get_elwise_kernel(
        "%(tp_x)s *x, %(tp_y)s *y, %(tp_z)s *z"
@@ -518,7 +564,7 @@ def get_binary_op_kernel(dtype_x, dtype_y, dtype_z, operator,
            "tp_y": dtype_to_ctype(dtype_y),
            "tp_z": dtype_to_ctype(dtype_z),
        },
-        f"z[i] = {result}",
+        f"z[i] = ({out_t}) {result}",
        "multiply",
    )


--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -11,6 +11,7 @@ import pycuda.gpuarray as gpuarray
 import pycuda.driver as drv
 from pycuda.compiler import SourceModule
 import pytest
+import operator


 @pytest.fixture(autouse=True)
@@ -18,6 +19,23 @@ def init_cuda_context():
    yield from init_cuda_context_fixture()


+def get_random_array(rng, shape, dtype):
+    dtype = np.dtype(dtype)
+
+    if dtype.kind == "f":
+        return rng.random(shape, dtype)
+    elif dtype.kind in "il":
+        return rng.integers(-42, 42, shape, dtype)
+    elif dtype.kind in "u":
+        return rng.integers(0, 42, shape, dtype)
+    elif dtype.kind == "c":
+        real_dtype = np.empty(0, dtype).real.dtype
+        return (dtype.type(1j) * get_random_array(rng, shape, real_dtype)
+                + get_random_array(rng, shape, real_dtype))
+    else:
+        raise NotImplementedError(f"dtype = {dtype}")
+
+
 @pytest.mark.cuda
 class TestGPUArray:
    def test_pow_array(self):
@@ -1434,6 +1452,33 @@ class TestGPUArray:
        np.testing.assert_allclose(cumath.log10(x_cu).get(), np.log10(x_np),
                                   rtol=rtol)

+    @pytest.mark.parametrize("ldtype", [np.int32, np.int64,
+                                        np.float32, np.float64,
+                                        np.complex64, np.complex128])
+    @pytest.mark.parametrize("rdtype", [np.int32, np.int64,
+                                        np.float32, np.float64,
+                                        np.complex64, np.complex128])
+    @pytest.mark.parametrize("op", [operator.add, operator.sub, operator.mul,
+                                    operator.truediv])
+    def test_binary_ops_with_unequal_dtypes(self, ldtype, rdtype, op):
+        # See https://github.com/inducer/pycuda/issues/372
+        if op == operator.truediv and {ldtype, rdtype} <= {np.int32, np.int64}:
+            pytest.xfail("Enable after"
+                         " gitlab.tiker.net/inducer/pycuda/-/merge_requests/66"
+                         "is merged.")
+
+        rng = np.random.default_rng(0)
+        lop_np = get_random_array(rng, (10, 4), ldtype)
+        rop_np = get_random_array(rng, (10, 4), rdtype)
+
+        expected_result = op(lop_np, rop_np)
+        result = op(gpuarray.to_gpu(lop_np), gpuarray.to_gpu(rop_np)).get()
+
+        assert result.dtype == expected_result.dtype
+        assert result.shape == expected_result.shape
+        np.testing.assert_allclose(expected_result, result,
+                                   rtol=5e-5)
+

 if __name__ == "__main__":
    # make sure that import failures get reported, instead of skipping the tests.