diff --git a/pyopencl/array.py b/pyopencl/array.py
index b06dd70679651fe14fcc1c195c154cee67e3cc3c..bf58c965c69d69820a405de7a90eb7cd1e74bc87 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -82,6 +82,10 @@ except Exception:
         return False
 
 
+class InconsistentOpenCLQueueWarning(UserWarning):
+    pass
+
+
 class VecLookupWarner:
     def __getattr__(self, name):
         from warnings import warn
@@ -144,7 +148,11 @@ def elwise_kernel_runner(kernel_getter):
 
     def kernel_runner(*args, **kwargs):
         repr_ary = args[0]
-        queue = kwargs.pop("queue", None) or repr_ary.queue
+        queue = kwargs.pop("queue", None)
+        implicit_queue = queue is None
+        if implicit_queue:
+            queue = repr_ary.queue
+
         wait_for = kwargs.pop("wait_for", None)
 
         # wait_for must be a copy, because we modify it in-place below
@@ -171,6 +179,16 @@ def elwise_kernel_runner(kernel_getter):
                 actual_args.append(arg.base_data)
                 actual_args.append(arg.offset)
                 wait_for.extend(arg.events)
+
+                if (implicit_queue
+                        and arg.queue is not None
+                        and arg.queue != queue):
+                    from warnings import warn
+
+                    warn("Implicit queue in elementwise operation does not match "
+                            "queue of a provided argument. This will become an "
+                            "error in 2021.",
+                            type=InconsistentOpenCLQueueWarning)
             else:
                 actual_args.append(arg)
         actual_args.append(repr_ary.size)
@@ -1008,7 +1026,7 @@ class Array:
             result.add_event(
                     self._axpbyz(result,
                         self.dtype.type(1), self,
-                        other.dtype.type(-1), other))
+                        result.dtype.type(-1), other))
 
             return result
         else:
@@ -1031,7 +1049,7 @@ class Array:
         # other must be a scalar
         result = self._new_like_me(common_dtype)
         result.add_event(
-                self._axpbz(result, self.dtype.type(-1), self,
+                self._axpbz(result, result.dtype.type(-1), self,
                     common_dtype.type(other)))
         return result
 
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 357aa2bbf17477713905d040376ec199a518f877..df364eda3c883d378c1e9d25136d8f59f5763f9d 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -503,36 +503,36 @@ def real_dtype(dtype):
 
 @context_dependent_memoize
 def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
-    ax = "a*x[i]"
-    by = "b*y[i]"
+    result_t = dtype_to_ctype(dtype_z)
 
     x_is_complex = dtype_x.kind == "c"
     y_is_complex = dtype_y.kind == "c"
 
-    if x_is_complex:
-        ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x)
-
-    if y_is_complex:
-        by = "%s_mul(b, y[i])" % complex_dtype_to_name(dtype_y)
+    if dtype_z.kind == "c":
+        # a and b will always be complex here.
+        z_ct = complex_dtype_to_name(dtype_z)
 
-    if x_is_complex and not y_is_complex:
-        by = "{}_fromreal({})".format(complex_dtype_to_name(dtype_x), by)
+        if x_is_complex:
+            ax = f"{z_ct}_mul(a, {z_ct}_cast(x[i]))"
+        else:
+            ax = f"{z_ct}_mulr(a, x[i])"
 
-    if not x_is_complex and y_is_complex:
-        ax = "{}_fromreal({})".format(complex_dtype_to_name(dtype_y), ax)
+        if y_is_complex:
+            by = f"{z_ct}_mul(b, {z_ct}_cast(y[i]))"
+        else:
+            by = f"{z_ct}_mulr(b, y[i])"
 
-    if x_is_complex or y_is_complex:
-        result = (
-                "{root}_add({root}_cast({ax}), {root}_cast({by}))"
-                .format(
-                    ax=ax,
-                    by=by,
-                    root=complex_dtype_to_name(dtype_z)))
+        result = f"{z_ct}_add({ax}, {by})"
     else:
+        # real-only
+
+        ax = f"a*(({result_t}) x[i])"
+        by = f"b*(({result_t}) y[i])"
+
         result = f"{ax} + {by}"
 
     return get_elwise_kernel(context,
-            "{tp_z} *z, {tp_x} a, {tp_x} *x, {tp_y} b, {tp_y} *y".format(
+            "{tp_z} *z, {tp_z} a, {tp_x} *x, {tp_z} b, {tp_y} *y".format(
                 tp_x=dtype_to_ctype(dtype_x),
                 tp_y=dtype_to_ctype(dtype_y),
                 tp_z=dtype_to_ctype(dtype_z),
diff --git a/pytest.ini b/pytest.ini
new file mode 100644
index 0000000000000000000000000000000000000000..f2a2f6894081711b89214e24c18a5104f99db607
--- /dev/null
+++ b/pytest.ini
@@ -0,0 +1,3 @@
+[pytest]
+markers=
+        bitonic: tests involving bitonic sort
diff --git a/test/test_array.py b/test/test_array.py
index 39f8fd74e572c49e19d1614d1c352fb625f5553b..d17772375a64f9d236568bef72005637e95d181a 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -426,12 +426,20 @@ def test_addition_scalar(ctx_factory):
     assert (7 + a == a_added).all()
 
 
-def test_substract_array(ctx_factory):
+@pytest.mark.parametrize(("dtype_a", "dtype_b"),
+        [
+            (np.float32, np.float32),
+            (np.float32, np.int32),
+            (np.int32, np.int32),
+            (np.int64, np.int32),
+            (np.int64, np.uint32),
+            ])
+def test_subtract_array(ctx_factory, dtype_a, dtype_b):
     """Test the substraction of two arrays."""
     #test data
-    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(dtype_a)
     b = np.array([10, 20, 30, 40, 50,
-                  60, 70, 80, 90, 100]).astype(np.float32)
+                  60, 70, 80, 90, 100]).astype(dtype_b)
 
     context = ctx_factory()
     queue = cl.CommandQueue(context)