diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index 25acbc9f9b4b55985621cd3925bd8729c2a6e88e..9e1d04c81b66e7a7d86169c266fad1235b08a82c 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -393,35 +393,54 @@ def get_sum_kernel(ctx, dtype_out, dtype_in):
 
 @context_dependent_memoize
 def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None):
-    if dtype_out is None:
-        dtype_out = dtype_a
-
     if dtype_b is None:
         if dtype_a is None:
             dtype_b = dtype_out
         else:
             dtype_b = dtype_a
 
-    if dtype_a is None:
-        dtype_a = dtype_out
+    if dtype_out is None:
+        from pyopencl.compyte.array import get_common_dtype
+        from pyopencl.characterize import has_double_support
+        dtype_out = get_common_dtype(
+                dtype_a.type(0), dtype_b.type(0), has_double_support(ctx.devices[0]))
+
+    a_real_dtype = dtype_a.type(0).real.dtype
+    b_real_dtype = dtype_b.type(0).real.dtype
+    out_real_dtype = dtype_out.type(0).real.dtype
 
     a_is_complex = dtype_a.kind == "c"
     b_is_complex = dtype_b.kind == "c"
     out_is_complex = dtype_out.kind == "c"
 
-    if out_is_complex:
+    from pyopencl.elementwise import complex_dtype_to_name
+
+    if a_is_complex and b_is_complex:
         a = "a[i]"
         b = "b[i]"
-        from pyopencl.elementwise import complex_dtype_to_name
-        if a_is_complex and dtype_a != dtype_out:
+        if dtype_a != dtype_out:
             a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
-        if b_is_complex and dtype_b != dtype_out:
+        if dtype_b != dtype_out:
             b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)
 
         map_expr = "%s_mul(%s, %s)" % (
                 complex_dtype_to_name(dtype_out), a, b)
     else:
-        map_expr = "a[i]*b[i]"
+        a = "a[i]"
+        b = "b[i]"
+
+        if out_is_complex:
+            if a_is_complex and dtype_a != dtype_out:
+                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
+            if b_is_complex and dtype_b != dtype_out:
+                b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)
+
+            if not a_is_complex and a_real_dtype != out_real_dtype:
+                a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a)
+            if not b_is_complex and b_real_dtype != out_real_dtype:
+                b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b)
+
+        map_expr = "%s*%s" % (a, b)
 
     return ReductionKernel(ctx, dtype_out, neutral="0",
             reduce_expr="a+b", map_expr=map_expr,
diff --git a/test/test_array.py b/test/test_array.py
index 187e0ea39aed6fd80c1b26c0ab57486f635aa544..cbf5c8727dbbc2354eb814a84b98dbd496309968 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -607,17 +607,23 @@ def test_dot(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    for dtype in [np.float32, np.complex64]:
-        a_gpu = general_clrand(queue, (200000,), dtype)
-        a = a_gpu.get()
-        b_gpu = general_clrand(queue, (200000,), dtype)
-        b = b_gpu.get()
+    dtypes = [np.float32, np.complex64]
+    if has_double_support(context.devices[0]):
+        dtypes.extend([np.float64, np.complex128])
+
+    for a_dtype in dtypes:
+        for b_dtype in dtypes:
+            print a_dtype, b_dtype
+            a_gpu = general_clrand(queue, (200000,), a_dtype)
+            a = a_gpu.get()
+            b_gpu = general_clrand(queue, (200000,), b_dtype)
+            b = b_gpu.get()
 
-        dot_ab = np.dot(a, b)
+            dot_ab = np.dot(a, b)
 
-        dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
+            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
 
-        assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
+            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
 
 
 if False: