diff --git a/contrib/fortran-to-opencl/translate.py b/contrib/fortran-to-opencl/translate.py
index 66b1273c47067f94c2c1250d6cc93b7700678ff6..00397bc153de52c4cf3e772516417ca809a2a83f 100644
--- a/contrib/fortran-to-opencl/translate.py
+++ b/contrib/fortran-to-opencl/translate.py
@@ -309,6 +309,17 @@ class TypeInferenceMapper(CombineMapper):
             else:
                 raise RuntimeError("unexpected complex type")
 
+        elif name in ["imag", "real", "abs", "dble"]:
+            arg, = expr.parameters
+            base_dtype = self.rec(arg)
+
+            if base_dtype == np.complex128:
+                return np.dtype(np.float64)
+            elif base_dtype == np.complex64:
+                return np.dtype(np.float32)
+            else:
+                return base_dtype
+
         else:
             return CombineMapper.map_call(self, expr)
 
@@ -340,12 +351,23 @@ class ComplexCCodeMapper(CCodeMapperBase):
             complexes = [child for child in expr.children
                     if 'c' == self.infer_type(child).kind]
 
-            from pymbolic.mapper.stringifier import PREC_SUM
+            from pymbolic.mapper.stringifier import PREC_SUM, PREC_NONE
             real_sum = self.join_rec(" + ", reals, PREC_SUM)
-            complex_sum = self.join_rec(" + ", complexes, PREC_SUM)
+
+            if len(complexes) == 1:
+                myprec = PREC_SUM
+            else:
+                myprec = PREC_NONE
+
+            complex_sum = self.rec(complexes[0], myprec)
+            for child in complexes[1:]:
+                complex_sum = "%s_add(%s, %s)" % (
+                        tgt_name, complex_sum,
+                        self.rec(child, PREC_NONE))
 
             if real_sum:
-                result = "%s_fromreal(%s) + %s" % (tgt_name, real_sum, complex_sum)
+                result = "%s_add(%s_fromreal(%s), %s)" % (
+                        tgt_name, tgt_name, real_sum, complex_sum)
             else:
                 result = complex_sum
 
@@ -380,8 +402,7 @@ class ComplexCCodeMapper(CCodeMapperBase):
                         self.rec(child, PREC_NONE))
 
             if real_prd:
-                # elementwise semantics are correct
-                result = "%s * %s" % (real_prd, complex_prd)
+                result = "%s_rmul(%s, %s)" % (tgt_name, real_prd, complex_prd)
             else:
                 result = complex_prd
 
@@ -397,8 +418,10 @@ class ComplexCCodeMapper(CCodeMapperBase):
         if not (n_complex or d_complex):
             return CCodeMapperBase.map_quotient(self, expr, enclosing_prec)
         elif n_complex and not d_complex:
-            # elementwise semnatics are correct
-            return CCodeMapperBase.map_quotient(self, expr, enclosing_prec)
+            return "%s_divider(%s, %s)" % (
+                    self.complex_type_name(tgt_dtype),
+                    self.rec(expr.numerator, PREC_NONE),
+                    self.rec(expr.denominator, PREC_NONE))
         elif not n_complex and d_complex:
             return "%s_rdivide(%s, %s)" % (
                     self.complex_type_name(tgt_dtype),
@@ -478,21 +501,19 @@ class CCodeMapper(ComplexCCodeMapper):
         from pymbolic.mapper.stringifier import PREC_NONE
 
         tgt_dtype = self.infer_type(expr)
+        arg_dtypes = [self.infer_type(par) for par in expr.parameters]
 
         name = expr.function.name
         if 'f' == tgt_dtype.kind and name == "abs":
             name = "fabs"
 
-        if 'c' == tgt_dtype.kind:
+        elif 'c' == tgt_dtype.kind:
             if name in ["conjg", "dconjg"]:
                 name = "conj"
 
             if name[:2] == "cd" and name[2:] in ["log", "exp", "sqrt"]:
                 name = name[2:]
 
-            if name == "aimag":
-                name = "imag"
-
             if name == "dble":
                 name = "real"
 
@@ -500,6 +521,22 @@ class CCodeMapper(ComplexCCodeMapper):
                     self.complex_type_name(tgt_dtype),
                     name)
 
+        elif name in ["aimag", "real", "imag"] and tgt_dtype.kind == "f":
+            arg_dtype, = arg_dtypes
+
+            if name == "aimag":
+                name = "imag"
+
+            name = "%s_%s" % (
+                    self.complex_type_name(arg_dtype),
+                    name)
+
+        elif 'c' == tgt_dtype.kind and name == "abs":
+            arg_dtype, = arg_dtypes
+
+            name = "%s_abs" % (
+                    self.complex_type_name(arg_dtype))
+
         return self.format("%s(%s)",
                 name,
                 self.join_rec(", ", expr.parameters, PREC_NONE))
@@ -1370,6 +1407,13 @@ def f2cl_files(source_file, target_file, **kwargs):
 
 
 if __name__ == "__main__":
+    import logging
+    console = logging.StreamHandler()
+    console.setLevel(logging.DEBUG)
+    formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s')
+    console.setFormatter(formatter)
+    logging.getLogger('fparser').addHandler(console)
+
     from cgen.opencl import CLConstant
 
     if 0:
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 8d29b6c1d74db82d4230cb3f513a41e820bad7a8..bd2ee902ea179e4ec39db06884ac14ecd3f84d1f 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -60,7 +60,7 @@ except:
 
 # {{{ vector types
 
-class vec:
+class vec:  # noqa
     pass
 
 
@@ -271,7 +271,7 @@ class ArrayHasOffsetError(ValueError):
         ValueError.__init__(self, val)
 
 
-class _copy_queue:
+class _copy_queue:  # noqa
     pass
 
 
diff --git a/pyopencl/cl/pyopencl-bessel-j-complex.cl b/pyopencl/cl/pyopencl-bessel-j-complex.cl
index 404758718a6a2ccd4f5279bc73ca4f62764d88b0..0a5492aa6fc774d43c93ccf79f5597d6d79ddb7a 100644
--- a/pyopencl/cl/pyopencl-bessel-j-complex.cl
+++ b/pyopencl/cl/pyopencl-bessel-j-complex.cl
@@ -54,7 +54,8 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
 
   cdouble_t z_half, mz_half2, mz_half_2k, z_half_v, z_half_vp1;
 
-  cdouble_t ima = {0.0e0, 1.0e0};
+  cdouble_t ima = cdouble_new(0, 1);
+  cdouble_t neg_ima = cdouble_new(0, -1);
 
   cdouble_t zinv, ztmp;
   cdouble_t j_nm1, j_n, j_np1;
@@ -87,7 +88,7 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
   {
     z_half = cdouble_divider(z,2.0);
 
-    mz_half2 = (-1)*cdouble_mul(z_half,z_half);
+    mz_half2 = cdouble_neg(cdouble_mul(z_half, z_half));
 
     z_half_v = cdouble_powr(z_half, v);
     z_half_vp1 = cdouble_mul(z_half_v, z_half);
@@ -105,15 +106,18 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
     k_factorial_inv = 1.0;
 
     // compute the power series of bessel j function
-    mz_half_2k = (cdouble_t)(1.0,0);
+    mz_half_2k = cdouble_new(1.0, 0);
 
-    *j_v = 0;
-    *j_vp1 = 0;
+    *j_v = cdouble_new(0, 0);
+    *j_vp1 = cdouble_new(0, 0);
 
     for ( k = 0; k < kmax; k++ )
     {
-      *j_v = *j_v + cdouble_mulr(mz_half_2k, kv_factorial_inv*k_factorial_inv);
-      *j_vp1 = *j_vp1 + cdouble_mulr(mz_half_2k, kvp1_factorial_inv*k_factorial_inv);
+      *j_v = cdouble_add(
+          *j_v,
+          cdouble_mulr(mz_half_2k, kv_factorial_inv*k_factorial_inv));
+      *j_vp1 = cdouble_add(*j_vp1,
+          cdouble_mulr(mz_half_2k, kvp1_factorial_inv*k_factorial_inv));
 
       mz_half_2k = cdouble_mul(mz_half_2k, mz_half2);
       k_factorial_inv /= (k+1);
@@ -131,8 +135,8 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
 
   // {{{ use recurrence for large z
 
-  j_nm1 = 0;
-  j_n = 1;
+  j_nm1 = cdouble_new(0, 0);
+  j_n = cdouble_new(1, 0);
 
   n = v;
 
@@ -140,7 +144,9 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
 
   while (true)
   {
-    j_np1 = cdouble_mul((2*n*zinv),j_n) - j_nm1;
+    j_np1 = cdouble_sub(
+        cdouble_mul(cdouble_rmul(2*n, zinv), j_n),
+        j_nm1);
 
     n += 1;
     j_nm1 = j_n;
@@ -148,8 +154,8 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
 
     if (n > nmax)
     {
-      *j_v = nan(0x8e55e1u);
-      *j_vp1 = nan(0x8e55e1u);
+      *j_v = cdouble_new(nan(0x8e55e1u), 0);
+      *j_vp1 = cdouble_new(nan(0x8e55e1u), 0);
       return;
     }
 
@@ -162,20 +168,16 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
   // Record the number of times of the missed rescalings
   // for j_v and j_vp1.
 
-
-  unscaled_j_np1 = 0;
-  unscaled_j_n = 1;
+  unscaled_j_np1 = cdouble_new(0, 0);
+  unscaled_j_n = cdouble_new(1, 0);
 
   // Use normalization condition http://dlmf.nist.gov/10.12#E5
-  psi = 0;
+  psi = cdouble_new(0, 0);
 
   if (cdouble_imag(z) <= 0)
-  {
     zmul = ima;
-  } else
-  {
-    zmul = (-1)*ima;
-  }
+  else
+    zmul = neg_ima;
 
   zsn = cdouble_powr(zmul, n%4);
 
@@ -186,13 +188,15 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
 
   while (n > 0)
   {
-    ztmp = cdouble_mul(2*n*zinv, unscaled_j_n) - unscaled_j_np1;
+    ztmp = cdouble_sub(
+        cdouble_mul(cdouble_rmul(2*n, zinv), unscaled_j_n),
+        unscaled_j_np1);
     dd = pow(cdouble_real(ztmp), 2) + pow(cdouble_imag(ztmp), 2);
 
     unscaled_j_nm1 = ztmp;
 
 
-    psi = psi + cdouble_mul(unscaled_j_n, zsn);
+    psi = cdouble_add(psi, cdouble_mul(unscaled_j_n, zsn));
     zsn = cdouble_mul(zsn, zmulinv);
 
     n -= 1;
@@ -215,14 +219,14 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1)
 
   }
 
-  psi = 2*psi + unscaled_j_n;
+  psi = cdouble_add(cdouble_rmul(2, psi), unscaled_j_n);
 
   if ( cdouble_imag(z) <= 0 )
   {
     scaling = cdouble_divide( cdouble_exp( cdouble_mul(ima,z) ), psi);
   } else
   {
-    scaling = cdouble_divide( cdouble_exp( cdouble_mul(-1*ima,z) ), psi);
+    scaling = cdouble_divide( cdouble_exp( cdouble_mul(neg_ima,z) ), psi);
   }
   vscaling = pow(upbound_inv, vscale);
   vp1scaling = pow(upbound_inv, vp1scale);
diff --git a/pyopencl/cl/pyopencl-complex.h b/pyopencl/cl/pyopencl-complex.h
index 244029862f9015f3fc3e76d67b0b3a9336bbde28..976bf81a62dcef2511cb20a4d01220542fca23be 100644
--- a/pyopencl/cl/pyopencl-complex.h
+++ b/pyopencl/cl/pyopencl-complex.h
@@ -32,55 +32,78 @@
 
 #define PYOPENCL_DECLARE_COMPLEX_TYPE_INT(REAL_TP, REAL_3LTR, TPROOT, TP) \
   \
-  REAL_TP TPROOT##_real(TP a) { return a.x; } \
-  REAL_TP TPROOT##_imag(TP a) { return a.y; } \
-  REAL_TP TPROOT##_abs(TP a) { return hypot(a.x, a.y); } \
+  REAL_TP TPROOT##_real(TP a) { return a.real; } \
+  REAL_TP TPROOT##_imag(TP a) { return a.imag; } \
+  REAL_TP TPROOT##_abs(TP a) { return hypot(a.real, a.imag); } \
   \
-  TP TPROOT##_fromreal(REAL_TP a) { return (TP)(a, 0); } \
-  TP TPROOT##_new(REAL_TP a, REAL_TP b) { return (TP)(a, b); } \
-  TP TPROOT##_conj(TP a) { return (TP)(a.x, -a.y); } \
+  TP TPROOT##_new(REAL_TP real, REAL_TP imag) \
+  { \
+    TP result; \
+    result.real = real; \
+    result.imag = imag; \
+    return result; \
+  } \
+  \
+  TP TPROOT##_fromreal(REAL_TP real) \
+  { \
+    TP result; \
+    result.real = real; \
+    result.imag = 0; \
+    return result; \
+  } \
+  \
+  \
+  TP TPROOT##_neg(TP a) { return TPROOT##_new(-a.real, -a.imag); } \
+  TP TPROOT##_conj(TP a) { return TPROOT##_new(a.real, -a.imag); } \
   \
   TP TPROOT##_add(TP a, TP b) \
   { \
-    return a+b; \
+    return TPROOT##_new(a.real + b.real, a.imag + b.imag); \
+    ; \
   } \
   TP TPROOT##_addr(TP a, REAL_TP b) \
   { \
-    return (TP)(b+a.x, a.y); \
+    return TPROOT##_new(b+a.real, a.imag); \
   } \
   TP TPROOT##_radd(REAL_TP a, TP b) \
   { \
-    return (TP)(a+b.x, b.y); \
+    return TPROOT##_new(a+b.real, b.imag); \
+  } \
+  \
+  TP TPROOT##_sub(TP a, TP b) \
+  { \
+    return TPROOT##_new(a.real - b.real, a.imag - b.imag); \
+    ; \
   } \
   \
   TP TPROOT##_mul(TP a, TP b) \
   { \
-    return (TP)( \
-        a.x*b.x - a.y*b.y, \
-        a.x*b.y + a.y*b.x); \
+    return TPROOT##_new( \
+        a.real*b.real - a.imag*b.imag, \
+        a.real*b.imag + a.imag*b.real); \
   } \
   \
   TP TPROOT##_mulr(TP a, REAL_TP b) \
   { \
-    return a*b; \
+    return TPROOT##_new(a.real*b, a.imag*b); \
   } \
   \
   TP TPROOT##_rmul(REAL_TP a, TP b) \
   { \
-    return a*b; \
+    return TPROOT##_new(a*b.real, a*b.imag); \
   } \
   \
   TP TPROOT##_rdivide(REAL_TP z1, TP z2) \
   { \
-    if (fabs(z2.x) <= fabs(z2.y)) { \
-      REAL_TP ratio = z2.x / z2.y; \
-      REAL_TP denom = z2.y * (1 + ratio * ratio); \
-      return (TP)((z1 * ratio) / denom, - z1 / denom); \
+    if (fabs(z2.real) <= fabs(z2.imag)) { \
+      REAL_TP ratio = z2.real / z2.imag; \
+      REAL_TP denom = z2.imag * (1 + ratio * ratio); \
+      return TPROOT##_new((z1 * ratio) / denom, - z1 / denom); \
     } \
     else { \
-      REAL_TP ratio = z2.y / z2.x; \
-      REAL_TP denom = z2.x * (1 + ratio * ratio); \
-      return (TP)(z1 / denom, - (z1 * ratio) / denom); \
+      REAL_TP ratio = z2.imag / z2.real; \
+      REAL_TP denom = z2.real * (1 + ratio * ratio); \
+      return TPROOT##_new(z1 / denom, - (z1 * ratio) / denom); \
     } \
   } \
   \
@@ -88,170 +111,174 @@
   { \
     REAL_TP ratio, denom, a, b, c, d; \
     \
-    if (fabs(z2.x) <= fabs(z2.y)) { \
-      ratio = z2.x / z2.y; \
-      denom = z2.y; \
-      a = z1.y; \
-      b = z1.x; \
-      c = -z1.x; \
-      d = z1.y; \
+    if (fabs(z2.real) <= fabs(z2.imag)) { \
+      ratio = z2.real / z2.imag; \
+      denom = z2.imag; \
+      a = z1.imag; \
+      b = z1.real; \
+      c = -z1.real; \
+      d = z1.imag; \
     } \
     else { \
-      ratio = z2.y / z2.x; \
-      denom = z2.x; \
-      a = z1.x; \
-      b = z1.y; \
-      c = z1.y; \
-      d = -z1.x; \
+      ratio = z2.imag / z2.real; \
+      denom = z2.real; \
+      a = z1.real; \
+      b = z1.imag; \
+      c = z1.imag; \
+      d = -z1.real; \
     } \
     denom *= (1 + ratio * ratio); \
-    return (TP)( \
+    return TPROOT##_new( \
        (a + b * ratio) / denom, \
        (c + d * ratio) / denom); \
   } \
   \
   TP TPROOT##_divider(TP a, REAL_TP b) \
   { \
-    return a/b; \
+    return TPROOT##_new(a.real/b, a.imag/b); \
   } \
   \
   TP TPROOT##_pow(TP a, TP b) \
   { \
-    REAL_TP logr = log(hypot(a.x, a.y)); \
-    REAL_TP logi = atan2(a.y, a.x); \
-    REAL_TP x = exp(logr * b.x - logi * b.y); \
-    REAL_TP y = logr * b.y + logi * b.x; \
+    REAL_TP logr = log(hypot(a.real, a.imag)); \
+    REAL_TP logi = atan2(a.imag, a.real); \
+    REAL_TP x = exp(logr * b.real - logi * b.imag); \
+    REAL_TP y = logr * b.imag + logi * b.real; \
     \
     REAL_TP cosy; \
     REAL_TP siny = sincos(y, &cosy); \
-    return (TP) (x*cosy, x*siny); \
+    return TPROOT##_new(x*cosy, x*siny); \
   } \
   \
   TP TPROOT##_powr(TP a, REAL_TP b) \
   { \
-    REAL_TP logr = log(hypot(a.x, a.y)); \
-    REAL_TP logi = atan2(a.y, a.x); \
+    REAL_TP logr = log(hypot(a.real, a.imag)); \
+    REAL_TP logi = atan2(a.imag, a.real); \
     REAL_TP x = exp(logr * b); \
     REAL_TP y = logi * b; \
     \
     REAL_TP cosy; \
     REAL_TP siny = sincos(y, &cosy); \
     \
-    return (TP)(x * cosy, x*siny); \
+    return TPROOT##_new(x * cosy, x*siny); \
   } \
   \
   TP TPROOT##_rpow(REAL_TP a, TP b) \
   { \
     REAL_TP logr = log(a); \
-    REAL_TP x = exp(logr * b.x); \
-    REAL_TP y = logr * b.y; \
+    REAL_TP x = exp(logr * b.real); \
+    REAL_TP y = logr * b.imag; \
     \
     REAL_TP cosy; \
     REAL_TP siny = sincos(y, &cosy); \
-    return (TP) (x * cosy, x * siny); \
+    return TPROOT##_new(x * cosy, x * siny); \
   } \
   \
   TP TPROOT##_sqrt(TP a) \
   { \
-    REAL_TP re = a.x; \
-    REAL_TP im = a.y; \
+    REAL_TP re = a.real; \
+    REAL_TP im = a.imag; \
     REAL_TP mag = hypot(re, im); \
     TP result; \
     \
     if (mag == 0.f) { \
-      result.x = result.y = 0.f; \
+      result.real = result.imag = 0.f; \
     } else if (re > 0.f) { \
-      result.x = sqrt(0.5f * (mag + re)); \
-      result.y = im/result.x/2.f; \
+      result.real = sqrt(0.5f * (mag + re)); \
+      result.imag = im/result.real/2.f; \
     } else { \
-      result.y = sqrt(0.5f * (mag - re)); \
+      result.imag = sqrt(0.5f * (mag - re)); \
       if (im < 0.f) \
-        result.y = - result.y; \
-      result.x = im/result.y/2.f; \
+        result.imag = - result.imag; \
+      result.real = im/result.imag/2.f; \
     } \
     return result; \
   } \
   \
   TP TPROOT##_exp(TP a) \
   { \
-    REAL_TP expr = exp(a.x); \
+    REAL_TP expr = exp(a.real); \
     REAL_TP cosi; \
-    REAL_TP sini = sincos(a.y, &cosi); \
-    return (TP)(expr * cosi, expr * sini); \
+    REAL_TP sini = sincos(a.imag, &cosi); \
+    return TPROOT##_new(expr * cosi, expr * sini); \
   } \
   \
   TP TPROOT##_log(TP a) \
-  { return (TP)(log(hypot(a.x, a.y)), atan2(a.y, a.x)); } \
+  { return TPROOT##_new(log(hypot(a.real, a.imag)), atan2(a.imag, a.real)); } \
   \
   TP TPROOT##_sin(TP a) \
   { \
     REAL_TP cosr; \
-    REAL_TP sinr = sincos(a.x, &cosr); \
-    return (TP)(sinr*cosh(a.y), cosr*sinh(a.y)); \
+    REAL_TP sinr = sincos(a.real, &cosr); \
+    return TPROOT##_new(sinr*cosh(a.imag), cosr*sinh(a.imag)); \
   } \
   \
   TP TPROOT##_cos(TP a) \
   { \
     REAL_TP cosr; \
-    REAL_TP sinr = sincos(a.x, &cosr); \
-    return (TP)(cosr*cosh(a.y), -sinr*sinh(a.y)); \
+    REAL_TP sinr = sincos(a.real, &cosr); \
+    return TPROOT##_new(cosr*cosh(a.imag), -sinr*sinh(a.imag)); \
   } \
   \
   TP TPROOT##_tan(TP a) \
   { \
-    REAL_TP re2 = 2.f * a.x; \
-    REAL_TP im2 = 2.f * a.y; \
+    REAL_TP re2 = 2.f * a.real; \
+    REAL_TP im2 = 2.f * a.imag; \
     \
     const REAL_TP limit = log(REAL_3LTR##_MAX); \
     \
     if (fabs(im2) > limit) \
-      return (TP)(0.f, (im2 > 0 ? 1.f : -1.f)); \
+      return TPROOT##_new(0.f, (im2 > 0 ? 1.f : -1.f)); \
     else \
     { \
       REAL_TP den = cos(re2) + cosh(im2); \
-      return (TP) (sin(re2) / den, sinh(im2) / den); \
+      return TPROOT##_new(sin(re2) / den, sinh(im2) / den); \
     } \
   } \
   \
   TP TPROOT##_sinh(TP a) \
   { \
     REAL_TP cosi; \
-    REAL_TP sini = sincos(a.y, &cosi); \
-    return (TP)(sinh(a.x)*cosi, cosh(a.x)*sini); \
+    REAL_TP sini = sincos(a.imag, &cosi); \
+    return TPROOT##_new(sinh(a.real)*cosi, cosh(a.real)*sini); \
   } \
   \
   TP TPROOT##_cosh(TP a) \
   { \
     REAL_TP cosi; \
-    REAL_TP sini = sincos(a.y, &cosi); \
-    return (TP)(cosh(a.x)*cosi, sinh(a.x)*sini); \
+    REAL_TP sini = sincos(a.imag, &cosi); \
+    return TPROOT##_new(cosh(a.real)*cosi, sinh(a.real)*sini); \
   } \
   \
   TP TPROOT##_tanh(TP a) \
   { \
-    REAL_TP re2 = 2.f * a.x; \
-    REAL_TP im2 = 2.f * a.y; \
+    REAL_TP re2 = 2.f * a.real; \
+    REAL_TP im2 = 2.f * a.imag; \
     \
     const REAL_TP limit = log(REAL_3LTR##_MAX); \
     \
     if (fabs(re2) > limit) \
-      return (TP)((re2 > 0 ? 1.f : -1.f), 0.f); \
+      return TPROOT##_new((re2 > 0 ? 1.f : -1.f), 0.f); \
     else \
     { \
       REAL_TP den = cosh(re2) + cos(im2); \
-      return (TP) (sinh(re2) / den, sin(im2) / den); \
+      return TPROOT##_new(sinh(re2) / den, sin(im2) / den); \
     } \
   } \
 
 #define PYOPENCL_DECLARE_COMPLEX_TYPE(BASE, BASE_3LTR) \
-  typedef BASE##2 c##BASE##_t; \
+  typedef union \
+  { \
+    struct { BASE x, y; }; \
+    struct { BASE real, imag; }; \
+  } c##BASE##_t; \
   \
   PYOPENCL_DECLARE_COMPLEX_TYPE_INT(BASE, BASE_3LTR, c##BASE, c##BASE##_t)
 
 PYOPENCL_DECLARE_COMPLEX_TYPE(float, FLT);
-#define cfloat_cast(a) ((cfloat_t) ((a).x, (a).y))
+#define cfloat_cast(a) cfloat_new((a).real, (a).imag)
 
 #ifdef PYOPENCL_DEFINE_CDOUBLE
 PYOPENCL_DECLARE_COMPLEX_TYPE(double, DBL);
-#define cdouble_cast(a) ((cdouble_t) ((a).x, (a).y))
+#define cdouble_cast(a) cdouble_new((a).real, (a).imag)
 #endif
diff --git a/pyopencl/cl/pyopencl-hankel-complex.cl b/pyopencl/cl/pyopencl-hankel-complex.cl
index d5a0c53d3c04fb6ceacb8ac5fca5205ea7c3e59b..0c5917a61cf9ce4bca4bdb2795e3e0d38d92ee5a 100644
--- a/pyopencl/cl/pyopencl-hankel-complex.cl
+++ b/pyopencl/cl/pyopencl-hankel-complex.cl
@@ -103,17 +103,17 @@ void hankel_01_complex(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon)
   ;
 
   zu = cdouble_conj(z);
-  zr = (- 1) * zu;
+  zr = cdouble_rmul(- 1, zu);
   hank103u(zu, & ier, & h0u, & h1u, ifexpon);
   hank103r(zr, & ier, & h0r, & h1r, ifexpon);
   if (ifexpon == 1)
     goto label_3000;
 
-  subt = cdouble_real(cdouble_abs(cdouble_imag(zu)));
-  cd = cdouble_exp(cdouble_fromreal((- 1) * subt) + cdouble_mul(ima, zu));
+  subt = fabs(cdouble_imag(zu));
+  cd = cdouble_exp(cdouble_add(cdouble_fromreal((- 1) * subt), cdouble_mul(ima, zu)));
   h0u = cdouble_mul(h0u, cd);
   h1u = cdouble_mul(h1u, cd);
-  cd = cdouble_exp(cdouble_fromreal((- 1) * subt) + cdouble_mul(ima, zr));
+  cd = cdouble_exp(cdouble_add(cdouble_fromreal((- 1) * subt), cdouble_mul(ima, zr)));
   h0r = cdouble_mul(h0r, cd);
   h1r = cdouble_mul(h1r, cd);
   label_3000:
@@ -121,27 +121,27 @@ void hankel_01_complex(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon)
 
   half_ = 1;
   half_ = half_ / 2;
-  y0 = cdouble_divide(half_ * (h0u + h0r), ima);
-  fj0 = ((- 1) * half_) * (h0u + ((- 1) * h0r));
-  y1 = cdouble_divide(((- 1) * half_) * (h1u + ((- 1) * h1r)), ima);
-  fj1 = half_ * (h1u + h1r);
-  z2 = (- 1) * cdouble_conj(z);
+  y0 = cdouble_divide(cdouble_rmul(half_, cdouble_add(h0u, h0r)), ima);
+  fj0 = cdouble_rmul((- 1) * half_, cdouble_add(h0u, cdouble_rmul(- 1, h0r)));
+  y1 = cdouble_divide(cdouble_rmul((- 1) * half_, cdouble_add(h1u, cdouble_rmul(- 1, h1r))), ima);
+  fj1 = cdouble_rmul(half_, cdouble_add(h1u, h1r));
+  z2 = cdouble_rmul(- 1, cdouble_conj(z));
   cclog = cdouble_log(z2);
-  ser2 = y0 + ((- 1) * cdouble_mul((2 * fj0) / pi, cclog));
-  ser3 = y1 + ((- 1) * cdouble_mul((2 * fj1) / pi, cclog));
+  ser2 = cdouble_add(y0, cdouble_rmul(- 1, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj0), pi), cclog)));
+  ser3 = cdouble_add(y1, cdouble_rmul(- 1, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj1), pi), cclog)));
   fj0 = cdouble_conj(fj0);
-  fj1 = (- 1) * cdouble_conj(fj1);
+  fj1 = cdouble_rmul(- 1, cdouble_conj(fj1));
   ser2 = cdouble_conj(ser2);
-  ser3 = (- 1) * cdouble_conj(ser3);
+  ser3 = cdouble_rmul(- 1, cdouble_conj(ser3));
   cclog = cdouble_log(z);
-  y0 = ser2 + cdouble_mul((2 * fj0) / pi, cclog);
-  y1 = ser3 + cdouble_mul((2 * fj1) / pi, cclog);
-  * h0 = fj0 + cdouble_mul(ima, y0);
-  * h1 = fj1 + cdouble_mul(ima, y1);
+  y0 = cdouble_add(ser2, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj0), pi), cclog));
+  y1 = cdouble_add(ser3, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj1), pi), cclog));
+  * h0 = cdouble_add(fj0, cdouble_mul(ima, y0));
+  * h1 = cdouble_add(fj1, cdouble_mul(ima, y1));
   if (ifexpon == 1)
     return;
 
-  cd = cdouble_exp(cdouble_fromreal(subt) + ((- 1) * cdouble_mul(ima, z)));
+  cd = cdouble_exp(cdouble_add(cdouble_fromreal(subt), cdouble_rmul(- 1, cdouble_mul(ima, z))));
   * h0 = cdouble_mul(* h0, cd);
   * h1 = cdouble_mul(* h1, cd);
 }
@@ -158,6 +158,7 @@ void hank103u(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon)
 
   cdouble_t ccex;
   cdouble_t cd;
+  double com;
   double d;
   double done;
   cdouble_t ima = {0.0e0, 1.0e0};
@@ -167,6 +168,7 @@ void hank103u(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon)
   double thresh3;
   cdouble_t zzz9;
   * ier = 0;
+  com = cdouble_real(z);
   if (cdouble_imag(z) >= 0)
     goto label_1200;
 
@@ -234,7 +236,7 @@ void hank103p(__constant cdouble_t *p, int m, cdouble_t z, cdouble_t *f)
   * f = p[m - 1];
   for (i = m + (- 1); i >= 1; i += - 1)
   {
-    * f = cdouble_mul(* f, z) + p[i - 1];
+    * f = cdouble_add(cdouble_mul(* f, z), p[i - 1]);
     label_1200:
     ;
 
@@ -275,10 +277,10 @@ void hank103a(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon)
   qq1 = cdouble_fromreal(hank103a_q1[m - 1]);
   for (i = m + (- 1); i >= 1; i += - 1)
   {
-    pp = cdouble_fromreal(hank103a_p[i - 1]) + cdouble_mul(pp, zinv22);
-    pp1 = cdouble_fromreal(hank103a_p1[i - 1]) + cdouble_mul(pp1, zinv22);
-    qq = cdouble_fromreal(hank103a_q[i - 1]) + cdouble_mul(qq, zinv22);
-    qq1 = cdouble_fromreal(hank103a_q1[i - 1]) + cdouble_mul(qq1, zinv22);
+    pp = cdouble_add(cdouble_fromreal(hank103a_p[i - 1]), cdouble_mul(pp, zinv22));
+    pp1 = cdouble_add(cdouble_fromreal(hank103a_p1[i - 1]), cdouble_mul(pp1, zinv22));
+    qq = cdouble_add(cdouble_fromreal(hank103a_q[i - 1]), cdouble_mul(qq, zinv22));
+    qq1 = cdouble_add(cdouble_fromreal(hank103a_q1[i - 1]), cdouble_mul(qq1, zinv22));
     label_1600:
     ;
 
@@ -290,11 +292,11 @@ void hank103a(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon)
   if (ifexpon == 1)
     cccexp = cdouble_exp(cdouble_mul(ima, z));
 
-  cdd = cdouble_sqrt((2 / pi) * zinv);
-  * h0 = pp + cdouble_mul(ima, qq);
+  cdd = cdouble_sqrt(cdouble_rmul(2 / pi, zinv));
+  * h0 = cdouble_add(pp, cdouble_mul(ima, qq));
   * h0 = cdouble_mul(cdouble_mul(cdouble_mul(cdd, cdumb), cccexp), * h0);
-  * h1 = pp1 + cdouble_mul(ima, qq1);
-  * h1 = (- 1) * cdouble_mul(cdouble_mul(cdouble_mul(cdouble_mul(cdd, cccexp), cdumb), * h1), ima);
+  * h1 = cdouble_add(pp1, cdouble_mul(ima, qq1));
+  * h1 = cdouble_rmul(- 1, cdouble_mul(cdouble_mul(cdouble_mul(cdouble_mul(cdd, cccexp), cdumb), * h1), ima));
 }
 
 __constant double hank103l_cj0[] = {0.1000000000000000e+01, (- 1) * 0.2500000000000000e+00, 0.1562500000000000e-01, (- 1) * 0.4340277777777778e-03, 0.6781684027777778e-05, (- 1) * 0.6781684027777778e-07, 0.4709502797067901e-09, (- 1) * 0.2402807549524439e-11, 0.9385966990329841e-14, (- 1) * 0.2896903392077112e-16, 0.7242258480192779e-19, (- 1) * 0.1496334396734045e-21, 0.2597802772107717e-24, (- 1) * 0.3842903509035085e-27, 0.4901662639075363e-30, (- 1) * 0.5446291821194848e-33};
@@ -329,29 +331,29 @@ void hank103l(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon)
   cd = cdouble_fromreal(1);
   for (i = 1; i <= m; i += 1)
   {
-    fj0 = fj0 + (hank103l_cj0[i - 1] * cd);
-    fj1 = fj1 + (hank103l_cj1[i - 1] * cd);
-    y1 = y1 + (hank103l_ser2der[i - 1] * cd);
+    fj0 = cdouble_add(fj0, cdouble_rmul(hank103l_cj0[i - 1], cd));
+    fj1 = cdouble_add(fj1, cdouble_rmul(hank103l_cj1[i - 1], cd));
+    y1 = cdouble_add(y1, cdouble_rmul(hank103l_ser2der[i - 1], cd));
     cd = cdouble_mul(cd, z2);
-    y0 = y0 + (hank103l_ser2[i - 1] * cd);
+    y0 = cdouble_add(y0, cdouble_rmul(hank103l_ser2[i - 1], cd));
     label_1800:
     ;
 
   }
 
-  fj1 = (- 1) * cdouble_mul(fj1, z);
-  cdddlog = cdouble_fromreal(gamma) + cdouble_log(z / two);
-  y0 = cdouble_mul(cdddlog, fj0) + y0;
-  y0 = (two / pi) * y0;
+  fj1 = cdouble_rmul(- 1, cdouble_mul(fj1, z));
+  cdddlog = cdouble_add(cdouble_fromreal(gamma), cdouble_log(cdouble_divider(z, two)));
+  y0 = cdouble_add(cdouble_mul(cdddlog, fj0), y0);
+  y0 = cdouble_rmul(two / pi, y0);
   y1 = cdouble_mul(y1, z);
-  y1 = (((- 1) * cdouble_mul(cdddlog, fj1)) + cdouble_divide(fj0, z)) + y1;
-  y1 = (((- 1) * two) * y1) / pi;
-  * h0 = fj0 + cdouble_mul(ima, y0);
-  * h1 = fj1 + cdouble_mul(ima, y1);
+  y1 = cdouble_add(cdouble_add(cdouble_rmul(- 1, cdouble_mul(cdddlog, fj1)), cdouble_divide(fj0, z)), y1);
+  y1 = cdouble_divider(cdouble_rmul((- 1) * two, y1), pi);
+  * h0 = cdouble_add(fj0, cdouble_mul(ima, y0));
+  * h1 = cdouble_add(fj1, cdouble_mul(ima, y1));
   if (ifexpon == 1)
     return;
 
-  cd = cdouble_exp((- 1) * cdouble_mul(ima, z));
+  cd = cdouble_exp(cdouble_rmul(- 1, cdouble_mul(ima, z)));
   * h0 = cdouble_mul(* h0, cd);
   * h1 = cdouble_mul(* h1, cd);
 }
@@ -439,3 +441,4 @@ void hank103r(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon)
 
   hank103a(z, & (* h0), & (* h1), ifexpon);
 }
+
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 285c3e93fab1177363349915f43d3a2bb58a8578..47fe3d9585b1aebf08cde69624066e5088a081ae 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -525,7 +525,6 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
 
     x_is_complex = dtype_x.kind == "c"
     y_is_complex = dtype_y.kind == "c"
-    z_is_complex = dtype_z.kind == "c"
 
     if x_is_complex:
         ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x)
@@ -539,9 +538,15 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
     if not x_is_complex and y_is_complex:
         ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax)
 
-    result = "%s + %s" % (ax, by)
-    if z_is_complex:
-        result = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), result)
+    if x_is_complex or y_is_complex:
+        result = (
+                "{root}_add({root}_cast({ax}), {root}_cast({by}))"
+                .format(
+                    ax=ax,
+                    by=by,
+                    root=complex_dtype_to_name(dtype_z)))
+    else:
+        result = "%s + %s" % (ax, by)
 
     return get_elwise_kernel(context,
             "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % {
@@ -562,24 +567,27 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
     z_is_complex = dtype_z.kind == "c"
 
     ax = "a*x[i]"
-    if a_is_complex and x_is_complex:
+    if x_is_complex:
         a = "a"
         x = "x[i]"
 
-        if dtype_a != dtype_z:
-            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
         if dtype_x != dtype_z:
             x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x)
 
-        ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+        if a_is_complex:
+            if dtype_a != dtype_z:
+                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
 
-    # The following two are workarounds for Apple on OS X 10.8.
-    # They're not really necessary.
+            ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+        else:
+            ax = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
+    elif a_is_complex:
+        a = "a"
+        x = "x[i]"
 
-    elif a_is_complex and not x_is_complex:
-        ax = "a*((%s) x[i])" % dtype_to_ctype(real_dtype(dtype_a))
-    elif not a_is_complex and x_is_complex:
-        ax = "((%s) a)*x[i]" % dtype_to_ctype(real_dtype(dtype_x))
+        if dtype_a != dtype_z:
+            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a)
+        ax = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x)
 
     b = "b"
     if z_is_complex and not b_is_complex:
@@ -592,6 +600,14 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
         ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax)
         b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b)
 
+    if a_is_complex or x_is_complex or b_is_complex:
+        expr = "{root}_add({ax}, {b})".format(
+                ax=ax,
+                b=b,
+                root=complex_dtype_to_name(dtype_z))
+    else:
+        expr = "%s + %s" % (ax, b)
+
     return get_elwise_kernel(context,
             "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % {
                 "tp_a": dtype_to_ctype(dtype_a),
@@ -599,7 +615,7 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
                 "tp_b": dtype_to_ctype(dtype_b),
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
-            "z[i] = %s + %s" % (ax, b),
+            "z[i] = " + expr,
             name="axpb")
 
 
@@ -607,7 +623,6 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z):
 def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
     x_is_complex = dtype_x.kind == "c"
     y_is_complex = dtype_y.kind == "c"
-    z_is_complex = dtype_z.kind == "c"
 
     x = "x[i]"
     y = "y[i]"
@@ -619,13 +634,13 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
 
     if x_is_complex and y_is_complex:
         xy = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
-
+    elif x_is_complex and not y_is_complex:
+        xy = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+    elif not x_is_complex and y_is_complex:
+        xy = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
     else:
         xy = "%s * %s" % (x, y)
 
-    if z_is_complex:
-        xy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xy)
-
     return get_elwise_kernel(context,
             "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % {
                 "tp_x": dtype_to_ctype(dtype_x),
@@ -654,8 +669,9 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
     if x_is_complex and y_is_complex:
         xoy = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
     elif not x_is_complex and y_is_complex:
-
         xoy = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
+    elif x_is_complex and not y_is_complex:
+        xoy = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y)
     else:
         xoy = "%s / %s" % (x, y)
 
@@ -691,7 +707,9 @@ def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z):
     if x_is_complex and y_is_complex:
         yox = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
     elif not y_is_complex and x_is_complex:
-        yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_x), y, x)
+        yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
+    elif y_is_complex and not x_is_complex:
+        yox = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x)
     else:
         yox = "%s / %s" % (y, x)
 
@@ -728,16 +746,18 @@ def get_reverse_kernel(context, dtype):
 @context_dependent_memoize
 def get_arange_kernel(context, dtype):
     if dtype.kind == "c":
-        i = "%s_fromreal(i)" % complex_dtype_to_name(dtype)
+        expr = (
+                "{root}_add(start, {root}_rmul(i, step))"
+                .format(root=complex_dtype_to_name(dtype)))
     else:
-        i = "(%s) i" % dtype_to_ctype(dtype)
+        expr = "start + ((%s) i)*step" % dtype_to_ctype(dtype)
 
     return get_elwise_kernel(context, [
         VectorArg(dtype, "z", with_offset=True),
         ScalarArg(dtype, "start"),
         ScalarArg(dtype, "step"),
         ],
-        "z[i] = start + %s*step" % i,
+        "z[i] = " + expr,
         name="arange")
 
 
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index ba7c6dc77f69337c8e535284a782e31eb9b613a4..c1a02573e85f750cd20f52217261784c26e99e34 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -63,7 +63,8 @@ KERNEL = """//CL//
       __global out_type *out__base, long out__offset, ${arguments},
       unsigned int seq_count, unsigned int n)
     {
-        __global out_type *out = (__global out_type *) ((__global char *) out__base + out__offset);
+        __global out_type *out = (__global out_type *) (
+            (__global char *) out__base + out__offset);
         ${arg_prep}
 
         __local out_type ldata[GROUP_SIZE];
@@ -325,8 +326,8 @@ class ReductionKernel:
                 "that have at least one vector argument"
 
     def __call__(self, *args, **kwargs):
-        MAX_GROUP_COUNT = 1024
-        SMALL_SEQ_COUNT = 4
+        MAX_GROUP_COUNT = 1024  # noqa
+        SMALL_SEQ_COUNT = 4  # noqa
 
         from pyopencl.array import empty
 
@@ -392,7 +393,8 @@ class ReductionKernel:
                     use_queue,
                     (group_count*stage_inf.group_size,),
                     (stage_inf.group_size,),
-                    *([result.base_data, result.offset] + invocation_args + [seq_count, sz]),
+                    *([result.base_data, result.offset]
+                        + invocation_args + [seq_count, sz]),
                     **dict(wait_for=wait_for))
             wait_for = [last_evt]
 
@@ -473,7 +475,15 @@ def get_sum_kernel(ctx, dtype_out, dtype_in):
     if dtype_out is None:
         dtype_out = dtype_in
 
-    return ReductionKernel(ctx, dtype_out, "0", "a+b",
+    reduce_expr = "a+b"
+    neutral_expr = "0"
+    if dtype_out.kind == "c":
+        from pyopencl.elementwise import complex_dtype_to_name
+        dtname = complex_dtype_to_name(dtype_out)
+        reduce_expr = "%s_add(a, b)" % dtname
+        neutral_expr = "%s_new(0, 0)" % dtname
+
+    return ReductionKernel(ctx, dtype_out, neutral_expr, reduce_expr,
             arguments="const %(tp)s *in"
             % {"tp": dtype_to_ctype(dtype_in)})
 
@@ -492,49 +502,30 @@ def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first,
                 dtype_a.type(0), dtype_b.type(0),
                 has_double_support)
 
-    a_real_dtype = dtype_a.type(0).real.dtype
-    b_real_dtype = dtype_b.type(0).real.dtype
-    out_real_dtype = dtype_out.type(0).real.dtype
-
     a_is_complex = dtype_a.kind == "c"
     b_is_complex = dtype_b.kind == "c"
-    out_is_complex = dtype_out.kind == "c"
 
     from pyopencl.elementwise import complex_dtype_to_name
 
-    if a_is_complex and b_is_complex:
-        a = "a[%s]" % index_expr
-        b = "b[%s]" % index_expr
-        if dtype_a != dtype_out:
-            a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
-        if dtype_b != dtype_out:
-            b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)
-
-        if conjugate_first and a_is_complex:
-            a = "%s_conj(%s)" % (
-                    complex_dtype_to_name(dtype_out), a)
-
-        map_expr = "%s_mul(%s, %s)" % (
-                complex_dtype_to_name(dtype_out), a, b)
-    else:
-        a = "a[%s]" % index_expr
-        b = "b[%s]" % index_expr
-
-        if out_is_complex:
-            if a_is_complex and dtype_a != dtype_out:
-                a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
-            if b_is_complex and dtype_b != dtype_out:
-                b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)
+    a = "a[%s]" % index_expr
+    b = "b[%s]" % index_expr
 
-            if not a_is_complex and a_real_dtype != out_real_dtype:
-                a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a)
-            if not b_is_complex and b_real_dtype != out_real_dtype:
-                b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b)
+    if a_is_complex and (dtype_a != dtype_out):
+        a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a)
+    if b_is_complex and (dtype_b != dtype_out):
+        b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b)
 
-        if conjugate_first and a_is_complex:
-            a = "%s_conj(%s)" % (
-                    complex_dtype_to_name(dtype_out), a)
+    if a_is_complex and conjugate_first and a_is_complex:
+        a = "%s_conj(%s)" % (
+                complex_dtype_to_name(dtype_out), a)
 
+    if a_is_complex and not b_is_complex:
+        map_expr = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+    elif not a_is_complex and b_is_complex:
+        map_expr = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+    elif a_is_complex and b_is_complex:
+        map_expr = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b)
+    else:
         map_expr = "%s*%s" % (a, b)
 
     return map_expr, dtype_out, dtype_b
@@ -548,14 +539,22 @@ def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None,
             dtype_out, dtype_a, dtype_b, conjugate_first,
             has_double_support=has_double_support(ctx.devices[0]))
 
-    return ReductionKernel(ctx, dtype_out, neutral="0",
-            reduce_expr="a+b", map_expr=map_expr,
-            arguments=
-            "const %(tp_a)s *a, "
-            "const %(tp_b)s *b" % {
-                "tp_a": dtype_to_ctype(dtype_a),
-                "tp_b": dtype_to_ctype(dtype_b),
-                })
+    reduce_expr = "a+b"
+    neutral_expr = "0"
+    if dtype_out.kind == "c":
+        from pyopencl.elementwise import complex_dtype_to_name
+        dtname = complex_dtype_to_name(dtype_out)
+        reduce_expr = "%s_add(a, b)" % dtname
+        neutral_expr = "%s_new(0, 0)" % dtname
+
+    return ReductionKernel(ctx, dtype_out, neutral=neutral_expr,
+            reduce_expr=reduce_expr, map_expr=map_expr,
+            arguments=(
+                "const %(tp_a)s *a, "
+                "const %(tp_b)s *b" % {
+                    "tp_a": dtype_to_ctype(dtype_a),
+                    "tp_b": dtype_to_ctype(dtype_b),
+                    }))
 
 
 @context_dependent_memoize
@@ -570,14 +569,14 @@ def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=No
     # important: lookup_tbl must be first--it controls the length
     return ReductionKernel(ctx, dtype_out, neutral="0",
             reduce_expr="a+b", map_expr=map_expr,
-            arguments=
-            "const %(tp_lut)s *lookup_tbl, "
-            "const %(tp_a)s *a, "
-            "const %(tp_b)s *b" % {
-            "tp_lut": dtype_to_ctype(dtype_subset),
-            "tp_a": dtype_to_ctype(dtype_a),
-            "tp_b": dtype_to_ctype(dtype_b),
-            })
+            arguments=(
+                "const %(tp_lut)s *lookup_tbl, "
+                "const %(tp_a)s *a, "
+                "const %(tp_b)s *b" % {
+                    "tp_lut": dtype_to_ctype(dtype_subset),
+                    "tp_a": dtype_to_ctype(dtype_a),
+                    "tp_b": dtype_to_ctype(dtype_b),
+                    }))
 
 
 def get_minmax_neutral(what, dtype):
@@ -628,12 +627,13 @@ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset):
             neutral=get_minmax_neutral(what, dtype),
             reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
             map_expr="in[lookup_tbl[i]]",
-            arguments=
-            "const %(tp_lut)s *lookup_tbl, "
-            "const %(tp)s *in" % {
-            "tp": dtype_to_ctype(dtype),
-            "tp_lut": dtype_to_ctype(dtype_subset),
-            }, preamble="#define MY_INFINITY (1./0)")
+            arguments=(
+                "const %(tp_lut)s *lookup_tbl, "
+                "const %(tp)s *in" % {
+                    "tp": dtype_to_ctype(dtype),
+                    "tp_lut": dtype_to_ctype(dtype_subset),
+                    }),
+            preamble="#define MY_INFINITY (1./0)")
 
 # }}}