diff --git a/contrib/fortran-to-opencl/translate.py b/contrib/fortran-to-opencl/translate.py index 66b1273c47067f94c2c1250d6cc93b7700678ff6..00397bc153de52c4cf3e772516417ca809a2a83f 100644 --- a/contrib/fortran-to-opencl/translate.py +++ b/contrib/fortran-to-opencl/translate.py @@ -309,6 +309,17 @@ class TypeInferenceMapper(CombineMapper): else: raise RuntimeError("unexpected complex type") + elif name in ["imag", "real", "abs", "dble"]: + arg, = expr.parameters + base_dtype = self.rec(arg) + + if base_dtype == np.complex128: + return np.dtype(np.float64) + elif base_dtype == np.complex64: + return np.dtype(np.float32) + else: + return base_dtype + else: return CombineMapper.map_call(self, expr) @@ -340,12 +351,23 @@ class ComplexCCodeMapper(CCodeMapperBase): complexes = [child for child in expr.children if 'c' == self.infer_type(child).kind] - from pymbolic.mapper.stringifier import PREC_SUM + from pymbolic.mapper.stringifier import PREC_SUM, PREC_NONE real_sum = self.join_rec(" + ", reals, PREC_SUM) - complex_sum = self.join_rec(" + ", complexes, PREC_SUM) + + if len(complexes) == 1: + myprec = PREC_SUM + else: + myprec = PREC_NONE + + complex_sum = self.rec(complexes[0], myprec) + for child in complexes[1:]: + complex_sum = "%s_add(%s, %s)" % ( + tgt_name, complex_sum, + self.rec(child, PREC_NONE)) if real_sum: - result = "%s_fromreal(%s) + %s" % (tgt_name, real_sum, complex_sum) + result = "%s_add(%s_fromreal(%s), %s)" % ( + tgt_name, tgt_name, real_sum, complex_sum) else: result = complex_sum @@ -380,8 +402,7 @@ class ComplexCCodeMapper(CCodeMapperBase): self.rec(child, PREC_NONE)) if real_prd: - # elementwise semantics are correct - result = "%s * %s" % (real_prd, complex_prd) + result = "%s_rmul(%s, %s)" % (tgt_name, real_prd, complex_prd) else: result = complex_prd @@ -397,8 +418,10 @@ class ComplexCCodeMapper(CCodeMapperBase): if not (n_complex or d_complex): return CCodeMapperBase.map_quotient(self, expr, enclosing_prec) elif n_complex and not d_complex: - # elementwise semnatics are correct - return CCodeMapperBase.map_quotient(self, expr, enclosing_prec) + return "%s_divider(%s, %s)" % ( + self.complex_type_name(tgt_dtype), + self.rec(expr.numerator, PREC_NONE), + self.rec(expr.denominator, PREC_NONE)) elif not n_complex and d_complex: return "%s_rdivide(%s, %s)" % ( self.complex_type_name(tgt_dtype), @@ -478,21 +501,19 @@ class CCodeMapper(ComplexCCodeMapper): from pymbolic.mapper.stringifier import PREC_NONE tgt_dtype = self.infer_type(expr) + arg_dtypes = [self.infer_type(par) for par in expr.parameters] name = expr.function.name if 'f' == tgt_dtype.kind and name == "abs": name = "fabs" - if 'c' == tgt_dtype.kind: + elif 'c' == tgt_dtype.kind: if name in ["conjg", "dconjg"]: name = "conj" if name[:2] == "cd" and name[2:] in ["log", "exp", "sqrt"]: name = name[2:] - if name == "aimag": - name = "imag" - if name == "dble": name = "real" @@ -500,6 +521,22 @@ class CCodeMapper(ComplexCCodeMapper): self.complex_type_name(tgt_dtype), name) + elif name in ["aimag", "real", "imag"] and tgt_dtype.kind == "f": + arg_dtype, = arg_dtypes + + if name == "aimag": + name = "imag" + + name = "%s_%s" % ( + self.complex_type_name(arg_dtype), + name) + + elif 'c' == tgt_dtype.kind and name == "abs": + arg_dtype, = arg_dtypes + + name = "%s_abs" % ( + self.complex_type_name(arg_dtype)) + return self.format("%s(%s)", name, self.join_rec(", ", expr.parameters, PREC_NONE)) @@ -1370,6 +1407,13 @@ def f2cl_files(source_file, target_file, **kwargs): if __name__ == "__main__": + import logging + console = logging.StreamHandler() + console.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') + console.setFormatter(formatter) + logging.getLogger('fparser').addHandler(console) + from cgen.opencl import CLConstant if 0: diff --git a/pyopencl/array.py b/pyopencl/array.py index 8d29b6c1d74db82d4230cb3f513a41e820bad7a8..bd2ee902ea179e4ec39db06884ac14ecd3f84d1f 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -60,7 +60,7 @@ except: # {{{ vector types -class vec: +class vec: # noqa pass @@ -271,7 +271,7 @@ class ArrayHasOffsetError(ValueError): ValueError.__init__(self, val) -class _copy_queue: +class _copy_queue: # noqa pass diff --git a/pyopencl/cl/pyopencl-bessel-j-complex.cl b/pyopencl/cl/pyopencl-bessel-j-complex.cl index 404758718a6a2ccd4f5279bc73ca4f62764d88b0..0a5492aa6fc774d43c93ccf79f5597d6d79ddb7a 100644 --- a/pyopencl/cl/pyopencl-bessel-j-complex.cl +++ b/pyopencl/cl/pyopencl-bessel-j-complex.cl @@ -54,7 +54,8 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) cdouble_t z_half, mz_half2, mz_half_2k, z_half_v, z_half_vp1; - cdouble_t ima = {0.0e0, 1.0e0}; + cdouble_t ima = cdouble_new(0, 1); + cdouble_t neg_ima = cdouble_new(0, -1); cdouble_t zinv, ztmp; cdouble_t j_nm1, j_n, j_np1; @@ -87,7 +88,7 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) { z_half = cdouble_divider(z,2.0); - mz_half2 = (-1)*cdouble_mul(z_half,z_half); + mz_half2 = cdouble_neg(cdouble_mul(z_half, z_half)); z_half_v = cdouble_powr(z_half, v); z_half_vp1 = cdouble_mul(z_half_v, z_half); @@ -105,15 +106,18 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) k_factorial_inv = 1.0; // compute the power series of bessel j function - mz_half_2k = (cdouble_t)(1.0,0); + mz_half_2k = cdouble_new(1.0, 0); - *j_v = 0; - *j_vp1 = 0; + *j_v = cdouble_new(0, 0); + *j_vp1 = cdouble_new(0, 0); for ( k = 0; k < kmax; k++ ) { - *j_v = *j_v + cdouble_mulr(mz_half_2k, kv_factorial_inv*k_factorial_inv); - *j_vp1 = *j_vp1 + cdouble_mulr(mz_half_2k, kvp1_factorial_inv*k_factorial_inv); + *j_v = cdouble_add( + *j_v, + cdouble_mulr(mz_half_2k, kv_factorial_inv*k_factorial_inv)); + *j_vp1 = cdouble_add(*j_vp1, + cdouble_mulr(mz_half_2k, kvp1_factorial_inv*k_factorial_inv)); mz_half_2k = cdouble_mul(mz_half_2k, mz_half2); k_factorial_inv /= (k+1); @@ -131,8 +135,8 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) // {{{ use recurrence for large z - j_nm1 = 0; - j_n = 1; + j_nm1 = cdouble_new(0, 0); + j_n = cdouble_new(1, 0); n = v; @@ -140,7 +144,9 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) while (true) { - j_np1 = cdouble_mul((2*n*zinv),j_n) - j_nm1; + j_np1 = cdouble_sub( + cdouble_mul(cdouble_rmul(2*n, zinv), j_n), + j_nm1); n += 1; j_nm1 = j_n; @@ -148,8 +154,8 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) if (n > nmax) { - *j_v = nan(0x8e55e1u); - *j_vp1 = nan(0x8e55e1u); + *j_v = cdouble_new(nan(0x8e55e1u), 0); + *j_vp1 = cdouble_new(nan(0x8e55e1u), 0); return; } @@ -162,20 +168,16 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) // Record the number of times of the missed rescalings // for j_v and j_vp1. - - unscaled_j_np1 = 0; - unscaled_j_n = 1; + unscaled_j_np1 = cdouble_new(0, 0); + unscaled_j_n = cdouble_new(1, 0); // Use normalization condition http://dlmf.nist.gov/10.12#E5 - psi = 0; + psi = cdouble_new(0, 0); if (cdouble_imag(z) <= 0) - { zmul = ima; - } else - { - zmul = (-1)*ima; - } + else + zmul = neg_ima; zsn = cdouble_powr(zmul, n%4); @@ -186,13 +188,15 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) while (n > 0) { - ztmp = cdouble_mul(2*n*zinv, unscaled_j_n) - unscaled_j_np1; + ztmp = cdouble_sub( + cdouble_mul(cdouble_rmul(2*n, zinv), unscaled_j_n), + unscaled_j_np1); dd = pow(cdouble_real(ztmp), 2) + pow(cdouble_imag(ztmp), 2); unscaled_j_nm1 = ztmp; - psi = psi + cdouble_mul(unscaled_j_n, zsn); + psi = cdouble_add(psi, cdouble_mul(unscaled_j_n, zsn)); zsn = cdouble_mul(zsn, zmulinv); n -= 1; @@ -215,14 +219,14 @@ void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) } - psi = 2*psi + unscaled_j_n; + psi = cdouble_add(cdouble_rmul(2, psi), unscaled_j_n); if ( cdouble_imag(z) <= 0 ) { scaling = cdouble_divide( cdouble_exp( cdouble_mul(ima,z) ), psi); } else { - scaling = cdouble_divide( cdouble_exp( cdouble_mul(-1*ima,z) ), psi); + scaling = cdouble_divide( cdouble_exp( cdouble_mul(neg_ima,z) ), psi); } vscaling = pow(upbound_inv, vscale); vp1scaling = pow(upbound_inv, vp1scale); diff --git a/pyopencl/cl/pyopencl-complex.h b/pyopencl/cl/pyopencl-complex.h index 244029862f9015f3fc3e76d67b0b3a9336bbde28..976bf81a62dcef2511cb20a4d01220542fca23be 100644 --- a/pyopencl/cl/pyopencl-complex.h +++ b/pyopencl/cl/pyopencl-complex.h @@ -32,55 +32,78 @@ #define PYOPENCL_DECLARE_COMPLEX_TYPE_INT(REAL_TP, REAL_3LTR, TPROOT, TP) \ \ - REAL_TP TPROOT##_real(TP a) { return a.x; } \ - REAL_TP TPROOT##_imag(TP a) { return a.y; } \ - REAL_TP TPROOT##_abs(TP a) { return hypot(a.x, a.y); } \ + REAL_TP TPROOT##_real(TP a) { return a.real; } \ + REAL_TP TPROOT##_imag(TP a) { return a.imag; } \ + REAL_TP TPROOT##_abs(TP a) { return hypot(a.real, a.imag); } \ \ - TP TPROOT##_fromreal(REAL_TP a) { return (TP)(a, 0); } \ - TP TPROOT##_new(REAL_TP a, REAL_TP b) { return (TP)(a, b); } \ - TP TPROOT##_conj(TP a) { return (TP)(a.x, -a.y); } \ + TP TPROOT##_new(REAL_TP real, REAL_TP imag) \ + { \ + TP result; \ + result.real = real; \ + result.imag = imag; \ + return result; \ + } \ + \ + TP TPROOT##_fromreal(REAL_TP real) \ + { \ + TP result; \ + result.real = real; \ + result.imag = 0; \ + return result; \ + } \ + \ + \ + TP TPROOT##_neg(TP a) { return TPROOT##_new(-a.real, -a.imag); } \ + TP TPROOT##_conj(TP a) { return TPROOT##_new(a.real, -a.imag); } \ \ TP TPROOT##_add(TP a, TP b) \ { \ - return a+b; \ + return TPROOT##_new(a.real + b.real, a.imag + b.imag); \ + ; \ } \ TP TPROOT##_addr(TP a, REAL_TP b) \ { \ - return (TP)(b+a.x, a.y); \ + return TPROOT##_new(b+a.real, a.imag); \ } \ TP TPROOT##_radd(REAL_TP a, TP b) \ { \ - return (TP)(a+b.x, b.y); \ + return TPROOT##_new(a+b.real, b.imag); \ + } \ + \ + TP TPROOT##_sub(TP a, TP b) \ + { \ + return TPROOT##_new(a.real - b.real, a.imag - b.imag); \ + ; \ } \ \ TP TPROOT##_mul(TP a, TP b) \ { \ - return (TP)( \ - a.x*b.x - a.y*b.y, \ - a.x*b.y + a.y*b.x); \ + return TPROOT##_new( \ + a.real*b.real - a.imag*b.imag, \ + a.real*b.imag + a.imag*b.real); \ } \ \ TP TPROOT##_mulr(TP a, REAL_TP b) \ { \ - return a*b; \ + return TPROOT##_new(a.real*b, a.imag*b); \ } \ \ TP TPROOT##_rmul(REAL_TP a, TP b) \ { \ - return a*b; \ + return TPROOT##_new(a*b.real, a*b.imag); \ } \ \ TP TPROOT##_rdivide(REAL_TP z1, TP z2) \ { \ - if (fabs(z2.x) <= fabs(z2.y)) { \ - REAL_TP ratio = z2.x / z2.y; \ - REAL_TP denom = z2.y * (1 + ratio * ratio); \ - return (TP)((z1 * ratio) / denom, - z1 / denom); \ + if (fabs(z2.real) <= fabs(z2.imag)) { \ + REAL_TP ratio = z2.real / z2.imag; \ + REAL_TP denom = z2.imag * (1 + ratio * ratio); \ + return TPROOT##_new((z1 * ratio) / denom, - z1 / denom); \ } \ else { \ - REAL_TP ratio = z2.y / z2.x; \ - REAL_TP denom = z2.x * (1 + ratio * ratio); \ - return (TP)(z1 / denom, - (z1 * ratio) / denom); \ + REAL_TP ratio = z2.imag / z2.real; \ + REAL_TP denom = z2.real * (1 + ratio * ratio); \ + return TPROOT##_new(z1 / denom, - (z1 * ratio) / denom); \ } \ } \ \ @@ -88,170 +111,174 @@ { \ REAL_TP ratio, denom, a, b, c, d; \ \ - if (fabs(z2.x) <= fabs(z2.y)) { \ - ratio = z2.x / z2.y; \ - denom = z2.y; \ - a = z1.y; \ - b = z1.x; \ - c = -z1.x; \ - d = z1.y; \ + if (fabs(z2.real) <= fabs(z2.imag)) { \ + ratio = z2.real / z2.imag; \ + denom = z2.imag; \ + a = z1.imag; \ + b = z1.real; \ + c = -z1.real; \ + d = z1.imag; \ } \ else { \ - ratio = z2.y / z2.x; \ - denom = z2.x; \ - a = z1.x; \ - b = z1.y; \ - c = z1.y; \ - d = -z1.x; \ + ratio = z2.imag / z2.real; \ + denom = z2.real; \ + a = z1.real; \ + b = z1.imag; \ + c = z1.imag; \ + d = -z1.real; \ } \ denom *= (1 + ratio * ratio); \ - return (TP)( \ + return TPROOT##_new( \ (a + b * ratio) / denom, \ (c + d * ratio) / denom); \ } \ \ TP TPROOT##_divider(TP a, REAL_TP b) \ { \ - return a/b; \ + return TPROOT##_new(a.real/b, a.imag/b); \ } \ \ TP TPROOT##_pow(TP a, TP b) \ { \ - REAL_TP logr = log(hypot(a.x, a.y)); \ - REAL_TP logi = atan2(a.y, a.x); \ - REAL_TP x = exp(logr * b.x - logi * b.y); \ - REAL_TP y = logr * b.y + logi * b.x; \ + REAL_TP logr = log(hypot(a.real, a.imag)); \ + REAL_TP logi = atan2(a.imag, a.real); \ + REAL_TP x = exp(logr * b.real - logi * b.imag); \ + REAL_TP y = logr * b.imag + logi * b.real; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ - return (TP) (x*cosy, x*siny); \ + return TPROOT##_new(x*cosy, x*siny); \ } \ \ TP TPROOT##_powr(TP a, REAL_TP b) \ { \ - REAL_TP logr = log(hypot(a.x, a.y)); \ - REAL_TP logi = atan2(a.y, a.x); \ + REAL_TP logr = log(hypot(a.real, a.imag)); \ + REAL_TP logi = atan2(a.imag, a.real); \ REAL_TP x = exp(logr * b); \ REAL_TP y = logi * b; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ \ - return (TP)(x * cosy, x*siny); \ + return TPROOT##_new(x * cosy, x*siny); \ } \ \ TP TPROOT##_rpow(REAL_TP a, TP b) \ { \ REAL_TP logr = log(a); \ - REAL_TP x = exp(logr * b.x); \ - REAL_TP y = logr * b.y; \ + REAL_TP x = exp(logr * b.real); \ + REAL_TP y = logr * b.imag; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ - return (TP) (x * cosy, x * siny); \ + return TPROOT##_new(x * cosy, x * siny); \ } \ \ TP TPROOT##_sqrt(TP a) \ { \ - REAL_TP re = a.x; \ - REAL_TP im = a.y; \ + REAL_TP re = a.real; \ + REAL_TP im = a.imag; \ REAL_TP mag = hypot(re, im); \ TP result; \ \ if (mag == 0.f) { \ - result.x = result.y = 0.f; \ + result.real = result.imag = 0.f; \ } else if (re > 0.f) { \ - result.x = sqrt(0.5f * (mag + re)); \ - result.y = im/result.x/2.f; \ + result.real = sqrt(0.5f * (mag + re)); \ + result.imag = im/result.real/2.f; \ } else { \ - result.y = sqrt(0.5f * (mag - re)); \ + result.imag = sqrt(0.5f * (mag - re)); \ if (im < 0.f) \ - result.y = - result.y; \ - result.x = im/result.y/2.f; \ + result.imag = - result.imag; \ + result.real = im/result.imag/2.f; \ } \ return result; \ } \ \ TP TPROOT##_exp(TP a) \ { \ - REAL_TP expr = exp(a.x); \ + REAL_TP expr = exp(a.real); \ REAL_TP cosi; \ - REAL_TP sini = sincos(a.y, &cosi); \ - return (TP)(expr * cosi, expr * sini); \ + REAL_TP sini = sincos(a.imag, &cosi); \ + return TPROOT##_new(expr * cosi, expr * sini); \ } \ \ TP TPROOT##_log(TP a) \ - { return (TP)(log(hypot(a.x, a.y)), atan2(a.y, a.x)); } \ + { return TPROOT##_new(log(hypot(a.real, a.imag)), atan2(a.imag, a.real)); } \ \ TP TPROOT##_sin(TP a) \ { \ REAL_TP cosr; \ - REAL_TP sinr = sincos(a.x, &cosr); \ - return (TP)(sinr*cosh(a.y), cosr*sinh(a.y)); \ + REAL_TP sinr = sincos(a.real, &cosr); \ + return TPROOT##_new(sinr*cosh(a.imag), cosr*sinh(a.imag)); \ } \ \ TP TPROOT##_cos(TP a) \ { \ REAL_TP cosr; \ - REAL_TP sinr = sincos(a.x, &cosr); \ - return (TP)(cosr*cosh(a.y), -sinr*sinh(a.y)); \ + REAL_TP sinr = sincos(a.real, &cosr); \ + return TPROOT##_new(cosr*cosh(a.imag), -sinr*sinh(a.imag)); \ } \ \ TP TPROOT##_tan(TP a) \ { \ - REAL_TP re2 = 2.f * a.x; \ - REAL_TP im2 = 2.f * a.y; \ + REAL_TP re2 = 2.f * a.real; \ + REAL_TP im2 = 2.f * a.imag; \ \ const REAL_TP limit = log(REAL_3LTR##_MAX); \ \ if (fabs(im2) > limit) \ - return (TP)(0.f, (im2 > 0 ? 1.f : -1.f)); \ + return TPROOT##_new(0.f, (im2 > 0 ? 1.f : -1.f)); \ else \ { \ REAL_TP den = cos(re2) + cosh(im2); \ - return (TP) (sin(re2) / den, sinh(im2) / den); \ + return TPROOT##_new(sin(re2) / den, sinh(im2) / den); \ } \ } \ \ TP TPROOT##_sinh(TP a) \ { \ REAL_TP cosi; \ - REAL_TP sini = sincos(a.y, &cosi); \ - return (TP)(sinh(a.x)*cosi, cosh(a.x)*sini); \ + REAL_TP sini = sincos(a.imag, &cosi); \ + return TPROOT##_new(sinh(a.real)*cosi, cosh(a.real)*sini); \ } \ \ TP TPROOT##_cosh(TP a) \ { \ REAL_TP cosi; \ - REAL_TP sini = sincos(a.y, &cosi); \ - return (TP)(cosh(a.x)*cosi, sinh(a.x)*sini); \ + REAL_TP sini = sincos(a.imag, &cosi); \ + return TPROOT##_new(cosh(a.real)*cosi, sinh(a.real)*sini); \ } \ \ TP TPROOT##_tanh(TP a) \ { \ - REAL_TP re2 = 2.f * a.x; \ - REAL_TP im2 = 2.f * a.y; \ + REAL_TP re2 = 2.f * a.real; \ + REAL_TP im2 = 2.f * a.imag; \ \ const REAL_TP limit = log(REAL_3LTR##_MAX); \ \ if (fabs(re2) > limit) \ - return (TP)((re2 > 0 ? 1.f : -1.f), 0.f); \ + return TPROOT##_new((re2 > 0 ? 1.f : -1.f), 0.f); \ else \ { \ REAL_TP den = cosh(re2) + cos(im2); \ - return (TP) (sinh(re2) / den, sin(im2) / den); \ + return TPROOT##_new(sinh(re2) / den, sin(im2) / den); \ } \ } \ #define PYOPENCL_DECLARE_COMPLEX_TYPE(BASE, BASE_3LTR) \ - typedef BASE##2 c##BASE##_t; \ + typedef union \ + { \ + struct { BASE x, y; }; \ + struct { BASE real, imag; }; \ + } c##BASE##_t; \ \ PYOPENCL_DECLARE_COMPLEX_TYPE_INT(BASE, BASE_3LTR, c##BASE, c##BASE##_t) PYOPENCL_DECLARE_COMPLEX_TYPE(float, FLT); -#define cfloat_cast(a) ((cfloat_t) ((a).x, (a).y)) +#define cfloat_cast(a) cfloat_new((a).real, (a).imag) #ifdef PYOPENCL_DEFINE_CDOUBLE PYOPENCL_DECLARE_COMPLEX_TYPE(double, DBL); -#define cdouble_cast(a) ((cdouble_t) ((a).x, (a).y)) +#define cdouble_cast(a) cdouble_new((a).real, (a).imag) #endif diff --git a/pyopencl/cl/pyopencl-hankel-complex.cl b/pyopencl/cl/pyopencl-hankel-complex.cl index d5a0c53d3c04fb6ceacb8ac5fca5205ea7c3e59b..0c5917a61cf9ce4bca4bdb2795e3e0d38d92ee5a 100644 --- a/pyopencl/cl/pyopencl-hankel-complex.cl +++ b/pyopencl/cl/pyopencl-hankel-complex.cl @@ -103,17 +103,17 @@ void hankel_01_complex(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon) ; zu = cdouble_conj(z); - zr = (- 1) * zu; + zr = cdouble_rmul(- 1, zu); hank103u(zu, & ier, & h0u, & h1u, ifexpon); hank103r(zr, & ier, & h0r, & h1r, ifexpon); if (ifexpon == 1) goto label_3000; - subt = cdouble_real(cdouble_abs(cdouble_imag(zu))); - cd = cdouble_exp(cdouble_fromreal((- 1) * subt) + cdouble_mul(ima, zu)); + subt = fabs(cdouble_imag(zu)); + cd = cdouble_exp(cdouble_add(cdouble_fromreal((- 1) * subt), cdouble_mul(ima, zu))); h0u = cdouble_mul(h0u, cd); h1u = cdouble_mul(h1u, cd); - cd = cdouble_exp(cdouble_fromreal((- 1) * subt) + cdouble_mul(ima, zr)); + cd = cdouble_exp(cdouble_add(cdouble_fromreal((- 1) * subt), cdouble_mul(ima, zr))); h0r = cdouble_mul(h0r, cd); h1r = cdouble_mul(h1r, cd); label_3000: @@ -121,27 +121,27 @@ void hankel_01_complex(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon) half_ = 1; half_ = half_ / 2; - y0 = cdouble_divide(half_ * (h0u + h0r), ima); - fj0 = ((- 1) * half_) * (h0u + ((- 1) * h0r)); - y1 = cdouble_divide(((- 1) * half_) * (h1u + ((- 1) * h1r)), ima); - fj1 = half_ * (h1u + h1r); - z2 = (- 1) * cdouble_conj(z); + y0 = cdouble_divide(cdouble_rmul(half_, cdouble_add(h0u, h0r)), ima); + fj0 = cdouble_rmul((- 1) * half_, cdouble_add(h0u, cdouble_rmul(- 1, h0r))); + y1 = cdouble_divide(cdouble_rmul((- 1) * half_, cdouble_add(h1u, cdouble_rmul(- 1, h1r))), ima); + fj1 = cdouble_rmul(half_, cdouble_add(h1u, h1r)); + z2 = cdouble_rmul(- 1, cdouble_conj(z)); cclog = cdouble_log(z2); - ser2 = y0 + ((- 1) * cdouble_mul((2 * fj0) / pi, cclog)); - ser3 = y1 + ((- 1) * cdouble_mul((2 * fj1) / pi, cclog)); + ser2 = cdouble_add(y0, cdouble_rmul(- 1, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj0), pi), cclog))); + ser3 = cdouble_add(y1, cdouble_rmul(- 1, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj1), pi), cclog))); fj0 = cdouble_conj(fj0); - fj1 = (- 1) * cdouble_conj(fj1); + fj1 = cdouble_rmul(- 1, cdouble_conj(fj1)); ser2 = cdouble_conj(ser2); - ser3 = (- 1) * cdouble_conj(ser3); + ser3 = cdouble_rmul(- 1, cdouble_conj(ser3)); cclog = cdouble_log(z); - y0 = ser2 + cdouble_mul((2 * fj0) / pi, cclog); - y1 = ser3 + cdouble_mul((2 * fj1) / pi, cclog); - * h0 = fj0 + cdouble_mul(ima, y0); - * h1 = fj1 + cdouble_mul(ima, y1); + y0 = cdouble_add(ser2, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj0), pi), cclog)); + y1 = cdouble_add(ser3, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj1), pi), cclog)); + * h0 = cdouble_add(fj0, cdouble_mul(ima, y0)); + * h1 = cdouble_add(fj1, cdouble_mul(ima, y1)); if (ifexpon == 1) return; - cd = cdouble_exp(cdouble_fromreal(subt) + ((- 1) * cdouble_mul(ima, z))); + cd = cdouble_exp(cdouble_add(cdouble_fromreal(subt), cdouble_rmul(- 1, cdouble_mul(ima, z)))); * h0 = cdouble_mul(* h0, cd); * h1 = cdouble_mul(* h1, cd); } @@ -158,6 +158,7 @@ void hank103u(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon) cdouble_t ccex; cdouble_t cd; + double com; double d; double done; cdouble_t ima = {0.0e0, 1.0e0}; @@ -167,6 +168,7 @@ void hank103u(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon) double thresh3; cdouble_t zzz9; * ier = 0; + com = cdouble_real(z); if (cdouble_imag(z) >= 0) goto label_1200; @@ -234,7 +236,7 @@ void hank103p(__constant cdouble_t *p, int m, cdouble_t z, cdouble_t *f) * f = p[m - 1]; for (i = m + (- 1); i >= 1; i += - 1) { - * f = cdouble_mul(* f, z) + p[i - 1]; + * f = cdouble_add(cdouble_mul(* f, z), p[i - 1]); label_1200: ; @@ -275,10 +277,10 @@ void hank103a(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon) qq1 = cdouble_fromreal(hank103a_q1[m - 1]); for (i = m + (- 1); i >= 1; i += - 1) { - pp = cdouble_fromreal(hank103a_p[i - 1]) + cdouble_mul(pp, zinv22); - pp1 = cdouble_fromreal(hank103a_p1[i - 1]) + cdouble_mul(pp1, zinv22); - qq = cdouble_fromreal(hank103a_q[i - 1]) + cdouble_mul(qq, zinv22); - qq1 = cdouble_fromreal(hank103a_q1[i - 1]) + cdouble_mul(qq1, zinv22); + pp = cdouble_add(cdouble_fromreal(hank103a_p[i - 1]), cdouble_mul(pp, zinv22)); + pp1 = cdouble_add(cdouble_fromreal(hank103a_p1[i - 1]), cdouble_mul(pp1, zinv22)); + qq = cdouble_add(cdouble_fromreal(hank103a_q[i - 1]), cdouble_mul(qq, zinv22)); + qq1 = cdouble_add(cdouble_fromreal(hank103a_q1[i - 1]), cdouble_mul(qq1, zinv22)); label_1600: ; @@ -290,11 +292,11 @@ void hank103a(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon) if (ifexpon == 1) cccexp = cdouble_exp(cdouble_mul(ima, z)); - cdd = cdouble_sqrt((2 / pi) * zinv); - * h0 = pp + cdouble_mul(ima, qq); + cdd = cdouble_sqrt(cdouble_rmul(2 / pi, zinv)); + * h0 = cdouble_add(pp, cdouble_mul(ima, qq)); * h0 = cdouble_mul(cdouble_mul(cdouble_mul(cdd, cdumb), cccexp), * h0); - * h1 = pp1 + cdouble_mul(ima, qq1); - * h1 = (- 1) * cdouble_mul(cdouble_mul(cdouble_mul(cdouble_mul(cdd, cccexp), cdumb), * h1), ima); + * h1 = cdouble_add(pp1, cdouble_mul(ima, qq1)); + * h1 = cdouble_rmul(- 1, cdouble_mul(cdouble_mul(cdouble_mul(cdouble_mul(cdd, cccexp), cdumb), * h1), ima)); } __constant double hank103l_cj0[] = {0.1000000000000000e+01, (- 1) * 0.2500000000000000e+00, 0.1562500000000000e-01, (- 1) * 0.4340277777777778e-03, 0.6781684027777778e-05, (- 1) * 0.6781684027777778e-07, 0.4709502797067901e-09, (- 1) * 0.2402807549524439e-11, 0.9385966990329841e-14, (- 1) * 0.2896903392077112e-16, 0.7242258480192779e-19, (- 1) * 0.1496334396734045e-21, 0.2597802772107717e-24, (- 1) * 0.3842903509035085e-27, 0.4901662639075363e-30, (- 1) * 0.5446291821194848e-33}; @@ -329,29 +331,29 @@ void hank103l(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon) cd = cdouble_fromreal(1); for (i = 1; i <= m; i += 1) { - fj0 = fj0 + (hank103l_cj0[i - 1] * cd); - fj1 = fj1 + (hank103l_cj1[i - 1] * cd); - y1 = y1 + (hank103l_ser2der[i - 1] * cd); + fj0 = cdouble_add(fj0, cdouble_rmul(hank103l_cj0[i - 1], cd)); + fj1 = cdouble_add(fj1, cdouble_rmul(hank103l_cj1[i - 1], cd)); + y1 = cdouble_add(y1, cdouble_rmul(hank103l_ser2der[i - 1], cd)); cd = cdouble_mul(cd, z2); - y0 = y0 + (hank103l_ser2[i - 1] * cd); + y0 = cdouble_add(y0, cdouble_rmul(hank103l_ser2[i - 1], cd)); label_1800: ; } - fj1 = (- 1) * cdouble_mul(fj1, z); - cdddlog = cdouble_fromreal(gamma) + cdouble_log(z / two); - y0 = cdouble_mul(cdddlog, fj0) + y0; - y0 = (two / pi) * y0; + fj1 = cdouble_rmul(- 1, cdouble_mul(fj1, z)); + cdddlog = cdouble_add(cdouble_fromreal(gamma), cdouble_log(cdouble_divider(z, two))); + y0 = cdouble_add(cdouble_mul(cdddlog, fj0), y0); + y0 = cdouble_rmul(two / pi, y0); y1 = cdouble_mul(y1, z); - y1 = (((- 1) * cdouble_mul(cdddlog, fj1)) + cdouble_divide(fj0, z)) + y1; - y1 = (((- 1) * two) * y1) / pi; - * h0 = fj0 + cdouble_mul(ima, y0); - * h1 = fj1 + cdouble_mul(ima, y1); + y1 = cdouble_add(cdouble_add(cdouble_rmul(- 1, cdouble_mul(cdddlog, fj1)), cdouble_divide(fj0, z)), y1); + y1 = cdouble_divider(cdouble_rmul((- 1) * two, y1), pi); + * h0 = cdouble_add(fj0, cdouble_mul(ima, y0)); + * h1 = cdouble_add(fj1, cdouble_mul(ima, y1)); if (ifexpon == 1) return; - cd = cdouble_exp((- 1) * cdouble_mul(ima, z)); + cd = cdouble_exp(cdouble_rmul(- 1, cdouble_mul(ima, z))); * h0 = cdouble_mul(* h0, cd); * h1 = cdouble_mul(* h1, cd); } @@ -439,3 +441,4 @@ void hank103r(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon) hank103a(z, & (* h0), & (* h1), ifexpon); } + diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 285c3e93fab1177363349915f43d3a2bb58a8578..47fe3d9585b1aebf08cde69624066e5088a081ae 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -525,7 +525,6 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" - z_is_complex = dtype_z.kind == "c" if x_is_complex: ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x) @@ -539,9 +538,15 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): if not x_is_complex and y_is_complex: ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax) - result = "%s + %s" % (ax, by) - if z_is_complex: - result = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), result) + if x_is_complex or y_is_complex: + result = ( + "{root}_add({root}_cast({ax}), {root}_cast({by}))" + .format( + ax=ax, + by=by, + root=complex_dtype_to_name(dtype_z))) + else: + result = "%s + %s" % (ax, by) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % { @@ -562,24 +567,27 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): z_is_complex = dtype_z.kind == "c" ax = "a*x[i]" - if a_is_complex and x_is_complex: + if x_is_complex: a = "a" x = "x[i]" - if dtype_a != dtype_z: - a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) if dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) - ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) + if a_is_complex: + if dtype_a != dtype_z: + a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) - # The following two are workarounds for Apple on OS X 10.8. - # They're not really necessary. + ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) + else: + ax = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) + elif a_is_complex: + a = "a" + x = "x[i]" - elif a_is_complex and not x_is_complex: - ax = "a*((%s) x[i])" % dtype_to_ctype(real_dtype(dtype_a)) - elif not a_is_complex and x_is_complex: - ax = "((%s) a)*x[i]" % dtype_to_ctype(real_dtype(dtype_x)) + if dtype_a != dtype_z: + a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) + ax = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) b = "b" if z_is_complex and not b_is_complex: @@ -592,6 +600,14 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax) b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b) + if a_is_complex or x_is_complex or b_is_complex: + expr = "{root}_add({ax}, {b})".format( + ax=ax, + b=b, + root=complex_dtype_to_name(dtype_z)) + else: + expr = "%s + %s" % (ax, b) + return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % { "tp_a": dtype_to_ctype(dtype_a), @@ -599,7 +615,7 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): "tp_b": dtype_to_ctype(dtype_b), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s + %s" % (ax, b), + "z[i] = " + expr, name="axpb") @@ -607,7 +623,6 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" - z_is_complex = dtype_z.kind == "c" x = "x[i]" y = "y[i]" @@ -619,13 +634,13 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): if x_is_complex and y_is_complex: xy = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) - + elif x_is_complex and not y_is_complex: + xy = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) + elif not x_is_complex and y_is_complex: + xy = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: xy = "%s * %s" % (x, y) - if z_is_complex: - xy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xy) - return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), @@ -654,8 +669,9 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): if x_is_complex and y_is_complex: xoy = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif not x_is_complex and y_is_complex: - xoy = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) + elif x_is_complex and not y_is_complex: + xoy = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: xoy = "%s / %s" % (x, y) @@ -691,7 +707,9 @@ def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z): if x_is_complex and y_is_complex: yox = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) elif not y_is_complex and x_is_complex: - yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_x), y, x) + yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) + elif y_is_complex and not x_is_complex: + yox = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) else: yox = "%s / %s" % (y, x) @@ -728,16 +746,18 @@ def get_reverse_kernel(context, dtype): @context_dependent_memoize def get_arange_kernel(context, dtype): if dtype.kind == "c": - i = "%s_fromreal(i)" % complex_dtype_to_name(dtype) + expr = ( + "{root}_add(start, {root}_rmul(i, step))" + .format(root=complex_dtype_to_name(dtype))) else: - i = "(%s) i" % dtype_to_ctype(dtype) + expr = "start + ((%s) i)*step" % dtype_to_ctype(dtype) return get_elwise_kernel(context, [ VectorArg(dtype, "z", with_offset=True), ScalarArg(dtype, "start"), ScalarArg(dtype, "step"), ], - "z[i] = start + %s*step" % i, + "z[i] = " + expr, name="arange") diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index ba7c6dc77f69337c8e535284a782e31eb9b613a4..c1a02573e85f750cd20f52217261784c26e99e34 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -63,7 +63,8 @@ KERNEL = """//CL// __global out_type *out__base, long out__offset, ${arguments}, unsigned int seq_count, unsigned int n) { - __global out_type *out = (__global out_type *) ((__global char *) out__base + out__offset); + __global out_type *out = (__global out_type *) ( + (__global char *) out__base + out__offset); ${arg_prep} __local out_type ldata[GROUP_SIZE]; @@ -325,8 +326,8 @@ class ReductionKernel: "that have at least one vector argument" def __call__(self, *args, **kwargs): - MAX_GROUP_COUNT = 1024 - SMALL_SEQ_COUNT = 4 + MAX_GROUP_COUNT = 1024 # noqa + SMALL_SEQ_COUNT = 4 # noqa from pyopencl.array import empty @@ -392,7 +393,8 @@ class ReductionKernel: use_queue, (group_count*stage_inf.group_size,), (stage_inf.group_size,), - *([result.base_data, result.offset] + invocation_args + [seq_count, sz]), + *([result.base_data, result.offset] + + invocation_args + [seq_count, sz]), **dict(wait_for=wait_for)) wait_for = [last_evt] @@ -473,7 +475,15 @@ def get_sum_kernel(ctx, dtype_out, dtype_in): if dtype_out is None: dtype_out = dtype_in - return ReductionKernel(ctx, dtype_out, "0", "a+b", + reduce_expr = "a+b" + neutral_expr = "0" + if dtype_out.kind == "c": + from pyopencl.elementwise import complex_dtype_to_name + dtname = complex_dtype_to_name(dtype_out) + reduce_expr = "%s_add(a, b)" % dtname + neutral_expr = "%s_new(0, 0)" % dtname + + return ReductionKernel(ctx, dtype_out, neutral_expr, reduce_expr, arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)}) @@ -492,49 +502,30 @@ def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first, dtype_a.type(0), dtype_b.type(0), has_double_support) - a_real_dtype = dtype_a.type(0).real.dtype - b_real_dtype = dtype_b.type(0).real.dtype - out_real_dtype = dtype_out.type(0).real.dtype - a_is_complex = dtype_a.kind == "c" b_is_complex = dtype_b.kind == "c" - out_is_complex = dtype_out.kind == "c" from pyopencl.elementwise import complex_dtype_to_name - if a_is_complex and b_is_complex: - a = "a[%s]" % index_expr - b = "b[%s]" % index_expr - if dtype_a != dtype_out: - a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) - if dtype_b != dtype_out: - b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) - - if conjugate_first and a_is_complex: - a = "%s_conj(%s)" % ( - complex_dtype_to_name(dtype_out), a) - - map_expr = "%s_mul(%s, %s)" % ( - complex_dtype_to_name(dtype_out), a, b) - else: - a = "a[%s]" % index_expr - b = "b[%s]" % index_expr - - if out_is_complex: - if a_is_complex and dtype_a != dtype_out: - a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) - if b_is_complex and dtype_b != dtype_out: - b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) + a = "a[%s]" % index_expr + b = "b[%s]" % index_expr - if not a_is_complex and a_real_dtype != out_real_dtype: - a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a) - if not b_is_complex and b_real_dtype != out_real_dtype: - b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b) + if a_is_complex and (dtype_a != dtype_out): + a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) + if b_is_complex and (dtype_b != dtype_out): + b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) - if conjugate_first and a_is_complex: - a = "%s_conj(%s)" % ( - complex_dtype_to_name(dtype_out), a) + if a_is_complex and conjugate_first and a_is_complex: + a = "%s_conj(%s)" % ( + complex_dtype_to_name(dtype_out), a) + if a_is_complex and not b_is_complex: + map_expr = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b) + elif not a_is_complex and b_is_complex: + map_expr = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b) + elif a_is_complex and b_is_complex: + map_expr = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b) + else: map_expr = "%s*%s" % (a, b) return map_expr, dtype_out, dtype_b @@ -548,14 +539,22 @@ def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None, dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0])) - return ReductionKernel(ctx, dtype_out, neutral="0", - reduce_expr="a+b", map_expr=map_expr, - arguments= - "const %(tp_a)s *a, " - "const %(tp_b)s *b" % { - "tp_a": dtype_to_ctype(dtype_a), - "tp_b": dtype_to_ctype(dtype_b), - }) + reduce_expr = "a+b" + neutral_expr = "0" + if dtype_out.kind == "c": + from pyopencl.elementwise import complex_dtype_to_name + dtname = complex_dtype_to_name(dtype_out) + reduce_expr = "%s_add(a, b)" % dtname + neutral_expr = "%s_new(0, 0)" % dtname + + return ReductionKernel(ctx, dtype_out, neutral=neutral_expr, + reduce_expr=reduce_expr, map_expr=map_expr, + arguments=( + "const %(tp_a)s *a, " + "const %(tp_b)s *b" % { + "tp_a": dtype_to_ctype(dtype_a), + "tp_b": dtype_to_ctype(dtype_b), + })) @context_dependent_memoize @@ -570,14 +569,14 @@ def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=No # important: lookup_tbl must be first--it controls the length return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, - arguments= - "const %(tp_lut)s *lookup_tbl, " - "const %(tp_a)s *a, " - "const %(tp_b)s *b" % { - "tp_lut": dtype_to_ctype(dtype_subset), - "tp_a": dtype_to_ctype(dtype_a), - "tp_b": dtype_to_ctype(dtype_b), - }) + arguments=( + "const %(tp_lut)s *lookup_tbl, " + "const %(tp_a)s *a, " + "const %(tp_b)s *b" % { + "tp_lut": dtype_to_ctype(dtype_subset), + "tp_a": dtype_to_ctype(dtype_a), + "tp_b": dtype_to_ctype(dtype_b), + })) def get_minmax_neutral(what, dtype): @@ -628,12 +627,13 @@ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset): neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, map_expr="in[lookup_tbl[i]]", - arguments= - "const %(tp_lut)s *lookup_tbl, " - "const %(tp)s *in" % { - "tp": dtype_to_ctype(dtype), - "tp_lut": dtype_to_ctype(dtype_subset), - }, preamble="#define MY_INFINITY (1./0)") + arguments=( + "const %(tp_lut)s *lookup_tbl, " + "const %(tp)s *in" % { + "tp": dtype_to_ctype(dtype), + "tp_lut": dtype_to_ctype(dtype_subset), + }), + preamble="#define MY_INFINITY (1./0)") # }}}