diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..e387cbc3bcd66ae9bd56b8273890353e49cd6917 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,48 @@ +Python 2.7 AMD CPU: + script: + - py_version=2.7 + - cl_dev=amd_pu + - EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.7 + - amd-cl-cpu + except: + - tags +Python 3.4 AMD CPU: + script: + - py_version=3.4 + - cl_dev=amd_pu + - EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python3.4 + - amd-cl-cpu + except: + - tags +Python 2.6 AMD CPU: + script: + - py_version=2.6 + - cl_dev=amd_pu + - EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.6 + - amd-cl-cpu + except: + - tags +Python 2.7 K20: + script: + - py_version=2.7 + - cl_dev=nvi_k20 + - EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python2.7 + - cuda + except: + - tags diff --git a/contrib/fortran-to-opencl/setup.cfg b/contrib/fortran-to-opencl/setup.cfg new file mode 100644 index 0000000000000000000000000000000000000000..d3f13a0e64b79c00a957cb1369e335e0b8a00d76 --- /dev/null +++ b/contrib/fortran-to-opencl/setup.cfg @@ -0,0 +1,3 @@ +[flake8] +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802 +max-line-length=85 diff --git a/contrib/fortran-to-opencl/translate.py b/contrib/fortran-to-opencl/translate.py index d0355423e10b274ecfc658ed3f1b7c722586bb82..4c3486f4d52c135aacd2a915c90089fd71e8acb2 100644 --- a/contrib/fortran-to-opencl/translate.py +++ b/contrib/fortran-to-opencl/translate.py @@ -27,7 +27,7 @@ import numpy as np import re from pymbolic.parser import Parser as ExpressionParserBase from pymbolic.mapper import CombineMapper -import pymbolic.primitives +import pymbolic.primitives as p from pymbolic.mapper.c_code import CCodeMapper as CCodeMapperBase from warnings import warn @@ -43,6 +43,15 @@ class TranslationError(RuntimeError): pass +def complex_type_name(dtype): + if dtype == np.complex64: + return "cfloat" + if dtype == np.complex128: + return "cdouble" + else: + raise RuntimeError + + # {{{ AST components def dtype_to_ctype(dtype): @@ -99,7 +108,7 @@ _and = intern("and") _or = intern("or") -class TypedLiteral(pymbolic.primitives.Leaf): +class TypedLiteral(p.Leaf): def __init__(self, value, dtype): self.value = value self.dtype = np.dtype(dtype) @@ -110,6 +119,18 @@ class TypedLiteral(pymbolic.primitives.Leaf): mapper_method = intern("map_literal") +def simplify_typed_literal(expr): + if (isinstance(expr, p.Product) + and len(expr.children) == 2 + and isinstance(expr.children[1], TypedLiteral) + and p.is_constant(expr.children[0]) + and expr.children[0] == -1): + tl = expr.children[1] + return TypedLiteral("-"+tl.value, tl.dtype) + else: + return expr + + class FortranExpressionParser(ExpressionParserBase): # FIXME double/single prec literals @@ -134,7 +155,6 @@ class FortranExpressionParser(ExpressionParserBase): def parse_terminal(self, pstate): scope = self.tree_walker.scope_stack[-1] - from pymbolic.primitives import Subscript, Call, Variable from pymbolic.parser import ( _identifier, _openpar, _closepar, _float) @@ -164,17 +184,17 @@ class FortranExpressionParser(ExpressionParserBase): # not a subscript scope.use_name(name) - return Variable(name) + return p.Variable(name) - left_exp = Variable(name) + left_exp = p.Variable(name) pstate.advance() pstate.expect_not_end() if scope.is_known(name): - cls = Subscript + cls = p.Subscript else: - cls = Call + cls = p.Call if pstate.next_tag is _closepar: pstate.advance() @@ -219,14 +239,14 @@ class FortranExpressionParser(ExpressionParserBase): _PREC_CALL, _PREC_COMPARISON, _openpar, _PREC_LOGICAL_OR, _PREC_LOGICAL_AND) from pymbolic.primitives import ( - ComparisonOperator, LogicalAnd, LogicalOr) + Comparison, LogicalAnd, LogicalOr) next_tag = pstate.next_tag() if next_tag is _openpar and _PREC_CALL > min_precedence: raise TranslationError("parenthesis operator only works on names") elif next_tag in self.COMP_MAP and _PREC_COMPARISON > min_precedence: pstate.advance() - left_exp = ComparisonOperator( + left_exp = Comparison( left_exp, self.COMP_MAP[next_tag], self.parse_expression(pstate, _PREC_COMPARISON)) @@ -250,7 +270,10 @@ class FortranExpressionParser(ExpressionParserBase): assert len(left_exp) == 2 r, i = left_exp - dtype = (r.dtype.type(0) + i.dtype.type(0)) + r = simplify_typed_literal(r) + i = simplify_typed_literal(i) + + dtype = (r.dtype.type(0) + i.dtype.type(0)).dtype if dtype == np.float32: dtype = np.complex64 else: @@ -295,6 +318,17 @@ class TypeInferenceMapper(CombineMapper): else: raise RuntimeError("unexpected complex type") + elif name in ["imag", "real", "abs", "dble"]: + arg, = expr.parameters + base_dtype = self.rec(arg) + + if base_dtype == np.complex128: + return np.dtype(np.float64) + elif base_dtype == np.complex64: + return np.dtype(np.float32) + else: + return base_dtype + else: return CombineMapper.map_call(self, expr) @@ -304,14 +338,6 @@ class ComplexCCodeMapper(CCodeMapperBase): CCodeMapperBase.__init__(self) self.infer_type = infer_type - def complex_type_name(self, dtype): - if dtype == np.complex64: - return "cfloat" - if dtype == np.complex128: - return "cdouble" - else: - raise RuntimeError - def map_sum(self, expr, enclosing_prec): tgt_dtype = self.infer_type(expr) is_complex = tgt_dtype.kind == 'c' @@ -319,19 +345,30 @@ class ComplexCCodeMapper(CCodeMapperBase): if not is_complex: return CCodeMapperBase.map_sum(self, expr, enclosing_prec) else: - tgt_name = self.complex_type_name(tgt_dtype) + tgt_name = complex_type_name(tgt_dtype) reals = [child for child in expr.children if 'c' != self.infer_type(child).kind] complexes = [child for child in expr.children if 'c' == self.infer_type(child).kind] - from pymbolic.mapper.stringifier import PREC_SUM + from pymbolic.mapper.stringifier import PREC_SUM, PREC_NONE real_sum = self.join_rec(" + ", reals, PREC_SUM) - complex_sum = self.join_rec(" + ", complexes, PREC_SUM) + + if len(complexes) == 1: + myprec = PREC_SUM + else: + myprec = PREC_NONE + + complex_sum = self.rec(complexes[0], myprec) + for child in complexes[1:]: + complex_sum = "%s_add(%s, %s)" % ( + tgt_name, complex_sum, + self.rec(child, PREC_NONE)) if real_sum: - result = "%s_fromreal(%s) + %s" % (tgt_name, real_sum, complex_sum) + result = "%s_add(%s_fromreal(%s), %s)" % ( + tgt_name, tgt_name, real_sum, complex_sum) else: result = complex_sum @@ -344,7 +381,7 @@ class ComplexCCodeMapper(CCodeMapperBase): if not is_complex: return CCodeMapperBase.map_product(self, expr, enclosing_prec) else: - tgt_name = self.complex_type_name(tgt_dtype) + tgt_name = complex_type_name(tgt_dtype) reals = [child for child in expr.children if 'c' != self.infer_type(child).kind] @@ -366,8 +403,7 @@ class ComplexCCodeMapper(CCodeMapperBase): self.rec(child, PREC_NONE)) if real_prd: - # elementwise semantics are correct - result = "%s * %s" % (real_prd, complex_prd) + result = "%s_rmul(%s, %s)" % (tgt_name, real_prd, complex_prd) else: result = complex_prd @@ -383,16 +419,18 @@ class ComplexCCodeMapper(CCodeMapperBase): if not (n_complex or d_complex): return CCodeMapperBase.map_quotient(self, expr, enclosing_prec) elif n_complex and not d_complex: - # elementwise semnatics are correct - return CCodeMapperBase.map_quotient(self, expr, enclosing_prec) + return "%s_divider(%s, %s)" % ( + complex_type_name(tgt_dtype), + self.rec(expr.numerator, PREC_NONE), + self.rec(expr.denominator, PREC_NONE)) elif not n_complex and d_complex: return "%s_rdivide(%s, %s)" % ( - self.complex_type_name(tgt_dtype), + complex_type_name(tgt_dtype), self.rec(expr.numerator, PREC_NONE), self.rec(expr.denominator, PREC_NONE)) else: return "%s_divide(%s, %s)" % ( - self.complex_type_name(tgt_dtype), + complex_type_name(tgt_dtype), self.rec(expr.numerator, PREC_NONE), self.rec(expr.denominator, PREC_NONE)) @@ -419,12 +457,12 @@ class ComplexCCodeMapper(CCodeMapperBase): if b_complex and not e_complex: return "%s_powr(%s, %s)" % ( - self.complex_type_name(tgt_dtype), + complex_type_name(tgt_dtype), self.rec(expr.base, PREC_NONE), self.rec(expr.exponent, PREC_NONE)) else: return "%s_pow(%s, %s)" % ( - self.complex_type_name(tgt_dtype), + complex_type_name(tgt_dtype), self.rec(expr.base, PREC_NONE), self.rec(expr.exponent, PREC_NONE)) @@ -464,28 +502,42 @@ class CCodeMapper(ComplexCCodeMapper): from pymbolic.mapper.stringifier import PREC_NONE tgt_dtype = self.infer_type(expr) + arg_dtypes = [self.infer_type(par) for par in expr.parameters] name = expr.function.name if 'f' == tgt_dtype.kind and name == "abs": name = "fabs" - if 'c' == tgt_dtype.kind: + elif 'c' == tgt_dtype.kind: if name in ["conjg", "dconjg"]: name = "conj" if name[:2] == "cd" and name[2:] in ["log", "exp", "sqrt"]: name = name[2:] - if name == "aimag": - name = "imag" - if name == "dble": name = "real" name = "%s_%s" % ( - self.complex_type_name(tgt_dtype), + complex_type_name(tgt_dtype), name) + elif name in ["aimag", "real", "imag"] and tgt_dtype.kind == "f": + arg_dtype, = arg_dtypes + + if name == "aimag": + name = "imag" + + name = "%s_%s" % ( + complex_type_name(arg_dtype), + name) + + elif 'c' == tgt_dtype.kind and name == "abs": + arg_dtype, = arg_dtypes + + name = "%s_abs" % ( + complex_type_name(arg_dtype)) + return self.format("%s(%s)", name, self.join_rec(", ", expr.parameters, PREC_NONE)) @@ -512,7 +564,10 @@ class CCodeMapper(ComplexCCodeMapper): from pymbolic.mapper.stringifier import PREC_NONE if expr.dtype.kind == "c": r, i = expr.value - return "{ %s, %s }" % (self.rec(r, PREC_NONE), self.rec(i, PREC_NONE)) + return "%s_new(%s, %s)" % ( + complex_type_name(expr.dtype), + self.rec(r, PREC_NONE), + self.rec(i, PREC_NONE)) else: return expr.value @@ -758,10 +813,9 @@ class ArgumentAnalayzer(FTreeWalkerBase): lhs = self.parse_expr(node.variable) - from pymbolic.primitives import Subscript, Call - if isinstance(lhs, Subscript): + if isinstance(lhs, p.Subscript): lhs_name = lhs.aggregate.name - elif isinstance(lhs, Call): + elif isinstance(lhs, p.Call): # in absence of dim info, subscripts get parsed as calls lhs_name = lhs.function.name else: @@ -797,11 +851,10 @@ class ArgumentAnalayzer(FTreeWalkerBase): def map_Call(self, node): scope = self.scope_stack[-1] - from pymbolic.primitives import Subscript, Variable for i, arg_str in enumerate(node.items): arg = self.parse_expr(arg_str) - if isinstance(arg, (Variable, Subscript)): - if isinstance(arg, Subscript): + if isinstance(arg, (p.Variable, p.Subscript)): + if isinstance(arg, p.Subscript): arg_name = arg.aggregate.name else: arg_name = arg.name @@ -926,9 +979,9 @@ class F2CLTranslator(FTreeWalkerBase): if shape is not None: result.append(cgen.Statement( "%s %s[nitemsof(%s)]" - % ( - dtype_to_ctype(scope.get_type(name)), - name, name))) + % ( + dtype_to_ctype(scope.get_type(name)), + name, name))) else: result.append(self.get_declarator(name)) @@ -1358,6 +1411,13 @@ def f2cl_files(source_file, target_file, **kwargs): if __name__ == "__main__": + import logging + console = logging.StreamHandler() + console.setLevel(logging.DEBUG) + formatter = logging.Formatter('%(name)-12s: %(levelname)-8s %(message)s') + console.setFormatter(formatter) + logging.getLogger('fparser').addHandler(console) + from cgen.opencl import CLConstant if 0: diff --git a/doc/misc.rst b/doc/misc.rst index 950e27da8412d84cec520687ae9d65e0aa58d819..31b954e3fe8c637b15f8d8ab6fa5853b40ed583e 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -110,13 +110,22 @@ other software to be turned into the corresponding :mod:`pyopencl` objects. User-visible Changes ==================== -Version 2014.1 +Version 2015.2 -------------- .. note:: This version is currently under development. You can get snapshots from PyOpenCL's `git repository <https://github.com/pyopencl/pyopencl>`_ +Version 2015.1 +-------------- + +* Support for new-style buffer protocol +* Numerous fixes + +Version 2014.1 +-------------- + * :ref:`ipython-integration` * Bug fixes diff --git a/doc/tools.rst b/doc/tools.rst index e85b4e9a39ac8dca6be4b8a4a6831ab49bf8852c..9c647b1992c6337cc58d73ec24d13e4ecb6a18af 100644 --- a/doc/tools.rst +++ b/doc/tools.rst @@ -26,7 +26,7 @@ the available memory. Using :class:`pyopencl.array.Array` instances with a :class:`MemoryPool` is not complicated:: - mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) + mem_pool = pyopencl.tools.MemoryPool(pyopencl.tools.ImmediateAllocator(queue)) a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) .. class:: PooledBuffer diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py index 387e31a4d0a455368ca45bef287220b5f9ead72e..0fb9c611a93f43610298c66533a80e9572fe44fc 100644 --- a/pyopencl/algorithm.py +++ b/pyopencl/algorithm.py @@ -392,6 +392,9 @@ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL// # {{{ driver +# import hoisted here to be used as a default argument in the constructor +from pyopencl.scan import GenericScanKernel + class RadixSort(object): """Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_ @@ -401,7 +404,7 @@ class RadixSort(object): """ def __init__(self, context, arguments, key_expr, sort_arg_names, bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32, - options=[]): + scan_kernel=GenericScanKernel, options=[]): """ :arg arguments: A string of comma-separated C argument declarations. If *arguments* is specified, then *input_expr* must also be @@ -469,8 +472,7 @@ class RadixSort(object): scan_preamble = preamble \ + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args) - from pyopencl.scan import GenericScanKernel - self.scan_kernel = GenericScanKernel( + self.scan_kernel = scan_kernel( context, scan_dtype, arguments=scan_arguments, input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr, @@ -821,8 +823,8 @@ class ListOfListsBuilder: from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [ VectorArg(index_dtype, "plb_%s_count" % name) - for name, dtype in self.list_names_and_dtypes - if name not in self.count_sharing] + for name, dtype in self.list_names_and_dtypes + if name not in self.count_sharing] user_list_args = [] for name, dtype in self.list_names_and_dtypes: diff --git a/pyopencl/array.py b/pyopencl/array.py index 8d29b6c1d74db82d4230cb3f513a41e820bad7a8..bd2ee902ea179e4ec39db06884ac14ecd3f84d1f 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -60,7 +60,7 @@ except: # {{{ vector types -class vec: +class vec: # noqa pass @@ -271,7 +271,7 @@ class ArrayHasOffsetError(ValueError): ValueError.__init__(self, val) -class _copy_queue: +class _copy_queue: # noqa pass diff --git a/pyopencl/characterize/__init__.py b/pyopencl/characterize/__init__.py index 0a6aed652b9b3a67613e7be349d422f9fb6faa01..26e5b994bc2a265e1e6805d79b59e688ee7f31a8 100644 --- a/pyopencl/characterize/__init__.py +++ b/pyopencl/characterize/__init__.py @@ -313,19 +313,6 @@ def get_simd_group_size(dev, type_size): if dev.type & cl.device_type.CPU: # implicit assumption: Impl. will vectorize - - if type_size == 1: - return dev.preferred_vector_width_char - elif type_size == 2: - return dev.preferred_vector_width_short - elif type_size == 4: - return dev.preferred_vector_width_float - elif type_size == 8: - return dev.preferred_vector_width_double - else: - from warnings import warn - warn("unexpected dtype size in get_simd_group on CPU device, " - "guessing group width 1") - return 1 + return 1 return None diff --git a/pyopencl/cl/pyopencl-bessel-j-complex.cl b/pyopencl/cl/pyopencl-bessel-j-complex.cl new file mode 100644 index 0000000000000000000000000000000000000000..99592828adf23f5bcf12a6a18b2c2d14941f3c82 --- /dev/null +++ b/pyopencl/cl/pyopencl-bessel-j-complex.cl @@ -0,0 +1,240 @@ +/* +Evaluate Bessel J function J_v(z) and J_{v+1}(z) with v a nonnegative integer +and z anywhere in the complex plane. + +Copyright (C) Vladimir Rokhlin +Copyright (C) 2010-2012 Leslie Greengard and Zydrunas Gimbutas +Copyright (C) 2015 Shidong Jiang, Andreas Kloeckner + +Manually translated from +https://github.com/zgimbutas/fmmlib2d/blob/master/src/cdjseval2d.f + +Originally licensed under GPL, permission to license under MIT granted via email +by Vladimir Rokhlin on May 25, 2015 and by Zydrunas Gimbutas on May 17, 2015. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +*/ + +void bessel_j_complex(int v, cdouble_t z, cdouble_t *j_v, cdouble_t *j_vp1) +{ + int n; + int nmax = 10000; + + int k; + int kmax=8; + + int vscale, vp1scale; + double vscaling, vp1scaling; + + const double small = 2e-1; + const double median = 1.0e0; + + const double upbound = 1e40; + const double upbound_inv = 1e-40; + + double dd; + double k_factorial_inv, kv_factorial_inv, kvp1_factorial_inv; + + cdouble_t z_half, mz_half2, mz_half_2k, z_half_v, z_half_vp1; + + cdouble_t ima = cdouble_new(0, 1); + cdouble_t neg_ima = cdouble_new(0, -1); + + cdouble_t zinv, ztmp; + cdouble_t j_nm1, j_n, j_np1; + + cdouble_t psi, zsn, zmul, zmulinv; + cdouble_t unscaled_j_n, unscaled_j_nm1, unscaled_j_np1; + cdouble_t unscaled_j_v, unscaled_j_vp1; + cdouble_t scaling; + + // assert( v >= 0 ); + +#if 0 + if (cdouble_abs(z) < tiny) + { + if (v == 0) + { + *j_v = cdouble_new(1, 0); + *j_vp1 = cdouble_new(0, 0); + } else + { + *j_v = cdouble_new(0, 0); + *j_vp1 = cdouble_new(0, 0); + } + return; + } +#endif + + // {{{ power series for (small z) or (large v and median z) + if ( (cdouble_abs(z) < small) || ( (v>12) && (cdouble_abs(z) < median))) + { + z_half = cdouble_divider(z,2.0); + + mz_half2 = cdouble_neg(cdouble_mul(z_half, z_half)); + + z_half_v = cdouble_powr(z_half, v); + z_half_vp1 = cdouble_mul(z_half_v, z_half); + + + // compute 1/v! + kv_factorial_inv = 1.0; + for ( k = 1; k <= v; k++) + { + kv_factorial_inv /= k; + } + + kvp1_factorial_inv = kv_factorial_inv / (v+1); + + k_factorial_inv = 1.0; + + // compute the power series of bessel j function + mz_half_2k = cdouble_new(1.0, 0); + + *j_v = cdouble_new(0, 0); + *j_vp1 = cdouble_new(0, 0); + + for ( k = 0; k < kmax; k++ ) + { + *j_v = cdouble_add( + *j_v, + cdouble_mulr(mz_half_2k, kv_factorial_inv*k_factorial_inv)); + *j_vp1 = cdouble_add(*j_vp1, + cdouble_mulr(mz_half_2k, kvp1_factorial_inv*k_factorial_inv)); + + mz_half_2k = cdouble_mul(mz_half_2k, mz_half2); + k_factorial_inv /= (k+1); + kv_factorial_inv /= (k+v+1); + kvp1_factorial_inv /= (k+v+2); + } + + *j_v = cdouble_mul(*j_v, z_half_v ); + *j_vp1 = cdouble_mul(*j_vp1, z_half_vp1 ); + + return; + } + + // }}} + + // {{{ use recurrence for large z + + j_nm1 = cdouble_new(0, 0); + j_n = cdouble_new(1, 0); + + n = v; + + zinv = cdouble_rdivide(1,z); + + while (true) + { + j_np1 = cdouble_sub( + cdouble_mul(cdouble_rmul(2*n, zinv), j_n), + j_nm1); + + n += 1; + j_nm1 = j_n; + j_n = j_np1; + + if (n > nmax) + { + *j_v = cdouble_new(nan(0x8e55e1u), 0); + *j_vp1 = cdouble_new(nan(0x8e55e1u), 0); + return; + } + + dd = pow((cdouble_real(j_n)), 2)+pow(cdouble_imag(j_n),2); + if (dd > upbound) + break; + } + + // downward recursion, account for rescalings + // Record the number of times of the missed rescalings + // for j_v and j_vp1. + + unscaled_j_np1 = cdouble_new(0, 0); + unscaled_j_n = cdouble_new(1, 0); + + // Use normalization condition http://dlmf.nist.gov/10.12#E5 + psi = cdouble_new(0, 0); + + if (cdouble_imag(z) <= 0) + zmul = ima; + else + zmul = neg_ima; + + zsn = cdouble_powr(zmul, n%4); + + zmulinv = cdouble_rdivide(1, zmul); + + vscale = 0; + vp1scale = 0; + + while (n > 0) + { + ztmp = cdouble_sub( + cdouble_mul(cdouble_rmul(2*n, zinv), unscaled_j_n), + unscaled_j_np1); + dd = pow(cdouble_real(ztmp), 2) + pow(cdouble_imag(ztmp), 2); + + unscaled_j_nm1 = ztmp; + + + psi = cdouble_add(psi, cdouble_mul(unscaled_j_n, zsn)); + zsn = cdouble_mul(zsn, zmulinv); + + n -= 1; + unscaled_j_np1 = unscaled_j_n; + unscaled_j_n = unscaled_j_nm1; + + if (dd > upbound) + { + unscaled_j_np1 = cdouble_rmul(upbound_inv, unscaled_j_np1); + unscaled_j_n = cdouble_rmul(upbound_inv, unscaled_j_n); + psi = cdouble_rmul(upbound_inv,psi); + if (n < v) vscale++; + if (n < v+1) vp1scale++; + } + + if (n == v) + unscaled_j_v = unscaled_j_n; + if (n == v+1) + unscaled_j_vp1 = unscaled_j_n; + + } + + psi = cdouble_add(cdouble_rmul(2, psi), unscaled_j_n); + + if ( cdouble_imag(z) <= 0 ) + { + scaling = cdouble_divide( cdouble_exp( cdouble_mul(ima,z) ), psi); + } else + { + scaling = cdouble_divide( cdouble_exp( cdouble_mul(neg_ima,z) ), psi); + } + vscaling = pow(upbound_inv, vscale); + vp1scaling = pow(upbound_inv, vp1scale); + + *j_v = cdouble_mul(unscaled_j_v, cdouble_mulr(scaling, vscaling)); + *j_vp1 = cdouble_mul(unscaled_j_vp1, cdouble_mulr(scaling,vp1scaling)); + + // }}} +} + +// vim: fdm=marker diff --git a/pyopencl/cl/pyopencl-complex.h b/pyopencl/cl/pyopencl-complex.h index 67ae2cfd0c97275291013d8a2daae5f0689085fd..976bf81a62dcef2511cb20a4d01220542fca23be 100644 --- a/pyopencl/cl/pyopencl-complex.h +++ b/pyopencl/cl/pyopencl-complex.h @@ -28,57 +28,82 @@ // multiplication (float2*float1) is defined for these types, // but do not match the rules of complex arithmetic. +#pragma once + #define PYOPENCL_DECLARE_COMPLEX_TYPE_INT(REAL_TP, REAL_3LTR, TPROOT, TP) \ \ - REAL_TP TPROOT##_real(TP a) { return a.x; } \ - REAL_TP TPROOT##_imag(TP a) { return a.y; } \ - REAL_TP TPROOT##_abs(TP a) { return hypot(a.x, a.y); } \ + REAL_TP TPROOT##_real(TP a) { return a.real; } \ + REAL_TP TPROOT##_imag(TP a) { return a.imag; } \ + REAL_TP TPROOT##_abs(TP a) { return hypot(a.real, a.imag); } \ + \ + TP TPROOT##_new(REAL_TP real, REAL_TP imag) \ + { \ + TP result; \ + result.real = real; \ + result.imag = imag; \ + return result; \ + } \ + \ + TP TPROOT##_fromreal(REAL_TP real) \ + { \ + TP result; \ + result.real = real; \ + result.imag = 0; \ + return result; \ + } \ + \ \ - TP TPROOT##_fromreal(REAL_TP a) { return (TP)(a, 0); } \ - TP TPROOT##_new(REAL_TP a, REAL_TP b) { return (TP)(a, b); } \ - TP TPROOT##_conj(TP a) { return (TP)(a.x, -a.y); } \ + TP TPROOT##_neg(TP a) { return TPROOT##_new(-a.real, -a.imag); } \ + TP TPROOT##_conj(TP a) { return TPROOT##_new(a.real, -a.imag); } \ \ TP TPROOT##_add(TP a, TP b) \ { \ - return a+b; \ + return TPROOT##_new(a.real + b.real, a.imag + b.imag); \ + ; \ } \ TP TPROOT##_addr(TP a, REAL_TP b) \ { \ - return (TP)(b+a.x, a.y); \ + return TPROOT##_new(b+a.real, a.imag); \ } \ TP TPROOT##_radd(REAL_TP a, TP b) \ { \ - return (TP)(a+b.x, b.y); \ + return TPROOT##_new(a+b.real, b.imag); \ + } \ + \ + TP TPROOT##_sub(TP a, TP b) \ + { \ + return TPROOT##_new(a.real - b.real, a.imag - b.imag); \ + ; \ } \ \ TP TPROOT##_mul(TP a, TP b) \ { \ - return (TP)( \ - a.x*b.x - a.y*b.y, \ - a.x*b.y + a.y*b.x); \ + return TPROOT##_new( \ + a.real*b.real - a.imag*b.imag, \ + a.real*b.imag + a.imag*b.real); \ } \ \ TP TPROOT##_mulr(TP a, REAL_TP b) \ { \ - return a*b; \ + return TPROOT##_new(a.real*b, a.imag*b); \ } \ \ TP TPROOT##_rmul(REAL_TP a, TP b) \ { \ - return a*b; \ + return TPROOT##_new(a*b.real, a*b.imag); \ } \ \ TP TPROOT##_rdivide(REAL_TP z1, TP z2) \ { \ - if (fabs(z2.x) <= fabs(z2.y)) { \ - REAL_TP ratio = z2.x / z2.y; \ - REAL_TP denom = z2.y * (1 + ratio * ratio); \ - return (TP)((z1 * ratio) / denom, - z1 / denom); \ + if (fabs(z2.real) <= fabs(z2.imag)) { \ + REAL_TP ratio = z2.real / z2.imag; \ + REAL_TP denom = z2.imag * (1 + ratio * ratio); \ + return TPROOT##_new((z1 * ratio) / denom, - z1 / denom); \ } \ else { \ - REAL_TP ratio = z2.y / z2.x; \ - REAL_TP denom = z2.x * (1 + ratio * ratio); \ - return (TP)(z1 / denom, - (z1 * ratio) / denom); \ + REAL_TP ratio = z2.imag / z2.real; \ + REAL_TP denom = z2.real * (1 + ratio * ratio); \ + return TPROOT##_new(z1 / denom, - (z1 * ratio) / denom); \ } \ } \ \ @@ -86,170 +111,174 @@ { \ REAL_TP ratio, denom, a, b, c, d; \ \ - if (fabs(z2.x) <= fabs(z2.y)) { \ - ratio = z2.x / z2.y; \ - denom = z2.y; \ - a = z1.y; \ - b = z1.x; \ - c = -z1.x; \ - d = z1.y; \ + if (fabs(z2.real) <= fabs(z2.imag)) { \ + ratio = z2.real / z2.imag; \ + denom = z2.imag; \ + a = z1.imag; \ + b = z1.real; \ + c = -z1.real; \ + d = z1.imag; \ } \ else { \ - ratio = z2.y / z2.x; \ - denom = z2.x; \ - a = z1.x; \ - b = z1.y; \ - c = z1.y; \ - d = -z1.x; \ + ratio = z2.imag / z2.real; \ + denom = z2.real; \ + a = z1.real; \ + b = z1.imag; \ + c = z1.imag; \ + d = -z1.real; \ } \ denom *= (1 + ratio * ratio); \ - return (TP)( \ + return TPROOT##_new( \ (a + b * ratio) / denom, \ (c + d * ratio) / denom); \ } \ \ TP TPROOT##_divider(TP a, REAL_TP b) \ { \ - return a/b; \ + return TPROOT##_new(a.real/b, a.imag/b); \ } \ \ TP TPROOT##_pow(TP a, TP b) \ { \ - REAL_TP logr = log(hypot(a.x, a.y)); \ - REAL_TP logi = atan2(a.y, a.x); \ - REAL_TP x = exp(logr * b.x - logi * b.y); \ - REAL_TP y = logr * b.y + logi * b.x; \ + REAL_TP logr = log(hypot(a.real, a.imag)); \ + REAL_TP logi = atan2(a.imag, a.real); \ + REAL_TP x = exp(logr * b.real - logi * b.imag); \ + REAL_TP y = logr * b.imag + logi * b.real; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ - return (TP) (x*cosy, x*siny); \ + return TPROOT##_new(x*cosy, x*siny); \ } \ \ TP TPROOT##_powr(TP a, REAL_TP b) \ { \ - REAL_TP logr = log(hypot(a.x, a.y)); \ - REAL_TP logi = atan2(a.y, a.x); \ + REAL_TP logr = log(hypot(a.real, a.imag)); \ + REAL_TP logi = atan2(a.imag, a.real); \ REAL_TP x = exp(logr * b); \ REAL_TP y = logi * b; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ \ - return (TP)(x * cosy, x*siny); \ + return TPROOT##_new(x * cosy, x*siny); \ } \ \ TP TPROOT##_rpow(REAL_TP a, TP b) \ { \ REAL_TP logr = log(a); \ - REAL_TP x = exp(logr * b.x); \ - REAL_TP y = logr * b.y; \ + REAL_TP x = exp(logr * b.real); \ + REAL_TP y = logr * b.imag; \ \ REAL_TP cosy; \ REAL_TP siny = sincos(y, &cosy); \ - return (TP) (x * cosy, x * siny); \ + return TPROOT##_new(x * cosy, x * siny); \ } \ \ TP TPROOT##_sqrt(TP a) \ { \ - REAL_TP re = a.x; \ - REAL_TP im = a.y; \ + REAL_TP re = a.real; \ + REAL_TP im = a.imag; \ REAL_TP mag = hypot(re, im); \ TP result; \ \ if (mag == 0.f) { \ - result.x = result.y = 0.f; \ + result.real = result.imag = 0.f; \ } else if (re > 0.f) { \ - result.x = sqrt(0.5f * (mag + re)); \ - result.y = im/result.x/2.f; \ + result.real = sqrt(0.5f * (mag + re)); \ + result.imag = im/result.real/2.f; \ } else { \ - result.y = sqrt(0.5f * (mag - re)); \ + result.imag = sqrt(0.5f * (mag - re)); \ if (im < 0.f) \ - result.y = - result.y; \ - result.x = im/result.y/2.f; \ + result.imag = - result.imag; \ + result.real = im/result.imag/2.f; \ } \ return result; \ } \ \ TP TPROOT##_exp(TP a) \ { \ - REAL_TP expr = exp(a.x); \ + REAL_TP expr = exp(a.real); \ REAL_TP cosi; \ - REAL_TP sini = sincos(a.y, &cosi); \ - return (TP)(expr * cosi, expr * sini); \ + REAL_TP sini = sincos(a.imag, &cosi); \ + return TPROOT##_new(expr * cosi, expr * sini); \ } \ \ TP TPROOT##_log(TP a) \ - { return (TP)(log(hypot(a.x, a.y)), atan2(a.y, a.x)); } \ + { return TPROOT##_new(log(hypot(a.real, a.imag)), atan2(a.imag, a.real)); } \ \ TP TPROOT##_sin(TP a) \ { \ REAL_TP cosr; \ - REAL_TP sinr = sincos(a.x, &cosr); \ - return (TP)(sinr*cosh(a.y), cosr*sinh(a.y)); \ + REAL_TP sinr = sincos(a.real, &cosr); \ + return TPROOT##_new(sinr*cosh(a.imag), cosr*sinh(a.imag)); \ } \ \ TP TPROOT##_cos(TP a) \ { \ REAL_TP cosr; \ - REAL_TP sinr = sincos(a.x, &cosr); \ - return (TP)(cosr*cosh(a.y), -sinr*sinh(a.y)); \ + REAL_TP sinr = sincos(a.real, &cosr); \ + return TPROOT##_new(cosr*cosh(a.imag), -sinr*sinh(a.imag)); \ } \ \ TP TPROOT##_tan(TP a) \ { \ - REAL_TP re2 = 2.f * a.x; \ - REAL_TP im2 = 2.f * a.y; \ + REAL_TP re2 = 2.f * a.real; \ + REAL_TP im2 = 2.f * a.imag; \ \ const REAL_TP limit = log(REAL_3LTR##_MAX); \ \ if (fabs(im2) > limit) \ - return (TP)(0.f, (im2 > 0 ? 1.f : -1.f)); \ + return TPROOT##_new(0.f, (im2 > 0 ? 1.f : -1.f)); \ else \ { \ REAL_TP den = cos(re2) + cosh(im2); \ - return (TP) (sin(re2) / den, sinh(im2) / den); \ + return TPROOT##_new(sin(re2) / den, sinh(im2) / den); \ } \ } \ \ TP TPROOT##_sinh(TP a) \ { \ REAL_TP cosi; \ - REAL_TP sini = sincos(a.y, &cosi); \ - return (TP)(sinh(a.x)*cosi, cosh(a.x)*sini); \ + REAL_TP sini = sincos(a.imag, &cosi); \ + return TPROOT##_new(sinh(a.real)*cosi, cosh(a.real)*sini); \ } \ \ TP TPROOT##_cosh(TP a) \ { \ REAL_TP cosi; \ - REAL_TP sini = sincos(a.y, &cosi); \ - return (TP)(cosh(a.x)*cosi, sinh(a.x)*sini); \ + REAL_TP sini = sincos(a.imag, &cosi); \ + return TPROOT##_new(cosh(a.real)*cosi, sinh(a.real)*sini); \ } \ \ TP TPROOT##_tanh(TP a) \ { \ - REAL_TP re2 = 2.f * a.x; \ - REAL_TP im2 = 2.f * a.y; \ + REAL_TP re2 = 2.f * a.real; \ + REAL_TP im2 = 2.f * a.imag; \ \ const REAL_TP limit = log(REAL_3LTR##_MAX); \ \ if (fabs(re2) > limit) \ - return (TP)((re2 > 0 ? 1.f : -1.f), 0.f); \ + return TPROOT##_new((re2 > 0 ? 1.f : -1.f), 0.f); \ else \ { \ REAL_TP den = cosh(re2) + cos(im2); \ - return (TP) (sinh(re2) / den, sin(im2) / den); \ + return TPROOT##_new(sinh(re2) / den, sin(im2) / den); \ } \ } \ #define PYOPENCL_DECLARE_COMPLEX_TYPE(BASE, BASE_3LTR) \ - typedef BASE##2 c##BASE##_t; \ + typedef union \ + { \ + struct { BASE x, y; }; \ + struct { BASE real, imag; }; \ + } c##BASE##_t; \ \ PYOPENCL_DECLARE_COMPLEX_TYPE_INT(BASE, BASE_3LTR, c##BASE, c##BASE##_t) PYOPENCL_DECLARE_COMPLEX_TYPE(float, FLT); -#define cfloat_cast(a) ((cfloat_t) ((a).x, (a).y)) +#define cfloat_cast(a) cfloat_new((a).real, (a).imag) #ifdef PYOPENCL_DEFINE_CDOUBLE PYOPENCL_DECLARE_COMPLEX_TYPE(double, DBL); -#define cdouble_cast(a) ((cdouble_t) ((a).x, (a).y)) +#define cdouble_cast(a) cdouble_new((a).real, (a).imag) #endif diff --git a/pyopencl/cl/pyopencl-hankel-complex.cl b/pyopencl/cl/pyopencl-hankel-complex.cl new file mode 100644 index 0000000000000000000000000000000000000000..3115b8de42ae1ccaf608d89f4685365657123a98 --- /dev/null +++ b/pyopencl/cl/pyopencl-hankel-complex.cl @@ -0,0 +1,444 @@ +/* +Evaluate Hankel function of first kind of order 0 and 1 for argument z +anywhere in the complex plane. + +Copyright (C) Vladimir Rokhlin +Copyright (C) 2010-2012 Leslie Greengard and Zydrunas Gimbutas +Copyright (C) 2015 Andreas Kloeckner + +Auto-translated from +https://github.com/zgimbutas/fmmlib2d/blob/master/src/hank103.f +using +https://github.com/pyopencl/pyopencl/tree/master/contrib/fortran-to-opencl + +Originally licensed under GPL, permission to license under MIT granted via email +by Vladimir Rokhlin on May 25, 2015 and by Zydrunas Gimbutas on May 17, 2015. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +void hank103(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon); +void hank103u(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon); +void hank103p(__constant cdouble_t *p, int m, cdouble_t z, cdouble_t *f); +void hank103a(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon); +void hank103l(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon); +void hank103r(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon); + +/* + * this subroutine evaluates the hankel functions H_0^1, H_1^1 + * for an arbitrary user-specified complex number z. The user + * also has the option of evaluating the functions h0, h1 + * scaled by the (complex) coefficient e^{-i \cdot z}. This + * subroutine is a modification of the subroutine hank102 + * (see), different from the latter by having the parameter + * ifexpon. Please note that the subroutine hank102 is in + * turn a slightly accelerated version of the old hank101 + * (see). The principal claim to fame of all three is that + * they are valid on the whole complex plane, and are + * reasonably accurate (14-digit relative accuracy) and + * reasonably fast. Also, please note that all three have not + * been carefully tested in the third quadrant (both x and y + * negative); some sort of numerical trouble is possible + * (though has not been observed) for LARGE z in the third + * quadrant. + * + * ifexpon = 1 will cause the subroutine to evaluate the Hankel functions + * honestly + * ifexpon = 0 will cause the subroutine to scale the Hankel functions + * by e^{-i \cdot z}. + */ + +void hankel_01_complex(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon) +{ + cdouble_t cclog; + cdouble_t cd; + cdouble_t fj0; + cdouble_t fj1; + cdouble_t h0r; + cdouble_t h0u; + cdouble_t h1r; + cdouble_t h1u; + double half_; + int ier; + cdouble_t ima = cdouble_new(0.0e0, 1.0e0); + double pi = 0.31415926535897932e+01; + cdouble_t ser2; + cdouble_t ser3; + double subt; + cdouble_t y0; + cdouble_t y1; + cdouble_t z2; + cdouble_t zr; + cdouble_t zu; + if (cdouble_imag(z) < 0) + goto label_1400; + + hank103u(z, & ier, & (* h0), & (* h1), ifexpon); + return; + label_1400: + ; + + if (cdouble_real(z) < 0) + goto label_2000; + + hank103r(z, & ier, & (* h0), & (* h1), ifexpon); + return; + label_2000: + ; + + zu = cdouble_conj(z); + zr = cdouble_rmul(- 1, zu); + hank103u(zu, & ier, & h0u, & h1u, ifexpon); + hank103r(zr, & ier, & h0r, & h1r, ifexpon); + if (ifexpon == 1) + goto label_3000; + + subt = fabs(cdouble_imag(zu)); + cd = cdouble_exp(cdouble_add(cdouble_fromreal((- 1) * subt), cdouble_mul(ima, zu))); + h0u = cdouble_mul(h0u, cd); + h1u = cdouble_mul(h1u, cd); + cd = cdouble_exp(cdouble_add(cdouble_fromreal((- 1) * subt), cdouble_mul(ima, zr))); + h0r = cdouble_mul(h0r, cd); + h1r = cdouble_mul(h1r, cd); + label_3000: + ; + + half_ = 1; + half_ = half_ / 2; + y0 = cdouble_divide(cdouble_rmul(half_, cdouble_add(h0u, h0r)), ima); + fj0 = cdouble_rmul((- 1) * half_, cdouble_add(h0u, cdouble_rmul(- 1, h0r))); + y1 = cdouble_divide(cdouble_rmul((- 1) * half_, cdouble_add(h1u, cdouble_rmul(- 1, h1r))), ima); + fj1 = cdouble_rmul(half_, cdouble_add(h1u, h1r)); + z2 = cdouble_rmul(- 1, cdouble_conj(z)); + cclog = cdouble_log(z2); + ser2 = cdouble_add(y0, cdouble_rmul(- 1, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj0), pi), cclog))); + ser3 = cdouble_add(y1, cdouble_rmul(- 1, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj1), pi), cclog))); + fj0 = cdouble_conj(fj0); + fj1 = cdouble_rmul(- 1, cdouble_conj(fj1)); + ser2 = cdouble_conj(ser2); + ser3 = cdouble_rmul(- 1, cdouble_conj(ser3)); + cclog = cdouble_log(z); + y0 = cdouble_add(ser2, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj0), pi), cclog)); + y1 = cdouble_add(ser3, cdouble_mul(cdouble_divider(cdouble_rmul(2, fj1), pi), cclog)); + * h0 = cdouble_add(fj0, cdouble_mul(ima, y0)); + * h1 = cdouble_add(fj1, cdouble_mul(ima, y1)); + if (ifexpon == 1) + return; + + cd = cdouble_exp(cdouble_add(cdouble_fromreal(subt), cdouble_rmul(- 1, cdouble_mul(ima, z)))); + * h0 = cdouble_mul(* h0, cd); + * h1 = cdouble_mul(* h1, cd); +} + +__constant double hank103u_c0p1[] = {(- 1) * 0.6619836118357782e-12, (- 1) * 0.6619836118612709e-12, (- 1) * 0.7307514264754200e-21, 0.3928160926261892e-10, 0.5712712520172854e-09, (- 1) * 0.5712712519967086e-09, (- 1) * 0.1083820384008718e-07, (- 1) * 0.1894529309455499e-18, 0.7528123700585197e-07, 0.7528123700841491e-07, 0.1356544045548053e-16, (- 1) * 0.8147940452202855e-06, (- 1) * 0.3568198575016769e-05, 0.3568198574899888e-05, 0.2592083111345422e-04, 0.4209074870019400e-15, (- 1) * 0.7935843289157352e-04, (- 1) * 0.7935843289415642e-04, (- 1) * 0.6848330800445365e-14, 0.4136028298630129e-03, 0.9210433149997867e-03, (- 1) * 0.9210433149680665e-03, (- 1) * 0.3495306809056563e-02, (- 1) * 0.6469844672213905e-13, 0.5573890502766937e-02, 0.5573890503000873e-02, 0.3767341857978150e-12, (- 1) * 0.1439178509436339e-01, (- 1) * 0.1342403524448708e-01, 0.1342403524340215e-01, 0.8733016209933828e-02, 0.1400653553627576e-11, 0.2987361261932706e-01, 0.2987361261607835e-01, (- 1) * 0.3388096836339433e-11, (- 1) * 0.1690673895793793e+00, 0.2838366762606121e+00, (- 1) * 0.2838366762542546e+00, 0.7045107746587499e+00, (- 1) * 0.5363893133864181e-11, (- 1) * 0.7788044738211666e+00, (- 1) * 0.7788044738130360e+00, 0.5524779104964783e-11, 0.1146003459721775e+01, 0.6930697486173089e+00, (- 1) * 0.6930697486240221e+00, (- 1) * 0.7218270272305891e+00, 0.3633022466839301e-11, 0.3280924142354455e+00, 0.3280924142319602e+00, (- 1) * 0.1472323059106612e-11, (- 1) * 0.2608421334424268e+00, (- 1) * 0.9031397649230536e-01, 0.9031397649339185e-01, 0.5401342784296321e-01, (- 1) * 0.3464095071668884e-12, (- 1) * 0.1377057052946721e-01, (- 1) * 0.1377057052927901e-01, 0.4273263742980154e-13, 0.5877224130705015e-02, 0.1022508471962664e-02, (- 1) * 0.1022508471978459e-02, (- 1) * 0.2789107903871137e-03, 0.2283984571396129e-14, 0.2799719727019427e-04, 0.2799719726970900e-04, (- 1) * 0.3371218242141487e-16, (- 1) * 0.3682310515545645e-05, (- 1) * 0.1191412910090512e-06, 0.1191412910113518e-06}; +__constant double hank103u_c0p2[] = {0.5641895835516786e+00, (- 1) * 0.5641895835516010e+00, (- 1) * 0.3902447089770041e-09, (- 1) * 0.3334441074447365e-11, (- 1) * 0.7052368835911731e-01, (- 1) * 0.7052368821797083e-01, 0.1957299315085370e-08, (- 1) * 0.3126801711815631e-06, (- 1) * 0.3967331737107949e-01, 0.3967327747706934e-01, 0.6902866639752817e-04, 0.3178420816292497e-06, 0.4080457166061280e-01, 0.4080045784614144e-01, (- 1) * 0.2218731025620065e-04, 0.6518438331871517e-02, 0.9798339748600499e-01, (- 1) * 0.9778028374972253e-01, (- 1) * 0.3151825524811773e+00, (- 1) * 0.7995603166188139e-03, 0.1111323666639636e+01, 0.1116791178994330e+01, 0.1635711249533488e-01, (- 1) * 0.8527067497983841e+01, (- 1) * 0.2595553689471247e+02, 0.2586942834408207e+02, 0.1345583522428299e+03, 0.2002017907999571e+00, (- 1) * 0.3086364384881525e+03, (- 1) * 0.3094609382885628e+03, (- 1) * 0.1505974589617013e+01, 0.1250150715797207e+04, 0.2205210257679573e+04, (- 1) * 0.2200328091885836e+04, (- 1) * 0.6724941072552172e+04, (- 1) * 0.7018887749450317e+01, 0.8873498980910335e+04, 0.8891369384353965e+04, 0.2008805099643591e+02, (- 1) * 0.2030681426035686e+05, (- 1) * 0.2010017782384992e+05, 0.2006046282661137e+05, 0.3427941581102808e+05, 0.3432892927181724e+02, (- 1) * 0.2511417407338804e+05, (- 1) * 0.2516567363193558e+05, (- 1) * 0.3318253740485142e+02, 0.3143940826027085e+05, 0.1658466564673543e+05, (- 1) * 0.1654843151976437e+05, (- 1) * 0.1446345041326510e+05, (- 1) * 0.1645433213663233e+02, 0.5094709396573681e+04, 0.5106816671258367e+04, 0.3470692471612145e+01, (- 1) * 0.2797902324245621e+04, (- 1) * 0.5615581955514127e+03, 0.5601021281020627e+03, 0.1463856702925587e+03, 0.1990076422327786e+00, (- 1) * 0.9334741618922085e+01, (- 1) * 0.9361368967669095e+01}; +__constant double hank103u_c1p1[] = {0.4428361927253983e-12, (- 1) * 0.4428361927153559e-12, (- 1) * 0.2575693161635231e-10, (- 1) * 0.2878656317479645e-21, 0.3658696304107867e-09, 0.3658696304188925e-09, 0.7463138750413651e-19, (- 1) * 0.6748894854135266e-08, (- 1) * 0.4530098210372099e-07, 0.4530098210271137e-07, 0.4698787882823243e-06, 0.5343848349451927e-17, (- 1) * 0.1948662942158171e-05, (- 1) * 0.1948662942204214e-05, (- 1) * 0.1658085463182409e-15, 0.1316906100496570e-04, 0.3645368564036497e-04, (- 1) * 0.3645368563934748e-04, (- 1) * 0.1633458547818390e-03, (- 1) * 0.2697770638600506e-14, 0.2816784976551660e-03, 0.2816784976676616e-03, 0.2548673351180060e-13, (- 1) * 0.6106478245116582e-03, 0.2054057459296899e-03, (- 1) * 0.2054057460218446e-03, (- 1) * 0.6254962367291260e-02, 0.1484073406594994e-12, 0.1952900562500057e-01, 0.1952900562457318e-01, (- 1) * 0.5517611343746895e-12, (- 1) * 0.8528074392467523e-01, (- 1) * 0.1495138141086974e+00, 0.1495138141099772e+00, 0.4394907314508377e+00, (- 1) * 0.1334677126491326e-11, (- 1) * 0.1113740586940341e+01, (- 1) * 0.1113740586937837e+01, 0.2113005088866033e-11, 0.1170212831401968e+01, 0.1262152242318805e+01, (- 1) * 0.1262152242322008e+01, (- 1) * 0.1557810619605511e+01, 0.2176383208521897e-11, 0.8560741701626648e+00, 0.8560741701600203e+00, (- 1) * 0.1431161194996653e-11, (- 1) * 0.8386735092525187e+00, (- 1) * 0.3651819176599290e+00, 0.3651819176613019e+00, 0.2811692367666517e+00, (- 1) * 0.5799941348040361e-12, (- 1) * 0.9494630182937280e-01, (- 1) * 0.9494630182894480e-01, 0.1364615527772751e-12, 0.5564896498129176e-01, 0.1395239688792536e-01, (- 1) * 0.1395239688799950e-01, (- 1) * 0.5871314703753967e-02, 0.1683372473682212e-13, 0.1009157100083457e-02, 0.1009157100077235e-02, (- 1) * 0.8997331160162008e-15, (- 1) * 0.2723724213360371e-03, (- 1) * 0.2708696587599713e-04, 0.2708696587618830e-04, 0.3533092798326666e-05, (- 1) * 0.1328028586935163e-16, (- 1) * 0.1134616446885126e-06, (- 1) * 0.1134616446876064e-06}; +__constant double hank103u_c1p2[] = {(- 1) * 0.5641895835446003e+00, (- 1) * 0.5641895835437973e+00, 0.3473016376419171e-10, (- 1) * 0.3710264617214559e-09, 0.2115710836381847e+00, (- 1) * 0.2115710851180242e+00, 0.3132928887334847e-06, 0.2064187785625558e-07, (- 1) * 0.6611954881267806e-01, (- 1) * 0.6611997176900310e-01, (- 1) * 0.3386004893181560e-05, 0.7146557892862998e-04, (- 1) * 0.5728505088320786e-01, 0.5732906930408979e-01, (- 1) * 0.6884187195973806e-02, (- 1) * 0.2383737409286457e-03, 0.1170452203794729e+00, 0.1192356405185651e+00, 0.8652871239920498e-02, (- 1) * 0.3366165876561572e+00, (- 1) * 0.1203989383538728e+01, 0.1144625888281483e+01, 0.9153684260534125e+01, 0.1781426600949249e+00, (- 1) * 0.2740411284066946e+02, (- 1) * 0.2834461441294877e+02, (- 1) * 0.2192611071606340e+01, 0.1445470231392735e+03, 0.3361116314072906e+03, (- 1) * 0.3270584743216529e+03, (- 1) * 0.1339254798224146e+04, (- 1) * 0.1657618537130453e+02, 0.2327097844591252e+04, 0.2380960024514808e+04, 0.7760611776965994e+02, (- 1) * 0.7162513471480693e+04, (- 1) * 0.9520608696419367e+04, 0.9322604506839242e+04, 0.2144033447577134e+05, 0.2230232555182369e+03, (- 1) * 0.2087584364240919e+05, (- 1) * 0.2131762020653283e+05, (- 1) * 0.3825699231499171e+03, 0.3582976792594737e+05, 0.2642632405857713e+05, (- 1) * 0.2585137938787267e+05, (- 1) * 0.3251446505037506e+05, (- 1) * 0.3710875194432116e+03, 0.1683805377643986e+05, 0.1724393921722052e+05, 0.1846128226280221e+03, (- 1) * 0.1479735877145448e+05, (- 1) * 0.5258288893282565e+04, 0.5122237462705988e+04, 0.2831540486197358e+04, 0.3905972651440027e+02, (- 1) * 0.5562781548969544e+03, (- 1) * 0.5726891190727206e+03, (- 1) * 0.2246192560136119e+01, 0.1465347141877978e+03, 0.9456733342595993e+01, (- 1) * 0.9155767836700837e+01}; +void hank103u(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon) +{ + + + + + cdouble_t ccex; + cdouble_t cd; + double com; + double d; + double done; + cdouble_t ima = cdouble_new(0.0e0, 1.0e0); + int m; + double thresh1; + double thresh2; + double thresh3; + cdouble_t zzz9; + * ier = 0; + com = cdouble_real(z); + if (cdouble_imag(z) >= 0) + goto label_1200; + + * ier = 4; + return; + label_1200: + ; + + done = 1; + thresh1 = 1; + thresh2 = 3.7 * 3.7; + thresh3 = 400; + d = cdouble_real(cdouble_mul(z, cdouble_conj(z))); + if ((d < thresh1) || (d > thresh3)) + goto label_3000; + + if (d > thresh2) + goto label_2000; + + cd = cdouble_rdivide(done, cdouble_sqrt(z)); + ccex = cd; + if (ifexpon == 1) + ccex = cdouble_mul(ccex, cdouble_exp(cdouble_mul(ima, z))); + + zzz9 = cdouble_powr(z, 9); + m = 35; + hank103p((__constant cdouble_t *) (& (* hank103u_c0p1)), m, cd, & (* h0)); + * h0 = cdouble_mul(cdouble_mul(* h0, ccex), zzz9); + hank103p((__constant cdouble_t *) (& (* hank103u_c1p1)), m, cd, & (* h1)); + * h1 = cdouble_mul(cdouble_mul(* h1, ccex), zzz9); + return; + label_2000: + ; + + cd = cdouble_rdivide(done, cdouble_sqrt(z)); + ccex = cd; + if (ifexpon == 1) + ccex = cdouble_mul(ccex, cdouble_exp(cdouble_mul(ima, z))); + + m = 31; + hank103p((__constant cdouble_t *) (& (* hank103u_c0p2)), m, cd, & (* h0)); + * h0 = cdouble_mul(* h0, ccex); + m = 31; + hank103p((__constant cdouble_t *) (& (* hank103u_c1p2)), m, cd, & (* h1)); + * h1 = cdouble_mul(* h1, ccex); + return; + label_3000: + ; + + if (d > 50.e0) + goto label_4000; + + hank103l(z, & (* h0), & (* h1), ifexpon); + return; + label_4000: + ; + + hank103a(z, & (* h0), & (* h1), ifexpon); +} + +void hank103p(__constant cdouble_t *p, int m, cdouble_t z, cdouble_t *f) +{ + int i; + + * f = p[m - 1]; + for (i = m + (- 1); i >= 1; i += - 1) + { + * f = cdouble_add(cdouble_mul(* f, z), p[i - 1]); + label_1200: + ; + + } + +} + +__constant double hank103a_p[] = {0.1000000000000000e+01, (- 1) * 0.7031250000000000e-01, 0.1121520996093750e+00, (- 1) * 0.5725014209747314e+00, 0.6074042001273483e+01, (- 1) * 0.1100171402692467e+03, 0.3038090510922384e+04, (- 1) * 0.1188384262567833e+06, 0.6252951493434797e+07, (- 1) * 0.4259392165047669e+09, 0.3646840080706556e+11, (- 1) * 0.3833534661393944e+13, 0.4854014686852901e+15, (- 1) * 0.7286857349377657e+17, 0.1279721941975975e+20, (- 1) * 0.2599382102726235e+22, 0.6046711487532401e+24, (- 1) * 0.1597065525294211e+27}; +__constant double hank103a_p1[] = {0.1000000000000000e+01, 0.1171875000000000e+00, (- 1) * 0.1441955566406250e+00, 0.6765925884246826e+00, (- 1) * 0.6883914268109947e+01, 0.1215978918765359e+03, (- 1) * 0.3302272294480852e+04, 0.1276412726461746e+06, (- 1) * 0.6656367718817687e+07, 0.4502786003050393e+09, (- 1) * 0.3833857520742789e+11, 0.4011838599133198e+13, (- 1) * 0.5060568503314726e+15, 0.7572616461117957e+17, (- 1) * 0.1326257285320556e+20, 0.2687496750276277e+22, (- 1) * 0.6238670582374700e+24, 0.1644739123064188e+27}; +__constant double hank103a_q[] = {(- 1) * 0.1250000000000000e+00, 0.7324218750000000e-01, (- 1) * 0.2271080017089844e+00, 0.1727727502584457e+01, (- 1) * 0.2438052969955606e+02, 0.5513358961220206e+03, (- 1) * 0.1825775547429317e+05, 0.8328593040162893e+06, (- 1) * 0.5006958953198893e+08, 0.3836255180230434e+10, (- 1) * 0.3649010818849834e+12, 0.4218971570284096e+14, (- 1) * 0.5827244631566907e+16, 0.9476288099260110e+18, (- 1) * 0.1792162323051699e+21, 0.3900121292034000e+23, (- 1) * 0.9677028801069847e+25, 0.2715581773544907e+28}; +__constant double hank103a_q1[] = {0.3750000000000000e+00, (- 1) * 0.1025390625000000e+00, 0.2775764465332031e+00, (- 1) * 0.1993531733751297e+01, 0.2724882731126854e+02, (- 1) * 0.6038440767050702e+03, 0.1971837591223663e+05, (- 1) * 0.8902978767070679e+06, 0.5310411010968522e+08, (- 1) * 0.4043620325107754e+10, 0.3827011346598606e+12, (- 1) * 0.4406481417852279e+14, 0.6065091351222699e+16, (- 1) * 0.9833883876590680e+18, 0.1855045211579829e+21, (- 1) * 0.4027994121281017e+23, 0.9974783533410457e+25, (- 1) * 0.2794294288720121e+28}; +void hank103a(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon) +{ + cdouble_t cccexp; + cdouble_t cdd; + cdouble_t cdumb = cdouble_new(0.70710678118654757e+00, (- 1) * 0.70710678118654746e+00); + double done = 1.0e0; + int i; + cdouble_t ima = cdouble_new(0.0e0, 1.0e0); + int m; + + + double pi = 0.31415926535897932e+01; + cdouble_t pp; + cdouble_t pp1; + + + cdouble_t qq; + cdouble_t qq1; + cdouble_t zinv; + cdouble_t zinv22; + m = 10; + zinv = cdouble_rdivide(done, z); + pp = cdouble_fromreal(hank103a_p[m - 1]); + pp1 = cdouble_fromreal(hank103a_p1[m - 1]); + zinv22 = cdouble_mul(zinv, zinv); + qq = cdouble_fromreal(hank103a_q[m - 1]); + qq1 = cdouble_fromreal(hank103a_q1[m - 1]); + for (i = m + (- 1); i >= 1; i += - 1) + { + pp = cdouble_add(cdouble_fromreal(hank103a_p[i - 1]), cdouble_mul(pp, zinv22)); + pp1 = cdouble_add(cdouble_fromreal(hank103a_p1[i - 1]), cdouble_mul(pp1, zinv22)); + qq = cdouble_add(cdouble_fromreal(hank103a_q[i - 1]), cdouble_mul(qq, zinv22)); + qq1 = cdouble_add(cdouble_fromreal(hank103a_q1[i - 1]), cdouble_mul(qq1, zinv22)); + label_1600: + ; + + } + + qq = cdouble_mul(qq, zinv); + qq1 = cdouble_mul(qq1, zinv); + cccexp = cdouble_fromreal(1); + if (ifexpon == 1) + cccexp = cdouble_exp(cdouble_mul(ima, z)); + + cdd = cdouble_sqrt(cdouble_rmul(2 / pi, zinv)); + * h0 = cdouble_add(pp, cdouble_mul(ima, qq)); + * h0 = cdouble_mul(cdouble_mul(cdouble_mul(cdd, cdumb), cccexp), * h0); + * h1 = cdouble_add(pp1, cdouble_mul(ima, qq1)); + * h1 = cdouble_rmul(- 1, cdouble_mul(cdouble_mul(cdouble_mul(cdouble_mul(cdd, cccexp), cdumb), * h1), ima)); +} + +__constant double hank103l_cj0[] = {0.1000000000000000e+01, (- 1) * 0.2500000000000000e+00, 0.1562500000000000e-01, (- 1) * 0.4340277777777778e-03, 0.6781684027777778e-05, (- 1) * 0.6781684027777778e-07, 0.4709502797067901e-09, (- 1) * 0.2402807549524439e-11, 0.9385966990329841e-14, (- 1) * 0.2896903392077112e-16, 0.7242258480192779e-19, (- 1) * 0.1496334396734045e-21, 0.2597802772107717e-24, (- 1) * 0.3842903509035085e-27, 0.4901662639075363e-30, (- 1) * 0.5446291821194848e-33}; +__constant double hank103l_cj1[] = {(- 1) * 0.5000000000000000e+00, 0.6250000000000000e-01, (- 1) * 0.2604166666666667e-02, 0.5425347222222222e-04, (- 1) * 0.6781684027777778e-06, 0.5651403356481481e-08, (- 1) * 0.3363930569334215e-10, 0.1501754718452775e-12, (- 1) * 0.5214426105738801e-15, 0.1448451696038556e-17, (- 1) * 0.3291935672814899e-20, 0.6234726653058522e-23, (- 1) * 0.9991549123491221e-26, 0.1372465538941102e-28, (- 1) * 0.1633887546358454e-31, 0.1701966194123390e-34}; +__constant double hank103l_ser2[] = {0.2500000000000000e+00, (- 1) * 0.2343750000000000e-01, 0.7957175925925926e-03, (- 1) * 0.1412850839120370e-04, 0.1548484519675926e-06, (- 1) * 0.1153828185281636e-08, 0.6230136717695511e-11, (- 1) * 0.2550971742728932e-13, 0.8195247730999099e-16, (- 1) * 0.2121234517551702e-18, 0.4518746345057852e-21, (- 1) * 0.8061529302289970e-24, 0.1222094716680443e-26, (- 1) * 0.1593806157473552e-29, 0.1807204342667468e-32, (- 1) * 0.1798089518115172e-35}; +__constant double hank103l_ser2der[] = {0.5000000000000000e+00, (- 1) * 0.9375000000000000e-01, 0.4774305555555556e-02, (- 1) * 0.1130280671296296e-03, 0.1548484519675926e-05, (- 1) * 0.1384593822337963e-07, 0.8722191404773715e-10, (- 1) * 0.4081554788366291e-12, 0.1475144591579838e-14, (- 1) * 0.4242469035103405e-17, 0.9941241959127275e-20, (- 1) * 0.1934767032549593e-22, 0.3177446263369152e-25, (- 1) * 0.4462657240925946e-28, 0.5421613028002404e-31, (- 1) * 0.5753886457968550e-34}; +void hank103l(cdouble_t z, cdouble_t *h0, cdouble_t *h1, int ifexpon) +{ + cdouble_t cd; + cdouble_t cdddlog; + + + cdouble_t fj0; + cdouble_t fj1; + double gamma = 0.5772156649015328606e+00; + int i; + cdouble_t ima = cdouble_new(0.0e0, 1.0e0); + int m; + double pi = 0.31415926535897932e+01; + + + double two = 2.0e0; + cdouble_t y0; + cdouble_t y1; + cdouble_t z2; + m = 16; + fj0 = cdouble_fromreal(0); + fj1 = cdouble_fromreal(0); + y0 = cdouble_fromreal(0); + y1 = cdouble_fromreal(0); + z2 = cdouble_mul(z, z); + cd = cdouble_fromreal(1); + for (i = 1; i <= m; i += 1) + { + fj0 = cdouble_add(fj0, cdouble_rmul(hank103l_cj0[i - 1], cd)); + fj1 = cdouble_add(fj1, cdouble_rmul(hank103l_cj1[i - 1], cd)); + y1 = cdouble_add(y1, cdouble_rmul(hank103l_ser2der[i - 1], cd)); + cd = cdouble_mul(cd, z2); + y0 = cdouble_add(y0, cdouble_rmul(hank103l_ser2[i - 1], cd)); + label_1800: + ; + + } + + fj1 = cdouble_rmul(- 1, cdouble_mul(fj1, z)); + cdddlog = cdouble_add(cdouble_fromreal(gamma), cdouble_log(cdouble_divider(z, two))); + y0 = cdouble_add(cdouble_mul(cdddlog, fj0), y0); + y0 = cdouble_rmul(two / pi, y0); + y1 = cdouble_mul(y1, z); + y1 = cdouble_add(cdouble_add(cdouble_rmul(- 1, cdouble_mul(cdddlog, fj1)), cdouble_divide(fj0, z)), y1); + y1 = cdouble_divider(cdouble_rmul((- 1) * two, y1), pi); + * h0 = cdouble_add(fj0, cdouble_mul(ima, y0)); + * h1 = cdouble_add(fj1, cdouble_mul(ima, y1)); + if (ifexpon == 1) + return; + + cd = cdouble_exp(cdouble_rmul(- 1, cdouble_mul(ima, z))); + * h0 = cdouble_mul(* h0, cd); + * h1 = cdouble_mul(* h1, cd); +} + +__constant double hank103r_c0p1[] = {(- 1) * 0.4268441995428495e-23, 0.4374027848105921e-23, 0.9876152216238049e-23, (- 1) * 0.1065264808278614e-20, 0.6240598085551175e-19, 0.6658529985490110e-19, (- 1) * 0.5107210870050163e-17, (- 1) * 0.2931746613593983e-18, 0.1611018217758854e-15, (- 1) * 0.1359809022054077e-15, (- 1) * 0.7718746693707326e-15, 0.6759496139812828e-14, (- 1) * 0.1067620915195442e-12, (- 1) * 0.1434699000145826e-12, 0.3868453040754264e-11, 0.7061853392585180e-12, (- 1) * 0.6220133527871203e-10, 0.3957226744337817e-10, 0.3080863675628417e-09, (- 1) * 0.1154618431281900e-08, 0.7793319486868695e-08, 0.1502570745460228e-07, (- 1) * 0.1978090852638430e-06, (- 1) * 0.7396691873499030e-07, 0.2175857247417038e-05, (- 1) * 0.8473534855334919e-06, (- 1) * 0.1053381327609720e-04, 0.2042555121261223e-04, (- 1) * 0.4812568848956982e-04, (- 1) * 0.1961519090873697e-03, 0.1291714391689374e-02, 0.9234422384950050e-03, (- 1) * 0.1113890671502769e-01, 0.9053687375483149e-03, 0.5030666896877862e-01, (- 1) * 0.4923119348218356e-01, 0.5202355973926321e+00, (- 1) * 0.1705244841954454e+00, (- 1) * 0.1134990486611273e+01, (- 1) * 0.1747542851820576e+01, 0.8308174484970718e+01, 0.2952358687641577e+01, (- 1) * 0.3286074510100263e+02, 0.1126542966971545e+02, 0.6576015458463394e+02, (- 1) * 0.1006116996293757e+03, 0.3216834899377392e+02, 0.3614005342307463e+03, (- 1) * 0.6653878500833375e+03, (- 1) * 0.6883582242804924e+03, 0.2193362007156572e+04, 0.2423724600546293e+03, (- 1) * 0.3665925878308203e+04, 0.2474933189642588e+04, 0.1987663383445796e+04, (- 1) * 0.7382586600895061e+04, 0.4991253411017503e+04, 0.1008505017740918e+05, (- 1) * 0.1285284928905621e+05, (- 1) * 0.5153674821668470e+04, 0.1301656757246985e+05, (- 1) * 0.4821250366504323e+04, (- 1) * 0.4982112643422311e+04, 0.9694070195648748e+04, (- 1) * 0.1685723189234701e+04, (- 1) * 0.6065143678129265e+04, 0.2029510635584355e+04, 0.1244402339119502e+04, (- 1) * 0.4336682903961364e+03, 0.8923209875101459e+02}; +__constant double hank103r_c0p2[] = {0.5641895835569398e+00, (- 1) * 0.5641895835321127e+00, (- 1) * 0.7052370223565544e-01, (- 1) * 0.7052369923405479e-01, (- 1) * 0.3966909368581382e-01, 0.3966934297088857e-01, 0.4130698137268744e-01, 0.4136196771522681e-01, 0.6240742346896508e-01, (- 1) * 0.6553556513852438e-01, (- 1) * 0.3258849904760676e-01, (- 1) * 0.7998036854222177e-01, (- 1) * 0.3988006311955270e+01, 0.1327373751674479e+01, 0.6121789346915312e+02, (- 1) * 0.9251865216627577e+02, 0.4247064992018806e+03, 0.2692553333489150e+04, (- 1) * 0.4374691601489926e+05, (- 1) * 0.3625248208112831e+05, 0.1010975818048476e+07, (- 1) * 0.2859360062580096e+05, (- 1) * 0.1138970241206912e+08, 0.1051097979526042e+08, 0.2284038899211195e+08, (- 1) * 0.2038012515235694e+09, 0.1325194353842857e+10, 0.1937443530361381e+10, (- 1) * 0.2245999018652171e+11, (- 1) * 0.5998903865344352e+10, 0.1793237054876609e+12, (- 1) * 0.8625159882306147e+11, (- 1) * 0.5887763042735203e+12, 0.1345331284205280e+13, (- 1) * 0.2743432269370813e+13, (- 1) * 0.8894942160272255e+13, 0.4276463113794564e+14, 0.2665019886647781e+14, (- 1) * 0.2280727423955498e+15, 0.3686908790553973e+14, 0.5639846318168615e+15, (- 1) * 0.6841529051615703e+15, 0.9901426799966038e+14, 0.2798406605978152e+16, (- 1) * 0.4910062244008171e+16, (- 1) * 0.5126937967581805e+16, 0.1387292951936756e+17, 0.1043295727224325e+16, (- 1) * 0.1565204120687265e+17, 0.1215262806973577e+17, 0.3133802397107054e+16, (- 1) * 0.1801394550807078e+17, 0.4427598668012807e+16, 0.6923499968336864e+16}; +__constant double hank103r_c1p1[] = {(- 1) * 0.4019450270734195e-23, (- 1) * 0.4819240943285824e-23, 0.1087220822839791e-20, 0.1219058342725899e-21, (- 1) * 0.7458149572694168e-19, 0.5677825613414602e-19, 0.8351815799518541e-18, (- 1) * 0.5188585543982425e-17, 0.1221075065755962e-15, 0.1789261470637227e-15, (- 1) * 0.6829972121890858e-14, (- 1) * 0.1497462301804588e-14, 0.1579028042950957e-12, (- 1) * 0.9414960303758800e-13, (- 1) * 0.1127570848999746e-11, 0.3883137940932639e-11, (- 1) * 0.3397569083776586e-10, (- 1) * 0.6779059427459179e-10, 0.1149529442506273e-08, 0.4363087909873751e-09, (- 1) * 0.1620182360840298e-07, 0.6404695607668289e-08, 0.9651461037419628e-07, (- 1) * 0.1948572160668177e-06, 0.6397881896749446e-06, 0.2318661930507743e-05, (- 1) * 0.1983192412396578e-04, (- 1) * 0.1294811208715315e-04, 0.2062663873080766e-03, (- 1) * 0.2867633324735777e-04, (- 1) * 0.1084309075952914e-02, 0.1227880935969686e-02, 0.2538406015667726e-03, (- 1) * 0.1153316815955356e-01, 0.4520140008266983e-01, 0.5693944718258218e-01, (- 1) * 0.9640790976658534e+00, (- 1) * 0.6517135574036008e+00, 0.2051491829570049e+01, (- 1) * 0.1124151010077572e+01, (- 1) * 0.3977380460328048e+01, 0.8200665483661009e+01, (- 1) * 0.7950131652215817e+01, (- 1) * 0.3503037697046647e+02, 0.9607320812492044e+02, 0.7894079689858070e+02, (- 1) * 0.3749002890488298e+03, (- 1) * 0.8153831134140778e+01, 0.7824282518763973e+03, (- 1) * 0.6035276543352174e+03, (- 1) * 0.5004685759675768e+03, 0.2219009060854551e+04, (- 1) * 0.2111301101664672e+04, (- 1) * 0.4035632271617418e+04, 0.7319737262526823e+04, 0.2878734389521922e+04, (- 1) * 0.1087404934318719e+05, 0.3945740567322783e+04, 0.6727823761148537e+04, (- 1) * 0.1253555346597302e+05, 0.3440468371829973e+04, 0.1383240926370073e+05, (- 1) * 0.9324927373036743e+04, (- 1) * 0.6181580304530313e+04, 0.6376198146666679e+04, (- 1) * 0.1033615527971958e+04, (- 1) * 0.1497604891055181e+04, 0.1929025541588262e+04, (- 1) * 0.4219760183545219e+02, (- 1) * 0.4521162915353207e+03}; +__constant double hank103r_c1p2[] = {(- 1) * 0.5641895835431980e+00, (- 1) * 0.5641895835508094e+00, 0.2115710934750869e+00, (- 1) * 0.2115710923186134e+00, (- 1) * 0.6611607335011594e-01, (- 1) * 0.6611615414079688e-01, (- 1) * 0.5783289433408652e-01, 0.5785737744023628e-01, 0.8018419623822896e-01, 0.8189816020440689e-01, 0.1821045296781145e+00, (- 1) * 0.2179738973008740e+00, 0.5544705668143094e+00, 0.2224466316444440e+01, (- 1) * 0.8563271248520645e+02, (- 1) * 0.4394325758429441e+02, 0.2720627547071340e+04, (- 1) * 0.6705390850875292e+03, (- 1) * 0.3936221960600770e+05, 0.5791730432605451e+05, (- 1) * 0.1976787738827811e+06, (- 1) * 0.1502498631245144e+07, 0.2155317823990686e+08, 0.1870953796705298e+08, (- 1) * 0.4703995711098311e+09, 0.3716595906453190e+07, 0.5080557859012385e+10, (- 1) * 0.4534199223888966e+10, (- 1) * 0.1064438211647413e+11, 0.8612243893745942e+11, (- 1) * 0.5466017687785078e+12, (- 1) * 0.8070950386640701e+12, 0.9337074941225827e+13, 0.2458379240643264e+13, (- 1) * 0.7548692171244579e+14, 0.3751093169954336e+14, 0.2460677431350039e+15, (- 1) * 0.5991919372881911e+15, 0.1425679408434606e+16, 0.4132221939781502e+16, (- 1) * 0.2247506469468969e+17, (- 1) * 0.1269771078165026e+17, 0.1297336292749026e+18, (- 1) * 0.2802626909791308e+17, (- 1) * 0.3467137222813017e+18, 0.4773955215582192e+18, (- 1) * 0.2347165776580206e+18, (- 1) * 0.2233638097535785e+19, 0.5382350866778548e+19, 0.4820328886922998e+19, (- 1) * 0.1928978948099345e+20, 0.1575498747750907e+18, 0.3049162180215152e+20, (- 1) * 0.2837046201123502e+20, (- 1) * 0.5429391644354291e+19, 0.6974653380104308e+20, (- 1) * 0.5322120857794536e+20, (- 1) * 0.6739879079691706e+20, 0.6780343087166473e+20, 0.1053455984204666e+20, (- 1) * 0.2218784058435737e+20, 0.1505391868530062e+20}; +void hank103r(cdouble_t z, int *ier, cdouble_t *h0, cdouble_t *h1, int ifexpon) +{ + + + + + cdouble_t cccexp; + cdouble_t cd; + cdouble_t cdd; + double d; + double done; + cdouble_t ima = cdouble_new(0.0e0, 1.0e0); + int m; + double thresh1; + double thresh2; + double thresh3; + cdouble_t zz18; + * ier = 0; + if ((cdouble_real(z) >= 0) && (cdouble_imag(z) <= 0)) + goto label_1400; + + * ier = 4; + return; + label_1400: + ; + + done = 1; + thresh1 = 16; + thresh2 = 64; + thresh3 = 400; + d = cdouble_real(cdouble_mul(z, cdouble_conj(z))); + if ((d < thresh1) || (d > thresh3)) + goto label_3000; + + if (d > thresh2) + goto label_2000; + + cccexp = cdouble_fromreal(1); + if (ifexpon == 1) + cccexp = cdouble_exp(cdouble_mul(ima, z)); + + cdd = cdouble_rdivide(done, cdouble_sqrt(z)); + cd = cdouble_rdivide(done, z); + zz18 = cdouble_powr(z, 18); + m = 35; + hank103p((__constant cdouble_t *) (& (* hank103r_c0p1)), m, cd, & (* h0)); + * h0 = cdouble_mul(cdouble_mul(cdouble_mul(* h0, cdd), cccexp), zz18); + hank103p((__constant cdouble_t *) (& (* hank103r_c1p1)), m, cd, & (* h1)); + * h1 = cdouble_mul(cdouble_mul(cdouble_mul(* h1, cdd), cccexp), zz18); + return; + label_2000: + ; + + cd = cdouble_rdivide(done, z); + cdd = cdouble_sqrt(cd); + cccexp = cdouble_fromreal(1); + if (ifexpon == 1) + cccexp = cdouble_exp(cdouble_mul(ima, z)); + + m = 27; + hank103p((__constant cdouble_t *) (& (* hank103r_c0p2)), m, cd, & (* h0)); + * h0 = cdouble_mul(cdouble_mul(* h0, cccexp), cdd); + m = 31; + hank103p((__constant cdouble_t *) (& (* hank103r_c1p2)), m, cd, & (* h1)); + * h1 = cdouble_mul(cdouble_mul(* h1, cccexp), cdd); + return; + label_3000: + ; + + if (d > 50.e0) + goto label_4000; + + hank103l(z, & (* h0), & (* h1), ifexpon); + return; + label_4000: + ; + + hank103a(z, & (* h0), & (* h1), ifexpon); +} + diff --git a/pyopencl/clmath.py b/pyopencl/clmath.py index 1b41ce67d0ce9b7543c6f76944edf29c1a082ff0..d745741d2091264058d7dc1c29275db110e948e6 100644 --- a/pyopencl/clmath.py +++ b/pyopencl/clmath.py @@ -240,13 +240,28 @@ def _bessel_yn(result, n, x): np.dtype(type(n)), x.dtype) +@cl_array.elwise_kernel_runner +def _hankel_01(h0, h1, x): + if h0.dtype != h1.dtype: + raise TypeError("types of h0 and h1 must match") + return elementwise.get_hankel_01_kernel( + h0.context, h0.dtype, x.dtype) + + def bessel_jn(n, x, queue=None): result = x._new_like_me(queue=queue) - _bessel_jn(result, n, x) + _bessel_jn(result, n, x, queue=queue) return result def bessel_yn(n, x, queue=None): result = x._new_like_me(queue=queue) - _bessel_yn(result, n, x) + _bessel_yn(result, n, x, queue=queue) return result + + +def hankel_01(x, queue=None): + h0 = x._new_like_me(queue=queue) + h1 = x._new_like_me(queue=queue) + _hankel_01(h0, h1, x, queue=queue) + return h0, h1 diff --git a/pyopencl/compyte b/pyopencl/compyte index 5d54e1b2b7f28d3e779029ac0b4aa5f957829f23..fb6ba114d9d906403d47b0aaf69e2fe4cef382f2 160000 --- a/pyopencl/compyte +++ b/pyopencl/compyte @@ -1 +1 @@ -Subproject commit 5d54e1b2b7f28d3e779029ac0b4aa5f957829f23 +Subproject commit fb6ba114d9d906403d47b0aaf69e2fe4cef382f2 diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 398936ff52587b91a48d0febec06a7e172d80019..47fe3d9585b1aebf08cde69624066e5088a081ae 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -383,7 +383,7 @@ def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): args = [ VectorArg(dtype, "dest%d" % i) - for i in range(vec_count) + for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), VectorArg(idx_dtype, "gmem_src_idx", with_offset=True), @@ -392,7 +392,7 @@ def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): for i in range(vec_count) ] + [ ScalarArg(idx_dtype, "offset%d" % i) - for i in range(vec_count) if with_offsets + for i in range(vec_count) if with_offsets ] if with_offsets: @@ -421,7 +421,7 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): args = [ VectorArg(dtype, "dest%d" % i, with_offset=True) - for i in range(vec_count) + for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), ] + [ @@ -525,7 +525,6 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" - z_is_complex = dtype_z.kind == "c" if x_is_complex: ax = "%s_mul(a, x[i])" % complex_dtype_to_name(dtype_x) @@ -539,9 +538,15 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z): if not x_is_complex and y_is_complex: ax = "%s_fromreal(%s)" % (complex_dtype_to_name(dtype_y), ax) - result = "%s + %s" % (ax, by) - if z_is_complex: - result = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), result) + if x_is_complex or y_is_complex: + result = ( + "{root}_add({root}_cast({ax}), {root}_cast({by}))" + .format( + ax=ax, + by=by, + root=complex_dtype_to_name(dtype_z))) + else: + result = "%s + %s" % (ax, by) return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y" % { @@ -562,24 +567,27 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): z_is_complex = dtype_z.kind == "c" ax = "a*x[i]" - if a_is_complex and x_is_complex: + if x_is_complex: a = "a" x = "x[i]" - if dtype_a != dtype_z: - a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) if dtype_x != dtype_z: x = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), x) - ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) + if a_is_complex: + if dtype_a != dtype_z: + a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) - # The following two are workarounds for Apple on OS X 10.8. - # They're not really necessary. + ax = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) + else: + ax = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) + elif a_is_complex: + a = "a" + x = "x[i]" - elif a_is_complex and not x_is_complex: - ax = "a*((%s) x[i])" % dtype_to_ctype(real_dtype(dtype_a)) - elif not a_is_complex and x_is_complex: - ax = "((%s) a)*x[i]" % dtype_to_ctype(real_dtype(dtype_x)) + if dtype_a != dtype_z: + a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), a) + ax = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), a, x) b = "b" if z_is_complex and not b_is_complex: @@ -592,6 +600,14 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): ax = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), ax) b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), b) + if a_is_complex or x_is_complex or b_is_complex: + expr = "{root}_add({ax}, {b})".format( + ax=ax, + b=b, + root=complex_dtype_to_name(dtype_z)) + else: + expr = "%s + %s" % (ax, b) + return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_a)s a, %(tp_x)s *x,%(tp_b)s b" % { "tp_a": dtype_to_ctype(dtype_a), @@ -599,7 +615,7 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): "tp_b": dtype_to_ctype(dtype_b), "tp_z": dtype_to_ctype(dtype_z), }, - "z[i] = %s + %s" % (ax, b), + "z[i] = " + expr, name="axpb") @@ -607,7 +623,6 @@ def get_axpbz_kernel(context, dtype_a, dtype_x, dtype_b, dtype_z): def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): x_is_complex = dtype_x.kind == "c" y_is_complex = dtype_y.kind == "c" - z_is_complex = dtype_z.kind == "c" x = "x[i]" y = "y[i]" @@ -619,13 +634,13 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z): if x_is_complex and y_is_complex: xy = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) - + elif x_is_complex and not y_is_complex: + xy = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) + elif not x_is_complex and y_is_complex: + xy = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: xy = "%s * %s" % (x, y) - if z_is_complex: - xy = "%s_cast(%s)" % (complex_dtype_to_name(dtype_z), xy) - return get_elwise_kernel(context, "%(tp_z)s *z, %(tp_x)s *x, %(tp_y)s *y" % { "tp_x": dtype_to_ctype(dtype_x), @@ -654,8 +669,9 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z): if x_is_complex and y_is_complex: xoy = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) elif not x_is_complex and y_is_complex: - xoy = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) + elif x_is_complex and not y_is_complex: + xoy = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), x, y) else: xoy = "%s / %s" % (x, y) @@ -691,7 +707,9 @@ def get_rdivide_elwise_kernel(context, dtype_x, dtype_y, dtype_z): if x_is_complex and y_is_complex: yox = "%s_divide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) elif not y_is_complex and x_is_complex: - yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_x), y, x) + yox = "%s_rdivide(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) + elif y_is_complex and not x_is_complex: + yox = "%s_divider(%s, %s)" % (complex_dtype_to_name(dtype_z), y, x) else: yox = "%s / %s" % (y, x) @@ -728,16 +746,18 @@ def get_reverse_kernel(context, dtype): @context_dependent_memoize def get_arange_kernel(context, dtype): if dtype.kind == "c": - i = "%s_fromreal(i)" % complex_dtype_to_name(dtype) + expr = ( + "{root}_add(start, {root}_rmul(i, step))" + .format(root=complex_dtype_to_name(dtype))) else: - i = "(%s) i" % dtype_to_ctype(dtype) + expr = "start + ((%s) i)*step" % dtype_to_ctype(dtype) return get_elwise_kernel(context, [ VectorArg(dtype, "z", with_offset=True), ScalarArg(dtype, "start"), ScalarArg(dtype, "step"), ], - "z[i] = start + %s*step" % i, + "z[i] = " + expr, name="arange") @@ -912,20 +932,80 @@ def get_ldexp_kernel(context, out_dtype=np.float32, sig_dtype=np.float32, @context_dependent_memoize def get_bessel_kernel(context, which_func, out_dtype=np.float64, order_dtype=np.int32, x_dtype=np.float64): + if x_dtype.kind != "c": + return get_elwise_kernel(context, [ + VectorArg(out_dtype, "z", with_offset=True), + ScalarArg(order_dtype, "ord_n"), + VectorArg(x_dtype, "x", with_offset=True), + ], + "z[i] = bessel_%sn(ord_n, x[i])" % which_func, + name="bessel_%sn_kernel" % which_func, + preamble=""" + #if __OPENCL_C_VERSION__ < 120 + #pragma OPENCL EXTENSION cl_khr_fp64: enable + #endif + #define PYOPENCL_DEFINE_CDOUBLE + #include <pyopencl-bessel-%s.cl> + """ % which_func) + else: + if which_func != "j": + raise NotImplementedError("complex arguments for Bessel Y") + + if x_dtype != np.complex128: + raise NotImplementedError("non-complex double dtype") + if x_dtype != out_dtype: + raise NotImplementedError("different input/output types") + + return get_elwise_kernel(context, [ + VectorArg(out_dtype, "z", with_offset=True), + ScalarArg(order_dtype, "ord_n"), + VectorArg(x_dtype, "x", with_offset=True), + ], + """ + cdouble_t jv_loc; + cdouble_t jvp1_loc; + bessel_j_complex(ord_n, x[i], &jv_loc, &jvp1_loc); + z[i] = jv_loc; + """, + name="bessel_j_complex_kernel", + preamble=""" + #if __OPENCL_C_VERSION__ < 120 + #pragma OPENCL EXTENSION cl_khr_fp64: enable + #endif + #define PYOPENCL_DEFINE_CDOUBLE + #include <pyopencl-complex.h> + #include <pyopencl-bessel-j-complex.cl> + """) + + +@context_dependent_memoize +def get_hankel_01_kernel(context, out_dtype, x_dtype): + if x_dtype != np.complex128: + raise NotImplementedError("non-complex double dtype") + if x_dtype != out_dtype: + raise NotImplementedError("different input/output types") + return get_elwise_kernel(context, [ - VectorArg(out_dtype, "z", with_offset=True), - ScalarArg(order_dtype, "ord_n"), + VectorArg(out_dtype, "h0", with_offset=True), + VectorArg(out_dtype, "h1", with_offset=True), VectorArg(x_dtype, "x", with_offset=True), ], - "z[i] = bessel_%sn(ord_n, x[i])" % which_func, - name="bessel_%sn_kernel" % which_func, + """ + cdouble_t h0_loc; + cdouble_t h1_loc; + hankel_01_complex(x[i], &h0_loc, &h1_loc, 1); + h0[i] = h0_loc; + h1[i] = h1_loc; + """, + name="hankel_complex_kernel", preamble=""" #if __OPENCL_C_VERSION__ < 120 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif #define PYOPENCL_DEFINE_CDOUBLE - #include <pyopencl-bessel-%s.cl> - """ % which_func) + #include <pyopencl-complex.h> + #include <pyopencl-hankel-complex.cl> + """) @context_dependent_memoize diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index ba7c6dc77f69337c8e535284a782e31eb9b613a4..2e432fc5572f9cc37001011d3dbbbef2bf3acdcd 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -41,7 +41,7 @@ import numpy as np # {{{ kernel source -KERNEL = """//CL// +KERNEL = r"""//CL// #define GROUP_SIZE ${group_size} #define READ_AND_MAP(i) (${map_expr}) #define REDUCE(a, b) (${reduce_expr}) @@ -63,7 +63,8 @@ KERNEL = """//CL// __global out_type *out__base, long out__offset, ${arguments}, unsigned int seq_count, unsigned int n) { - __global out_type *out = (__global out_type *) ((__global char *) out__base + out__offset); + __global out_type *out = (__global out_type *) ( + (__global char *) out__base + out__offset); ${arg_prep} __local out_type ldata[GROUP_SIZE]; @@ -88,7 +89,7 @@ KERNEL = """//CL// cur_size = group_size %> - % while cur_size > no_sync_size: + % while cur_size > 1: barrier(CLK_LOCAL_MEM_FENCE); <% @@ -107,40 +108,6 @@ KERNEL = """//CL// % endwhile - % if cur_size > 1: - ## we need to synchronize one last time for entry into the - ## no-sync region. - - barrier(CLK_LOCAL_MEM_FENCE); - - <% - # NB: There's an exact duplicate of this calculation in the - # %while loop below. - - new_size = cur_size // 2 - assert new_size * 2 == cur_size - %> - - if (lid < ${new_size}) - { - __local volatile out_type *lvdata = ldata; - % while cur_size > 1: - <% - new_size = cur_size // 2 - assert new_size * 2 == cur_size - %> - - lvdata[lid] = REDUCE( - lvdata[lid], - lvdata[lid + ${new_size}]); - - <% cur_size = new_size %> - - % endwhile - - } - % endif - if (lid == 0) out[get_group_id(0)] = ldata[0]; } """ @@ -185,24 +152,6 @@ def _get_reduction_source( # }}} - # {{{ compute synchronization-less group size - - def get_dev_no_sync_size(device): - from pyopencl.characterize import get_simd_group_size - result = get_simd_group_size(device, out_type_size) - - if result is None: - from warnings import warn - warn("Reduction might be unnecessarily slow: " - "can't query SIMD group size") - return 1 - - return result - - no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices) - - # }}} - from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support @@ -210,7 +159,6 @@ def _get_reduction_source( out_type=out_type, arguments=", ".join(arg.declarator() for arg in parsed_args), group_size=group_size, - no_sync_size=no_sync_size, neutral=neutral, reduce_expr=_process_code_for_macro(reduce_expr), map_expr=_process_code_for_macro(map_expr), @@ -325,8 +273,8 @@ class ReductionKernel: "that have at least one vector argument" def __call__(self, *args, **kwargs): - MAX_GROUP_COUNT = 1024 - SMALL_SEQ_COUNT = 4 + MAX_GROUP_COUNT = 1024 # noqa + SMALL_SEQ_COUNT = 4 # noqa from pyopencl.array import empty @@ -392,7 +340,8 @@ class ReductionKernel: use_queue, (group_count*stage_inf.group_size,), (stage_inf.group_size,), - *([result.base_data, result.offset] + invocation_args + [seq_count, sz]), + *([result.base_data, result.offset] + + invocation_args + [seq_count, sz]), **dict(wait_for=wait_for)) wait_for = [last_evt] @@ -473,7 +422,15 @@ def get_sum_kernel(ctx, dtype_out, dtype_in): if dtype_out is None: dtype_out = dtype_in - return ReductionKernel(ctx, dtype_out, "0", "a+b", + reduce_expr = "a+b" + neutral_expr = "0" + if dtype_out.kind == "c": + from pyopencl.elementwise import complex_dtype_to_name + dtname = complex_dtype_to_name(dtype_out) + reduce_expr = "%s_add(a, b)" % dtname + neutral_expr = "%s_new(0, 0)" % dtname + + return ReductionKernel(ctx, dtype_out, neutral_expr, reduce_expr, arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)}) @@ -492,49 +449,30 @@ def _get_dot_expr(dtype_out, dtype_a, dtype_b, conjugate_first, dtype_a.type(0), dtype_b.type(0), has_double_support) - a_real_dtype = dtype_a.type(0).real.dtype - b_real_dtype = dtype_b.type(0).real.dtype - out_real_dtype = dtype_out.type(0).real.dtype - a_is_complex = dtype_a.kind == "c" b_is_complex = dtype_b.kind == "c" - out_is_complex = dtype_out.kind == "c" from pyopencl.elementwise import complex_dtype_to_name - if a_is_complex and b_is_complex: - a = "a[%s]" % index_expr - b = "b[%s]" % index_expr - if dtype_a != dtype_out: - a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) - if dtype_b != dtype_out: - b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) + a = "a[%s]" % index_expr + b = "b[%s]" % index_expr - if conjugate_first and a_is_complex: - a = "%s_conj(%s)" % ( - complex_dtype_to_name(dtype_out), a) + if a_is_complex and (dtype_a != dtype_out): + a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) + if b_is_complex and (dtype_b != dtype_out): + b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) - map_expr = "%s_mul(%s, %s)" % ( - complex_dtype_to_name(dtype_out), a, b) - else: - a = "a[%s]" % index_expr - b = "b[%s]" % index_expr - - if out_is_complex: - if a_is_complex and dtype_a != dtype_out: - a = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), a) - if b_is_complex and dtype_b != dtype_out: - b = "%s_cast(%s)" % (complex_dtype_to_name(dtype_out), b) - - if not a_is_complex and a_real_dtype != out_real_dtype: - a = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), a) - if not b_is_complex and b_real_dtype != out_real_dtype: - b = "(%s) (%s)" % (dtype_to_ctype(out_real_dtype), b) - - if conjugate_first and a_is_complex: - a = "%s_conj(%s)" % ( - complex_dtype_to_name(dtype_out), a) + if a_is_complex and conjugate_first and a_is_complex: + a = "%s_conj(%s)" % ( + complex_dtype_to_name(dtype_out), a) + if a_is_complex and not b_is_complex: + map_expr = "%s_mulr(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b) + elif not a_is_complex and b_is_complex: + map_expr = "%s_rmul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b) + elif a_is_complex and b_is_complex: + map_expr = "%s_mul(%s, %s)" % (complex_dtype_to_name(dtype_out), a, b) + else: map_expr = "%s*%s" % (a, b) return map_expr, dtype_out, dtype_b @@ -548,14 +486,22 @@ def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None, dtype_out, dtype_a, dtype_b, conjugate_first, has_double_support=has_double_support(ctx.devices[0])) - return ReductionKernel(ctx, dtype_out, neutral="0", - reduce_expr="a+b", map_expr=map_expr, - arguments= - "const %(tp_a)s *a, " - "const %(tp_b)s *b" % { - "tp_a": dtype_to_ctype(dtype_a), - "tp_b": dtype_to_ctype(dtype_b), - }) + reduce_expr = "a+b" + neutral_expr = "0" + if dtype_out.kind == "c": + from pyopencl.elementwise import complex_dtype_to_name + dtname = complex_dtype_to_name(dtype_out) + reduce_expr = "%s_add(a, b)" % dtname + neutral_expr = "%s_new(0, 0)" % dtname + + return ReductionKernel(ctx, dtype_out, neutral=neutral_expr, + reduce_expr=reduce_expr, map_expr=map_expr, + arguments=( + "const %(tp_a)s *a, " + "const %(tp_b)s *b" % { + "tp_a": dtype_to_ctype(dtype_a), + "tp_b": dtype_to_ctype(dtype_b), + })) @context_dependent_memoize @@ -570,14 +516,14 @@ def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=No # important: lookup_tbl must be first--it controls the length return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, - arguments= - "const %(tp_lut)s *lookup_tbl, " - "const %(tp_a)s *a, " - "const %(tp_b)s *b" % { - "tp_lut": dtype_to_ctype(dtype_subset), - "tp_a": dtype_to_ctype(dtype_a), - "tp_b": dtype_to_ctype(dtype_b), - }) + arguments=( + "const %(tp_lut)s *lookup_tbl, " + "const %(tp_a)s *a, " + "const %(tp_b)s *b" % { + "tp_lut": dtype_to_ctype(dtype_subset), + "tp_a": dtype_to_ctype(dtype_a), + "tp_b": dtype_to_ctype(dtype_b), + })) def get_minmax_neutral(what, dtype): @@ -628,12 +574,13 @@ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset): neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, map_expr="in[lookup_tbl[i]]", - arguments= - "const %(tp_lut)s *lookup_tbl, " - "const %(tp)s *in" % { - "tp": dtype_to_ctype(dtype), - "tp_lut": dtype_to_ctype(dtype_subset), - }, preamble="#define MY_INFINITY (1./0)") + arguments=( + "const %(tp_lut)s *lookup_tbl, " + "const %(tp)s *in" % { + "tp": dtype_to_ctype(dtype), + "tp_lut": dtype_to_ctype(dtype_subset), + }), + preamble="#define MY_INFINITY (1./0)") # }}} diff --git a/pyopencl/scan.py b/pyopencl/scan.py index 43e0d18b1c6e142036e7c30b095d93618b69c689..10a64ee57e8d844723a584ce1795da530902c073 100644 --- a/pyopencl/scan.py +++ b/pyopencl/scan.py @@ -1419,7 +1419,7 @@ void ${name_prefix}_debug_scan( const index_type N) { scan_type item = ${neutral}; - scan_type prev_item; + scan_type last_item; for (index_type i = 0; i < N; ++i) { @@ -1427,7 +1427,7 @@ void ${name_prefix}_debug_scan( ${arg_ctypes[arg_name]} ${name}; %if ife_offset < 0: if (i+${ife_offset} >= 0) - ${name} = ${arg_name}[i+offset]; + ${name} = ${arg_name}[i+${ife_offset}]; %else: ${name} = ${arg_name}[i]; %endif @@ -1435,12 +1435,12 @@ void ${name_prefix}_debug_scan( scan_type my_val = INPUT_EXPR(i); - prev_item = item; + last_item = item; %if is_segmented: bool is_seg_start = IS_SEG_START(i, my_val); %endif - item = SCAN_EXPR(prev_item, my_val, + item = SCAN_EXPR(last_item, my_val, %if is_segmented: is_seg_start %else: diff --git a/pyopencl/version.py b/pyopencl/version.py index 2d917d29fc041929fe4c94362fd212f13cafc685..a0b428c42f2ebf49667fa8e7a9a48f643c23b915 100644 --- a/pyopencl/version.py +++ b/pyopencl/version.py @@ -1,3 +1,3 @@ -VERSION = (2014, 1) +VERSION = (2015, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/test/test_clmath.py b/test/test_clmath.py index 586e9e075275aadb11c3067b3127f8c955afb625..53f017e49bfda6b5330b001ef7f9558a3be7cb53 100644 --- a/test/test_clmath.py +++ b/test/test_clmath.py @@ -1,4 +1,4 @@ -from __future__ import division +from __future__ import division, print_function __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" @@ -21,9 +21,12 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ + import math import numpy as np +import pytest + def have_cl(): try: @@ -323,6 +326,127 @@ def test_bessel(ctx_factory): pt.show() +@pytest.mark.parametrize("ref_src", ["pyfmmlib", "scipy"]) +def test_complex_bessel(ctx_factory, ref_src): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + if not has_double_support(ctx.devices[0]): + from pytest import skip + skip("no double precision support--cannot test complex bessel function") + + v = 40 + n = 10**5 + + np.random.seed(13) + z = ( + np.logspace(-5, 2, n) + * np.exp(1j * 2 * np.pi * np.random.rand(n))) + + if ref_src == "pyfmmlib": + pyfmmlib = pytest.importorskip("pyfmmlib") + + jv_ref = np.zeros(len(z), 'complex') + + vin = v+1 + + for i in range(len(z)): + ier, fjs, _, _ = pyfmmlib.jfuns2d(vin, z[i], 1, 0, 10000) + assert ier == 0 + jv_ref[i] = fjs[v] + + elif ref_src == "scipy": + spec = pytest.importorskip("scipy.special") + jv_ref = spec.jv(v, z) + + else: + raise ValueError("ref_src") + + z_dev = cl_array.to_device(queue, z) + + jv_dev = clmath.bessel_jn(v, z_dev) + + abs_err_jv = np.abs(jv_dev.get() - jv_ref) + abs_jv_ref = np.abs(jv_ref) + rel_err_jv = abs_err_jv/abs_jv_ref + + # use absolute error instead if the function value itself is too small + tiny = 1e-300 + ind = abs_jv_ref < tiny + rel_err_jv[ind] = abs_err_jv[ind] + + # if the reference value is inf or nan, set the error to zero + ind1 = np.isinf(abs_jv_ref) + ind2 = np.isnan(abs_jv_ref) + + rel_err_jv[ind1] = 0 + rel_err_jv[ind2] = 0 + + if 0: + print(abs(z)) + print(np.abs(jv_ref)) + print(np.abs(jv_dev.get())) + print(rel_err_jv) + + max_err = np.max(rel_err_jv) + assert max_err <= 2e-13, max_err + + print("Jv", np.max(rel_err_jv)) + + if 0: + import matplotlib.pyplot as pt + pt.loglog(np.abs(z), rel_err_jv) + pt.show() + + +@pytest.mark.parametrize("ref_src", ["pyfmmlib", "scipy"]) +def test_hankel_01_complex(ctx_factory, ref_src): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 10**6 + np.random.seed(11) + z = ( + np.logspace(-5, 2, n) + * np.exp(1j * 2 * np.pi * np.random.rand(n))) + + def get_err(check, ref): + return np.max(np.abs(check-ref)) / np.max(np.abs(ref)) + + if ref_src == "pyfmmlib": + pyfmmlib = pytest.importorskip("pyfmmlib") + h0_ref, h1_ref = pyfmmlib.hank103_vec(z, ifexpon=1) + elif ref_src == "scipy": + spec = pytest.importorskip("scipy.special") + h0_ref = spec.hankel1(0, z) + h1_ref = spec.hankel1(1, z) + + else: + raise ValueError("ref_src") + + z_dev = cl_array.to_device(queue, z) + + h0_dev, h1_dev = clmath.hankel_01(z_dev) + + rel_err_h0 = np.abs(h0_dev.get() - h0_ref)/np.abs(h0_ref) + rel_err_h1 = np.abs(h1_dev.get() - h1_ref)/np.abs(h1_ref) + + max_rel_err_h0 = np.max(rel_err_h0) + max_rel_err_h1 = np.max(rel_err_h1) + + print("H0", max_rel_err_h0) + print("H1", max_rel_err_h1) + + assert max_rel_err_h0 < 4e-13 + assert max_rel_err_h1 < 2e-13 + + if 0: + import matplotlib.pyplot as pt + pt.loglog(np.abs(z), rel_err_h0) + pt.loglog(np.abs(z), rel_err_h1) + pt.show() + + if __name__ == "__main__": import sys if len(sys.argv) > 1: diff --git a/test/test_wrapper.py b/test/test_wrapper.py index 971ca27d6f959b9adb91843acc5038e35d305505..870e3c5d946f321cbe93730369f17b26da5d44c4 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -62,7 +62,7 @@ def test_get_info(ctx_factory): (cl.Program, cl.program_info.KERNEL_NAMES), (cl.Program, cl.program_info.NUM_KERNELS), ]) - CRASH_QUIRKS = [ + CRASH_QUIRKS = [ # noqa (("NVIDIA Corporation", "NVIDIA CUDA", "OpenCL 1.0 CUDA 3.0.1"), [ @@ -92,7 +92,7 @@ def test_get_info(ctx_factory): (cl.Program, cl.program_info.SOURCE), ]), ] - QUIRKS = [] + QUIRKS = [] # noqa plat_quirk_key = ( platform.vendor, @@ -684,7 +684,7 @@ def test_platform_get_devices(platform): getattr(cl.device_type, 'CUSTOM', None)): continue for dev in devs: - assert dev.type == dev_type + assert dev.type & dev_type == dev_type def test_user_event(ctx_factory):