diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 48db98503e76d4144e08395df1bfd90f2a8490f2..3ee1a5ccb67aa2e931ebaa5834d09b02e71971ea 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,6 +25,7 @@ Python 3.5 Intel CPU: - intel-cl-cpu except: - tags + Python 3.5 AMD CPU: script: - export PY_EXE=python3.5 @@ -37,6 +38,7 @@ Python 3.5 AMD CPU: - amd-cl-cpu except: - tags + Python 2.6 AMD CPU: script: - export PY_EXE=python2.6 @@ -89,6 +91,19 @@ Python 3.5 AMD GPU: except: - tags +Python 3.6 POCL: + script: + - export PY_EXE=python3.6 + - export PYOPENCL_TEST=portable + - export EXTRA_INSTALL="numpy mako" + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh + - ". ./build-and-test-py-project.sh" + tags: + - python3.6 + - pocl + except: + - tags + Python 3.5 POCL CL 1.1: script: - export PY_EXE=python3.5 @@ -121,6 +136,7 @@ Python 2.7 Apple: - export PY_EXE=python2.7 - export PYOPENCL_TEST=app:cpu - export EXTRA_INSTALL="numpy mako" + - export PKG_CONFIG_PATH=/usr/local/opt/libffi/lib/pkgconfig - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -134,6 +150,7 @@ PyPy POCL: - export PY_EXE=pypy - export PYOPENCL_TEST=portable - export EXTRA_INSTALL="numpy mako" + - export NO_DOCTESTS=1 - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: diff --git a/doc/algorithm.rst b/doc/algorithm.rst index 3ad9c53edc6c81bc7b6f0b804930f8cf60a73721..954fabe4b803647694dae7f3737b487bc2bd0aa0 100644 --- a/doc/algorithm.rst +++ b/doc/algorithm.rst @@ -225,7 +225,7 @@ Simple / Legacy Interface .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None) -.. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None) +.. class:: InclusiveScanKernel(ctx, dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None) Works like :class:`ExclusiveScanKernel`. diff --git a/doc/array.rst b/doc/array.rst index 6f35c0461023480bd299d5f828ad91d36214a5c0..8c83e6663f007560a744953e0439a875981e56ea 100644 --- a/doc/array.rst +++ b/doc/array.rst @@ -101,7 +101,7 @@ functions defined on them such as `cfloat_mul(a, b)` or `cdouble_log(z)`. Elementwise kernels automatically include the header if your kernel has complex input or output. See the `source file -<https://github.com/pyopencl/pyopencl/blob/master/src/cl/pyopencl-complex.h>`_ +<https://github.com/pyopencl/pyopencl/blob/master/pyopencl/cl/pyopencl-complex.h>`_ for a precise list of what's available. If you need double precision support, please:: diff --git a/doc/howto.rst b/doc/howto.rst index 5ea67f7df91ae40a44e4a4b8fe78957512aa8b4e..92244c43a69ef61ba3b33a581bc4a696ed9e7705 100644 --- a/doc/howto.rst +++ b/doc/howto.rst @@ -65,10 +65,10 @@ the device: >>> ary_host["field2"].fill(1000) >>> ary_host[13]["field2"] = 12 >>> print(ary_host) - [(217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) - (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) - (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 12.0) (217, 1000.0) - (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0) (217, 1000.0)] + [(217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) + (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) + (217, 1000.) (217, 1000.) (217, 1000.) (217, 12.) (217, 1000.) + (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.) (217, 1000.)] >>> ary = cl.array.to_device(queue, ary_host) @@ -85,10 +85,10 @@ We can then operate on the array with our own kernels: >>> evt = prg.set_to_1(queue, ary.shape, None, ary.data) >>> print(ary) - [(1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) - (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) - (1, 1000.0) (1, 12.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) (1, 1000.0) - (1, 1000.0) (1, 1000.0)] + [(1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) + (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) + (1, 1000.) (1, 12.) (1, 1000.) (1, 1000.) (1, 1000.) (1, 1000.) + (1, 1000.) (1, 1000.)] as well as with PyOpenCL's built-in operations: @@ -99,7 +99,7 @@ as well as with PyOpenCL's built-in operations: ... preamble=my_struct_c_decl) >>> evt = elwise(ary) >>> print(ary) - [(2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) - (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) - (2, 1000.0) (2, 12.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) (2, 1000.0) - (2, 1000.0) (2, 1000.0)] + [(2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) + (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) + (2, 1000.) (2, 12.) (2, 1000.) (2, 1000.) (2, 1000.) (2, 1000.) + (2, 1000.) (2, 1000.)] diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index afb48d619460943bb2a5245a27298f13da091cf2..a8a694cd3156c931cdfbc094e070914ac0ade510 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -218,7 +218,9 @@ _DEFAULT_BUILD_OPTIONS = [] _DEFAULT_INCLUDE_OPTIONS = ["-I", _find_pyopencl_include_path()] # map of platform.name to build options list -_PLAT_BUILD_OPTIONS = {} +_PLAT_BUILD_OPTIONS = { + "Oclgrind": ["-D", "PYOPENCL_USING_OCLGRIND"], + } def enable_debugging(platform_or_context): diff --git a/pyopencl/array.py b/pyopencl/array.py index ac32eede8d277cd1a5d00c553027eda8e23ecdde..279705c25fffb3447b7757069dfe5a847bacb34a 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -1189,6 +1189,8 @@ class Array(object): raise ValueError("The truth value of an array with " "more than one element is ambiguous. Use a.any() or a.all()") + __bool__ = __nonzero__ + def any(self, queue=None, wait_for=None): from pyopencl.reduction import get_any_kernel krnl = get_any_kernel(self.context, self.dtype) diff --git a/pyopencl/cache.py b/pyopencl/cache.py index 95aeb0eba885f997b98b71818cd1a1670d68176d..3d8f8fa98db087e019b531c89413b8ba16104d57 100644 --- a/pyopencl/cache.py +++ b/pyopencl/cache.py @@ -32,6 +32,10 @@ import sys import os from pytools import Record +import logging +logger = logging.getLogger(__name__) + + try: import hashlib new_hash = hashlib.md5 @@ -352,10 +356,14 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes, cache_result = retrieve_from_cache(cache_dir, cache_key) if cache_result is None: + logger.info("build program: binary cache miss (key: %s)" % cache_key) + to_be_built_indices.append(i) binaries.append(None) logs.append(None) else: + logger.debug("build program: binary cache hit (key: %s)" % cache_key) + binary, log = cache_result binaries.append(binary) logs.append(log) @@ -382,9 +390,14 @@ def _create_built_program_from_source_cached(ctx, src, options_bytes, src = src + "\n\n__constant int pyopencl_defeat_cache_%s = 0;" % ( uuid4().hex) + logger.info("build program: start building program from source on %s" + % ", ".join(str(devices[i]) for i in to_be_built_indices)) + prg = _cl._Program(ctx, src) prg.build(options_bytes, [devices[i] for i in to_be_built_indices]) + logger.info("build program: from-source build complete") + prg_devs = prg.get_info(_cl.program_info.DEVICES) prg_bins = prg.get_info(_cl.program_info.BINARIES) prg_logs = prg._get_build_logs() diff --git a/pyopencl/cffi_cl.py b/pyopencl/cffi_cl.py index 16f18f1f4e8507ce56e12b344b9e19f5d68ebd13..c2e87ca1035bfa1e6363ea43a2e6c5205c218a72 100644 --- a/pyopencl/cffi_cl.py +++ b/pyopencl/cffi_cl.py @@ -33,6 +33,7 @@ import warnings from warnings import warn import numpy as np import sys +import re from pytools import memoize_method @@ -42,6 +43,9 @@ from .compyte.array import f_contiguous_strides, c_contiguous_strides from pyopencl._cffi import lib as _lib +import logging +logger = logging.getLogger(__name__) + class _CLKernelArg(object): pass @@ -176,6 +180,9 @@ def _generic_info_to_python(info): elif type_ == 'cl_device_topology_amd*': ret = DeviceTopologyAmd( value.pcie.bus, value.pcie.device, value.pcie.function) + elif type_ == 'cl_image_format*': + ret = ImageFormat(value.image_channel_order, + value.image_channel_data_type) elif type_.startswith('char*['): ret = list(map(_ffi_pystr, value)) _lib.free_pointer_array(info.value, len(value)) @@ -643,17 +650,6 @@ class Platform(_Common): def __repr__(self): return "<pyopencl.Platform '%s' at 0x%x>" % (self.name, self.int_ptr) - def _get_cl_version(self): - import re - version_string = self.version - match = re.match(r"^OpenCL ([0-9]+)\.([0-9]+) .*$", version_string) - if match is None: - raise RuntimeError("platform %s returned non-conformant " - "platform version string '%s'" % - (self, version_string)) - - return int(match.group(1)), int(match.group(2)) - def unload_platform_compiler(plat): _handle_error(_lib.platform__unload_compiler(plat.ptr)) @@ -692,6 +688,28 @@ class Device(_Common): # }}} +# {{{ {Device,Platform}._get_cl_version + +_OPENCL_VERSION_STRING_RE = re.compile(r"^OpenCL ([0-9]+)\.([0-9]+) .*$") + + +def _platdev_get_cl_version(self): + version_string = self.version + match = _OPENCL_VERSION_STRING_RE.match(version_string) + if match is None: + raise RuntimeError("platform %s returned non-conformant " + "platform version string '%s'" % + (self, version_string)) + + return int(match.group(1)), int(match.group(2)) + + +Platform._get_cl_version = _platdev_get_cl_version +Device._get_cl_version = _platdev_get_cl_version + +# }}} + + # {{{ Context def _parse_context_properties(properties): @@ -812,7 +830,7 @@ class CommandQueue(_Common): self.finish() def _get_cl_version(self): - return self.context._get_cl_version() + return self.device._get_cl_version() # }}} @@ -1583,6 +1601,7 @@ class _Program(_Common): return build_logs def build(self, options_bytes, devices=None): + logger.debug("build program: start") err = None try: self._build(options=options_bytes, devices=devices) @@ -1602,8 +1621,12 @@ class _Program(_Common): if err is not None: # Python 3.2 outputs the whole list of currently active exceptions # This serves to remove one (redundant) level from that nesting. + + logger.debug("build program: completed, error") raise err + logger.debug("build program: completed, success") + message = (75*"="+"\n").join( "Build on %s succeeded, but said:\n\n%s" % (dev, log) for dev, log in self._get_build_logs() @@ -1839,7 +1862,7 @@ class Kernel(_Common): from pyopencl.characterize import has_struct_arg_count_bug count_bug_per_dev = [ - has_struct_arg_count_bug(dev) + has_struct_arg_count_bug(dev, self.context) for dev in self.context.devices] from pytools import single_valued diff --git a/pyopencl/characterize/__init__.py b/pyopencl/characterize/__init__.py index b6ea1c53f4d812dd79389e17a11051d88a4be173..d03051897b8e35a1a114fa5e8f4cebd145bb1589 100644 --- a/pyopencl/characterize/__init__.py +++ b/pyopencl/characterize/__init__.py @@ -1,8 +1,4 @@ -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range -from six.moves import zip +from __future__ import division, absolute_import __copyright__ = "Copyright (C) 2009 Andreas Kloeckner" @@ -28,6 +24,8 @@ THE SOFTWARE. import pyopencl as cl from pytools import memoize +import six +from six.moves import range, zip class CLCharacterizationWarning(UserWarning): @@ -322,14 +320,70 @@ def get_simd_group_size(dev, type_size): return None -def has_struct_arg_count_bug(dev): +def get_pocl_version(platform, fallback_value=None): + if platform.name != "Portable Computing Language": + return None + + import re + ver_match = re.match( + r"^OpenCL [0-9.]+ pocl ([0-9]+)\.([0-9]+)", platform.version) + if ver_match is None: + msg = ("pocl version number did not have expected format: '%s'" + % platform.version) + if fallback_value is not None: + from warnings import warn + warn(msg) + return fallback_value + else: + raise ValueError(msg) + else: + return (int(ver_match.group(1)), int(ver_match.group(2))) + + +_CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE = {} + + +def _check_for_pocl_arg_count_bug(dev, ctx=None): + try: + return _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev] + except KeyError: + pass + + if ctx is None: + build_ctx = cl.Context([dev]) + else: + build_ctx = ctx + + prg = cl.Program(build_ctx, """ + struct two_things + { + long a; + long b; + }; + + __kernel void test_knl(struct two_things x) + { + } + """).build() + + result = prg.test_knl.num_args == 2 + _CHECK_FOR_POCL_ARG_COUNT_BUG_CACHE[dev] = result + + return result + + +def has_struct_arg_count_bug(dev, ctx=None): """Checks whether the device is expected to have the `argument counting bug <https://github.com/pocl/pocl/issues/197>`_. """ if dev.platform.name == "Apple" and dev.type & cl.device_type.CPU: return "apple" - if (dev.platform.name == "Portable Computing Language" - and dev.address_bits == 64): - return "pocl" + if dev.platform.name == "Portable Computing Language": + pocl_version = get_pocl_version(dev.platform, fallback_value=(0.14)) + if pocl_version <= (0, 13): + return "pocl" + elif pocl_version <= (0, 14) and _check_for_pocl_arg_count_bug(dev, ctx): + return "pocl" + return False diff --git a/pyopencl/cl/pyopencl-complex.h b/pyopencl/cl/pyopencl-complex.h index 7518e8108e1123fa157f943d05f833ba8615c28d..fe29aae86099fc1eecf1eba11fc14ce271852444 100644 --- a/pyopencl/cl/pyopencl-complex.h +++ b/pyopencl/cl/pyopencl-complex.h @@ -32,12 +32,12 @@ #define PYOPENCL_DECLARE_COMPLEX_TYPE_INT(REAL_TP, REAL_3LTR, TPROOT, TP) \ \ - REAL_TP TPROOT##_real(TP a) { return a.real; } \ - REAL_TP TPROOT##_imag(TP a) { return a.imag; } \ - REAL_TP TPROOT##_abs(TP a) { return hypot(a.real, a.imag); } \ - REAL_TP TPROOT##_abs_squared(TP a) { return a.real * a.real + a.imag * a.imag; } \ + inline REAL_TP TPROOT##_real(TP a) { return a.real; } \ + inline REAL_TP TPROOT##_imag(TP a) { return a.imag; } \ + inline REAL_TP TPROOT##_abs(TP a) { return hypot(a.real, a.imag); } \ + inline REAL_TP TPROOT##_abs_squared(TP a) { return a.real * a.real + a.imag * a.imag; } \ \ - TP TPROOT##_new(REAL_TP real, REAL_TP imag) \ + inline TP TPROOT##_new(REAL_TP real, REAL_TP imag) \ { \ TP result; \ result.real = real; \ @@ -45,7 +45,7 @@ return result; \ } \ \ - TP TPROOT##_fromreal(REAL_TP real) \ + inline TP TPROOT##_fromreal(REAL_TP real) \ { \ TP result; \ result.real = real; \ @@ -54,47 +54,47 @@ } \ \ \ - TP TPROOT##_neg(TP a) { return TPROOT##_new(-a.real, -a.imag); } \ - TP TPROOT##_conj(TP a) { return TPROOT##_new(a.real, -a.imag); } \ + inline TP TPROOT##_neg(TP a) { return TPROOT##_new(-a.real, -a.imag); } \ + inline TP TPROOT##_conj(TP a) { return TPROOT##_new(a.real, -a.imag); } \ \ - TP TPROOT##_add(TP a, TP b) \ + inline TP TPROOT##_add(TP a, TP b) \ { \ return TPROOT##_new(a.real + b.real, a.imag + b.imag); \ ; \ } \ - TP TPROOT##_addr(TP a, REAL_TP b) \ + inline TP TPROOT##_addr(TP a, REAL_TP b) \ { \ return TPROOT##_new(b+a.real, a.imag); \ } \ - TP TPROOT##_radd(REAL_TP a, TP b) \ + inline TP TPROOT##_radd(REAL_TP a, TP b) \ { \ return TPROOT##_new(a+b.real, b.imag); \ } \ \ - TP TPROOT##_sub(TP a, TP b) \ + inline TP TPROOT##_sub(TP a, TP b) \ { \ return TPROOT##_new(a.real - b.real, a.imag - b.imag); \ ; \ } \ \ - TP TPROOT##_mul(TP a, TP b) \ + inline TP TPROOT##_mul(TP a, TP b) \ { \ return TPROOT##_new( \ a.real*b.real - a.imag*b.imag, \ a.real*b.imag + a.imag*b.real); \ } \ \ - TP TPROOT##_mulr(TP a, REAL_TP b) \ + inline TP TPROOT##_mulr(TP a, REAL_TP b) \ { \ return TPROOT##_new(a.real*b, a.imag*b); \ } \ \ - TP TPROOT##_rmul(REAL_TP a, TP b) \ + inline TP TPROOT##_rmul(REAL_TP a, TP b) \ { \ return TPROOT##_new(a*b.real, a*b.imag); \ } \ \ - TP TPROOT##_rdivide(REAL_TP z1, TP z2) \ + inline TP TPROOT##_rdivide(REAL_TP z1, TP z2) \ { \ if (fabs(z2.real) <= fabs(z2.imag)) { \ REAL_TP ratio = z2.real / z2.imag; \ @@ -108,7 +108,7 @@ } \ } \ \ - TP TPROOT##_divide(TP z1, TP z2) \ + inline TP TPROOT##_divide(TP z1, TP z2) \ { \ REAL_TP ratio, denom, a, b, c, d; \ \ @@ -134,12 +134,12 @@ (c + d * ratio) / denom); \ } \ \ - TP TPROOT##_divider(TP a, REAL_TP b) \ + inline TP TPROOT##_divider(TP a, REAL_TP b) \ { \ return TPROOT##_new(a.real/b, a.imag/b); \ } \ \ - TP TPROOT##_pow(TP a, TP b) \ + inline TP TPROOT##_pow(TP a, TP b) \ { \ REAL_TP logr = log(hypot(a.real, a.imag)); \ REAL_TP logi = atan2(a.imag, a.real); \ @@ -151,7 +151,7 @@ return TPROOT##_new(x*cosy, x*siny); \ } \ \ - TP TPROOT##_powr(TP a, REAL_TP b) \ + inline TP TPROOT##_powr(TP a, REAL_TP b) \ { \ REAL_TP logr = log(hypot(a.real, a.imag)); \ REAL_TP logi = atan2(a.imag, a.real); \ @@ -164,7 +164,7 @@ return TPROOT##_new(x * cosy, x*siny); \ } \ \ - TP TPROOT##_rpow(REAL_TP a, TP b) \ + inline TP TPROOT##_rpow(REAL_TP a, TP b) \ { \ REAL_TP logr = log(a); \ REAL_TP x = exp(logr * b.real); \ @@ -175,7 +175,7 @@ return TPROOT##_new(x * cosy, x * siny); \ } \ \ - TP TPROOT##_sqrt(TP a) \ + inline TP TPROOT##_sqrt(TP a) \ { \ REAL_TP re = a.real; \ REAL_TP im = a.imag; \ @@ -196,7 +196,7 @@ return result; \ } \ \ - TP TPROOT##_exp(TP a) \ + inline TP TPROOT##_exp(TP a) \ { \ REAL_TP expr = exp(a.real); \ REAL_TP cosi; \ @@ -204,24 +204,24 @@ return TPROOT##_new(expr * cosi, expr * sini); \ } \ \ - TP TPROOT##_log(TP a) \ + inline TP TPROOT##_log(TP a) \ { return TPROOT##_new(log(hypot(a.real, a.imag)), atan2(a.imag, a.real)); } \ \ - TP TPROOT##_sin(TP a) \ + inline TP TPROOT##_sin(TP a) \ { \ REAL_TP cosr; \ REAL_TP sinr = sincos(a.real, &cosr); \ return TPROOT##_new(sinr*cosh(a.imag), cosr*sinh(a.imag)); \ } \ \ - TP TPROOT##_cos(TP a) \ + inline TP TPROOT##_cos(TP a) \ { \ REAL_TP cosr; \ REAL_TP sinr = sincos(a.real, &cosr); \ return TPROOT##_new(cosr*cosh(a.imag), -sinr*sinh(a.imag)); \ } \ \ - TP TPROOT##_tan(TP a) \ + inline TP TPROOT##_tan(TP a) \ { \ REAL_TP re2 = 2.f * a.real; \ REAL_TP im2 = 2.f * a.imag; \ @@ -237,21 +237,21 @@ } \ } \ \ - TP TPROOT##_sinh(TP a) \ + inline TP TPROOT##_sinh(TP a) \ { \ REAL_TP cosi; \ REAL_TP sini = sincos(a.imag, &cosi); \ return TPROOT##_new(sinh(a.real)*cosi, cosh(a.real)*sini); \ } \ \ - TP TPROOT##_cosh(TP a) \ + inline TP TPROOT##_cosh(TP a) \ { \ REAL_TP cosi; \ REAL_TP sini = sincos(a.imag, &cosi); \ return TPROOT##_new(cosh(a.real)*cosi, sinh(a.real)*sini); \ } \ \ - TP TPROOT##_tanh(TP a) \ + inline TP TPROOT##_tanh(TP a) \ { \ REAL_TP re2 = 2.f * a.real; \ REAL_TP im2 = 2.f * a.imag; \ diff --git a/pyopencl/cl/pyopencl-random123/openclfeatures.h b/pyopencl/cl/pyopencl-random123/openclfeatures.h index af03d3092318c6c27f1a65ce8104c1609b1e66e1..8403706f9d5b16c3ac2a44a7c122c531e20bc9ec 100644 --- a/pyopencl/cl/pyopencl-random123/openclfeatures.h +++ b/pyopencl/cl/pyopencl-random123/openclfeatures.h @@ -69,8 +69,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifndef R123_USE_MULHILO64_OPENCL_INTRIN +#ifdef PYOPENCL_USING_OCLGRIND +#define R123_USE_MULHILO64_OPENCL_INTRIN 0 +#else #define R123_USE_MULHILO64_OPENCL_INTRIN 1 #endif +#endif #ifndef R123_USE_AES_NI #define R123_USE_AES_NI 0 diff --git a/pyopencl/mempool.py b/pyopencl/mempool.py index 1139edab691e86fdf955fc68c86e266fbb9bb8e6..071bdb98ffba1b1fc6462530c6739847d84cf8ee 100644 --- a/pyopencl/mempool.py +++ b/pyopencl/mempool.py @@ -248,7 +248,6 @@ class MemoryPool(object): for bin_nr, bin_list in six.iteritems(self.bin_nr_to_bin): while bin_list: self.allocator.free(bin_list.pop()) - self.held_blocks -= 1 yield diff --git a/pyopencl/scan.py b/pyopencl/scan.py index 0ea9e01e28fdbeca4058290ca0352639f4597880..1f8d5d2ca79b9ed2638c18bc60dc38934deba26c 100644 --- a/pyopencl/scan.py +++ b/pyopencl/scan.py @@ -1,10 +1,6 @@ """Scan primitive.""" -from __future__ import division -from __future__ import absolute_import -import six -from six.moves import range -from six.moves import zip +from __future__ import division, absolute_import __copyright__ = """ Copyright 2011-2012 Andreas Kloeckner @@ -28,6 +24,9 @@ Derived from code within the Thrust project, https://github.com/thrust/thrust/ """ +import six +from six.moves import range, zip + import numpy as np import pyopencl as cl @@ -296,8 +295,11 @@ void ${kernel_name}( if (LID_0 == 0 && unit_base != interval_begin) { + scan_type tmp = ldata[K][WG_SIZE - 1].value; + scan_type tmp_aux = ldata[0][0].value; + ldata[0][0].value = SCAN_EXPR( - ldata[K][WG_SIZE - 1].value, ldata[0][0].value, + tmp, tmp_aux, %if is_segmented: (l_segment_start_flags[0][0]) %else: @@ -761,7 +763,7 @@ _PREFIX_WORDS = set(""" group_base seg_end my_val DEBUG ARGS ints_to_store ints_per_wg scan_types_per_int linear_index linear_scan_data_idx dest src store_base wrapped_scan_type - dummy scan_tmp + dummy scan_tmp tmp_aux LID_2 LID_1 LID_0 LDIM_0 LDIM_1 LDIM_2 @@ -1051,7 +1053,7 @@ class GenericScanKernel(_GenericScanKernelBase): output_statement="ary[i+1] = item;") a = cl.array.arange(queue, 10000, dtype=np.int32) - scan_kernel(a, queue=queue) + knl(a, queue=queue) """ diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 36fafbc381441139b0bc62bf185005678ba2856c..c7bd5ed00280840f78d751f895e6013ea2154601 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -458,7 +458,7 @@ class _CDeclList: if dtype in vec.type_to_scalar_and_count: return - for name, field_data in six.iteritems(dtype.fields): + for name, field_data in sorted(six.iteritems(dtype.fields)): field_dtype, offset = field_data[:2] self.add_dtype(field_dtype) @@ -846,12 +846,12 @@ class _TemplateRenderer(object): if arguments is not None: cdl.visit_arguments(arguments) - for tv in six.itervalues(self.type_aliases): + for _, tv in sorted(six.iteritems(self.type_aliases)): cdl.add_dtype(tv) type_alias_decls = [ "typedef %s %s;" % (dtype_to_ctype(val), name) - for name, val in six.iteritems(self.type_aliases) + for name, val in sorted(six.iteritems(self.type_aliases)) ] return cdl.get_declarations() + "\n" + "\n".join(type_alias_decls) diff --git a/pyopencl/version.py b/pyopencl/version.py index 598079b12416759bd8e51e2af4ced9f48d52a096..29cfe8d37511d59834a25a29676ab6d79ac86d48 100644 --- a/pyopencl/version.py +++ b/pyopencl/version.py @@ -1,3 +1,3 @@ -VERSION = (2016, 2) +VERSION = (2016, 2, 1) VERSION_STATUS = "" VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS diff --git a/setup.py b/setup.py index 1d735904cc533f3f39405bd1a9d8285b3f2a59e3..3d2ddd0e9d359374abebb2ee2330381000dfa738 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,7 @@ def get_config_schema(): IncludeDir, LibraryDir, Libraries, \ Switch, StringListOption - default_cxxflags = ['-std=c++0x'] + default_cxxflags = ['-std=gnu++11'] if 'darwin' in sys.platform: import platform diff --git a/src/c_wrapper/clinfo_ext.h b/src/c_wrapper/clinfo_ext.h index 6094c52c18057b81e09526f3576c11042163e942..9263981e743a947b895cfd1344f2432cbebdbbd6 100644 --- a/src/c_wrapper/clinfo_ext.h +++ b/src/c_wrapper/clinfo_ext.h @@ -4,7 +4,7 @@ #ifndef _EXT_H #define _EXT_H -#ifdef __APPLE__ +#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H)) #include <OpenCL/opencl.h> #else #include <CL/cl.h> diff --git a/src/c_wrapper/context.cpp b/src/c_wrapper/context.cpp index 0a453d0ba797d396e1cdd2cd2ede6195f3d498e6..f2478fd73a54c11dbed830f6eace3e57f9ace215 100644 --- a/src/c_wrapper/context.cpp +++ b/src/c_wrapper/context.cpp @@ -68,7 +68,7 @@ context::get_info(cl_uint param_name) const break; #if defined(PYOPENCL_GL_SHARING_VERSION) && (PYOPENCL_GL_SHARING_VERSION >= 1) -#if defined(__APPLE__) && defined(HAVE_GL) +#if defined(__APPLE__) && defined(HAVE_GL) && !defined(PYOPENCL_APPLE_USE_CL_H) case CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE: #else case CL_GL_CONTEXT_KHR: diff --git a/src/c_wrapper/gl_obj.cpp b/src/c_wrapper/gl_obj.cpp index 6dfbec8c5ea11b78072964fb088d83849aa04763..bd7edf31d8ce772adae21047ab34e3eb925f1482 100644 --- a/src/c_wrapper/gl_obj.cpp +++ b/src/c_wrapper/gl_obj.cpp @@ -137,7 +137,7 @@ have_gl() cl_context_properties get_apple_cgl_share_group() { -#ifdef __APPLE__ +#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H)) #ifdef HAVE_GL CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); diff --git a/src/c_wrapper/pyopencl_ext.h b/src/c_wrapper/pyopencl_ext.h index 4b5e7871e57d7c26a89830e5bc5bec4bb1c8667c..cd5d7112ea6325d1770f2fbf4431c33ef2edec0d 100644 --- a/src/c_wrapper/pyopencl_ext.h +++ b/src/c_wrapper/pyopencl_ext.h @@ -7,7 +7,7 @@ #else -#ifdef __APPLE__ +#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H)) #include <OpenCL/opencl.h> diff --git a/src/c_wrapper/wrap_cl.h b/src/c_wrapper/wrap_cl.h index b097d12d9ebb32c092333721eef32ea847ba8ac5..21ff9c086805056e701186adf00070ed1eee48ed 100644 --- a/src/c_wrapper/wrap_cl.h +++ b/src/c_wrapper/wrap_cl.h @@ -13,7 +13,7 @@ #define CL_USE_DEPRECATED_OPENCL_1_1_APIS -#ifdef __APPLE__ +#if (defined(__APPLE__) && !defined(PYOPENCL_APPLE_USE_CL_H)) // {{{ Mac diff --git a/src/c_wrapper/wrap_constants.cpp b/src/c_wrapper/wrap_constants.cpp index 701638b98b9eb415bd7da4cfe50ea45707d23b90..16b1d1f49ba156f10c5b19b63e36f9915b00b77d 100644 --- a/src/c_wrapper/wrap_constants.cpp +++ b/src/c_wrapper/wrap_constants.cpp @@ -421,7 +421,7 @@ void populate_constants(void(*add)(const char*, const char*, int64_t value)) ADD_ATTR("context_properties", ,WGL_HDC_KHR); ADD_ATTR("context_properties", ,CGL_SHAREGROUP_KHR); #endif -#if defined(__APPLE__) && defined(HAVE_GL) +#if defined(__APPLE__) && defined(HAVE_GL) && !defined(PYOPENCL_APPLE_USE_CL_H) ADD_ATTR("context_properties", ,CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE); #endif /* __APPLE__ */ #ifdef CL_CONTEXT_OFFLINE_DEVICES_AMD diff --git a/test/test_algorithm.py b/test/test_algorithm.py index 374381ede72ad66951d5146613f4f7a00cc9311a..676390bd751116c334aa3a9425b92f7a32500fe3 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -386,7 +386,8 @@ def test_dot(ctx_factory): vdot_ab_gpu = cl_array.vdot(a_gpu, b_gpu).get() - assert abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) < 1e-4 + rel_err = abs(vdot_ab_gpu - vdot_ab) / abs(vdot_ab) + assert rel_err < 1e-4, rel_err @memoize @@ -939,6 +940,9 @@ def test_bitonic_argsort(ctx_factory, size, dtype): queue = cl.CommandQueue(ctx) dev = ctx.devices[0] + if (dev.platform.name == "Portable Computing Language" + and sys.platform == "darwin"): + pytest.xfail("Bitonic sort crashes on Apple POCL") if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU): pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup " "parallelism") diff --git a/test/test_enqueue_copy.py b/test/test_enqueue_copy.py index 14f0bc7df3191531099b6077754c0c77870ffda8..564e833a4bc167644bd057c5fa2117d17bda2cda 100644 --- a/test/test_enqueue_copy.py +++ b/test/test_enqueue_copy.py @@ -29,6 +29,7 @@ import pytest from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) +from pyopencl.characterize import get_pocl_version def generate_slice(start, shape): @@ -42,7 +43,9 @@ def test_enqueue_copy_rect_2d(ctx_factory, honor_skip=True): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - if honor_skip and ctx.devices[0].platform.name == "Portable Computing Language": + if (honor_skip + and ctx.devices[0].platform.name == "Portable Computing Language" + and get_pocl_version(ctx.devices[0].platform) <= (0, 13)): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") @@ -127,7 +130,9 @@ def test_enqueue_copy_rect_3d(ctx_factory, honor_skip=True): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - if honor_skip and ctx.devices[0].platform.name == "Portable Computing Language": + if (honor_skip + and ctx.devices[0].platform.name == "Portable Computing Language" + and get_pocl_version(ctx.devices[0].platform) <= (0, 13)): # https://github.com/pocl/pocl/issues/353 pytest.skip("POCL's rectangular copies crash") diff --git a/test/test_wrapper.py b/test/test_wrapper.py index b1a23c2a00eb2289fd98dd5a228953109bf3ffef..a281e3105c5d17941821de6c103609a298d99a86 100644 --- a/test/test_wrapper.py +++ b/test/test_wrapper.py @@ -33,6 +33,7 @@ import pyopencl.array as cl_array import pyopencl.clrandom from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) +from pyopencl.characterize import get_pocl_version # Are CL implementations crashy? You be the judge. :) try: @@ -43,10 +44,11 @@ else: faulthandler.enable() -def _skip_if_pocl(plat, msg='unsupported by pocl'): +def _skip_if_pocl(plat, up_to_version, msg='unsupported by pocl'): if plat.vendor == "The pocl project": - import pytest - pytest.skip(msg) + if up_to_version is None or get_pocl_version(plat) <= up_to_version: + import pytest + pytest.skip(msg) def test_get_info(ctx_factory): @@ -363,7 +365,7 @@ def test_image_2d(ctx_factory): if "Intel" in device.vendor and "31360.31426" in device.version: from pytest import skip skip("images crashy on %s" % device) - _skip_if_pocl(device.platform, 'pocl does not support CL_ADDRESS_CLAMP') + _skip_if_pocl(device.platform, None, 'pocl does not support CL_ADDRESS_CLAMP') prg = cl.Program(context, """ __kernel void copy_image( @@ -435,7 +437,7 @@ def test_image_3d(ctx_factory): if device.platform.vendor == "Intel(R) Corporation": from pytest import skip skip("images crashy on %s" % device) - _skip_if_pocl(device.platform, 'pocl does not support CL_ADDRESS_CLAMP') + _skip_if_pocl(device.platform, None, 'pocl does not support CL_ADDRESS_CLAMP') prg = cl.Program(context, """ __kernel void copy_image_plane( @@ -626,7 +628,8 @@ def test_can_build_binary(ctx_factory): def test_enqueue_barrier_marker(ctx_factory): ctx = ctx_factory() - _skip_if_pocl(ctx.devices[0].platform, 'pocl crashes on enqueue_barrier') + # Still relevant on pocl 0.14. + _skip_if_pocl(ctx.devices[0].platform, None, 'pocl crashes on enqueue_barrier') queue = cl.CommandQueue(ctx) cl.enqueue_barrier(queue) evt1 = cl.enqueue_marker(queue) @@ -647,7 +650,7 @@ def test_unload_compiler(platform): cl.get_cl_header_version() < (1, 2)): from pytest import skip skip("clUnloadPlatformCompiler is only available in OpenCL 1.2") - _skip_if_pocl(platform, 'pocl does not support unloading compiler') + _skip_if_pocl(platform, (0, 13), 'pocl does not support unloading compiler') if platform.vendor == "Intel(R) Corporation": from pytest import skip skip("Intel proprietary driver does not support unloading compiler") @@ -954,7 +957,8 @@ def test_coarse_grain_svm(ctx_factory): # https://bitbucket.org/pypy/numpy/issues/52 assert isinstance(svm_ary.mem.base, cl.SVMAllocation) - if dev.platform.name != "Portable Computing Language": + if (dev.platform.name != "Portable Computing Language" + or get_pocl_version(dev.platform) >= (0, 14)): # pocl 0.13 has a bug misinterpreting the size parameter cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype)) @@ -980,7 +984,7 @@ def test_coarse_grain_svm(ctx_factory): if ctx.devices[0].platform.name != "Portable Computing Language": # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)" - # in pocl 0.13. + # in pocl 0.13 and 0.14-pre. cl.enqueue_copy(queue, new_ary, svm_ary) assert np.array_equal(orig_ary*2, new_ary)