From 602db4a829abfb7a9f813dae55ceb52b5611b04c Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Sun, 5 Jul 2015 00:29:34 -0500 Subject: [PATCH] Code generation for kernel enqueue and work around pocl complex arg split issue --- pyopencl/__init__.py | 342 ++++++++++++++++++++++++++++++++++------- pyopencl/cffi_cl.py | 30 ++-- setup.py | 2 +- test/test_algorithm.py | 3 - test/test_array.py | 8 - test/test_clmath.py | 13 +- 6 files changed, 305 insertions(+), 93 deletions(-) diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index aa8abdd7..0e7412b1 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -42,6 +42,8 @@ except ImportError: "its source directory. This likely won't work.") raise +_CPY2 = _cl._CPY2 +_CPY26 = _cl._CPY2 and sys.version_info < (2, 7) import numpy as np @@ -167,6 +169,8 @@ CONSTANT_CLASSES = [ and name[0].islower() and name not in ["zip", "map", "range"]] +# {{{ diagnostics + class CompilerWarning(UserWarning): pass @@ -185,6 +189,25 @@ def compiler_output(text): class _ErrorRecord(_Record): pass +# }}} + + +# {{{ arg packing helpers + +_size_t_char = ({ + 8: 'Q', + 4: 'L', + 2: 'H', + 1: 'B', +})[_cl._ffi.sizeof('size_t')] +_type_char_map = { + 'n': _size_t_char.lower(), + 'N': _size_t_char +} +del _size_t_char + +# }}} + # {{{ find pyopencl shipped source code @@ -609,72 +632,260 @@ def _add_functionality(): kernel_old_init(self, prg, name) self._source = getattr(prg, "_source", None) - def kernel_call(self, queue, global_size, local_size, *args, **kwargs): - global_offset = kwargs.pop("global_offset", None) - g_times_l = kwargs.pop("g_times_l", False) - wait_for = kwargs.pop("wait_for", None) + self._generate_naive_call() - if kwargs: - raise TypeError( - "Kernel.__call__ recived unexpected keyword arguments: %s" - % ", ".join(list(kwargs.keys()))) + # {{{ code generation for __call__, set_args - self.set_args(*args) + def kernel__set_set_args_body(self, body, num_passed_args): + from pytools.py_codegen import ( + PythonFunctionGenerator, + PythonCodeGenerator, + Indentation) - return enqueue_nd_range_kernel(queue, self, global_size, local_size, - global_offset, wait_for, g_times_l=g_times_l) + arg_names = ["arg%d" % i for i in xrange(num_passed_args)] - def kernel_set_scalar_arg_dtypes(self, arg_dtypes): - assert len(arg_dtypes) == self.num_args, ( - "length of argument type array (%d) and " - "CL-generated number of arguments (%d) do not agree" - % (len(arg_dtypes), self.num_args)) + # {{{ wrap in error handler - arg_type_chars = [] + err_gen = PythonCodeGenerator() - for arg_dtype in arg_dtypes: - if arg_dtype is None: - arg_type_chars.append(None) + err_gen("try:") + with Indentation(err_gen): + err_gen.extend(body) + err_gen("except TypeError as e:") + with Indentation(err_gen): + err_gen(""" + if current_arg is not None: + args = [{args}] + advice = "" + from pyopencl.array import Array + if isinstance(args[current_arg], Array): + advice = " (perhaps you meant to pass 'array.data' " \ + "instead of the array itself?)" + + raise _cl.LogicError( + "when processing argument #%d (1-based): %s%s" + % (current_arg+1, str(e), advice)) + else: + raise + """ + .format(args=", ".join(arg_names))) + err_gen("") + + # }}} + + def add_preamble(gen): + gen.add_to_preamble( + "import numpy as np") + gen.add_to_preamble( + "import pyopencl.cffi_cl as _cl") + gen.add_to_preamble( + "from pyopencl.cffi_cl import _lib, " + "_ffi, _handle_error, _CLKernelArg") + gen.add_to_preamble("from pyopencl import status_code") + gen.add_to_preamble("from struct import pack") + gen.add_to_preamble("") + + # {{{ generate _enqueue + + gen = PythonFunctionGenerator("enqueue_knl_%s" % self.function_name, + ["self", "queue", "global_size", "local_size"] + + arg_names + + ["global_offset=None", "g_times_l=None", "wait_for=None"]) + + add_preamble(gen) + gen.extend(err_gen) + + gen(""" + return _cl.enqueue_nd_range_kernel(queue, self, global_size, local_size, + global_offset, wait_for, g_times_l=g_times_l) + """) + + self._enqueue = gen.get_function() + + # }}} + + # {{{ generate set_args + + gen = PythonFunctionGenerator("_set_args", ["self"] + arg_names) + + add_preamble(gen) + gen.extend(err_gen) + + self._set_args = gen.get_function() + + # }}} + + def kernel__generate_buffer_arg_setter(self, gen, arg_idx, buf_var): + from pytools.py_codegen import Indentation + + if _CPY2: + # https://github.com/numpy/numpy/issues/5381 + gen("if isinstance({buf_var}, np.generic):".format(buf_var=buf_var)) + with Indentation(gen): + gen("{buf_var} = np.getbuffer({buf_var})".format(buf_var=buf_var)) + + gen(""" + c_buf, sz, _ = _cl._c_buffer_from_obj({buf_var}) + status = _lib.kernel__set_arg_buf(self.ptr, {arg_idx}, c_buf, sz) + if status != _ffi.NULL: + _handle_error(status) + """ + .format(arg_idx=arg_idx, buf_var=buf_var)) + + def kernel__generate_generic_arg_handler(self, gen, arg_idx, arg_var): + from pytools.py_codegen import Indentation + + gen(""" + if {arg_var} is None: + status = _lib.kernel__set_arg_null(self.ptr, {arg_idx}) + if status != _ffi.NULL: + _handle_error(status) + elif isinstance({arg_var}, _CLKernelArg): + self.set_arg({arg_idx}, {arg_var}) + """ + .format(arg_idx=arg_idx, arg_var=arg_var)) + + gen("else:") + with Indentation(gen): + self._generate_buffer_arg_setter(gen, arg_idx, arg_var) + + def kernel__generate_naive_call(self): + num_args = self.num_args + + from pytools.py_codegen import PythonCodeGenerator + gen = PythonCodeGenerator() + + for i in range(num_args): + gen("# process argument {arg_idx}".format(arg_idx=i)) + gen("") + gen("current_arg = {arg_idx}".format(arg_idx=i)) + self._generate_generic_arg_handler(gen, i, "arg%d" % i) + gen("") + + self._set_set_args_body(gen, num_args) + + def kernel_set_scalar_arg_dtypes(self, scalar_arg_dtypes): + # {{{ arg counting bug handling + + # For example: + # https://github.com/pocl/pocl/issues/197 + # (but Apple CPU has a similar bug) + + work_around_arg_count_bug = False + warn_about_arg_count_bug = False + + from pyopencl.characterize import has_struct_arg_count_bug + + count_bug_per_dev = [ + has_struct_arg_count_bug(dev) + for dev in self.context.devices] + + if any(count_bug_per_dev): + if all(count_bug_per_dev): + work_around_arg_count_bug = True else: - arg_type_chars.append(np.dtype(arg_dtype).char) + warn_about_arg_count_bug = True + + # }}} + + cl_arg_idx = 0 + + from pytools.py_codegen import PythonCodeGenerator + gen = PythonCodeGenerator() + + for arg_idx, arg_dtype in enumerate(scalar_arg_dtypes): + gen("# process argument {arg_idx}".format(arg_idx=arg_idx)) + gen("") + gen("current_arg = {arg_idx}".format(arg_idx=arg_idx)) + arg_var = "arg%d" % arg_idx + + if arg_dtype is None: + self._generate_generic_arg_handler(gen, cl_arg_idx, arg_var) + cl_arg_idx += 1 + gen("") + continue + + arg_dtype = np.dtype(arg_dtype) - self._arg_type_chars = arg_type_chars + if arg_dtype.char == "V": + self._generate_generic_arg_handler(gen, cl_arg_idx, arg_var) + cl_arg_idx += 1 - def kernel_set_args(self, *args): - assert len(args) == self.num_args, ( + elif arg_dtype.kind == "c": + if warn_about_arg_count_bug: + warn("{knl_name}: arguments include complex numbers, and " + "some (but not all) of the target devices mishandle " + "struct kernel arguments (hence the workaround is " + "disabled".format( + knl_name=self.function_name, stacklevel=2)) + + if arg_dtype == np.complex64: + arg_char = "f" + elif arg_dtype == np.complex128: + arg_char = "d" + else: + raise TypeError("unexpected complex type: %s" % arg_dtype) + + if work_around_arg_count_bug and arg_dtype == np.complex128: + gen( + "buf = pack('{arg_char}', {arg_var}.real)" + .format(arg_char=arg_char, arg_var=arg_var)) + self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf") + cl_arg_idx += 1 + gen( + "buf = pack('{arg_char}', {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=arg_var)) + self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf") + cl_arg_idx += 1 + else: + gen( + "buf = pack('{arg_char}{arg_char}', " + "{arg_var}.real, {arg_var}.imag)" + .format(arg_char=arg_char, arg_var=arg_var)) + self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf") + cl_arg_idx += 1 + + elif arg_dtype.char in "IL" and _CPY26: + # Prevent SystemError: ../Objects/longobject.c:336: bad + # argument to internal function + + gen( + "buf = pack('{arg_char}', long({arg_var})" + .format(arg_char=arg_dtype.char, arg_var=arg_var)) + self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf") + cl_arg_idx += 1 + + else: + arg_char = arg_dtype.char + arg_char = _type_char_map.get(arg_char, arg_char) + gen( + "buf = pack('{arg_char}', {arg_var})" + .format( + arg_char=arg_char, + arg_var=arg_var)) + self._generate_buffer_arg_setter(gen, cl_arg_idx, "buf") + cl_arg_idx += 1 + + gen("") + + if cl_arg_idx != self.num_args: + raise TypeError( "length of argument list (%d) and " "CL-generated number of arguments (%d) do not agree" - % (len(args), self.num_args)) + % (cl_arg_idx, self.num_args)) - i = None - try: - try: - arg_type_chars = self.__dict__["_arg_type_chars"] - except KeyError: - for i, arg in enumerate(args): - self.set_arg(i, arg) - else: - from pyopencl._pvt_struct import pack + self._set_set_args_body(gen, len(scalar_arg_dtypes)) - for i, (arg, arg_type_char) in enumerate( - zip(args, arg_type_chars)): - if arg_type_char and arg_type_char != "V": - self.set_arg(i, pack(arg_type_char, arg)) - else: - self.set_arg(i, arg) - except TypeError as e: - if i is not None: - advice = "" - from pyopencl.array import Array - if isinstance(args[i], Array): - advice = " (perhaps you meant to pass 'array.data' " \ - "instead of the array itself?)" - - raise LogicError( - "when processing argument #%d (1-based): %s%s" - % (i+1, str(e), advice)) - else: - raise + # }}} + + def kernel_set_args(self, *args, **kwargs): + # Need to dupicate the 'self' argument for dynamically generated method + return self._set_args(self, *args, **kwargs) + + def kernel_call(self, queue, global_size, local_size, *args, **kwargs): + # __call__ can't be overridden directly, so we need this + # trampoline hack. + return self._enqueue(self, queue, global_size, local_size, *args, **kwargs) def kernel_capture_call(self, filename, queue, global_size, local_size, *args, **kwargs): @@ -683,9 +894,13 @@ def _add_functionality(): *args, **kwargs) Kernel.__init__ = kernel_init - Kernel.__call__ = kernel_call + Kernel._set_set_args_body = kernel__set_set_args_body + Kernel._generate_buffer_arg_setter = kernel__generate_buffer_arg_setter + Kernel._generate_generic_arg_handler = kernel__generate_generic_arg_handler + Kernel._generate_naive_call = kernel__generate_naive_call Kernel.set_scalar_arg_dtypes = kernel_set_scalar_arg_dtypes Kernel.set_args = kernel_set_args + Kernel.__call__ = kernel_call Kernel.capture_call = kernel_capture_call # }}} @@ -842,11 +1057,20 @@ def _add_functionality(): except AttributeError: return str(val) else: - result = "%s failed: %s" % (val.routine(), - status_code.to_string(val.code(), "<unknown error %d>") - .lower().replace("_", " ")) - if val.what(): - result += " - " + val.what() + result = "" + if val.code() != status_code.SUCCESS: + result = status_code.to_string( + val.code(), "<unknown error %d>") + routine = val.routine() + if routine: + result = "%s failed: %s" % ( + routine.lower().replace("_", " "), + result) + what = val.what() + if what: + if result: + result += " - " + result += what return result def error_code(self): diff --git a/pyopencl/cffi_cl.py b/pyopencl/cffi_cl.py index 2f92250f..fbd2a0cc 100644 --- a/pyopencl/cffi_cl.py +++ b/pyopencl/cffi_cl.py @@ -37,6 +37,11 @@ from .compyte.array import f_contiguous_strides, c_contiguous_strides _lib = _ffi.dlopen(None) + +class _CLKernelArg(object): + pass + + # {{{ hook up connections between the wrapper and the interperter import gc @@ -713,7 +718,7 @@ class cffi_array(np.ndarray): # noqa return self.__base -class LocalMemory(object): +class LocalMemory(_CLKernelArg): __slots__ = ('_size',) def __init__(self, size): @@ -724,7 +729,7 @@ class LocalMemory(object): return self._size -class MemoryObjectHolder(_Common): +class MemoryObjectHolder(_Common, _CLKernelArg): def get_host_array(self, shape, dtype, order="C"): dtype, shape, strides = _norm_shape_dtype( shape, dtype, order, None, 'MemoryObjectHolder.get_host_array') @@ -1030,17 +1035,20 @@ class Kernel(_Common): self.ptr = ptr_kernel[0] def set_arg(self, arg_index, arg): + # If you change this, also change the kernel call generation logic. if arg is None: _handle_error(_lib.kernel__set_arg_null(self.ptr, arg_index)) - elif isinstance(arg, MemoryObjectHolder): - _handle_error(_lib.kernel__set_arg_mem(self.ptr, arg_index, arg.ptr)) - elif isinstance(arg, Sampler): - _handle_error(_lib.kernel__set_arg_sampler(self.ptr, arg_index, - arg.ptr)) - elif isinstance(arg, LocalMemory): - _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index, - _ffi.NULL, arg.size)) + elif isinstance(arg, _CLKernelArg): + if isinstance(arg, MemoryObjectHolder): + _handle_error(_lib.kernel__set_arg_mem(self.ptr, arg_index, arg.ptr)) + elif isinstance(arg, Sampler): + _handle_error(_lib.kernel__set_arg_sampler(self.ptr, arg_index, + arg.ptr)) + elif isinstance(arg, LocalMemory): + _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index, + _ffi.NULL, arg.size)) elif _CPY2 and isinstance(arg, np.generic): + # https://github.com/numpy/numpy/issues/5381 c_buf, size, _ = _c_buffer_from_obj(np.getbuffer(arg)) _handle_error(_lib.kernel__set_arg_buf(self.ptr, arg_index, c_buf, size)) @@ -1869,7 +1877,7 @@ class Image(MemoryObject): # {{{ Sampler -class Sampler(_Common): +class Sampler(_Common, _CLKernelArg): _id = 'sampler' def __init__(self, context, normalized_coords, addressing_mode, filter_mode): diff --git a/setup.py b/setup.py index 0c19b0f7..4ae6a052 100644 --- a/setup.py +++ b/setup.py @@ -207,7 +207,7 @@ def main(): install_requires=[ "numpy", - "pytools>=2014.2", + "pytools>=2015.1.1", "pytest>=2", "decorator>=3.2.0", "cffi>=1.1.0", diff --git a/test/test_algorithm.py b/test/test_algorithm.py index 5518d508..b55c850e 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -328,9 +328,6 @@ def test_dot(ctx_factory): queue = cl.CommandQueue(context) dev = context.devices[0] - from pyopencl.characterize import has_struct_arg_count_bug - if has_struct_arg_count_bug(dev): - pytest.xfail("device has struct arg counting bug") dtypes = [np.float32, np.complex64] if has_double_support(dev): diff --git a/test/test_array.py b/test/test_array.py index adb2f744..ecfd3ba9 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -95,11 +95,6 @@ def test_mix_complex(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - dev = context.devices[0] - from pyopencl.characterize import has_struct_arg_count_bug - if has_struct_arg_count_bug(dev): - pytest.xfail("device has struct arg counting bug") - size = 10 dtypes = [ @@ -174,9 +169,6 @@ def test_pow_neg1_vs_inv(ctx_factory): if not has_double_support(device): from pytest import skip skip("double precision not supported on %s" % device) - from pyopencl.characterize import has_struct_arg_count_bug - if has_struct_arg_count_bug(device): - pytest.xfail("device has struct arg counting bug") a_dev = make_random_array(queue, np.complex128, 20000) diff --git a/test/test_clmath.py b/test/test_clmath.py index e0e07764..4dea8f91 100644 --- a/test/test_clmath.py +++ b/test/test_clmath.py @@ -76,11 +76,6 @@ def make_unary_function_test(name, limits=(0, 1), threshold=0, use_complex=False gpu_func = getattr(clmath, name) cpu_func = getattr(np, numpy_func_names.get(name, name)) - dev = context.devices[0] - from pyopencl.characterize import has_struct_arg_count_bug - if use_complex and has_struct_arg_count_bug(dev): - pytest.xfail("device has struct arg counting bug") - if has_double_support(context.devices[0]): if use_complex: dtypes = [np.float32, np.float64, np.complex64, np.complex128] @@ -132,8 +127,8 @@ if have_cl(): (-math.pi/2 + 0.1, math.pi/2 - 0.1), 4e-5, use_complex=True) test_atan = make_unary_function_test("atan", (-10, 10), 2e-7) - test_sinh = make_unary_function_test("sinh", (-3, 3), 2e-6, use_complex=2e-3) - test_cosh = make_unary_function_test("cosh", (-3, 3), 2e-6, use_complex=2e-3) + test_sinh = make_unary_function_test("sinh", (-3, 3), 3e-6, use_complex=2e-3) + test_cosh = make_unary_function_test("cosh", (-3, 3), 3e-6, use_complex=2e-3) test_tanh = make_unary_function_test("tanh", (-3, 3), 2e-6, use_complex=True) @@ -228,10 +223,6 @@ def test_frexp(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - if context.devices[0].platform.name == "Portable Computing Language": - # https://github.com/pocl/pocl/issues/202 - pytest.xfail("POCL's frexp seems to have issues") - for s in sizes: a = cl_array.arange(queue, s, dtype=np.float32)/10 significands, exponents = clmath.frexp(a) -- GitLab