From a26b5b9ce5516de8cb30dcbc02e3b00b534f905f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 3 Jun 2013 13:15:58 -0400 Subject: [PATCH] Support for slices and offsets in elementwise computations. --- pyopencl/algorithm.py | 2 ++ pyopencl/array.py | 15 +++++--- pyopencl/compyte | 2 +- pyopencl/elementwise.py | 55 +++++++++++++++++----------- pyopencl/tools.py | 79 +++++++++++++++++++++++++++++++++++------ test/test_array.py | 57 +++++++++++++++++------------ 6 files changed, 150 insertions(+), 60 deletions(-) diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py index beddbb2c..737666f4 100644 --- a/pyopencl/algorithm.py +++ b/pyopencl/algorithm.py @@ -113,6 +113,7 @@ def remove_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=N # }}} + # {{{ partition _partition_template = ScanTemplate( @@ -523,6 +524,7 @@ class RadixSort(object): # }}} + # {{{ generic parallel list builder # {{{ kernel template diff --git a/pyopencl/array.py b/pyopencl/array.py index 7f4dd64d..22f49cf0 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -176,7 +176,8 @@ def elwise_kernel_runner(kernel_getter): if not arg.flags.forc: raise RuntimeError("only contiguous arrays may " "be used as arguments to this operation") - actual_args.append(arg.data) + actual_args.append(arg.base_data) + actual_args.append(arg.offset) wait_for.extend(arg.events) else: actual_args.append(arg) @@ -220,7 +221,7 @@ class ArrayHasOffsetError(ValueError): .. versionadded:: 2013.1 """ - def __init__(self, val="The operation you are attempting does not (yet?) " + def __init__(self, val="The operation you are attempting does not yet " "support arrays that start at an offset from the beginning " "of their buffer."): ValueError.__init__(self, val) @@ -1477,10 +1478,16 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None, cl.kernel_work_group_info.WORK_GROUP_SIZE, queue.device)) + from pytools import flatten knl(queue, gs, ls, *([o.data for o in out[chunk_slice]] - + [dest_indices.data, src_indices.data] - + [i.data for i in arrays[chunk_slice]] + + [dest_indices.base_data, + dest_indices.offset, + src_indices.base_data, + src_indices.offset] + + list(flatten( + (i.base_data, i.offset) + for i in arrays[chunk_slice])) + src_offsets_list[chunk_slice] + [src_indices.size])) diff --git a/pyopencl/compyte b/pyopencl/compyte index fc06f44a..479c0c6e 160000 --- a/pyopencl/compyte +++ b/pyopencl/compyte @@ -1 +1 @@ -Subproject commit fc06f44a14dc9658a460d001a4b4a5c8fd05b993 +Subproject commit 479c0c6e1daed38ca9153b1fd42e1e9fffb0a256 diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 896b2c3e..22535990 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -112,8 +112,8 @@ def get_elwise_kernel_and_types(context, arguments, operation, name="elwise_kernel", options=[], preamble="", use_range=False, **kwargs): - from pyopencl.tools import parse_arg_list - parsed_args = parse_arg_list(arguments) + from pyopencl.tools import parse_arg_list, get_arg_offset_adjuster_code + parsed_args = parse_arg_list(arguments, with_offset=True) auto_preamble = kwargs.pop("auto_preamble", True) @@ -147,10 +147,12 @@ def get_elwise_kernel_and_types(context, arguments, operation, else: parsed_args.append(ScalarArg(np.intp, "n")) + loop_prep = kwargs.pop("loop_prep", "") + loop_prep = get_arg_offset_adjuster_code(parsed_args) + loop_prep prg = get_elwise_program( context, parsed_args, operation, name=name, options=options, preamble=preamble, - use_range=use_range, **kwargs) + use_range=use_range, loop_prep=loop_prep, **kwargs) from pyopencl.tools import get_arg_list_scalar_arg_dtypes @@ -215,6 +217,15 @@ class ElementwiseKernel: name=self.name, options=self.options, use_range=use_range, **self.kwargs) + for arg in arg_descrs: + if isinstance(arg, VectorArg) and not arg.with_offset: + from warnings import warn + warn("ElementwiseKernel '%s' used with VectorArgs that do not " + "have offset support enabled. This usage is deprecated. " + "Just pass with_offset=True to VectorArg, everything should " + "sort itself out automatically." % self.name, + DeprecationWarning) + if not [i for i, arg in enumerate(arg_descrs) if isinstance(arg, VectorArg)]: raise RuntimeError( @@ -244,7 +255,9 @@ class ElementwiseKernel: if repr_vec is None: repr_vec = arg - invocation_args.append(arg.data) + invocation_args.append(arg.base_data) + if arg_descr.with_offset: + invocation_args.append(arg.offset) else: invocation_args.append(arg) @@ -319,7 +332,7 @@ class ElementwiseTemplate(KernelTemplateBase): type_aliases, var_values, context, options) arg_list = renderer.render_argument_list( - self.arguments, more_arguments) + self.arguments, more_arguments, with_offset=True) type_decl_preamble = renderer.get_type_decl_preamble( context.devices[0], declare_types, arg_list) @@ -344,11 +357,11 @@ def get_take_kernel(context, dtype, idx_dtype, vec_count=1): "tp": dtype_to_ctype(dtype), } - args = ([VectorArg(dtype, "dest" + str(i)) + args = ([VectorArg(dtype, "dest" + str(i), with_offset=True) for i in range(vec_count)] - + [VectorArg(dtype, "src" + str(i)) + + [VectorArg(dtype, "src" + str(i), with_offset=True) for i in range(vec_count)] - + [VectorArg(idx_dtype, "idx")]) + + [VectorArg(idx_dtype, "idx", with_offset=True)]) body = ( ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join( @@ -369,10 +382,10 @@ def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): VectorArg(dtype, "dest%d" % i) for i in range(vec_count) ] + [ - VectorArg(idx_dtype, "gmem_dest_idx"), - VectorArg(idx_dtype, "gmem_src_idx"), + VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), + VectorArg(idx_dtype, "gmem_src_idx", with_offset=True), ] + [ - VectorArg(dtype, "src%d" % i) + VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] + [ ScalarArg(idx_dtype, "offset%d" % i) @@ -404,12 +417,12 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1): } args = [ - VectorArg(dtype, "dest%d" % i) + VectorArg(dtype, "dest%d" % i, with_offset=True) for i in range(vec_count) ] + [ - VectorArg(idx_dtype, "gmem_dest_idx"), + VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), ] + [ - VectorArg(dtype, "src%d" % i) + VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] @@ -458,18 +471,18 @@ def get_linear_combination_kernel(summand_descriptors, preamble.append( "texture <%s, 1, cudaReadModeElementType> tex_a%d;" % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) - args.append(VectorArg(vector_dtype, "x%d" % i)) + args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True)) tex_names.append("tex_a%d" % i) loop_prep.append( "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" % (dtype_to_ctype(scalar_dtype), i, i)) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) - args.append(VectorArg(vector_dtype, "x%d" % i)) + args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True)) summands.append("a%d*x%d[i]" % (i, i)) - args.append(VectorArg(dtype_z, "z")) + args.append(VectorArg(dtype_z, "z", with_offset=True)) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, @@ -831,10 +844,10 @@ def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None): @context_dependent_memoize def get_if_positive_kernel(context, crit_dtype, dtype): return get_elwise_kernel(context, [ - VectorArg(dtype, "result"), - VectorArg(crit_dtype, "crit"), - VectorArg(dtype, "then_"), - VectorArg(dtype, "else_"), + VectorArg(dtype, "result", with_offset=True), + VectorArg(crit_dtype, "crit", with_offset=True), + VectorArg(dtype, "then_", with_offset=True), + VectorArg(dtype, "else_", with_offset=True), ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", name="if_positive") diff --git a/pyopencl/tools.py b/pyopencl/tools.py index bc935b2a..47c00140 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -313,8 +313,19 @@ class DtypedArgument(Argument): class VectorArg(DtypedArgument): + def __init__(self, dtype, name, with_offset=False): + DtypedArgument.__init__(self, dtype, name) + self.with_offset = with_offset + def declarator(self): - return "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name) + if self.with_offset: + # Two underscores -> less likelihood of a name clash. + return "__global %s *%s__base, long %s__offset" % ( + dtype_to_ctype(self.dtype), self.name, self.name) + else: + result = "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name) + + return result class ScalarArg(DtypedArgument): @@ -331,7 +342,7 @@ class OtherArg(Argument): return self.decl -def parse_c_arg(c_arg): +def parse_c_arg(c_arg, with_offset=False): for aspace in ["__local", "__constant"]: if aspace in c_arg: raise RuntimeError("cannot deal with local or constant " @@ -339,11 +350,17 @@ def parse_c_arg(c_arg): c_arg = c_arg.replace("__global", "") + if with_offset: + vec_arg_factory = lambda dtype, name: \ + VectorArg(dtype, name, with_offset=True) + else: + vec_arg_factory = VectorArg + from pyopencl.compyte.dtypes import parse_c_arg_backend - return parse_c_arg_backend(c_arg, ScalarArg, VectorArg) + return parse_c_arg_backend(c_arg, ScalarArg, vec_arg_factory) -def parse_arg_list(arguments): +def parse_arg_list(arguments, with_offset=False): """Parse a list of kernel arguments. *arguments* may be a comma-separate list of C declarators in a string, a list of strings representing C declarators, or :class:`Argument` objects. @@ -355,7 +372,7 @@ def parse_arg_list(arguments): def parse_single_arg(obj): if isinstance(obj, str): from pyopencl.tools import parse_c_arg - return parse_c_arg(obj) + return parse_c_arg(obj, with_offset=with_offset) else: return obj @@ -365,15 +382,34 @@ def parse_arg_list(arguments): def get_arg_list_scalar_arg_dtypes(arg_types): result = [] - from pyopencl.tools import ScalarArg for arg_type in arg_types: if isinstance(arg_type, ScalarArg): result.append(arg_type.dtype) - else: + elif isinstance(arg_type, VectorArg): result.append(None) + if arg_type.with_offset: + result.append(np.int64) + else: + raise RuntimeError("arg type not understood: %s" % type(arg_type)) return result + +def get_arg_offset_adjuster_code(arg_types): + result = [] + + for arg_type in arg_types: + if isinstance(arg_type, VectorArg) and arg_type.with_offset: + result.append("__global %(type)s *%(name)s = " + "(__global %(type)s *) " + "((__global char *) %(name)s__base + %(name)s__offset);" + % dict( + type=dtype_to_ctype(arg_type.dtype), + name=arg_type.name)) + + return "\n".join(result) + + # }}} @@ -663,9 +699,19 @@ class _MakoTextTemplate: class _ArgumentPlaceholder: - def __init__(self, typename, name): + """A placeholder for subclasses of :class:`DtypedArgument`. This is needed + because the concrete dtype of the argument is not known at template + creation time--it may be a type alias that will only be filled in + at run time. These types take the place of these proto-arguments until + all types are known. + + See also :class:`_TemplateRenderer.render_arg`. + """ + + def __init__(self, typename, name, **extra_kwargs): self.typename = typename self.name = name + self.extra_kwargs = extra_kwargs class _VectorArgPlaceholder(_ArgumentPlaceholder): @@ -721,11 +767,16 @@ class _TemplateRenderer(object): def render_arg(self, arg_placeholder): return arg_placeholder.target_class( self.parse_type(arg_placeholder.typename), - arg_placeholder.name) + arg_placeholder.name, + **arg_placeholder.extra_kwargs) _C_COMMENT_FINDER = re.compile(r"/\*.*?\*/") - def render_argument_list(self, *arg_lists): + def render_argument_list(self, *arg_lists, **kwargs): + with_offset = kwargs.pop("with_offset", False) + if kwargs: + raise TypeError("unrecognized kwargs: " + ", ".join(kwargs)) + all_args = [] for arg_list in arg_lists: @@ -740,6 +791,12 @@ class _TemplateRenderer(object): else: all_args.extend(arg_list) + if with_offset: + vec_arg_factory = lambda typename, name: \ + _VectorArgPlaceholder(typename, name, with_offset=True) + else: + vec_arg_factory = _VectorArgPlaceholder + from pyopencl.compyte.dtypes import parse_c_arg_backend parsed_args = [] for arg in all_args: @@ -749,7 +806,7 @@ class _TemplateRenderer(object): continue ph = parse_c_arg_backend(arg, - _ScalarArgPlaceholder, _VectorArgPlaceholder, + _ScalarArgPlaceholder, vec_arg_factory, name_to_dtype=lambda x: x) parsed_arg = self.render_arg(ph) diff --git a/test/test_array.py b/test/test_array.py index b292b132..05b6d8c8 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -30,13 +30,11 @@ import pytools.test import pyopencl as cl import pyopencl.array as cl_array import pyopencl.tools as cl_tools -from pyopencl.tools import pytest_generate_tests_for_pyopencl \ - as pytest_generate_tests +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) from pyopencl.characterize import has_double_support - - # {{{ helpers TO_REAL = { @@ -44,6 +42,7 @@ TO_REAL = { np.dtype(np.complex128): np.float64 } + def general_clrand(queue, shape, dtype): from pyopencl.clrandom import rand as clrand @@ -69,6 +68,7 @@ def make_random_array(queue, dtype, size): # }}} + # {{{ dtype-related @pytools.test.mark_test.opencl @@ -80,13 +80,14 @@ def test_basic_complex(ctx_factory): size = 500 - ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) + ary = (rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) + rand(queue, shape=(size,), dtype=np.float32).astype(np.complex64) * 1j) c = np.complex64(5+7j) host_ary = ary.get() assert la.norm((ary*c).get() - c*host_ary) < 1e-5 * la.norm(host_ary) + @pytools.test.mark_test.opencl def test_mix_complex(ctx_factory): context = ctx_factory() @@ -157,6 +158,7 @@ def test_mix_complex(ctx_factory): assert correct + @pytools.test.mark_test.opencl def test_pow_neg1_vs_inv(ctx_factory): ctx = ctx_factory() @@ -189,14 +191,20 @@ def test_vector_fill(ctx_factory): a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4) + @pytools.test.mark_test.opencl def test_absrealimag(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - def real(x): return x.real - def imag(x): return x.imag - def conj(x): return x.conj() + def real(x): + return x.real + + def imag(x): + return x.imag + + def conj(x): + return x.conj() n = 111 for func in [abs, real, imag, conj]: @@ -216,7 +224,8 @@ def test_absrealimag(ctx_factory): # }}} -# {{{ operands + +# {{{ operators @pytools.test.mark_test.opencl def test_pow_array(ctx_factory): @@ -254,9 +263,9 @@ def test_multiply(ctx_factory): for sz in [10, 50000]: for dtype, scalars in [ - (np.float32, [2]), - (np.complex64, [2j]), - ]: + (np.float32, [2]), + (np.complex64, [2j]), + ]: for scalar in scalars: a_gpu = make_random_array(queue, dtype, sz) a = a_gpu.get() @@ -390,6 +399,7 @@ def test_divide_array(ctx_factory): # }}} + # {{{ RNG @pytools.test.mark_test.opencl @@ -433,6 +443,7 @@ def test_random(ctx_factory): # }}} + # {{{ misc @pytools.test.mark_test.opencl @@ -443,6 +454,7 @@ def test_numpy_integer_shape(ctx_factory): cl_array.empty(queue, np.int32(17), np.float32) cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32) + @pytools.test.mark_test.opencl def test_len(ctx_factory): context = ctx_factory() @@ -452,6 +464,7 @@ def test_len(ctx_factory): a_cpu = cl_array.to_device(queue, a) assert len(a_cpu) == 10 + @pytools.test.mark_test.opencl def test_stride_preservation(ctx_factory): context = ctx_factory() @@ -464,6 +477,7 @@ def test_stride_preservation(ctx_factory): print(AT_GPU.flags.f_contiguous, AT_GPU.flags.c_contiguous) assert np.allclose(AT_GPU.get(), AT) + @pytools.test.mark_test.opencl def test_nan_arithmetic(ctx_factory): context = ctx_factory() @@ -489,6 +503,7 @@ def test_nan_arithmetic(ctx_factory): assert (np.isnan(ab) == np.isnan(ab_gpu)).all() + @pytools.test.mark_test.opencl def test_mem_pool_with_arrays(ctx_factory): context = ctx_factory() @@ -501,6 +516,7 @@ def test_mem_pool_with_arrays(ctx_factory): assert a_dev.allocator is mem_pool assert b_dev.allocator is mem_pool + @pytools.test.mark_test.opencl def test_view(ctx_factory): context = ctx_factory() @@ -523,37 +539,32 @@ def test_view(ctx_factory): # }}} + @pytools.test.mark_test.opencl -def no_test_slice(ctx_factory): +def test_slice(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.clrandom import rand as clrand l = 20000 - a_gpu = clrand(queue, (l,)) + a_gpu = clrand(queue, (l,), dtype=np.float32) a = a_gpu.get() from random import randrange - for i in range(200): + for i in range(20): start = randrange(l) end = randrange(start, l) - a_gpu_slice = a_gpu[start:end] - a_slice = a[start:end] + a_gpu_slice = 2*a_gpu[start:end] + a_slice = 2*a[start:end] assert la.norm(a_gpu_slice.get() - a_slice) == 0 - - - if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the # tests. - import pyopencl as cl - - import sys if len(sys.argv) > 1: exec(sys.argv[1]) else: -- GitLab