From eca4cfbfe756c0c02b211829a40b4903cfdb0b14 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 22 May 2016 22:46:36 +0200 Subject: [PATCH] Add range, slice, and data-less reductions --- doc/misc.rst | 2 + pyopencl/array.py | 20 +++--- pyopencl/reduction.py | 141 ++++++++++++++++++++++++++++++----------- test/test_algorithm.py | 30 ++++++++- 4 files changed, 144 insertions(+), 49 deletions(-) diff --git a/doc/misc.rst b/doc/misc.rst index eef15c1c..610152c2 100644 --- a/doc/misc.rst +++ b/doc/misc.rst @@ -121,6 +121,8 @@ Version 2016.2 * Deprecate RANLUXCL. It will be removed in the 2018.x series of PyOpenCL. * Introduce Random123 random number generators. See :mod:`pyopencl.clrandom` for more information. +* Add support for **range** and **slice** kwargs and data-less reductions + to :class:`pyopencl.reduction.ReductionKernel`. Version 2016.1 -------------- diff --git a/pyopencl/array.py b/pyopencl/array.py index 996fd19a..905a9610 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -2299,25 +2299,25 @@ _builtin_min = min _builtin_max = max -def sum(a, dtype=None, queue=None): +def sum(a, dtype=None, queue=None, slice=None): """ .. versionadded:: 2011.1 """ from pyopencl.reduction import get_sum_kernel krnl = get_sum_kernel(a.context, dtype, a.dtype) - return krnl(a, queue=queue) + return krnl(a, queue=queue, slice=slice) -def dot(a, b, dtype=None, queue=None): +def dot(a, b, dtype=None, queue=None, slice=None): """ .. versionadded:: 2011.1 """ from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype) - return krnl(a, b, queue=queue) + return krnl(a, b, queue=queue, slice=slice) -def vdot(a, b, dtype=None, queue=None): +def vdot(a, b, dtype=None, queue=None, slice=None): """Like :func:`numpy.vdot`. .. versionadded:: 2013.1 @@ -2325,17 +2325,17 @@ def vdot(a, b, dtype=None, queue=None): from pyopencl.reduction import get_dot_kernel krnl = get_dot_kernel(a.context, dtype, a.dtype, b.dtype, conjugate_first=True) - return krnl(a, b, queue=queue) + return krnl(a, b, queue=queue, slice=slice) -def subset_dot(subset, a, b, dtype=None, queue=None): +def subset_dot(subset, a, b, dtype=None, queue=None, slice=None): """ .. versionadded:: 2011.1 """ from pyopencl.reduction import get_subset_dot_kernel krnl = get_subset_dot_kernel( a.context, dtype, subset.dtype, a.dtype, b.dtype) - return krnl(subset, a, b, queue=queue) + return krnl(subset, a, b, queue=queue, slice=slice) def _make_minmax_kernel(what): @@ -2358,10 +2358,10 @@ max.__doc__ = """ def _make_subset_minmax_kernel(what): - def f(subset, a, queue=None): + def f(subset, a, queue=None, slice=None): from pyopencl.reduction import get_subset_minmax_kernel krnl = get_subset_minmax_kernel(a.context, what, a.dtype, subset.dtype) - return krnl(subset, a, queue=queue) + return krnl(subset, a, queue=queue, slice=slice) return f diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index 689763f1..3284d75c 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -44,9 +44,9 @@ import numpy as np # {{{ kernel source KERNEL = r"""//CL// - #define GROUP_SIZE ${group_size} - #define READ_AND_MAP(i) (${map_expr}) - #define REDUCE(a, b) (${reduce_expr}) + #define PCL_GROUP_SIZE ${group_size} + #define PCL_READ_AND_MAP(i) (${map_expr}) + #define PCL_REDUCE(a, b) (${reduce_expr}) % if double_support: #if __OPENCL_C_VERSION__ < 120 @@ -59,33 +59,37 @@ KERNEL = r"""//CL// ${preamble} - typedef ${out_type} out_type; + typedef ${out_type} pcl_out_type; __kernel void ${name}( - __global out_type *out__base, long out__offset, ${arguments}, - unsigned int seq_count, unsigned int n) + __global pcl_out_type *pcl_out__base, long pcl_out__offset, + ${arguments} + long pcl_start, long pcl_step, long pcl_stop, + unsigned int pcl_seq_count, long n) { - __global out_type *out = (__global out_type *) ( - (__global char *) out__base + out__offset); + __global pcl_out_type *pcl_out = (__global pcl_out_type *) ( + (__global char *) pcl_out__base + pcl_out__offset); ${arg_prep} - __local out_type ldata[GROUP_SIZE]; + __local pcl_out_type pcl_ldata[PCL_GROUP_SIZE]; - unsigned int lid = get_local_id(0); + unsigned int pcl_lid = get_local_id(0); - unsigned int i = get_group_id(0)*GROUP_SIZE*seq_count + lid; + const long pcl_base_idx = + get_group_id(0)*PCL_GROUP_SIZE*pcl_seq_count + pcl_lid; + long i = pcl_start + pcl_base_idx * pcl_step; - out_type acc = ${neutral}; - for (unsigned s = 0; s < seq_count; ++s) + pcl_out_type pcl_acc = ${neutral}; + for (unsigned pcl_s = 0; pcl_s < pcl_seq_count; ++pcl_s) { - if (i >= n) + if (i >= pcl_stop) break; - acc = REDUCE(acc, READ_AND_MAP(i)); + pcl_acc = PCL_REDUCE(pcl_acc, PCL_READ_AND_MAP(i)); - i += GROUP_SIZE; + i += PCL_GROUP_SIZE*pcl_step; } - ldata[lid] = acc; + pcl_ldata[pcl_lid] = pcl_acc; <% cur_size = group_size @@ -99,18 +103,18 @@ KERNEL = r"""//CL// assert new_size * 2 == cur_size %> - if (lid < ${new_size}) + if (pcl_lid < ${new_size}) { - ldata[lid] = REDUCE( - ldata[lid], - ldata[lid + ${new_size}]); + pcl_ldata[pcl_lid] = PCL_REDUCE( + pcl_ldata[pcl_lid], + pcl_ldata[pcl_lid + ${new_size}]); } <% cur_size = new_size %> % endwhile - if (lid == 0) out[get_group_id(0)] = ldata[0]; + if (pcl_lid == 0) pcl_out[get_group_id(0)] = pcl_ldata[0]; } """ @@ -157,10 +161,15 @@ def _get_reduction_source( from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support + + arguments = ", ".join(arg.declarator() for arg in parsed_args) + if parsed_args: + arguments += ", " + src = str(Template(KERNEL).render( out_type=out_type, - arguments=", ".join(arg.declarator() for arg in parsed_args), group_size=group_size, + arguments=arguments, neutral=neutral, reduce_expr=_process_code_for_macro(reduce_expr), map_expr=_process_code_for_macro(map_expr), @@ -222,7 +231,7 @@ def get_reduction_kernel(stage, inf.kernel.set_scalar_arg_dtypes( [None, np.int64] + get_arg_list_scalar_arg_dtypes(inf.arg_types) - + [np.uint32]*2) + + [np.int64]*5) return inf @@ -266,15 +275,21 @@ class ReductionKernel: name=name+"_stage2", options=options, preamble=preamble, max_group_size=max_group_size) - from pytools import any - from pyopencl.tools import VectorArg - assert any( - isinstance(arg_tp, VectorArg) - for arg_tp in self.stage_1_inf.arg_types), \ - "ReductionKernel can only be used with functions " \ - "that have at least one vector argument" - def __call__(self, *args, **kwargs): + """ + :arg range: A :class:`slice` object. Specifies the range of indices on which + the kernel will be executed. May not be given at the same time + as *slice*. + :arg slice: A :class:`slice` object. + Specifies the range of indices on which the kernel will be + executed, relative to the first vector-like argument. + May not be given at the same time as *range*. + :arg allocator: + + .. versionchanged:: 2016.2 + + *range_* and *slice_* added. + """ MAX_GROUP_COUNT = 1024 # noqa SMALL_SEQ_COUNT = 4 # noqa @@ -283,10 +298,14 @@ class ReductionKernel: stage_inf = self.stage_1_inf queue = kwargs.pop("queue", None) + allocator = kwargs.pop("allocator", None) wait_for = kwargs.pop("wait_for", None) return_event = kwargs.pop("return_event", False) out = kwargs.pop("out", None) + range_ = kwargs.pop("range", None) + slice_ = kwargs.pop("slice", None) + if kwargs: raise TypeError("invalid keyword argument to reduction kernel") @@ -310,14 +329,58 @@ class ReductionKernel: else: invocation_args.append(arg) - repr_vec = vectors[0] - sz = repr_vec.size + if vectors: + repr_vec = vectors[0] + else: + repr_vec = None + + # {{{ range/slice processing + + if range_ is not None: + if slice_ is not None: + raise TypeError("may not specify both range and slice " + "keyword arguments") + + else: + if slice_ is None: + slice_ = slice(None) + + if repr_vec is None: + raise TypeError( + "must have vector argument when range is not specified") + + range_ = slice(*slice_.indices(repr_vec.size)) + + assert range_ is not None + + start = range_.start + if start is None: + start = 0 + if range_.step is None: + step = 1 + else: + step = range_.step + sz = abs(range_.stop - start)//step + + # }}} if queue is not None: use_queue = queue else: + if repr_vec is None: + raise TypeError( + "must specify queue argument when no vector argument " + "present") + use_queue = repr_vec.queue + if allocator is None: + if repr_vec is None: + from pyopencl.tools import DeferredAllocator + allocator = DeferredAllocator(queue.context) + else: + allocator = repr_vec.allocator + if sz <= stage_inf.group_size*SMALL_SEQ_COUNT*MAX_GROUP_COUNT: total_group_size = SMALL_SEQ_COUNT*stage_inf.group_size group_count = (sz + total_group_size - 1) // total_group_size @@ -327,23 +390,25 @@ class ReductionKernel: macrogroup_size = group_count*stage_inf.group_size seq_count = (sz + macrogroup_size - 1) // macrogroup_size + size_args = [start, step, range_.stop, seq_count, sz] + if group_count == 1 and out is not None: result = out elif group_count == 1: result = empty(use_queue, (), self.dtype_out, - allocator=repr_vec.allocator) + allocator=allocator) else: result = empty(use_queue, (group_count,), self.dtype_out, - allocator=repr_vec.allocator) + allocator=allocator) last_evt = stage_inf.kernel( use_queue, (group_count*stage_inf.group_size,), (stage_inf.group_size,), *([result.base_data, result.offset] - + invocation_args + [seq_count, sz]), + + invocation_args + size_args), **dict(wait_for=wait_for)) wait_for = [last_evt] @@ -356,6 +421,8 @@ class ReductionKernel: stage_inf = self.stage_2_inf args = (result,) + stage1_args + range_ = slice_ = None + # }}} diff --git a/test/test_algorithm.py b/test/test_algorithm.py index 952a38dd..ffd85676 100644 --- a/test/test_algorithm.py +++ b/test/test_algorithm.py @@ -244,11 +244,37 @@ def test_sum(ctx_factory): slice(1000, 3000), slice(1000, -3000), slice(1000, None), + slice(1000, None, 3), ]: sum_a = np.sum(a[slc]) - sum_a_gpu = cl_array.sum(a_gpu[slc]).get() - assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4 + if slc.step is None: + sum_a_gpu = cl_array.sum(a_gpu[slc]).get() + assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4 + + sum_a_gpu_2 = cl_array.sum(a_gpu, slice=slc).get() + assert abs(sum_a_gpu_2 - sum_a) / abs(sum_a) < 1e-4 + + +def test_sum_without_data(ctx_factory): + from pytest import importorskip + importorskip("mako") + + context = ctx_factory() + queue = cl.CommandQueue(context) + + n = 2000 + + from pyopencl.reduction import ReductionKernel + red = ReductionKernel(context, np.int32, + neutral="0", + reduce_expr="a+b", map_expr="i", + arguments=[]) + + result_dev = red(range=range(n), queue=queue).get() + result_ref = n*(n-1)//2 + + assert result_dev == result_ref def test_minmax(ctx_factory): -- GitLab