diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index ac07fbd4e3d324e66132b5e1dc8f98e9bf36cce3..9a105d1da3c5c7dbc84e64efc8e1ec021d7cc5e9 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -35,7 +35,7 @@ import pyopencl as cl from pyopencl.tools import ( context_dependent_memoize, dtype_to_ctype, KernelTemplateBase, - _process_code_for_macro, VectorArg) + _process_code_for_macro) import numpy as np @@ -61,6 +61,8 @@ KERNEL = """//CL// __global out_type *out, ${arguments}, unsigned int seq_count, unsigned int n) { + ${arg_prep} + __local out_type ldata[GROUP_SIZE]; unsigned int lid = get_local_id(0); @@ -140,7 +142,7 @@ KERNEL = """//CL// def _get_reduction_source( ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, parsed_args, - name="reduce_kernel", preamble="", + name="reduce_kernel", preamble="", arg_prep="", device=None, max_group_size=None): if device is not None: @@ -203,6 +205,7 @@ def _get_reduction_source( map_expr=_process_code_for_macro(map_expr), name=name, preamble=preamble, + arg_prep=arg_prep, double_support=all(has_double_support(dev) for dev in devices), )) @@ -222,6 +225,7 @@ def get_reduction_kernel(stage, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", preamble="", device=None, options=[], max_group_size=None): + if map_expr is None: if stage == 2: map_expr = "pyopencl_reduction_inp[i]" @@ -229,12 +233,16 @@ def get_reduction_kernel(stage, map_expr = "in[i]" from pyopencl.tools import ( - parse_arg_list, get_arg_list_scalar_arg_dtypes) + parse_arg_list, get_arg_list_scalar_arg_dtypes, + get_arg_offset_adjuster_code, VectorArg) - if arguments is not None: - arguments = parse_arg_list(arguments) + arg_prep = "" + if stage == 1 and arguments is not None: + arguments = parse_arg_list(arguments, with_offset=True) + arg_prep = get_arg_offset_adjuster_code(arguments) if stage == 2 and arguments is not None: + arguments = parse_arg_list(arguments) arguments = ( [VectorArg(dtype_out, "pyopencl_reduction_inp")] + arguments) @@ -242,7 +250,7 @@ def get_reduction_kernel(stage, inf = _get_reduction_source( ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, map_expr, arguments, - name, preamble, device, max_group_size) + name, preamble, arg_prep, device, max_group_size) inf.program = cl.Program(ctx, inf.source) inf.program.build(options) @@ -298,6 +306,7 @@ class ReductionKernel: max_group_size=max_group_size) from pytools import any + from pyopencl.tools import VectorArg assert any( isinstance(arg_tp, VectorArg) for arg_tp in self.stage_1_inf.arg_types), \ @@ -325,6 +334,7 @@ class ReductionKernel: invocation_args = [] vectors = [] + from pyopencl.tools import VectorArg for arg, arg_tp in zip(args, stage_inf.arg_types): if isinstance(arg_tp, VectorArg): if not arg.flags.forc: @@ -332,7 +342,9 @@ class ReductionKernel: "deal with non-contiguous arrays") vectors.append(arg) - invocation_args.append(arg.data) + invocation_args.append(arg.base_data) + if arg_tp.with_offset: + invocation_args.append(arg.offset) else: invocation_args.append(arg)