diff --git a/pyopencl/array.py b/pyopencl/array.py index c6d19cc06b50d5724784714ca85cab0c9557dad5..761226253018b1bf70347da7c188b02c861e8970 100644 --- a/pyopencl/array.py +++ b/pyopencl/array.py @@ -1175,7 +1175,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None): raise ValueError("out and arrays must have the same length") if len(dest_indices.shape) != 1: - raise ValueError("src_indices must be 1D") + raise ValueError("dest_indices must be 1D") chunk_size = _builtin_min(vec_count, 10) diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 694a735c4d70a8058dc0d6663a1113e9aeecd775..947df7cb8e9d0dc769920dfb30db223cac2669ce 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -312,7 +312,7 @@ class ElementwiseTemplate(KernelTemplateBase): self.name = name self.preamble = preamble - def build_inner(self, context, type_aliases, var_values, + def build_inner(self, context, type_aliases=(), var_values=(), more_preamble="", more_arguments=(), declare_types=(), options=()): renderer = self.get_renderer(type_aliases, var_values, context, options) diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index c2f12cea44dbc5ef44b9b9fe1a5d4d8164121788..abf3c354b2402a4280d44fd7d5fce29975a8505f 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -36,13 +36,15 @@ None of the original source code remains. import pyopencl as cl from pyopencl.tools import ( context_dependent_memoize, - dtype_to_ctype) + dtype_to_ctype, KernelTemplateBase, + _process_code_for_macro) import numpy as np -import pyopencl._mymako as mako +# {{{ kernel source + KERNEL = """//CL// #define GROUP_SIZE ${group_size} #define READ_AND_MAP(i) (${map_expr}) @@ -134,8 +136,9 @@ KERNEL = """//CL// } """ +# }}} - +# {{{ internal codegen frontends def _get_reduction_source( ctx, out_type, out_type_size, @@ -199,8 +202,8 @@ def _get_reduction_source( group_size=group_size, no_sync_size=no_sync_size, neutral=neutral, - reduce_expr=reduce_expr, - map_expr=map_expr, + reduce_expr=_process_code_for_macro(reduce_expr), + map_expr=_process_code_for_macro(map_expr), name=name, preamble=preamble, double_support=all(has_double_support(dev) for dev in devices), @@ -219,7 +222,7 @@ def _get_reduction_source( def get_reduction_kernel(stage, - ctx, out_type, out_type_size, + ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", preamble="", device=None, options=[], max_group_size=None): @@ -229,26 +232,26 @@ def get_reduction_kernel(stage, else: map_expr = "in[i]" - if stage == 2: - in_arg = "const %s *pyopencl_reduction_inp" % out_type - if arguments: - arguments = in_arg + ", " + arguments - else: - arguments = in_arg + from pyopencl.tools import ( + parse_arg_list, get_arg_list_scalar_arg_dtypes, + VectorArg) - from pyopencl.tools import parse_arg_list, get_arg_list_scalar_arg_dtypes - parsed_args = parse_arg_list(arguments) + if arguments is not None: + arguments = parse_arg_list(arguments) + + if stage == 2 and arguments is not None: + arguments = [VectorArg(dtype_out, "pyopencl_reduction_inp")] + arguments inf = _get_reduction_source( - ctx, out_type, out_type_size, - neutral, reduce_expr, map_expr, parsed_args, + ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, + neutral, reduce_expr, map_expr, arguments, name, preamble, device, max_group_size) inf.program = cl.Program(ctx, inf.source) inf.program.build(options) inf.kernel = getattr(inf.program, name) - inf.arg_types = parsed_args + inf.arg_types = arguments inf.kernel.set_scalar_arg_dtypes( [None] @@ -257,8 +260,9 @@ def get_reduction_kernel(stage, return inf +# }}} - +# {{{ main reduction kernel class ReductionKernel: def __init__(self, ctx, dtype_out, @@ -272,7 +276,7 @@ class ReductionKernel: while True: self.stage_1_inf = get_reduction_kernel(1, ctx, - dtype_to_ctype(dtype_out), dtype_out.itemsize, + dtype_out, neutral, reduce_expr, map_expr, arguments, name=name+"_stage1", options=options, preamble=preamble, max_group_size=max_group_size) @@ -290,7 +294,7 @@ class ReductionKernel: assert trip_count <= 2 self.stage_2_inf = get_reduction_kernel(2, ctx, - dtype_to_ctype(dtype_out), dtype_out.itemsize, + dtype_out, neutral, reduce_expr, arguments=arguments, name=name+"_stage2", options=options, preamble=preamble, max_group_size=max_group_size) @@ -372,8 +376,47 @@ class ReductionKernel: stage_inf = self.stage_2_inf args = (result,) + stage1_args +# }}} + +# {{{ template +class ReductionTemplate(KernelTemplateBase): + def __init__(self, + arguments, neutral, reduce_expr, map_expr=None, + is_segment_start_expr=None, input_fetch_exprs=[], + name_prefix="reduce", preamble="", template_processor=None): + KernelTemplateBase.__init__(self, template_processor=template_processor) + self.arguments = arguments + self.reduce_expr = reduce_expr + self.neutral = neutral + self.map_expr = map_expr + self.name_prefix = name_prefix + self.preamble = preamble + + def build_inner(self, context, type_aliases=(), var_values=(), + more_preamble="", more_arguments=(), declare_types=(), + options=(), devices=None): + renderer = self.get_renderer(type_aliases, var_values, context, options) + + arg_list = renderer.render_argument_list(self.arguments, more_arguments) + + type_decl_preamble = renderer.get_type_decl_preamble( + context.devices[0], declare_types, arg_list) + + return ReductionKernel(context, renderer.type_aliases["reduction_t"], + renderer(self.neutral), renderer(self.reduce_expr), + renderer(self.map_expr), + renderer.render_argument_list(self.arguments, more_arguments), + name=renderer(self.name_prefix), options=list(options), + preamble=( + type_decl_preamble + + "\n" + + renderer(self.preamble + "\n" + more_preamble))) + +# }}} + +# {{{ array reduction kernel getters @context_dependent_memoize def get_sum_kernel(ctx, dtype_out, dtype_in): @@ -538,4 +581,6 @@ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset): "tp_lut": dtype_to_ctype(dtype_subset), }, preamble="#define MY_INFINITY (1./0)") +# }}} + # vim: filetype=pyopencl:fdm=marker diff --git a/pyopencl/scan.py b/pyopencl/scan.py index d0781b313143c2742184ab61f67b2016f6e7b4f6..0d0d5ef7d45637f27463503e7c80648da2d97d4b 100644 --- a/pyopencl/scan.py +++ b/pyopencl/scan.py @@ -1542,7 +1542,7 @@ class ScanTemplate(KernelTemplateBase): self.name_prefix = name_prefix self.preamble = preamble - def build_inner(self, context, type_aliases, var_values, + def build_inner(self, context, type_aliases=(), var_values=(), more_preamble="", more_arguments=(), declare_types=(), options=(), devices=None, scan_cls=GenericScanKernel): renderer = self.get_renderer(type_aliases, var_values, context, options)