diff --git a/doc/source/array.rst b/doc/source/array.rst index c6586eb0cc2c8f4e007df83381591fdeb7ef4242..f9b2cedbc370e6fffe2ebedad6cf4fea0abf8a95 100644 --- a/doc/source/array.rst +++ b/doc/source/array.rst @@ -421,7 +421,7 @@ Custom Reductions .. module:: pyopencl.reduction -.. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options="", preamble="") +.. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble="") Generate a kernel that takes a number of scalar or vector *arguments* (at least one vector argument), performs the *map_expr* on each entry of @@ -437,15 +437,15 @@ Custom Reductions assumed. *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is - performed and in which the result is returned. *neutral* is - specified as float or integer formatted as string. *reduce_expr* and - *map_expr* are specified as string formatted operations and *arguments* - is specified as a string formatted as a C argument list. *name* specifies - the name as which the kernel is compiled. *options* are passed - unmodified to :meth:`pyopencl.Program.build`. *preamble* is specified - as a string of code. + performed and in which the result is returned. *neutral* is specified as + float or integer formatted as string. *reduce_expr* and *map_expr* are + specified as string formatted operations and *arguments* is specified as a + string formatted as a C argument list. *name* specifies the name as which + the kernel is compiled. *options* are passed unmodified to + :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that + is inserted before the actual kernels. - .. method __call__(*args, queue=None) + .. method:: __call__(*args, queue=None) .. versionadded: 2011.1 @@ -460,6 +460,46 @@ Here's a usage example:: my_dot_prod = krnl(a, b).get() +Parallel Scan / Prefix Sum +-------------------------- + +.. module:: pyopencl.scan + +.. class:: ExclusiveScanKernel(ctx, dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="", devices=None) + + Generates a kernel that can compute a `prefix sum <https://secure.wikimedia.org/wikipedia/en/wiki/Prefix_sum>`_ + using any associative operation given as *scan_expr*. + *scan_expr* uses the formal values "a" and "b" to indicate two operands of + an associative binary operation. *neutral* is the neutral element + of *scan_expr*, obeying *scan_expr(a, neutral) == a*. + + *dtype* specifies the type of the arrays being operated on. + *name_prefix* is used for kernel names to ensure recognizability + in profiles and logs. *options* is a list of compiler options to use + when building. *preamble* specifies a string of code that is + inserted before the actual kernels. *devices* may be used to restrict + the set of devices on which the kernel is meant to run. (defaults + to all devices in the context *ctx*. + + .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None) + +.. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None) + + Works like :class:`ExclusiveScanKernel`. Unlike the exclusive case, + *neutral* is not required. + +Here's a usage example:: + + knl = InclusiveScanKernel(context, np.int32, "a+b") + + n = 2**20-2**18+5 + host_data = np.random.randint(0, 10, n).astype(np.int32) + dev_data = cl_array.to_device(queue, host_data) + + knl(dev_data) + assert (dev_data.get() == np.cumsum(host_data, axis=0)).all() + + Fast Fourier Transforms ----------------------- diff --git a/doc/source/misc.rst b/doc/source/misc.rst index e18d293e9860ddbc19e61f33a834807d7428b256..4cf441e74e2d748e55bd63c59fb700e82e3f8af3 100644 --- a/doc/source/misc.rst +++ b/doc/source/misc.rst @@ -86,6 +86,7 @@ Version 2011.1 :func:`pyopencl.enqueue_map_image`. * Add :mod:`pyopencl.reduction`. * Add :ref:`reductions`. +* Add :mod:`pyopencl.scan`. * Add :meth:`pyopencl.MemoryObject.get_host_array`. * Deprecate context arguments of :func:`pyopencl.array.to_device`, @@ -104,7 +105,7 @@ Version 0.92 extension, leading to working GL interoperability. * Add :meth:`pyopencl.Kernel.set_args`. * The call signature of :meth:`pyopencl.Kernel.__call__` changed to - emphasize the importance of *loccal_size*. + emphasize the importance of *local_size*. * Add :meth:`pyopencl.Kernel.set_scalar_arg_dtypes`. * Add support for the `cl_nv_device_attribute_query <http://www.khronos.org/registry/cl/extensions/khr/cl_nv_device_attribute_query.txt>`_ @@ -221,7 +222,7 @@ Licensing PyOpenCL is licensed to you under the MIT/X Consortium license: -Copyright (c) 2009 Andreas Klöckner and Contributors. +Copyright (c) 2009-11 Andreas Klöckner and Contributors. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation @@ -244,6 +245,30 @@ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +PyOpenCL includes derivatives of parts of the `Thrust +<https://code.google.com/p/thrust/>`_ computing package (in particular the scan +implementation). These parts are licensed as follows: + + Copyright 2008-2011 NVIDIA Corporation + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + <http://www.apache.org/licenses/LICENSE-2.0> + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +.. note:: + + If you use Apache-licensed parts, be aware that these may be incompatible + with software licensed exclusively under GPL2. (Most software is licensed + as GPL2 or later, in which case this is not an issue.) + Frequently Asked Questions ========================== diff --git a/doc/source/runtime.rst b/doc/source/runtime.rst index 357740222ebcb1563d2265eecf1b0613dbfc7220..aab26ef2108c2cb6ce2060143a67a4a91acae1de 100644 --- a/doc/source/runtime.rst +++ b/doc/source/runtime.rst @@ -571,11 +571,15 @@ Programs and Kernels See :class:`program_build_info` for values of *param*. - .. method:: build(options="", devices=None) + .. method:: build(options=[], devices=None) *options* is a string of compiler flags. Returns *self*. + .. versionchanged:: 2011.1 + + *options* may now also be a :class:`list` of :class:`str`. + .. attribute:: kernel_name :class:`Kernel` objects can be produced from a built diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py index 83f9d3a79fcdd25d3ac6d901ae5ab9b5729c3a4e..feab73477196958a5e7a9f63d3c19590a693f4a0 100644 --- a/pyopencl/__init__.py +++ b/pyopencl/__init__.py @@ -136,7 +136,10 @@ def _add_functionality(): else: return self.get_info(pi_attr) - def program_build(self, options="", devices=None): + def program_build(self, options=[], devices=None): + if isinstance(options, list): + options = " ".join(options) + try: self._build(options=options, devices=devices) except Exception, e: diff --git a/pyopencl/_cluda.py b/pyopencl/_cluda.py new file mode 100644 index 0000000000000000000000000000000000000000..957a8348bbf5b1e27f0e7ebd859a00c06e054955 --- /dev/null +++ b/pyopencl/_cluda.py @@ -0,0 +1,26 @@ +CLUDA_PREAMBLE = """ +#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE); + +#define WITHIN_KERNEL /* empty */ +#define KERNEL __kernel +#define GLOBAL_MEM __global +#define LOCAL_MEM __local +#define LOCAL_MEM_ARG __local +#define REQD_WG_SIZE(X,Y,Z) __attribute__((reqd_work_group_size(X, Y, Z))) + +#define LID_0 get_local_id(0) +#define LID_1 get_local_id(1) +#define LID_2 get_local_id(2) + +#define GID_0 get_group_id(0) +#define GID_1 get_group_id(1) +#define GID_2 get_group_id(2) + +% if double_support: + #pragma OPENCL EXTENSION cl_khr_fp64: enable +% endif +""" + + + + diff --git a/pyopencl/_mymako.py b/pyopencl/_mymako.py new file mode 100644 index 0000000000000000000000000000000000000000..a4e4b46505c013d7d28058afeace5834e56f53d5 --- /dev/null +++ b/pyopencl/_mymako.py @@ -0,0 +1,14 @@ +try: + import mako.template +except ImportError: + raise ImportError( + "Some of PyOpenCL's facilities require the Mako templating engine.\n" + "You or a piece of software you have used has tried to call such a\n" + "part of PyOpenCL, but there was a problem importing Mako.\n\n" + "You may install mako now by typing one of:\n" + "- easy_install Mako\n" + "- pip install Mako\n" + "- aptitude install python-mako\n" + "\nor whatever else is appropriate for your system.") + +from mako import * diff --git a/pyopencl/characterize.py b/pyopencl/characterize.py new file mode 100644 index 0000000000000000000000000000000000000000..b323047132b5754a718bd783d127bca8d0b41192 --- /dev/null +++ b/pyopencl/characterize.py @@ -0,0 +1,5 @@ +def has_double_support(dev): + for ext in dev.extensions.split(" "): + if ext == "cl_khr_fp64": + return True + return False diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index f3994ed621d2fe282269a1ce16c026943f110097..6bcdc004e8ad7d4e3368d02d58228599d94b1ced 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -70,7 +70,7 @@ def get_elwise_program(context, arguments, operation, "after_loop": after_loop, }) - return Program(context, source).build(options=" ".join(options)) + return Program(context, source).build(options) diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index 9348253d7fefe1b657bf69c8ef9872ce075c555d..ebde742dcc9bd1f72376a0709aaabf30e6910e62 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -39,20 +39,7 @@ from pyopencl.tools import ( dtype_to_ctype) from pytools import memoize_method import numpy as np - -try: - import mako -except ImportError: - raise ImportError( - "PyOpenCL's reduction facility requires the Mako templating engine.\n" - "You or a piece of software you have used has tried to call PyOpenCL's\n" - "reduction code, but there was a problem importing Mako.\n\n" - "You may install mako now by typing one of:\n" - "- easy_install Mako\n" - "- pip install Mako\n" - "- aptitude install python-mako\n" - "\nor whatever else is appropriate for your system.") - +import pyopencl._mymako as mako @@ -193,7 +180,7 @@ def get_reduction_source( from mako.template import Template from pytools import all - from pyopencl.tools import has_double_support + from pyopencl.characterize import has_double_support src = str(Template(KERNEL).render( out_type=out_type, arguments=arguments, @@ -224,7 +211,7 @@ def get_reduction_kernel( ctx, out_type, out_type_size, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", preamble="", - device=None, options="", max_group_size=None): + device=None, options=[], max_group_size=None): if map_expr is None: map_expr = "in[i]" @@ -237,7 +224,7 @@ def get_reduction_kernel( name, preamble, device, max_group_size) inf.program = cl.Program(ctx, inf.source) - inf.program.build() + inf.program.build(options) inf.kernel = getattr(inf.program, name) from pyopencl.tools import parse_c_arg, ScalarArg @@ -261,7 +248,7 @@ def get_reduction_kernel( class ReductionKernel: def __init__(self, ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, - name="reduce_kernel", options="", preamble=""): + name="reduce_kernel", options=[], preamble=""): dtype_out = self.dtype_out = np.dtype(dtype_out) diff --git a/pyopencl/scan.py b/pyopencl/scan.py new file mode 100644 index 0000000000000000000000000000000000000000..02679a3122f9662b67175a48db948b23329aa2a1 --- /dev/null +++ b/pyopencl/scan.py @@ -0,0 +1,616 @@ +# WARNING! +# If you update this file, make sure to also update the sister copy in +# PyCUDA or PyOpenCL--both files should always be exactly identical. + +"""Scan primitive.""" + +from __future__ import division + +__copyright__ = """ +Copyright 2011 Andreas Kloeckner +Copyright 2008-2011 NVIDIA Corporation +""" + +__license__ = """ +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +Derived from thrust/detail/backend/cuda/detail/fast_scan.h +within the Thrust project, https://code.google.com/p/thrust/ +""" + + + + +import numpy as np + +_CL_MODE = "pyopencl" in __name__ + +if _CL_MODE: + import pyopencl as cl + import pyopencl.array as cl_array + from pyopencl.tools import dtype_to_ctype + import pyopencl._mymako as mako + from pyopencl._cluda import CLUDA_PREAMBLE +else: + import pycuda.driver as driver + import pycuda.gpuarray as gpuarray + from pycuda.compiler import SourceModule + from pycuda.tools import dtype_to_ctype + import pycuda._mymako as mako + from pycuda._cluda import CLUDA_PREAMBLE + + + + +SHARED_PREAMBLE = CLUDA_PREAMBLE + """ +#define WG_SIZE ${wg_size} +#define SCAN_EXPR(a, b) ${scan_expr} + +${preamble} + +typedef ${scan_type} scan_type; +""" + + + + +SCAN_INTERVALS_SOURCE = mako.template.Template(SHARED_PREAMBLE + """ +#define K ${wg_seq_batches} + +<%def name="make_group_scan(name, with_bounds_check)"> + WITHIN_KERNEL + void ${name}(LOCAL_MEM_ARG scan_type *array + % if with_bounds_check: + , const unsigned n + % endif + ) + { + scan_type val = array[LID_0]; + + <% offset = 1 %> + + % while offset <= wg_size: + if (LID_0 >= ${offset} + % if with_bounds_check: + && LID_0 < n + % endif + ) + { + scan_type tmp = array[LID_0 - ${offset}]; + val = SCAN_EXPR(tmp, val); + } + + local_barrier(); + array[LID_0] = val; + local_barrier(); + + <% offset *= 2 %> + % endwhile + } +</%def> + +${make_group_scan("scan_group", False)} +${make_group_scan("scan_group_n", True)} + +KERNEL +REQD_WG_SIZE(WG_SIZE, 1, 1) +void ${name_prefix}_scan_intervals( + GLOBAL_MEM scan_type *input, + const unsigned int N, + const unsigned int interval_size, + GLOBAL_MEM scan_type *output, + GLOBAL_MEM scan_type *group_results) +{ + LOCAL_MEM scan_type sdata[K + 1][WG_SIZE + 1]; // padded to avoid bank conflicts + + local_barrier(); // TODO figure out why this seems necessary now + + const unsigned int interval_begin = interval_size * GID_0; + const unsigned int interval_end = min(interval_begin + interval_size, N); + + const unsigned int unit_size = K * WG_SIZE; + + unsigned int base = interval_begin; + + // process full units + for(; base + unit_size <= interval_end; base += unit_size) + { + // read data + for(unsigned int k = 0; k < K; k++) + { + const unsigned int offset = k*WG_SIZE + LID_0; + + GLOBAL_MEM scan_type *temp = input + (base + offset); + sdata[offset % K][offset / K] = *temp; + } + + // carry in + if (LID_0 == 0 && base != interval_begin) + sdata[0][0] = SCAN_EXPR(sdata[K][WG_SIZE - 1], sdata[0][0]); + + local_barrier(); + + // scan local values + scan_type sum = sdata[0][LID_0]; + + for(unsigned int k = 1; k < K; k++) + { + scan_type tmp = sdata[k][LID_0]; + sum = SCAN_EXPR(sum, tmp); + sdata[k][LID_0] = sum; + } + + // second level scan + sdata[K][LID_0] = sum; + local_barrier(); + scan_group(&sdata[K][0]); + + // update local values + if (LID_0 > 0) + { + sum = sdata[K][LID_0 - 1]; + + for(unsigned int k = 0; k < K; k++) + { + scan_type tmp = sdata[k][LID_0]; + sdata[k][LID_0] = SCAN_EXPR(sum, tmp); + } + } + + local_barrier(); + + // write data + for(unsigned int k = 0; k < K; k++) + { + const unsigned int offset = k*WG_SIZE + LID_0; + + GLOBAL_MEM scan_type *temp = output + (base + offset); + *temp = sdata[offset % K][offset / K]; + } + + local_barrier(); + } + + // process partially full unit at end of input (if necessary) + if (base < interval_end) + { + // read data + for(unsigned int k = 0; k < K; k++) + { + const unsigned int offset = k*WG_SIZE + LID_0; + + if (base + offset < interval_end) + { + GLOBAL_MEM scan_type *temp = input + (base + offset); + sdata[offset % K][offset / K] = *temp; + } + } + + // carry in + if (LID_0 == 0 && base != interval_begin) + sdata[0][0] = SCAN_EXPR(sdata[K][WG_SIZE - 1], sdata[0][0]); + + local_barrier(); + + // scan local values + scan_type sum = sdata[0][LID_0]; + + const unsigned int offset_end = interval_end - base; + + for(unsigned int k = 1; k < K; k++) + { + if (K * LID_0 + k < offset_end) + { + scan_type tmp = sdata[k][LID_0]; + sum = SCAN_EXPR(sum, tmp); + sdata[k][LID_0] = sum; + } + } + + // second level scan + sdata[K][LID_0] = sum; + local_barrier(); + scan_group_n(&sdata[K][0], offset_end / K); + + // update local values + if (LID_0 > 0) + { + sum = sdata[K][LID_0 - 1]; + + for(unsigned int k = 0; k < K; k++) + { + if (K * LID_0 + k < offset_end) + { + scan_type tmp = sdata[k][LID_0]; + sdata[k][LID_0] = SCAN_EXPR(sum, tmp); + } + } + } + + local_barrier(); + + // write data + for(unsigned int k = 0; k < K; k++) + { + const unsigned int offset = k*WG_SIZE + LID_0; + + if (base + offset < interval_end) + { + GLOBAL_MEM scan_type *temp = output + (base + offset); + *temp = sdata[offset % K][offset / K]; + } + } + + } + + local_barrier(); + + // write interval sum + if (LID_0 == 0) + { + GLOBAL_MEM scan_type *temp = output + (interval_end - 1); + group_results[GID_0] = *temp; + } +} +""") + + + + +INCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """ +KERNEL +REQD_WG_SIZE(WG_SIZE, 1, 1) +void ${name_prefix}_final_update( + GLOBAL_MEM scan_type *output, + const unsigned int N, + const unsigned int interval_size, + GLOBAL_MEM scan_type *group_results) +{ + const unsigned int interval_begin = interval_size * GID_0; + const unsigned int interval_end = min(interval_begin + interval_size, N); + + if (GID_0 == 0) + return; + + // value to add to this segment + scan_type sum = group_results[GID_0 - 1]; + + // advance result pointer + output += interval_begin + LID_0; + + for(unsigned int base = interval_begin; base < interval_end; base += WG_SIZE, output += WG_SIZE) + { + const unsigned int i = base + LID_0; + + if(i < interval_end) + { + scan_type tmp = *output; + *output = SCAN_EXPR(sum, tmp); + } + + local_barrier(); + } +} +""") + + + + +EXCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """ +KERNEL +REQD_WG_SIZE(WG_SIZE, 1, 1) +void ${name_prefix}_final_update( + GLOBAL_MEM scan_type *output, + const unsigned int N, + const unsigned int interval_size, + GLOBAL_MEM scan_type *group_results) +{ + LOCAL_MEM scan_type sdata[WG_SIZE]; + + local_barrier(); // TODO figure out why this seems necessary now + + const unsigned int interval_begin = interval_size * GID_0; + const unsigned int interval_end = min(interval_begin + interval_size, N); + + // value to add to this segment + scan_type carry = ${neutral}; + if(GID_0 != 0) + { + scan_type tmp = group_results[GID_0 - 1]; + carry = SCAN_EXPR(carry, tmp); + } + + scan_type val = carry; + + // advance result pointer + output += interval_begin + LID_0; + + for(unsigned int base = interval_begin; base < interval_end; base += WG_SIZE, output += WG_SIZE) + { + const unsigned int i = base + LID_0; + + if(i < interval_end) + { + scan_type tmp = *output; + sdata[LID_0] = SCAN_EXPR(carry, tmp); + } + + local_barrier(); + + if (LID_0 != 0) + val = sdata[LID_0 - 1]; + + if (i < interval_end) + *output = val; + + if(LID_0 == 0) + val = sdata[WG_SIZE - 1]; + + local_barrier(); + } +} +""") + + + + +def _div_ceil(nr, dr): + return (nr + dr -1) // dr + + +def _uniform_interval_splitting(n, granularity, max_intervals): + grains = _div_ceil(n, granularity) + + # one grain per interval + if grains <= max_intervals: + return granularity, grains + + # ensures that: + # num_intervals * interval_size is >= n + # and + # (num_intervals - 1) * interval_size is < n + + grains_per_interval = _div_ceil(grains, max_intervals) + interval_size = grains_per_interval * granularity + num_intervals = _div_ceil(n, interval_size) + + return interval_size, num_intervals + + + + +if _CL_MODE: + class _ScanKernelBase(object): + def __init__(self, ctx, dtype, + scan_expr, neutral=None, + name_prefix="scan", options=[], preamble="", devices=None): + + if isinstance(self, ExclusiveScanKernel) and neutral is None: + raise ValueError("neutral element is required for exclusive scan") + + self.context = ctx + dtype = self.dtype = np.dtype(dtype) + self.neutral = neutral + + if devices is None: + devices = ctx.devices + self.devices = devices + + max_wg_size = min(dev.max_work_group_size for dev in self.devices) + + # Thrust says these are good for GT200 + self.scan_wg_size = min(max_wg_size, 128) + self.update_wg_size = min(max_wg_size, 256) + + if self.scan_wg_size < 16: + # Hello, Apple CPU. Nice to see you. + self.scan_wg_seq_batches = 128 # FIXME: guesswork + else: + self.scan_wg_seq_batches = 6 + + from pytools import all + from pyopencl.characterize import has_double_support + + kw_values = dict( + preamble=preamble, + name_prefix=name_prefix, + scan_type=dtype_to_ctype(dtype), + scan_expr=scan_expr, + neutral=neutral, + double_support=all( + has_double_support(dev) for dev in devices) + ) + + scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render( + wg_size=self.scan_wg_size, + wg_seq_batches=self.scan_wg_seq_batches, + **kw_values)) + scan_intervals_prg = cl.Program(ctx, scan_intervals_src).build(options) + self.scan_intervals_knl = getattr( + scan_intervals_prg, + name_prefix+"_scan_intervals") + self.scan_intervals_knl.set_scalar_arg_dtypes( + (None, np.uint32, np.uint32, None, None)) + + final_update_src = str(self.final_update_tp.render( + wg_size=self.update_wg_size, + **kw_values)) + + final_update_prg = cl.Program(self.context, final_update_src).build(options) + self.final_update_knl = getattr( + final_update_prg, + name_prefix+"_final_update") + self.final_update_knl.set_scalar_arg_dtypes( + (None, np.uint32, np.uint32, None)) + + def __call__(self, input_ary, output_ary=None, allocator=None, + queue=None): + allocator = allocator or input_ary.allocator + queue = queue or input_ary.queue or output_ary.queue + + if output_ary is None: + output_ary = input_ary + + if isinstance(output_ary, (str, unicode)) and output_ary == "new": + output_ary = cl_array.empty_like(input_ary, allocator=allocator) + + if input_ary.shape != output_ary.shape: + raise ValueError("input and output must have the same shape") + + n, = input_ary.shape + + if not n: + return output_ary + + unit_size = self.scan_wg_size * self.scan_wg_seq_batches + max_groups = 3*max(dev.max_compute_units for dev in self.devices) + + interval_size, num_groups = _uniform_interval_splitting( + n, unit_size, max_groups); + + block_results = allocator(self.dtype.itemsize*num_groups) + dummy_results = allocator(self.dtype.itemsize) + + # first level scan of interval (one interval per block) + self.scan_intervals_knl( + queue, (num_groups*self.scan_wg_size,), (self.scan_wg_size,), + input_ary.data, + n, interval_size, + output_ary.data, + block_results) + + # second level inclusive scan of per-block results + self.scan_intervals_knl( + queue, (self.scan_wg_size,), (self.scan_wg_size,), + block_results, + num_groups, interval_size, + block_results, + dummy_results) + + # update intervals with result of second level scan + self.final_update_knl( + queue, (num_groups*self.update_wg_size,), (self.update_wg_size,), + output_ary.data, + n, interval_size, + block_results) + + return output_ary + + + + +else: + class _ScanKernelBase(object): + def __init__(self, dtype, + scan_expr, neutral=None, + name_prefix="scan", options=[], preamble="", devices=None): + + if isinstance(self, ExclusiveScanKernel) and neutral is None: + raise ValueError("neutral element is required for exclusive scan") + + dtype = self.dtype = np.dtype(dtype) + self.neutral = neutral + + # Thrust says these are good for GT200 + self.scan_wg_size = 128 + self.update_wg_size = 256 + self.scan_wg_seq_batches = 6 + + kw_values = dict( + preamble=preamble, + name_prefix=name_prefix, + scan_type=dtype_to_ctype(dtype), + scan_expr=scan_expr, + neutral=neutral) + + scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render( + wg_size=self.scan_wg_size, + wg_seq_batches=self.scan_wg_seq_batches, + **kw_values)) + scan_intervals_prg = SourceModule( + scan_intervals_src, options=options, no_extern_c=True) + self.scan_intervals_knl = scan_intervals_prg.get_function( + name_prefix+"_scan_intervals") + self.scan_intervals_knl.prepare("PIIPP", (self.scan_wg_size, 1, 1)) + + final_update_src = str(self.final_update_tp.render( + wg_size=self.update_wg_size, + **kw_values)) + + final_update_prg = SourceModule( + final_update_src, options=options, no_extern_c=True) + self.final_update_knl = final_update_prg.get_function( + name_prefix+"_final_update") + self.final_update_knl.prepare("PIIP", (self.update_wg_size, 1, 1)) + + def __call__(self, input_ary, output_ary=None, allocator=None, + stream=None): + allocator = allocator or input_ary.allocator + + if output_ary is None: + output_ary = input_ary + + if isinstance(output_ary, (str, unicode)) and output_ary == "new": + output_ary = cl_array.empty_like(input_ary, allocator=allocator) + + if input_ary.shape != output_ary.shape: + raise ValueError("input and output must have the same shape") + + n, = input_ary.shape + + if not n: + return output_ary + + unit_size = self.scan_wg_size * self.scan_wg_seq_batches + dev = driver.Context.get_device() + max_groups = 3*dev.get_attribute( + driver.device_attribute.MULTIPROCESSOR_COUNT) + + interval_size, num_groups = _uniform_interval_splitting( + n, unit_size, max_groups); + + block_results = allocator(self.dtype.itemsize*num_groups) + dummy_results = allocator(self.dtype.itemsize) + + # first level scan of interval (one interval per block) + self.scan_intervals_knl.prepared_async_call( + (num_groups, 1), stream, + input_ary.gpudata, + n, interval_size, + output_ary.gpudata, + block_results) + + # second level inclusive scan of per-block results + self.scan_intervals_knl.prepared_async_call( + (1, 1), stream, + block_results, + num_groups, interval_size, + block_results, + dummy_results) + + # update intervals with result of second level scan + self.final_update_knl.prepared_async_call( + (num_groups, 1,), stream, + output_ary.gpudata, + n, interval_size, + block_results) + + return output_ary + + + +class InclusiveScanKernel(_ScanKernelBase): + final_update_tp = INCLUSIVE_UPDATE_SOURCE + +class ExclusiveScanKernel(_ScanKernelBase): + final_update_tp = EXCLUSIVE_UPDATE_SOURCE diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 41b355e950cf5a22804a75650db0ae0368c89b4e..7b59d4fcd7fb3468bd2b6f96e9212530a520f274 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -249,15 +249,6 @@ def parse_c_arg(c_arg): -def has_double_support(dev): - for ext in dev.extensions.split(" "): - if ext == "cl_khr_fp64": - return True - return False - - - - def get_gl_sharing_context_properties(): ctx_props = cl.context_properties diff --git a/test/test_array.py b/test/test_array.py index adabdcfc8780e655b83faafa14b233b409e673c4..636a53b97bf7ce55259bad7bdaf7f250085ded1d 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -19,7 +19,7 @@ if have_cl(): import pyopencl as cl from pyopencl.tools import pytest_generate_tests_for_pyopencl \ as pytest_generate_tests - from pyopencl.tools import has_double_support + from pyopencl.characterize import has_double_support @@ -556,6 +556,40 @@ def test_astype(ctx_getter): +@pytools.test.mark_test.opencl +def test_scan(ctx_getter): + context = ctx_getter() + queue = cl.CommandQueue(context) + + from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel + + dtype = np.int32 + for cls in [InclusiveScanKernel, ExclusiveScanKernel]: + knl = cls(context, dtype, "a+b", "0") + + for n in [ + 10, 2**10-5, 2**10, + 2**20-2**18, + 2**20-2**18+5, + 2**10+5, + 2**20+5, + 2**20, 2**24 + ]: + host_data = np.random.randint(0, 10, n).astype(dtype) + dev_data = cl_array.to_device(queue, host_data) + + knl(dev_data) + + desired_result = np.cumsum(host_data, axis=0) + if cls is ExclusiveScanKernel: + desired_result -= host_data + + assert (dev_data.get() == desired_result).all() + + + + + if __name__ == "__main__": # make sure that import failures get reported, instead of skipping the tests. import pyopencl as cl diff --git a/test/test_clmath.py b/test/test_clmath.py index b8f648de08e83a5711ed372f7ba6250a92176ec9..8e4a60f5b33a83e7156e9ecc7fb5f2e53edfcd82 100644 --- a/test/test_clmath.py +++ b/test/test_clmath.py @@ -16,6 +16,7 @@ if have_cl(): import pyopencl.clmath as clmath from pyopencl.tools import pytest_generate_tests_for_pyopencl \ as pytest_generate_tests + from pyopencl.characterize import has_double_support @@ -26,15 +27,6 @@ sizes = [10, 128, 1<<10, 1<<11, 1<<13] -def has_double_support(dev): - for ext in dev.extensions.split(" "): - if ext == "cl_khr_fp64": - return True - return False - - - - numpy_func_names = { "asin": "arcsin", "acos": "arccos",