diff --git a/.gitmodules b/.gitmodules index 5e351df4f2432947d16f3137a38bd029e86c1372..0d69f48bebd0c79dfcf38c2228d61e11e65629e0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,6 @@ [submodule "bpl-subset"] path = bpl-subset - url = git://github.com/inducer/bpl-subset.git + url = ../bpl-subset.git [submodule "pyopencl/compyte"] path = pyopencl/compyte - url = git://github.com/inducer/compyte.git + url = ../compyte.git diff --git a/doc/source/algorithm.rst b/doc/source/algorithm.rst new file mode 100644 index 0000000000000000000000000000000000000000..94758f7129d7e302e64e6251c645f99b791ed95a --- /dev/null +++ b/doc/source/algorithm.rst @@ -0,0 +1,281 @@ +Parallel Algorithms +=================== + +Element-wise expression evalution ("map") +----------------------------------------- + +.. module:: pyopencl.elementwise + +Evaluating involved expressions on :class:`pyopencl.array.Array` instances by +using overloaded operators can be somewhat inefficient, because a new temporary +is created for each intermediate result. The functionality in the module +:mod:`pyopencl.elementwise` contains tools to help generate kernels that +evaluate multi-stage expressions on one or several operands in a single pass. + +.. class:: ElementwiseKernel(context, arguments, operation, name="kernel", preamble="", options=[]) + + Generate a kernel that takes a number of scalar or vector *arguments* + and performs the scalar *operation* on each entry of its arguments, if that + argument is a vector. + + *arguments* is specified as a string formatted as a C argument list. + *operation* is specified as a C assignment statement, without a semicolon. + Vectors in *operation* should be indexed by the variable *i*. + + *name* specifies the name as which the kernel is compiled, + and *options* are passed unmodified to :meth:`pyopencl.Program.build`. + + *preamble* is a piece of C source code that gets inserted outside of the + function context in the elementwise operation's kernel source code. + + .. method:: __call__(*args, wait_for=None) + + Invoke the generated scalar kernel. The arguments may either be scalars or + :class:`GPUArray` instances. + +Here's a usage example:: + + import pyopencl as cl + import pyopencl.array as cl_array + import numpy + + ctx = cl.create_some_context() + queue = cl.CommandQueue(ctx) + + n = 10 + a_gpu = cl_array.to_device( + ctx, queue, numpy.random.randn(n).astype(numpy.float32)) + b_gpu = cl_array.to_device( + ctx, queue, numpy.random.randn(n).astype(numpy.float32)) + + from pyopencl.elementwise import ElementwiseKernel + lin_comb = ElementwiseKernel(ctx, + "float a, float *x, " + "float b, float *y, " + "float *z", + "z[i] = a*x[i] + b*y[i]", + "linear_combination") + + c_gpu = cl_array.empty_like(a_gpu) + lin_comb(5, a_gpu, 6, b_gpu, c_gpu) + + import numpy.linalg as la + assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 + +(You can find this example as :file:`examples/demo_elementwise.py` in the PyOpenCL +distribution.) + +.. _custom-reductions: + +Sums and counts ("reduce") +-------------------------- + +.. module:: pyopencl.reduction + +.. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble="") + + Generate a kernel that takes a number of scalar or vector *arguments* + (at least one vector argument), performs the *map_expr* on each entry of + the vector argument and then the *reduce_expr* on the outcome of that. + *neutral* serves as an initial value. *preamble* offers the possibility + to add preprocessor directives and other code (such as helper functions) + to be added before the actual reduction kernel code. + + Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr* + uses the formal values "a" and "b" to indicate two operands of a binary + reduction operation. If you do not specify a *map_expr*, "in[i]" -- and + therefore the presence of only one input argument -- is automatically + assumed. + + *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is + performed and in which the result is returned. *neutral* is specified as + float or integer formatted as string. *reduce_expr* and *map_expr* are + specified as string formatted operations and *arguments* is specified as a + string formatted as a C argument list. *name* specifies the name as which + the kernel is compiled. *options* are passed unmodified to + :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that + is inserted before the actual kernels. + + .. method:: __call__(*args, queue=None) + + .. versionadded: 2011.1 + +Here's a usage example:: + + a = pyopencl.array.arange(queue, 400, dtype=numpy.float32) + b = pyopencl.array.arange(queue, 400, dtype=numpy.float32) + + krnl = ReductionKernel(ctx, numpy.float32, neutral="0", + reduce_expr="a+b", map_expr="x[i]*y[i]", + arguments="__global float *x, __global float *y") + + my_dot_prod = krnl(a, b).get() + +.. _custom-scan: + +Prefix Sums ("scan") +-------------------- + +.. module:: pyopencl.scan + +.. |scan_extra_args| replace:: a list of tuples *(name, value)* specifying + extra arguments to pass to the scan procedure. *value* must be :mod:`numpy` + sized type. +.. |preamble| replace:: A snippet of C that is inserted into the compiled kernel + before the actual kernel function. May be used for, e.g. type definitions + or include statements. + +A prefix sum is a running sum of an array, as provided by +e.g. :mod:`numpy.cumsum`:: + + >>> import numpy as np + >>> a = [1,1,1,1,1,2,2,2,2,2] + >>> np.cumsum(a) + array([ 1, 2, 3, 4, 5, 7, 9, 11, 13, 15]) + +This is a very simple example of what a scan can do. It turns out that scans +are significantly more versatile. They are a basic building block of many +non-trivial parallel algorithms. Many of the operations enabled by scans seem +difficult to parallelize because of loop-carried dependencies. + +.. seealso:: + + `Prefix sums and their applications `_, by Guy Blelloch. + This article gives an overview of some surprising applications of scans. + + :ref:`predefined-scans` + These operations built into PyOpenCL are realized using :class:`GenericScanKernel`. + +Usage Example +^^^^^^^^^^^^^ + +This example illustrates the implementation of a simplified version of :func:`copy_if`, +which copies integers from an array into the (variable-size) output if they are +greater than 300:: + + knl = GenericScanKernel( + ctx, np.int32, + arguments="__global int *ary, __global int *out", + input_expr="(ary[i] > 300) ? 1 : 0", + scan_expr="a+b", neutral="0", + output_statement=""" + if (prev_item != item) out[item-1] = ary[i]; + """) + + out = a.copy() + knl(a, out) + + a_host = a.get() + out_host = a_host[a_host > 300] + + assert (out_host == out.get()[:len(out_host)]).all() + +The value being scanned over is a number of flags indicating whether each array +element is greater than 300. These flags are computed by *input_expr*. The +prefix sum over this array gives a running count of array items greater than +300. The *output_statement* the compares `prev_item` (the previous item's scan +result, i.e. index) to `item` (the current item's scan result, i.e. +index). If they differ, i.e. if the predicate was satisfied at this +position, then the item is stored in the output at the computed index. + +This example does not make use of the following advanced features also available +in PyOpenCL: + +* Segmented scans + +* Access to the previous item in *input_expr* (e.g. for comparisons) + See the `implementation `_ of :func:`unique` for an example. + +Making Custom Scan Kernels +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. versionadded: 2012.2 + +.. autoclass:: GenericScanKernel + + .. method:: __call__(*args, allocator=None, queue=None) + + *queue* and *allocator* default to the ones provided on the first + :class:`pyopencl.array.Array` in *args*. + +Debugging aids +~~~~~~~~~~~~~~ + +.. class:: GenericDebugScanKernel + + Performs the same function and has the same interface as + :class:`GenericScanKernel`, but uses a dead-simple, sequential scan. Works + best on CPU platforms, and helps isolate bugs in scans by removing the + potential for issues originating in parallel execution. + +.. _predefined-scans: + +Simple / Legacy Interface +^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. class:: ExclusiveScanKernel(ctx, dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="", devices=None) + + Generates a kernel that can compute a `prefix sum `_ + using any associative operation given as *scan_expr*. + *scan_expr* uses the formal values "a" and "b" to indicate two operands of + an associative binary operation. *neutral* is the neutral element + of *scan_expr*, obeying *scan_expr(a, neutral) == a*. + + *dtype* specifies the type of the arrays being operated on. + *name_prefix* is used for kernel names to ensure recognizability + in profiles and logs. *options* is a list of compiler options to use + when building. *preamble* specifies a string of code that is + inserted before the actual kernels. *devices* may be used to restrict + the set of devices on which the kernel is meant to run. (defaults + to all devices in the context *ctx*. + + .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None) + +.. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None) + + Works like :class:`ExclusiveScanKernel`. + + .. versionchanged:: 2012.2 + *neutral* is now always required. + +For the array `[1,2,3]`, inclusive scan results in `[1,3,6]`, and exclusive +scan results in `[0,1,3]`. + +Here's a usage example:: + + knl = InclusiveScanKernel(context, np.int32, "a+b") + + n = 2**20-2**18+5 + host_data = np.random.randint(0, 10, n).astype(np.int32) + dev_data = cl_array.to_device(queue, host_data) + + knl(dev_data) + assert (dev_data.get() == np.cumsum(host_data, axis=0)).all() + +Predicated copies ("partition", "unique", ...) +---------------------------------------------- + +.. module:: pyopencl.algorithm + +.. autofunction:: copy_if + +.. autofunction:: remove_if + +.. autofunction:: partition + +.. autofunction:: unique + +Sorting (radix sort) +-------------------- + +.. autoclass:: RadixSort + + .. automethod:: __call__ + +Building many variable-size lists +--------------------------------- + +.. autoclass:: ListOfListsBuilder + + .. automethod:: __call__ + diff --git a/doc/source/array.rst b/doc/source/array.rst index 752a1b8f22f0958d7ba929789c5b16dcac1612a0..6a4eda2e04dfd2990fa38837508b0e98fc68ccd0 100644 --- a/doc/source/array.rst +++ b/doc/source/array.rst @@ -552,278 +552,6 @@ generator for scientific purposes, please consider citing them: number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114 `_ - -Single-pass Custom Expression Evaluation ----------------------------------------- - -.. module:: pyopencl.elementwise - -Evaluating involved expressions on :class:`pyopencl.array.Array` instances can be -somewhat inefficient, because a new temporary is created for each -intermediate result. The functionality in the module :mod:`pyopencl.elementwise` -contains tools to help generate kernels that evaluate multi-stage expressions -on one or several operands in a single pass. - -.. class:: ElementwiseKernel(context, arguments, operation, name="kernel", preamble="", options=[]) - - Generate a kernel that takes a number of scalar or vector *arguments* - and performs the scalar *operation* on each entry of its arguments, if that - argument is a vector. - - *arguments* is specified as a string formatted as a C argument list. - *operation* is specified as a C assignment statement, without a semicolon. - Vectors in *operation* should be indexed by the variable *i*. - - *name* specifies the name as which the kernel is compiled, - and *options* are passed unmodified to :meth:`pyopencl.Program.build`. - - *preamble* is a piece of C source code that gets inserted outside of the - function context in the elementwise operation's kernel source code. - - .. method:: __call__(*args, wait_for=None) - - Invoke the generated scalar kernel. The arguments may either be scalars or - :class:`GPUArray` instances. - -Here's a usage example:: - - import pyopencl as cl - import pyopencl.array as cl_array - import numpy - - ctx = cl.create_some_context() - queue = cl.CommandQueue(ctx) - - n = 10 - a_gpu = cl_array.to_device( - ctx, queue, numpy.random.randn(n).astype(numpy.float32)) - b_gpu = cl_array.to_device( - ctx, queue, numpy.random.randn(n).astype(numpy.float32)) - - from pyopencl.elementwise import ElementwiseKernel - lin_comb = ElementwiseKernel(ctx, - "float a, float *x, " - "float b, float *y, " - "float *z", - "z[i] = a*x[i] + b*y[i]", - "linear_combination") - - c_gpu = cl_array.empty_like(a_gpu) - lin_comb(5, a_gpu, 6, b_gpu, c_gpu) - - import numpy.linalg as la - assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5 - -(You can find this example as :file:`examples/demo_elementwise.py` in the PyOpenCL -distribution.) - -.. _custom-reductions: - -Custom Reductions ------------------ - -.. module:: pyopencl.reduction - -.. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble="") - - Generate a kernel that takes a number of scalar or vector *arguments* - (at least one vector argument), performs the *map_expr* on each entry of - the vector argument and then the *reduce_expr* on the outcome of that. - *neutral* serves as an initial value. *preamble* offers the possibility - to add preprocessor directives and other code (such as helper functions) - to be added before the actual reduction kernel code. - - Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr* - uses the formal values "a" and "b" to indicate two operands of a binary - reduction operation. If you do not specify a *map_expr*, "in[i]" -- and - therefore the presence of only one input argument -- is automatically - assumed. - - *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is - performed and in which the result is returned. *neutral* is specified as - float or integer formatted as string. *reduce_expr* and *map_expr* are - specified as string formatted operations and *arguments* is specified as a - string formatted as a C argument list. *name* specifies the name as which - the kernel is compiled. *options* are passed unmodified to - :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that - is inserted before the actual kernels. - - .. method:: __call__(*args, queue=None) - - .. versionadded: 2011.1 - -Here's a usage example:: - - a = pyopencl.array.arange(queue, 400, dtype=numpy.float32) - b = pyopencl.array.arange(queue, 400, dtype=numpy.float32) - - krnl = ReductionKernel(ctx, numpy.float32, neutral="0", - reduce_expr="a+b", map_expr="x[i]*y[i]", - arguments="__global float *x, __global float *y") - - my_dot_prod = krnl(a, b).get() - -.. _custom-scan: - -Parallel Scan / Prefix Sum --------------------------- - -.. module:: pyopencl.scan - -.. |scan_extra_args| replace:: a list of tuples *(name, value)* specifying - extra arguments to pass to the scan procedure. *value* must be :mod:`numpy` - sized type. -.. |preamble| replace:: A snippet of C that is inserted into the compiled kernel - before the actual kernel function. May be used for, e.g. type definitions - or include statements. - -A prefix sum is a running sum of an array, as provided by -e.g. :mod:`numpy.cumsum`:: - - >>> import numpy as np - >>> a = [1,1,1,1,1,2,2,2,2,2] - >>> np.cumsum(a) - array([ 1, 2, 3, 4, 5, 7, 9, 11, 13, 15]) - -This is a very simple example of what a scan can do. It turns out that scans -are significantly more versatile. They are a basic building block of many -non-trivial parallel algorithms. Many of the operations enabled by scans seem -difficult to parallelize because of loop-carried dependencies. - -.. seealso:: - - `Prefix sums and their applications `_, by Guy Blelloch. - This article gives an overview of some surprising applications of scans. - - :ref:`predefined-scans` - These operations built into PyOpenCL are realized using :class:`GenericScanKernel`. - -Usage Example -^^^^^^^^^^^^^ - -This example illustrates the implementation of a simplified version of :func:`copy_if`, -which copies integers from an array into the (variable-size) output if they are -greater than 300:: - - knl = GenericScanKernel( - ctx, np.int32, - arguments="__global int *ary, __global int *out", - input_expr="(ary[i] > 300) ? 1 : 0", - scan_expr="a+b", neutral="0", - output_statement=""" - if (prev_item != item) out[item-1] = ary[i]; - """) - - out = a.copy() - knl(a, out) - - a_host = a.get() - out_host = a_host[a_host > 300] - - assert (out_host == out.get()[:len(out_host)]).all() - -The value being scanned over is a number of flags indicating whether each array -element is greater than 300. These flags are computed by *input_expr*. The -prefix sum over this array gives a running count of array items greater than -300. The *output_statement* the compares `prev_item` (the previous item's scan -result, i.e. index) to `item` (the current item's scan result, i.e. -index). If they differ, i.e. if the predicate was satisfied at this -position, then the item is stored in the output at the computed index. - -This example does not make use of the following advanced features also available -in PyOpenCL: - -* Segmented scans - -* Access to the previous item in *input_expr* (e.g. for comparisons) - See the `implementation `_ of :func:`unique` for an example. - -Making Custom Scan Kernels -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. versionadded: 2012.2 - -.. autoclass:: GenericScanKernel - - .. method:: __call__(*args, allocator=None, queue=None) - - *queue* and *allocator* default to the ones provided on the first - :class:`pyopencl.array.Array` in *args*. - -Debugging aids -~~~~~~~~~~~~~~ - -.. class:: GenericDebugScanKernel - - Performs the same function and has the same interface as - :class:`GenericScanKernel`, but uses a dead-simple, sequential scan. Works - best on CPU platforms, and helps isolate bugs in scans by removing the - potential for issues originating in parallel execution. - -.. _predefined-scans: - -Simple / Legacy Interface -^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. class:: ExclusiveScanKernel(ctx, dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="", devices=None) - - Generates a kernel that can compute a `prefix sum `_ - using any associative operation given as *scan_expr*. - *scan_expr* uses the formal values "a" and "b" to indicate two operands of - an associative binary operation. *neutral* is the neutral element - of *scan_expr*, obeying *scan_expr(a, neutral) == a*. - - *dtype* specifies the type of the arrays being operated on. - *name_prefix* is used for kernel names to ensure recognizability - in profiles and logs. *options* is a list of compiler options to use - when building. *preamble* specifies a string of code that is - inserted before the actual kernels. *devices* may be used to restrict - the set of devices on which the kernel is meant to run. (defaults - to all devices in the context *ctx*. - - .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None) - -.. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None) - - Works like :class:`ExclusiveScanKernel`. - - .. versionchanged:: 2012.2 - *neutral* is now always required. - -For the array `[1,2,3]`, inclusive scan results in `[1,3,6]`, and exclusive -scan results in `[0,1,3]`. - -Here's a usage example:: - - knl = InclusiveScanKernel(context, np.int32, "a+b") - - n = 2**20-2**18+5 - host_data = np.random.randint(0, 10, n).astype(np.int32) - dev_data = cl_array.to_device(queue, host_data) - - knl(dev_data) - assert (dev_data.get() == np.cumsum(host_data, axis=0)).all() - -Higher-level algorithms ------------------------ - -.. module:: pyopencl.algorithm - -.. autofunction:: copy_if - -.. autofunction:: remove_if - -.. autofunction:: partition - -.. autofunction:: unique - -Sorting -^^^^^^^ - -.. autoclass:: RadixSort - - .. automethod:: __call__ - Fast Fourier Transforms ----------------------- diff --git a/doc/source/conf.py b/doc/source/conf.py index 1a77c22a21e4ad32339e80cca94d8b55f55f01ec..1554d69588b75de3d8e8fbe6786c5eac1e57560b 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -168,6 +168,7 @@ latex_documents = [ intersphinx_mapping = { 'http://docs.python.org/dev': None, 'http://docs.scipy.org/doc/numpy/': None, + 'http://docs.makotemplates.org/en/latest/': None, } autoclass_content = "both" diff --git a/doc/source/index.rst b/doc/source/index.rst index 387bd7cd0be922e4b07a8548a56cce1abe133db1..607b9da53cf074ae51af88e40f66c8a722f20f01 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -74,6 +74,7 @@ is on the web, thanks to Ian Johnson. runtime array + algorithm tools misc diff --git a/doc/source/runtime.rst b/doc/source/runtime.rst index 4dd2c14f9ebaa8e54a6e4709d8deada6534306b7..547ce01876393cc7650b8fc4520e01fa331aa2cc 100644 --- a/doc/source/runtime.rst +++ b/doc/source/runtime.rst @@ -601,7 +601,7 @@ Mapping Memory into Host Address Space .. method:: release(queue=None, wait_for=None) -.. function:: enqueue_map_buffer(queue, buf, flags, offset, shape, dtype, order, wait_for=None, is_blocking=True) +.. function:: enqueue_map_buffer(queue, buf, flags, offset, shape, dtype, order="C", wait_for=None, is_blocking=True) |explain-waitfor| *shape*, *dtype*, and *order* have the same meaning @@ -616,7 +616,10 @@ Mapping Memory into Host Address Space .. versionchanged:: 2011.1 *is_blocking* now defaults to True. -.. function:: enqueue_map_image(queue, buf, flags, origin, region, shape, dtype, order, wait_for=None, is_blocking=True) + .. versionchanged:: 2012.2 + *order* now defaults to "C". + +.. function:: enqueue_map_image(queue, buf, flags, origin, region, shape, dtype, order="C", wait_for=None, is_blocking=True) |explain-waitfor| *shape*, *dtype*, and *order* have the same meaning @@ -631,6 +634,8 @@ Mapping Memory into Host Address Space .. versionchanged:: 2011.1 *is_blocking* now defaults to True. + .. versionchanged:: 2012.2 + *order* now defaults to "C". Samplers ^^^^^^^^ @@ -830,6 +835,19 @@ Programs and Kernels *global_size* and *local_size* also do not have to have the same number of dimensions. + .. note:: + + :meth:`__call__` is *not* thread-safe. It sets the arguments using :meth:`set_args` + and then runs :func:`enqueue_nd_range_kernel`. Another thread could race it + in doing the same things, with undefined outcome. This issue is inherited + from the C-level OpenCL API. The recommended solution is to make a kernel + (i.e. access `prg.kernel_name`, which corresponds to making a new kernel) + for every thread that may enqueue calls to the kernel. + + A solution involving implicit locks was discussed and decided against on the + mailing list in `October 2012 + `_. + .. versionchanged:: 0.92 *local_size* was promoted to third positional argument from being a keyword argument. The old keyword argument usage will continue to diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py index 4e72a6a5aea38db696edb904c13b65303d96a652..a036c028b89db816a98ab574ea947399a419b851 100644 --- a/pyopencl/algorithm.py +++ b/pyopencl/algorithm.py @@ -30,10 +30,10 @@ OTHER DEALINGS IN THE SOFTWARE. import numpy as np import pyopencl as cl import pyopencl.array -from pyopencl.scan import GenericScanKernel, ScanTemplate +from pyopencl.scan import ScanTemplate from pyopencl.tools import dtype_to_ctype -from pyopencl.tools import context_dependent_memoize -from pytools import memoize +from pytools import memoize, memoize_method, Record +import pyopencl._mymako as mako from mako.template import Template @@ -63,6 +63,8 @@ def copy_if(ary, predicate, extra_args=[], queue=None, preamble=""): :returns: a tuple *(out, count)* where *out* is the output array and *count* is an on-device scalar (fetch to host with `count.get()`) indicating how many elements satisfied *predicate*. + + .. versionadded:: 2012.2 """ if len(ary) > np.iinfo(np.int32).max: scan_dtype = np.int64 @@ -96,6 +98,8 @@ def remove_if(ary, predicate, extra_args=[], queue=None, preamble=""): :returns: a tuple *(out, count)* where *out* is the output array and *count* is an on-device scalar (fetch to host with `count.get()`) indicating how many elements did not satisfy *predicate*. + + .. versionadded:: 2012.2 """ return copy_if(ary, "!(%s)" % predicate, extra_args=extra_args, queue=queue, preamble=preamble) @@ -132,6 +136,8 @@ def partition(ary, predicate, extra_args=[], queue=None, preamble=""): :returns: a tuple *(out_true, out_false, count)* where *count* is an on-device scalar (fetch to host with `count.get()`) indicating how many elements satisfied the predicate. + + .. versionadded:: 2012.2 """ if len(ary) > np.iinfo(np.uint32).max: scan_dtype = np.uint64 @@ -188,6 +194,8 @@ def unique(ary, is_equal_expr="a == b", extra_args=[], queue=None, preamble=""): :returns: a tuple *(out, count)* where *out* is the output array and *count* is an on-device scalar (fetch to host with `count.get()`) indicating how many elements satisfied the predicate. + + .. versionadded:: 2012.2 """ if len(ary) > np.iinfo(np.uint32).max: @@ -337,6 +345,8 @@ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL// class RadixSort(object): """Provides a general `radix sort `_ on the compute device. + + .. versionadded:: 2012.2 """ def __init__(self, context, arguments, key_expr, sort_arg_names, bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32, @@ -356,8 +366,8 @@ class RadixSort(object): # {{{ arg processing - from pyopencl.scan import _parse_args - self.arguments = _parse_args(arguments) + from pyopencl.tools import parse_arg_list + self.arguments = parse_arg_list(arguments) del arguments self.sort_arg_names = sort_arg_names @@ -485,4 +495,450 @@ class RadixSort(object): # }}} +# {{{ generic parallel list builder + +# {{{ kernel template + +_LIST_BUILDER_TEMPLATE = Template("""//CL// +${preamble} + +// {{{ declare helper macros for user interface + +typedef ${index_type} index_type; + +%if is_count_stage: + %for name, dtype in list_names_and_dtypes: + %if name in count_sharing: + #define APPEND_${name}(value) /* nothing */ + %else: + #define APPEND_${name}(value) ++(*plb_loc_${name}_count); + %endif + %endfor +%else: + %for name, dtype in list_names_and_dtypes: + %if name in count_sharing: + #define APPEND_${name}(value) \ + plb_${name}_list[(*plb_${count_sharing[name]}_index) - 1] = value; + %else: + #define APPEND_${name}(value) \ + plb_${name}_list[(*plb_${name}_index)++] = value; + %endif + %endfor +%endif + +#define LIST_ARG_DECL ${user_list_arg_decl} +#define LIST_ARGS ${user_list_args} +#define USER_ARG_DECL ${user_arg_decl} +#define USER_ARGS ${user_args} + +// }}} + +${generate_template} + +// {{{ kernel entry point + +__kernel void ${kernel_name}(${kernel_list_arg_decl} USER_ARG_DECL index_type n) +{ + int lid = get_local_id(0); + index_type gsize = get_global_size(0); + index_type work_group_start = get_local_size(0)*get_group_id(0); + for (index_type i = work_group_start + lid; i < n; i += gsize) + { + %if is_count_stage: + %for name, dtype in list_names_and_dtypes: + %if name not in count_sharing: + index_type plb_loc_${name}_count = 0; + %endif + %endfor + %else: + %for name, dtype in list_names_and_dtypes: + %if name not in count_sharing: + index_type plb_${name}_index = + plb_${name}_start_index[i]; + %endif + %endfor + %endif + + generate(${kernel_list_arg_values} USER_ARGS i); + + %if is_count_stage: + %for name, dtype in list_names_and_dtypes: + %if name not in count_sharing: + plb_${name}_count[i] = plb_loc_${name}_count; + %endif + %endfor + %endif + } +} + +// }}} + +""", strict_undefined=True) + +# }}} + +def _get_arg_decl(arg_list): + result = "" + for arg in arg_list: + result += arg.declarator() + ", " + + return result + +def _get_arg_list(arg_list, prefix=""): + result = "" + for arg in arg_list: + result += prefix + arg.name + ", " + + return result + + + +class BuiltList(Record): + pass + + + +class ListOfListsBuilder: + """Generates and executes code to produce a large number of variable-size + lists, simply. + + .. note:: This functionality is provided as a preview. Its interface + is subject to change until this notice is removed. + + .. versionadded:: 2012.2 + + Here's a usage example:: + + from pyopencl.algorithm import ListOfListsBuilder + builder = ListOfListsBuilder(context, [("mylist", np.int32)], \"\"\" + void generate(USER_ARG_DECL LIST_ARG_DECL index_type i) + { + int count = i % 4; + for (int j = 0; j < count; ++j) + { + APPEND_mylist(count); + } + } + \"\"\", arg_decls=[]) + + result = builder(queue, 2000) + + inf = result["mylist"] + assert inf.count == 3000 + assert (inf.list.get()[-6:] == [1, 2, 2, 3, 3, 3]).all() + + The function `generate` above is called once for each "input object". + Each input object can then generate zero or more list entries. + The number of these input objects is given to :meth:`__call__` as *n_objects*. + List entries are generated by calls to `APPEND_(value)`. + Multiple lists may be generated at once. + + """ + def __init__(self, context, list_names_and_dtypes, generate_template, + arg_decls, count_sharing=None, devices=None, name_prefix="plb_build_list", + options=[], preamble=""): + """ + :arg context: A :class:`pyopencl.Context`. + :arg list_names_and_dtypes: a list of `(name, dtype)` tuples + indicating the lists to be built. + :arg generate_template: a snippet of C as described below + :arg arg_decls: A string of comma-separated C argument declarations. + :arg count_sharing: A mapping consisting of `(child, mother)` + indicating that `mother` and `child` will always have the + same number of indices, and the `APPEND` to `mother` + will always happen *before* the `APPEND` to the child. + :arg name_prefix: the name prefix to use for the compiled kernels + :arg options: OpenCL compilation options for kernels using + *generate_template*. + + *generate_template* may use the following C macros/identifiers: + + * `index_type`: expands to C identifier for the index type used + for the calculation + * `USER_ARG_DECL`: expands to the C declarator for `arg_decls` + * `USER_ARGS`: a list of C argument values corresponding to + `user_arg_decl` + * `LIST_ARG_DECL`: expands to a C argument list representing the + data for the output lists. These are escaped prefixed with + `"plg_"` so as to not interfere with user-provided names. + * `LIST_ARGS`: a list of C argument values corresponding to + `LIST_ARG_DECL` + * `APPEND_name(entry)`: inserts `entry` into the list `name`. + Both arguments are Python strings, the latter representing + a valid C expression of the correct dtype. + + All argument-list related macros have a trailing comma included + if they are non-empty. + + *generate_template* must supply a function: + + .. code-block:: c + + void generate(USER_ARG_DECL LIST_ARG_DECL index_type i) + { + APPEND(mylist, 5); + } + + Internally, the `kernel_template` is expanded (at least) twice. Once, + for a 'counting' stage where the size of all the lists is determined, + and a second time, for a 'generation' stage where the lists are + actually filled. A `generate` function that has side effects beyond + calling `append` is therefore ill-formed. + """ + + if devices is None: + devices = context.devices + + if count_sharing is None: + count_sharing = {} + + self.context = context + self.devices = devices + + self.list_names_and_dtypes = list_names_and_dtypes + self.generate_template = generate_template + + from pyopencl.tools import parse_arg_list + self.arg_decls = parse_arg_list(arg_decls) + + self.count_sharing = count_sharing + + self.name_prefix = name_prefix + self.preamble = preamble + self.options = options + + # {{{ kernel generators + + @memoize_method + def get_scan_kernel(self, index_dtype): + from pyopencl.scan import GenericScanKernel + return GenericScanKernel( + self.context, index_dtype, + arguments="__global %s *ary" % dtype_to_ctype(index_dtype), + input_expr="ary[i]", + scan_expr="a+b", neutral="0", + output_statement="ary[i+1] = item;", + devices=self.devices) + + @memoize_method + def get_count_kernel(self, index_dtype): + index_ctype = dtype_to_ctype(index_dtype) + from pyopencl.tools import VectorArg, OtherArg + kernel_list_args = [ + VectorArg(index_dtype, "plb_%s_count" % name) + for name, dtype in self.list_names_and_dtypes + if name not in self.count_sharing] + + user_list_args = [] + for name, dtype in self.list_names_and_dtypes: + if name in self.count_sharing: + continue + + name = "plb_loc_%s_count" % name + user_list_args.append(OtherArg("%s *%s" % ( + index_ctype, name), name)) + + kernel_name = self.name_prefix+"_count" + src = _LIST_BUILDER_TEMPLATE.render( + is_count_stage=True, + kernel_name=kernel_name, + + kernel_list_arg_decl=_get_arg_decl(kernel_list_args), + kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"), + user_list_arg_decl=_get_arg_decl(user_list_args), + user_list_args=_get_arg_list(user_list_args), + user_arg_decl=_get_arg_decl(self.arg_decls), + user_args=_get_arg_list(self.arg_decls), + + list_names_and_dtypes=self.list_names_and_dtypes, + count_sharing=self.count_sharing, + name_prefix=self.name_prefix, + generate_template=self.generate_template, + preamble=self.preamble, + + index_type=index_ctype, + ) + + src = str(src) + + prg = cl.Program(self.context, src).build(self.options) + knl = getattr(prg, kernel_name) + + from pyopencl.tools import get_arg_list_scalar_arg_dtypes + knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes( + kernel_list_args+self.arg_decls) + [index_dtype]) + + return knl + + @memoize_method + def get_write_kernel(self, index_dtype): + index_ctype = dtype_to_ctype(index_dtype) + from pyopencl.tools import VectorArg, OtherArg + kernel_list_args = [] + kernel_list_arg_values = "" + user_list_args = [] + + for name, dtype in self.list_names_and_dtypes: + list_name = "plb_%s_list" % name + list_arg = VectorArg(dtype, list_name) + + kernel_list_args.append(list_arg) + + if name in self.count_sharing: + continue + + kernel_list_args.append( + VectorArg(index_dtype, "plb_%s_start_index" % name)) + + user_list_args.append(list_arg) + index_name = "plb_%s_index" % name + user_list_args.append(OtherArg("%s *%s" % ( + index_ctype, index_name), index_name)) + + kernel_list_arg_values += "%s, &%s, " % (list_name, index_name) + + kernel_name = self.name_prefix+"_write" + src = _LIST_BUILDER_TEMPLATE.render( + is_count_stage=False, + kernel_name=kernel_name, + + kernel_list_arg_decl=_get_arg_decl(kernel_list_args), + kernel_list_arg_values=kernel_list_arg_values, + user_list_arg_decl=_get_arg_decl(user_list_args), + user_list_args=_get_arg_list(user_list_args), + user_arg_decl=_get_arg_decl(self.arg_decls), + user_args=_get_arg_list(self.arg_decls), + + list_names_and_dtypes=self.list_names_and_dtypes, + count_sharing=self.count_sharing, + name_prefix=self.name_prefix, + generate_template=self.generate_template, + preamble=self.preamble, + + index_type=index_ctype, + ) + + src = str(src) + + prg = cl.Program(self.context, src).build(self.options) + knl = getattr(prg, kernel_name) + + from pyopencl.tools import get_arg_list_scalar_arg_dtypes + knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes( + kernel_list_args+self.arg_decls) + [index_dtype]) + + return knl + + # }}} + + # {{{ driver + + def __call__(self, queue, n_objects, *args, **kwargs): + """ + :arg args: arguments corresponding to arg_decls in the constructor. + :class:`pyopencl.array.Array` are not allowed directly and should + be passed as their :attr:`pyopencl.array.Array.data` attribute instead. + :arg allocator: optionally, the allocator to use to allocate new + arrays. + :returns: a mapping from names to objects which have attributes + + * `lists` for the array containing all lists + * `starts` for the array of starting indices in `lists`. + `starts` is built so that it has n+1 entries, so that + the *i*'th entry is the start of the *i*'th list, and the + *i*'th entry is the index one past the *i*'th list's end, + even for the last list. + """ + if n_objects >= int(np.iinfo(np.int32).max): + index_dtype = np.int64 + else: + index_dtype = np.int32 + index_dtype = np.dtype(index_dtype) + + allocator = kwargs.pop("allocator", None) + if kwargs: + raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) + + result = {} + count_list_args = [] + + count_kernel = self.get_count_kernel(index_dtype) + write_kernel = self.get_write_kernel(index_dtype) + scan_kernel = self.get_scan_kernel(index_dtype) + + # {{{ allocate memory for counts + + for name, dtype in self.list_names_and_dtypes: + if name in self.count_sharing: + continue + + counts = cl.array.empty(queue, + (n_objects + 1), index_dtype, allocator=allocator) + + # The scan will turn the "counts" array into the "starts" array + # in-place. + result[name] = BuiltList(starts=counts) + count_list_args.append(counts.data) + + # }}} + + count_kernel(queue, (n_objects,), None, + *(tuple(count_list_args) + args + (n_objects,))) + + # {{{ run scans + + for name, dtype in self.list_names_and_dtypes: + if name in self.count_sharing: + continue + + info_record = result[name] + starts_ary = info_record.starts + scan_kernel(starts_ary) + + # set first entry to zero + cl.enqueue_copy(queue, starts_ary.data, index_dtype.type(0)) + + # retrieve count + count = np.array(1, index_dtype) + cl.enqueue_copy(queue, count, starts_ary.data, + device_offset=index_dtype.itemsize*n_objects) + + info_record.count = int(count) + + # }}} + + # {{{ deal with count-sharing lists, allocate memory for lists + + write_list_args = [] + for name, dtype in self.list_names_and_dtypes: + if name in self.count_sharing: + sharing_from = self.count_sharing[name] + + info_record = result[name] = BuiltList( + count=result[sharing_from].count, + starts=result[sharing_from].starts, + ) + + else: + info_record = result[name] + + info_record.list = cl.array.empty(queue, + info_record.count, dtype, allocator=allocator) + write_list_args.append(info_record.list.data) + + if name not in self.count_sharing: + write_list_args.append(info_record.starts.data) + + # }}} + + + write_kernel(queue, (n_objects,), None, + *(tuple(write_list_args) + args + (n_objects,))) + + return result + + # }}} + +# }}} + # vim: filetype=pyopencl:fdm=marker diff --git a/pyopencl/compyte b/pyopencl/compyte index 331ce0d1771183e4d9f2cc40e23e61659daf3ab3..66591495cf0abd4c670300c0f9f72bef99b48ec8 160000 --- a/pyopencl/compyte +++ b/pyopencl/compyte @@ -1 +1 @@ -Subproject commit 331ce0d1771183e4d9f2cc40e23e61659daf3ab3 +Subproject commit 66591495cf0abd4c670300c0f9f72bef99b48ec8 diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py index 75baa22eb7763a3c76fabdb3a59c0ff828f31dee..9f83609577fd129aa8c68f19363ab405185a275d 100644 --- a/pyopencl/elementwise.py +++ b/pyopencl/elementwise.py @@ -47,7 +47,7 @@ def get_elwise_program(context, arguments, operation, body = r"""//CL// if (step < 0) { - for (i = start + (work_item_start + lid)*step; + for (i = start + (work_group_start + lid)*step; i > stop; i += gsize*step) { %(operation)s; @@ -55,7 +55,7 @@ def get_elwise_program(context, arguments, operation, } else { - for (i = start + (work_item_start + lid)*step; + for (i = start + (work_group_start + lid)*step; i < stop; i += gsize*step) { %(operation)s; @@ -64,7 +64,7 @@ def get_elwise_program(context, arguments, operation, """ else: body = """//CL// - for (i = work_item_start + lid; i < n; i += gsize) + for (i = work_group_start + lid; i < n; i += gsize) { %(operation)s; } @@ -77,7 +77,7 @@ def get_elwise_program(context, arguments, operation, { int lid = get_local_id(0); int gsize = get_global_size(0); - int work_item_start = get_local_size(0)*get_group_id(0); + int work_group_start = get_local_size(0)*get_group_id(0); long i; %(loop_prep)s; @@ -100,11 +100,9 @@ def get_elwise_program(context, arguments, operation, def get_elwise_kernel_and_types(context, arguments, operation, name="elwise_kernel", options=[], preamble="", use_range=False, **kwargs): - if isinstance(arguments, str): - from pyopencl.tools import parse_c_arg - parsed_args = [parse_c_arg(arg) for arg in arguments.split(",")] - else: - parsed_args = arguments + + from pyopencl.tools import parse_arg_list + parsed_args = parse_arg_list(arguments) auto_preamble = kwargs.pop("auto_preamble", True) @@ -143,15 +141,10 @@ def get_elwise_kernel_and_types(context, arguments, operation, name=name, options=options, preamble=preamble, use_range=use_range, **kwargs) - scalar_arg_dtypes = [] - for arg in parsed_args: - if isinstance(arg, ScalarArg): - scalar_arg_dtypes.append(arg.dtype) - else: - scalar_arg_dtypes.append(None) + from pyopencl.tools import get_arg_list_scalar_arg_dtypes kernel = getattr(prg, name) - kernel.set_scalar_arg_dtypes(scalar_arg_dtypes) + kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args)) return kernel, parsed_args diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py index 68562b1127838db6fcadf7a5f233b4c5d94543e5..c0f6b6d3ae7cf1e1daaa1a9bba6ebf778135c028 100644 --- a/pyopencl/reduction.py +++ b/pyopencl/reduction.py @@ -140,9 +140,9 @@ KERNEL = """//CL// -def get_reduction_source( +def _get_reduction_source( ctx, out_type, out_type_size, - neutral, reduce_expr, map_expr, arguments, + neutral, reduce_expr, map_expr, parsed_args, name="reduce_kernel", preamble="", device=None, max_group_size=None): @@ -198,7 +198,7 @@ def get_reduction_source( from pyopencl.characterize import has_double_support, has_amd_double_support src = str(Template(KERNEL).render( out_type=out_type, - arguments=arguments, + arguments=", ".join(arg.declarator() for arg in parsed_args), group_size=group_size, no_sync_size=no_sync_size, neutral=neutral, @@ -236,33 +236,30 @@ def get_reduction_kernel(stage, map_expr = "in[i]" if stage == 2: - in_arg = "__global const %s *pyopencl_reduction_inp" % out_type + in_arg = "const %s *pyopencl_reduction_inp" % out_type if arguments: arguments = in_arg + ", " + arguments else: arguments = in_arg - inf = get_reduction_source( + from pyopencl.tools import parse_arg_list, get_arg_list_scalar_arg_dtypes + parsed_args = parse_arg_list(arguments) + + inf = _get_reduction_source( ctx, out_type, out_type_size, - neutral, reduce_expr, map_expr, arguments, + neutral, reduce_expr, map_expr, parsed_args, name, preamble, device, max_group_size) inf.program = cl.Program(ctx, inf.source) inf.program.build(options) inf.kernel = getattr(inf.program, name) - from pyopencl.tools import parse_c_arg, ScalarArg - - inf.arg_types = [parse_c_arg(arg) for arg in arguments.split(",")] - scalar_arg_dtypes = [None] - for arg_type in inf.arg_types: - if isinstance(arg_type, ScalarArg): - scalar_arg_dtypes.append(arg_type.dtype) - else: - scalar_arg_dtypes.append(None) - scalar_arg_dtypes.extend([np.uint32]*2) + inf.arg_types = parsed_args - inf.kernel.set_scalar_arg_dtypes(scalar_arg_dtypes) + inf.kernel.set_scalar_arg_dtypes( + [None] + + get_arg_list_scalar_arg_dtypes(inf.arg_types) + + [np.uint32]*2) return inf @@ -390,7 +387,7 @@ def get_sum_kernel(ctx, dtype_out, dtype_in): dtype_out = dtype_in return ReductionKernel(ctx, dtype_out, "0", "a+b", - arguments="__global const %(tp)s *in" + arguments="const %(tp)s *in" % {"tp": dtype_to_ctype(dtype_in)}) @@ -450,8 +447,8 @@ def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None): return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr=map_expr, arguments= - "__global const %(tp_a)s *a, " - "__global const %(tp_b)s *b" % { + "const %(tp_a)s *a, " + "const %(tp_b)s *b" % { "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), }) @@ -477,9 +474,9 @@ def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=No return ReductionKernel(ctx, dtype_out, neutral="0", reduce_expr="a+b", map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]", arguments= - "__global const %(tp_lut)s *lookup_tbl, " - "__global const %(tp_a)s *a, " - "__global const %(tp_b)s *b" % { + "const %(tp_lut)s *lookup_tbl, " + "const %(tp_a)s *a, " + "const %(tp_b)s *b" % { "tp_lut": dtype_to_ctype(dtype_subset), "tp_a": dtype_to_ctype(dtype_a), "tp_b": dtype_to_ctype(dtype_b), @@ -520,7 +517,7 @@ def get_minmax_kernel(ctx, what, dtype): return ReductionKernel(ctx, dtype, neutral=get_minmax_neutral(what, dtype), reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, - arguments="__global const %(tp)s *in" % { + arguments="const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), }, preamble="#define MY_INFINITY (1./0)") @@ -541,8 +538,8 @@ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset): reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr}, map_expr="in[lookup_tbl[i]]", arguments= - "__global const %(tp_lut)s *lookup_tbl, " - "__global const %(tp)s *in" % { + "const %(tp_lut)s *lookup_tbl, " + "const %(tp)s *in" % { "tp": dtype_to_ctype(dtype), "tp_lut": dtype_to_ctype(dtype_subset), }, preamble="#define MY_INFINITY (1./0)") diff --git a/pyopencl/scan.py b/pyopencl/scan.py index 6d22f0c78591fa8f0b3c9a348c045178b1d64e0a..17d2de3f213edfb74f20fd001dc34496490fa05e 100644 --- a/pyopencl/scan.py +++ b/pyopencl/scan.py @@ -34,7 +34,8 @@ import numpy as np import pyopencl as cl import pyopencl.array from pyopencl.tools import (dtype_to_ctype, bitlog2, - KernelTemplateBase, _process_code_for_macro) + KernelTemplateBase, _process_code_for_macro, + get_arg_list_scalar_arg_dtypes) import pyopencl._mymako as mako from pyopencl._cluda import CLUDA_PREAMBLE @@ -144,7 +145,7 @@ void ${name_prefix}_scan_intervals( ) { // index K in first dimension used for carry storage - %if scan_dtype.itemsize > 4 and scan_dtype.itemsize % 8 == 0: + %if scan_dtype.itemsize > 4 and scan_dtype.itemsize % 8 == 0 and is_gpu: // Avoid bank conflicts by adding a single 32-bit value to the size of // the scan type. struct __attribute__ ((__packed__)) wrapped_scan_type @@ -500,8 +501,9 @@ void ${name_prefix}_scan_intervals( // {{{ write data - // work hard with index math to achieve contiguous 32-bit stores + %if is_gpu: { + // work hard with index math to achieve contiguous 32-bit stores __global int *dest = (__global int *) (partial_scan_buffer + unit_base); <% @@ -539,6 +541,21 @@ void ${name_prefix}_scan_intervals( } %endfor } + %else: + for (index_type k = 0; k < K; k++) + { + const index_type offset = k*WG_SIZE + LID_0; + + %if is_tail: + if (unit_base + offset < interval_end) + %endif + { + pycl_printf(("write: %d\n", unit_base + offset)); + partial_scan_buffer[unit_base + offset] = + ldata[offset % K][offset / K].value; + } + } + %endif pycl_printf(("after write\n")); @@ -714,31 +731,6 @@ def _round_down_to_power_of_2(val): assert result <= val return result -def _parse_args(arguments): - if isinstance(arguments, str): - arguments = arguments.split(",") - - def parse_single_arg(obj): - if isinstance(obj, str): - from pyopencl.tools import parse_c_arg - return parse_c_arg(obj) - else: - return obj - - return [parse_single_arg(arg) for arg in arguments] - -def _get_scalar_arg_dtypes(arg_types): - result = [] - - from pyopencl.tools import ScalarArg - for arg_type in arg_types: - if isinstance(arg_type, ScalarArg): - result.append(arg_type.dtype) - else: - result.append(None) - - return result - _PREFIX_WORDS = set(""" ldata partial_scan_buffer global scan_offset segment_start_in_k_group carry @@ -756,7 +748,8 @@ _PREFIX_WORDS = set(""" first_seg_start_in_interval g_segment_start_flags group_base seg_end my_val DEBUG ARGS ints_to_store ints_per_wg scan_types_per_int linear_index - linear_scan_data_idx dest src store_base + linear_scan_data_idx dest src store_base wrapped_scan_type + dummy LID_2 LID_1 LID_0 LDIM_0 LDIM_1 LDIM_2 @@ -765,11 +758,11 @@ _PREFIX_WORDS = set(""" """.split()) _IGNORED_WORDS = set(""" - 4 32 + 4 8 32 typedef for endfor if void while endwhile endfor endif else const printf None return bool n char true false ifdef pycl_printf str xrange assert - np iinfo max itemsize + np iinfo max itemsize __packed__ struct set iteritems len setdefault @@ -802,7 +795,7 @@ _IGNORED_WORDS = set(""" branch workgroup complicated granularity phase remainder than simpler We smaller look ifs lots self behind allow barriers whole loop after already Observe achieve contiguous stores hard go with by math - size won t way divisible bit so + size won t way divisible bit so Avoid declare adding single type is_tail is_first_level input_expr argument_signature preamble double_support neutral output_statement @@ -813,6 +806,7 @@ _IGNORED_WORDS = set(""" update_loop_lookbehind update_loop_plain update_loop use_lookbehind_update store_segment_start_flags update_loop first_seg scan_dtype dtype_to_ctype + is_gpu a b prev_item i last_item prev_value N NO_SEG_BOUNDARY across_seg_boundary @@ -949,7 +943,7 @@ class _GenericScanKernelBase(object): from warnings import warn warn("not specifying 'neutral' is deprecated and will lead to " "wrong results if your scan is not in-place or your " - "'output_statement' otherwise does something non-trivial", + "'output_statement' does something otherwise non-trivial", stacklevel=2) if dtype.itemsize % 4 != 0: @@ -964,7 +958,8 @@ class _GenericScanKernelBase(object): self.devices = devices self.options = options - self.parsed_args = _parse_args(arguments) + from pyopencl.tools import parse_arg_list + self.parsed_args = parse_arg_list(arguments) from pyopencl.tools import VectorArg self.first_array_idx = [ i for i, arg in enumerate(self.parsed_args) @@ -1010,6 +1005,7 @@ class _GenericScanKernelBase(object): arg_ctypes=arg_ctypes, scan_expr=_process_code_for_macro(scan_expr), neutral=_process_code_for_macro(neutral), + is_gpu=self.devices[0].type == cl.device_type.GPU, double_support=all( has_double_support(dev) for dev in devices), ) @@ -1033,30 +1029,30 @@ class GenericScanKernel(_GenericScanKernelBase): trip_count = 0 + avail_local_mem = min( + dev.local_mem_size + for dev in self.devices) + if self.devices[0].type == cl.device_type.CPU: # (about the widest vector a CPU can support, also taking # into account that CPUs don't hide latency by large work groups max_scan_wg_size = 16 - wg_size_multiples = 16 + wg_size_multiples = 4 else: max_scan_wg_size = min(dev.max_work_group_size for dev in self.devices) wg_size_multiples = 64 - avail_local_mem = min( - dev.local_mem_size - for dev in self.devices) - # k_group_size should be a power of two because of in-kernel # division by that number. solutions = [] - for k_exp in range(0, 7): + for k_exp in range(0, 9): for wg_size in range(wg_size_multiples, max_scan_wg_size+1, wg_size_multiples): k_group_size = 2**k_exp - if (self.get_local_mem_use( - wg_size, k_group_size) + 256 <= avail_local_mem): + lmem_use = self.get_local_mem_use(wg_size, k_group_size) + if lmem_use + 256 <= avail_local_mem: solutions.append((wg_size*k_group_size, k_group_size, wg_size)) if self.devices[0].type == cl.device_type.GPU: @@ -1164,7 +1160,7 @@ class GenericScanKernel(_GenericScanKernelBase): final_update_prg, self.name_prefix+"_final_update") update_scalar_arg_dtypes = ( - _get_scalar_arg_dtypes(self.parsed_args) + get_arg_list_scalar_arg_dtypes(self.parsed_args) + [self.index_dtype, self.index_dtype, None, None]) if self.is_segmented: update_scalar_arg_dtypes.append(None) # g_first_segment_start_in_interval @@ -1204,7 +1200,7 @@ class GenericScanKernel(_GenericScanKernelBase): def build_scan_kernel(self, max_wg_size, arguments, input_expr, is_segment_start_expr, input_fetch_exprs, is_first_level, store_segment_start_flags, k_group_size): - scalar_arg_dtypes = _get_scalar_arg_dtypes(arguments) + scalar_arg_dtypes = get_arg_list_scalar_arg_dtypes(arguments) # Empirically found on Nv hardware: no need to be bigger than this size wg_size = _round_down_to_power_of_2( @@ -1418,7 +1414,7 @@ class GenericDebugScanKernel(_GenericScanKernelBase): self.kernel = getattr( scan_prg, self.name_prefix+"_debug_scan") scalar_arg_dtypes = ( - _get_scalar_arg_dtypes(self.parsed_args) + get_arg_list_scalar_arg_dtypes(self.parsed_args) + [self.index_dtype]) self.kernel.set_scalar_arg_dtypes(scalar_arg_dtypes) diff --git a/pyopencl/tools.py b/pyopencl/tools.py index 22b782e61ef8b48ba989957f172b3cedc08bd12e..6a8bf4e78be3308afca5029d5f832289cece1f4e 100644 --- a/pyopencl/tools.py +++ b/pyopencl/tools.py @@ -236,7 +236,10 @@ def pytest_generate_tests_for_pyopencl(metafunc): # {{{ C argument lists -class Argument: +class Argument(object): + pass + +class DtypedArgument(Argument): def __init__(self, dtype, name): self.dtype = np.dtype(dtype) self.name = name @@ -247,27 +250,67 @@ class Argument: self.name, self.dtype) -class VectorArg(Argument): +class VectorArg(DtypedArgument): def declarator(self): return "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name) -class ScalarArg(Argument): +class ScalarArg(DtypedArgument): def declarator(self): return "%s %s" % (dtype_to_ctype(self.dtype), self.name) +class OtherArg(Argument): + def __init__(self, declarator, name): + self.decl = declarator + self.name = name + + def declarator(self): + return self.decl + def parse_c_arg(c_arg): - c_arg = (c_arg - .replace("__global", "") - .replace("__local", "") - .replace("__constant", "")) + for aspace in ["__local", "__constant"]: + if aspace in c_arg: + raise RuntimeError("cannot deal with local or constant " + "OpenCL address spaces in C argument lists ") + + c_arg = c_arg.replace("__global", "") from pyopencl.compyte.dtypes import parse_c_arg_backend return parse_c_arg_backend(c_arg, ScalarArg, VectorArg) +def parse_arg_list(arguments): + """Parse a list of kernel arguments. *arguments* may be a comma-separate list + of C declarators in a string, a list of strings representing C declarators, + or :class:`Argument` objects. + """ + + if isinstance(arguments, str): + arguments = arguments.split(",") + + def parse_single_arg(obj): + if isinstance(obj, str): + from pyopencl.tools import parse_c_arg + return parse_c_arg(obj) + else: + return obj + + return [parse_single_arg(arg) for arg in arguments] + +def get_arg_list_scalar_arg_dtypes(arg_types): + result = [] + + from pyopencl.tools import ScalarArg + for arg_type in arg_types: + if isinstance(arg_type, ScalarArg): + result.append(arg_type.dtype) + else: + result.append(None) + + return result + # }}} diff --git a/src/wrapper/wrap_cl_part_2.cpp b/src/wrapper/wrap_cl_part_2.cpp index cc7942c8b6af64cdb5aaf9fe44419c46f3185ae1..c48ee7aef39c50bff349caa918fab64d90617d1c 100644 --- a/src/wrapper/wrap_cl_part_2.cpp +++ b/src/wrapper/wrap_cl_part_2.cpp @@ -151,13 +151,15 @@ void pyopencl_expose_part_2() py::def("enqueue_map_buffer", enqueue_map_buffer, (py::args("queue", "buf", "flags", "offset", - "shape", "dtype", "order"), + "shape", "dtype"), + py::arg("order")="C", py::arg("wait_for")=py::object(), py::arg("is_blocking")=true)); py::def("enqueue_map_image", enqueue_map_image, (py::args("queue", "img", "flags", "origin", "region", - "shape", "dtype", "order"), + "shape", "dtype"), + py::arg("order")="C", py::arg("wait_for")=py::object(), py::arg("is_blocking")=true)); diff --git a/test/test_array.py b/test/test_array.py index 57ded681ce86f16554a7ca826b4d68b26b51290a..fbc11cf95791b84c90989fe6b80314fd335f69a3 100644 --- a/test/test_array.py +++ b/test/test_array.py @@ -1108,6 +1108,28 @@ def test_sort(ctx_factory): 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed)) assert (a_dev_sorted.get() == a_sorted).all() +@pytools.test.mark_test.opencl +def test_list_builder(ctx_factory): + context = ctx_factory() + queue = cl.CommandQueue(context) + + from pyopencl.algorithm import ListOfListsBuilder + builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL// + void generate(USER_ARG_DECL LIST_ARG_DECL index_type i) + { + int count = i % 4; + for (int j = 0; j < count; ++j) + { + APPEND_mylist(count); + } + } + """, arg_decls=[]) + + result = builder(queue, 2000) + + inf = result["mylist"] + assert inf.count == 3000 + assert (inf.list.get()[-6:] == [1, 2, 2, 3, 3, 3]).all() # }}} @@ -1171,7 +1193,7 @@ def test_nan_arithmetic(ctx_factory): def test_mem_pool_with_arrays(ctx_factory): context = ctx_factory() queue = cl.CommandQueue(context) - mem_pool = cl_tools.MemoryPool(cl_tools.CLAllocator(context)) + mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)) a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool) b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000