diff --git a/.gitmodules b/.gitmodules
index 5e351df4f2432947d16f3137a38bd029e86c1372..0d69f48bebd0c79dfcf38c2228d61e11e65629e0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "bpl-subset"]
 	path = bpl-subset
-	url = git://github.com/inducer/bpl-subset.git
+	url = ../bpl-subset.git
 [submodule "pyopencl/compyte"]
 	path = pyopencl/compyte
-	url = git://github.com/inducer/compyte.git
+	url = ../compyte.git
diff --git a/doc/source/algorithm.rst b/doc/source/algorithm.rst
new file mode 100644
index 0000000000000000000000000000000000000000..94758f7129d7e302e64e6251c645f99b791ed95a
--- /dev/null
+++ b/doc/source/algorithm.rst
@@ -0,0 +1,281 @@
+Parallel Algorithms
+===================
+
+Element-wise expression evalution ("map")
+-----------------------------------------
+
+.. module:: pyopencl.elementwise
+
+Evaluating involved expressions on :class:`pyopencl.array.Array` instances by
+using overloaded operators can be somewhat inefficient, because a new temporary
+is created for each intermediate result. The functionality in the module
+:mod:`pyopencl.elementwise` contains tools to help generate kernels that
+evaluate multi-stage expressions on one or several operands in a single pass.
+
+.. class:: ElementwiseKernel(context, arguments, operation, name="kernel", preamble="", options=[])
+
+    Generate a kernel that takes a number of scalar or vector *arguments*
+    and performs the scalar *operation* on each entry of its arguments, if that
+    argument is a vector.
+
+    *arguments* is specified as a string formatted as a C argument list.
+    *operation* is specified as a C assignment statement, without a semicolon.
+    Vectors in *operation* should be indexed by the variable *i*.
+
+    *name* specifies the name as which the kernel is compiled, 
+    and *options* are passed unmodified to :meth:`pyopencl.Program.build`.
+
+    *preamble* is a piece of C source code that gets inserted outside of the
+    function context in the elementwise operation's kernel source code.
+
+    .. method:: __call__(*args, wait_for=None)
+
+        Invoke the generated scalar kernel. The arguments may either be scalars or
+        :class:`GPUArray` instances.
+
+Here's a usage example::
+
+    import pyopencl as cl
+    import pyopencl.array as cl_array
+    import numpy
+
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    n = 10
+    a_gpu = cl_array.to_device(
+            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
+    b_gpu = cl_array.to_device(
+            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
+
+    from pyopencl.elementwise import ElementwiseKernel
+    lin_comb = ElementwiseKernel(ctx,
+            "float a, float *x, "
+            "float b, float *y, "
+            "float *z",
+            "z[i] = a*x[i] + b*y[i]",
+            "linear_combination")
+
+    c_gpu = cl_array.empty_like(a_gpu)
+    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
+
+    import numpy.linalg as la
+    assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
+
+(You can find this example as :file:`examples/demo_elementwise.py` in the PyOpenCL
+distribution.)
+
+.. _custom-reductions:
+
+Sums and counts ("reduce")
+--------------------------
+
+.. module:: pyopencl.reduction
+
+.. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble="")
+
+    Generate a kernel that takes a number of scalar or vector *arguments*
+    (at least one vector argument), performs the *map_expr* on each entry of
+    the vector argument and then the *reduce_expr* on the outcome of that.
+    *neutral* serves as an initial value. *preamble* offers the possibility
+    to add preprocessor directives and other code (such as helper functions)
+    to be added before the actual reduction kernel code.
+
+    Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr*
+    uses the formal values "a" and "b" to indicate two operands of a binary
+    reduction operation. If you do not specify a *map_expr*, "in[i]" -- and
+    therefore the presence of only one input argument -- is automatically
+    assumed.
+
+    *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is
+    performed and in which the result is returned. *neutral* is specified as
+    float or integer formatted as string. *reduce_expr* and *map_expr* are
+    specified as string formatted operations and *arguments* is specified as a
+    string formatted as a C argument list. *name* specifies the name as which
+    the kernel is compiled. *options* are passed unmodified to
+    :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that
+    is inserted before the actual kernels.
+
+    .. method:: __call__(*args, queue=None)
+
+    .. versionadded: 2011.1
+
+Here's a usage example::
+
+    a = pyopencl.array.arange(queue, 400, dtype=numpy.float32)
+    b = pyopencl.array.arange(queue, 400, dtype=numpy.float32)
+
+    krnl = ReductionKernel(ctx, numpy.float32, neutral="0",
+            reduce_expr="a+b", map_expr="x[i]*y[i]",
+            arguments="__global float *x, __global float *y")
+
+    my_dot_prod = krnl(a, b).get()
+
+.. _custom-scan:
+
+Prefix Sums ("scan")
+--------------------
+
+.. module:: pyopencl.scan
+
+.. |scan_extra_args| replace:: a list of tuples *(name, value)* specifying
+    extra arguments to pass to the scan procedure. *value* must be :mod:`numpy`
+    sized type.
+.. |preamble| replace:: A snippet of C that is inserted into the compiled kernel
+    before the actual kernel function. May be used for, e.g. type definitions
+    or include statements.
+
+A prefix sum is a running sum of an array, as provided by
+e.g. :mod:`numpy.cumsum`::
+
+    >>> import numpy as np
+    >>> a = [1,1,1,1,1,2,2,2,2,2]
+    >>> np.cumsum(a)
+    array([ 1,  2,  3,  4,  5,  7,  9, 11, 13, 15])
+
+This is a very simple example of what a scan can do. It turns out that scans
+are significantly more versatile. They are a basic building block of many
+non-trivial parallel algorithms. Many of the operations enabled by scans seem
+difficult to parallelize because of loop-carried dependencies.
+
+.. seealso::
+
+    `Prefix sums and their applications <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.128.6230>`_, by Guy Blelloch.
+        This article gives an overview of some surprising applications of scans.
+
+    :ref:`predefined-scans`
+        These operations built into PyOpenCL are realized using :class:`GenericScanKernel`.
+
+Usage Example
+^^^^^^^^^^^^^
+
+This example illustrates the implementation of a simplified version of :func:`copy_if`,
+which copies integers from an array into the (variable-size) output if they are
+greater than 300::
+
+    knl = GenericScanKernel(
+            ctx, np.int32,
+            arguments="__global int *ary, __global int *out",
+            input_expr="(ary[i] > 300) ? 1 : 0",
+            scan_expr="a+b", neutral="0",
+            output_statement="""
+                if (prev_item != item) out[item-1] = ary[i];
+                """)
+
+    out = a.copy()
+    knl(a, out)
+
+    a_host = a.get()
+    out_host = a_host[a_host > 300]
+
+    assert (out_host == out.get()[:len(out_host)]).all()
+
+The value being scanned over is a number of flags indicating whether each array
+element is greater than 300. These flags are computed by *input_expr*. The
+prefix sum over this array gives a running count of array items greater than
+300. The *output_statement* the compares `prev_item` (the previous item's scan
+result, i.e. index) to `item` (the current item's scan result, i.e.
+index). If they differ, i.e. if the predicate was satisfied at this
+position, then the item is stored in the output at the computed index.
+
+This example does not make use of the following advanced features also available
+in PyOpenCL:
+
+* Segmented scans
+
+* Access to the previous item in *input_expr* (e.g. for comparisons)
+  See the `implementation <https://github.com/inducer/pyopencl/blob/master/pyopencl/scan.py#L1353>`_ of :func:`unique` for an example.
+
+Making Custom Scan Kernels
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. versionadded: 2012.2
+
+.. autoclass:: GenericScanKernel
+
+    .. method:: __call__(*args, allocator=None, queue=None)
+
+        *queue* and *allocator* default to the ones provided on the first
+        :class:`pyopencl.array.Array` in *args*.
+
+Debugging aids
+~~~~~~~~~~~~~~
+
+.. class:: GenericDebugScanKernel
+
+    Performs the same function and has the same interface as
+    :class:`GenericScanKernel`, but uses a dead-simple, sequential scan.  Works
+    best on CPU platforms, and helps isolate bugs in scans by removing the
+    potential for issues originating in parallel execution.
+
+.. _predefined-scans:
+
+Simple / Legacy Interface
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. class:: ExclusiveScanKernel(ctx, dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="", devices=None)
+
+    Generates a kernel that can compute a `prefix sum <https://secure.wikimedia.org/wikipedia/en/wiki/Prefix_sum>`_
+    using any associative operation given as *scan_expr*.
+    *scan_expr* uses the formal values "a" and "b" to indicate two operands of
+    an associative binary operation. *neutral* is the neutral element
+    of *scan_expr*, obeying *scan_expr(a, neutral) == a*.
+
+    *dtype* specifies the type of the arrays being operated on. 
+    *name_prefix* is used for kernel names to ensure recognizability
+    in profiles and logs. *options* is a list of compiler options to use
+    when building. *preamble* specifies a string of code that is
+    inserted before the actual kernels. *devices* may be used to restrict
+    the set of devices on which the kernel is meant to run. (defaults
+    to all devices in the context *ctx*.
+
+    .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None)
+
+.. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None)
+
+    Works like :class:`ExclusiveScanKernel`.
+
+    .. versionchanged:: 2012.2
+        *neutral* is now always required.
+
+For the array `[1,2,3]`, inclusive scan results in `[1,3,6]`, and exclusive
+scan results in `[0,1,3]`.
+
+Here's a usage example::
+
+    knl = InclusiveScanKernel(context, np.int32, "a+b")
+
+    n = 2**20-2**18+5
+    host_data = np.random.randint(0, 10, n).astype(np.int32)
+    dev_data = cl_array.to_device(queue, host_data)
+
+    knl(dev_data)
+    assert (dev_data.get() == np.cumsum(host_data, axis=0)).all()
+
+Predicated copies ("partition", "unique", ...)
+----------------------------------------------
+
+.. module:: pyopencl.algorithm
+
+.. autofunction:: copy_if
+
+.. autofunction:: remove_if
+
+.. autofunction:: partition
+
+.. autofunction:: unique
+
+Sorting (radix sort)
+--------------------
+
+.. autoclass:: RadixSort
+
+    .. automethod:: __call__
+
+Building many variable-size lists
+---------------------------------
+
+.. autoclass:: ListOfListsBuilder
+
+    .. automethod:: __call__
+
diff --git a/doc/source/array.rst b/doc/source/array.rst
index 752a1b8f22f0958d7ba929789c5b16dcac1612a0..6a4eda2e04dfd2990fa38837508b0e98fc68ccd0 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -552,278 +552,6 @@ generator for scientific purposes, please consider citing them:
   number generator of Lüscher, `Computer Physics Communications 79 (1994) 111-114
   <http://dx.doi.org/10.1016/0010-4655(94)90233-X>`_
 
-
-Single-pass Custom Expression Evaluation
-----------------------------------------
-
-.. module:: pyopencl.elementwise
-
-Evaluating involved expressions on :class:`pyopencl.array.Array` instances can be
-somewhat inefficient, because a new temporary is created for each
-intermediate result. The functionality in the module :mod:`pyopencl.elementwise`
-contains tools to help generate kernels that evaluate multi-stage expressions
-on one or several operands in a single pass.
-
-.. class:: ElementwiseKernel(context, arguments, operation, name="kernel", preamble="", options=[])
-
-    Generate a kernel that takes a number of scalar or vector *arguments*
-    and performs the scalar *operation* on each entry of its arguments, if that
-    argument is a vector.
-
-    *arguments* is specified as a string formatted as a C argument list.
-    *operation* is specified as a C assignment statement, without a semicolon.
-    Vectors in *operation* should be indexed by the variable *i*.
-
-    *name* specifies the name as which the kernel is compiled, 
-    and *options* are passed unmodified to :meth:`pyopencl.Program.build`.
-
-    *preamble* is a piece of C source code that gets inserted outside of the
-    function context in the elementwise operation's kernel source code.
-
-    .. method:: __call__(*args, wait_for=None)
-
-        Invoke the generated scalar kernel. The arguments may either be scalars or
-        :class:`GPUArray` instances.
-
-Here's a usage example::
-
-    import pyopencl as cl
-    import pyopencl.array as cl_array
-    import numpy
-
-    ctx = cl.create_some_context()
-    queue = cl.CommandQueue(ctx)
-
-    n = 10
-    a_gpu = cl_array.to_device(
-            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
-    b_gpu = cl_array.to_device(
-            ctx, queue, numpy.random.randn(n).astype(numpy.float32))
-
-    from pyopencl.elementwise import ElementwiseKernel
-    lin_comb = ElementwiseKernel(ctx,
-            "float a, float *x, "
-            "float b, float *y, "
-            "float *z",
-            "z[i] = a*x[i] + b*y[i]",
-            "linear_combination")
-
-    c_gpu = cl_array.empty_like(a_gpu)
-    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
-
-    import numpy.linalg as la
-    assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
-
-(You can find this example as :file:`examples/demo_elementwise.py` in the PyOpenCL
-distribution.)
-
-.. _custom-reductions:
-
-Custom Reductions
------------------
-
-.. module:: pyopencl.reduction
-
-.. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble="")
-
-    Generate a kernel that takes a number of scalar or vector *arguments*
-    (at least one vector argument), performs the *map_expr* on each entry of
-    the vector argument and then the *reduce_expr* on the outcome of that.
-    *neutral* serves as an initial value. *preamble* offers the possibility
-    to add preprocessor directives and other code (such as helper functions)
-    to be added before the actual reduction kernel code.
-
-    Vectors in *map_expr* should be indexed by the variable *i*. *reduce_expr*
-    uses the formal values "a" and "b" to indicate two operands of a binary
-    reduction operation. If you do not specify a *map_expr*, "in[i]" -- and
-    therefore the presence of only one input argument -- is automatically
-    assumed.
-
-    *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is
-    performed and in which the result is returned. *neutral* is specified as
-    float or integer formatted as string. *reduce_expr* and *map_expr* are
-    specified as string formatted operations and *arguments* is specified as a
-    string formatted as a C argument list. *name* specifies the name as which
-    the kernel is compiled. *options* are passed unmodified to
-    :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that
-    is inserted before the actual kernels.
-
-    .. method:: __call__(*args, queue=None)
-
-    .. versionadded: 2011.1
-
-Here's a usage example::
-
-    a = pyopencl.array.arange(queue, 400, dtype=numpy.float32)
-    b = pyopencl.array.arange(queue, 400, dtype=numpy.float32)
-
-    krnl = ReductionKernel(ctx, numpy.float32, neutral="0",
-            reduce_expr="a+b", map_expr="x[i]*y[i]",
-            arguments="__global float *x, __global float *y")
-
-    my_dot_prod = krnl(a, b).get()
-
-.. _custom-scan:
-
-Parallel Scan / Prefix Sum
---------------------------
-
-.. module:: pyopencl.scan
-
-.. |scan_extra_args| replace:: a list of tuples *(name, value)* specifying
-    extra arguments to pass to the scan procedure. *value* must be :mod:`numpy`
-    sized type.
-.. |preamble| replace:: A snippet of C that is inserted into the compiled kernel
-    before the actual kernel function. May be used for, e.g. type definitions
-    or include statements.
-
-A prefix sum is a running sum of an array, as provided by
-e.g. :mod:`numpy.cumsum`::
-
-    >>> import numpy as np
-    >>> a = [1,1,1,1,1,2,2,2,2,2]
-    >>> np.cumsum(a)
-    array([ 1,  2,  3,  4,  5,  7,  9, 11, 13, 15])
-
-This is a very simple example of what a scan can do. It turns out that scans
-are significantly more versatile. They are a basic building block of many
-non-trivial parallel algorithms. Many of the operations enabled by scans seem
-difficult to parallelize because of loop-carried dependencies.
-
-.. seealso::
-
-    `Prefix sums and their applications <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.128.6230>`_, by Guy Blelloch.
-        This article gives an overview of some surprising applications of scans.
-
-    :ref:`predefined-scans`
-        These operations built into PyOpenCL are realized using :class:`GenericScanKernel`.
-
-Usage Example
-^^^^^^^^^^^^^
-
-This example illustrates the implementation of a simplified version of :func:`copy_if`,
-which copies integers from an array into the (variable-size) output if they are
-greater than 300::
-
-    knl = GenericScanKernel(
-            ctx, np.int32,
-            arguments="__global int *ary, __global int *out",
-            input_expr="(ary[i] > 300) ? 1 : 0",
-            scan_expr="a+b", neutral="0",
-            output_statement="""
-                if (prev_item != item) out[item-1] = ary[i];
-                """)
-
-    out = a.copy()
-    knl(a, out)
-
-    a_host = a.get()
-    out_host = a_host[a_host > 300]
-
-    assert (out_host == out.get()[:len(out_host)]).all()
-
-The value being scanned over is a number of flags indicating whether each array
-element is greater than 300. These flags are computed by *input_expr*. The
-prefix sum over this array gives a running count of array items greater than
-300. The *output_statement* the compares `prev_item` (the previous item's scan
-result, i.e. index) to `item` (the current item's scan result, i.e.
-index). If they differ, i.e. if the predicate was satisfied at this
-position, then the item is stored in the output at the computed index.
-
-This example does not make use of the following advanced features also available
-in PyOpenCL:
-
-* Segmented scans
-
-* Access to the previous item in *input_expr* (e.g. for comparisons)
-  See the `implementation <https://github.com/inducer/pyopencl/blob/master/pyopencl/scan.py#L1353>`_ of :func:`unique` for an example.
-
-Making Custom Scan Kernels
-^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. versionadded: 2012.2
-
-.. autoclass:: GenericScanKernel
-
-    .. method:: __call__(*args, allocator=None, queue=None)
-
-        *queue* and *allocator* default to the ones provided on the first
-        :class:`pyopencl.array.Array` in *args*.
-
-Debugging aids
-~~~~~~~~~~~~~~
-
-.. class:: GenericDebugScanKernel
-
-    Performs the same function and has the same interface as
-    :class:`GenericScanKernel`, but uses a dead-simple, sequential scan.  Works
-    best on CPU platforms, and helps isolate bugs in scans by removing the
-    potential for issues originating in parallel execution.
-
-.. _predefined-scans:
-
-Simple / Legacy Interface
-^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. class:: ExclusiveScanKernel(ctx, dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="", devices=None)
-
-    Generates a kernel that can compute a `prefix sum <https://secure.wikimedia.org/wikipedia/en/wiki/Prefix_sum>`_
-    using any associative operation given as *scan_expr*.
-    *scan_expr* uses the formal values "a" and "b" to indicate two operands of
-    an associative binary operation. *neutral* is the neutral element
-    of *scan_expr*, obeying *scan_expr(a, neutral) == a*.
-
-    *dtype* specifies the type of the arrays being operated on. 
-    *name_prefix* is used for kernel names to ensure recognizability
-    in profiles and logs. *options* is a list of compiler options to use
-    when building. *preamble* specifies a string of code that is
-    inserted before the actual kernels. *devices* may be used to restrict
-    the set of devices on which the kernel is meant to run. (defaults
-    to all devices in the context *ctx*.
-
-    .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None)
-
-.. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None)
-
-    Works like :class:`ExclusiveScanKernel`.
-
-    .. versionchanged:: 2012.2
-        *neutral* is now always required.
-
-For the array `[1,2,3]`, inclusive scan results in `[1,3,6]`, and exclusive
-scan results in `[0,1,3]`.
-
-Here's a usage example::
-
-    knl = InclusiveScanKernel(context, np.int32, "a+b")
-
-    n = 2**20-2**18+5
-    host_data = np.random.randint(0, 10, n).astype(np.int32)
-    dev_data = cl_array.to_device(queue, host_data)
-
-    knl(dev_data)
-    assert (dev_data.get() == np.cumsum(host_data, axis=0)).all()
-
-Higher-level algorithms
------------------------
-
-.. module:: pyopencl.algorithm
-
-.. autofunction:: copy_if
-
-.. autofunction:: remove_if
-
-.. autofunction:: partition
-
-.. autofunction:: unique
-
-Sorting
-^^^^^^^
-
-.. autoclass:: RadixSort
-
-    .. automethod:: __call__
-
 Fast Fourier Transforms
 -----------------------
 
diff --git a/doc/source/conf.py b/doc/source/conf.py
index 1a77c22a21e4ad32339e80cca94d8b55f55f01ec..1554d69588b75de3d8e8fbe6786c5eac1e57560b 100644
--- a/doc/source/conf.py
+++ b/doc/source/conf.py
@@ -168,6 +168,7 @@ latex_documents = [
 intersphinx_mapping = {
         'http://docs.python.org/dev': None,
         'http://docs.scipy.org/doc/numpy/': None,
+        'http://docs.makotemplates.org/en/latest/': None,
         }
 
 autoclass_content = "both"
diff --git a/doc/source/index.rst b/doc/source/index.rst
index 387bd7cd0be922e4b07a8548a56cce1abe133db1..607b9da53cf074ae51af88e40f66c8a722f20f01 100644
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -74,6 +74,7 @@ is on the web, thanks to Ian Johnson.
 
     runtime
     array
+    algorithm
     tools
     misc
 
diff --git a/doc/source/runtime.rst b/doc/source/runtime.rst
index 4dd2c14f9ebaa8e54a6e4709d8deada6534306b7..547ce01876393cc7650b8fc4520e01fa331aa2cc 100644
--- a/doc/source/runtime.rst
+++ b/doc/source/runtime.rst
@@ -601,7 +601,7 @@ Mapping Memory into Host Address Space
 
     .. method:: release(queue=None, wait_for=None)
 
-.. function:: enqueue_map_buffer(queue, buf, flags, offset, shape, dtype, order, wait_for=None, is_blocking=True)
+.. function:: enqueue_map_buffer(queue, buf, flags, offset, shape, dtype, order="C", wait_for=None, is_blocking=True)
 
     |explain-waitfor|
     *shape*, *dtype*, and *order* have the same meaning
@@ -616,7 +616,10 @@ Mapping Memory into Host Address Space
     .. versionchanged:: 2011.1
         *is_blocking* now defaults to True.
 
-.. function:: enqueue_map_image(queue, buf, flags, origin, region, shape, dtype, order, wait_for=None, is_blocking=True)
+    .. versionchanged:: 2012.2
+        *order* now defaults to "C".
+
+.. function:: enqueue_map_image(queue, buf, flags, origin, region, shape, dtype, order="C", wait_for=None, is_blocking=True)
 
     |explain-waitfor|
     *shape*, *dtype*, and *order* have the same meaning
@@ -631,6 +634,8 @@ Mapping Memory into Host Address Space
     .. versionchanged:: 2011.1
         *is_blocking* now defaults to True.
 
+    .. versionchanged:: 2012.2
+        *order* now defaults to "C".
 
 Samplers
 ^^^^^^^^
@@ -830,6 +835,19 @@ Programs and Kernels
         *global_size* and *local_size* also do not have to have the same number
         of dimensions.
 
+        .. note::
+
+            :meth:`__call__` is *not* thread-safe. It sets the arguments using :meth:`set_args`
+            and then runs :func:`enqueue_nd_range_kernel`. Another thread could race it
+            in doing the same things, with undefined outcome. This issue is inherited
+            from the C-level OpenCL API. The recommended solution is to make a kernel
+            (i.e. access `prg.kernel_name`, which corresponds to making a new kernel)
+            for every thread that may enqueue calls to the kernel.
+
+            A solution involving implicit locks was discussed and decided against on the
+            mailing list in `October 2012
+            <http://lists.tiker.net/pipermail/pyopencl/2012-October/001311.html>`_.
+
         .. versionchanged:: 0.92
             *local_size* was promoted to third positional argument from being a
             keyword argument. The old keyword argument usage will continue to
diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index 4e72a6a5aea38db696edb904c13b65303d96a652..a036c028b89db816a98ab574ea947399a419b851 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -30,10 +30,10 @@ OTHER DEALINGS IN THE SOFTWARE.
 import numpy as np
 import pyopencl as cl
 import pyopencl.array
-from pyopencl.scan import GenericScanKernel, ScanTemplate
+from pyopencl.scan import ScanTemplate
 from pyopencl.tools import dtype_to_ctype
-from pyopencl.tools import context_dependent_memoize
-from pytools import memoize
+from pytools import memoize, memoize_method, Record
+import pyopencl._mymako as mako
 from mako.template import Template
 
 
@@ -63,6 +63,8 @@ def copy_if(ary, predicate, extra_args=[], queue=None, preamble=""):
     :returns: a tuple *(out, count)* where *out* is the output array and *count*
         is an on-device scalar (fetch to host with `count.get()`) indicating
         how many elements satisfied *predicate*.
+
+    .. versionadded:: 2012.2
     """
     if len(ary) > np.iinfo(np.int32).max:
         scan_dtype = np.int64
@@ -96,6 +98,8 @@ def remove_if(ary, predicate, extra_args=[], queue=None, preamble=""):
     :returns: a tuple *(out, count)* where *out* is the output array and *count*
         is an on-device scalar (fetch to host with `count.get()`) indicating
         how many elements did not satisfy *predicate*.
+
+    .. versionadded:: 2012.2
     """
     return copy_if(ary, "!(%s)" % predicate, extra_args=extra_args, queue=queue,
             preamble=preamble)
@@ -132,6 +136,8 @@ def partition(ary, predicate, extra_args=[], queue=None, preamble=""):
     :returns: a tuple *(out_true, out_false, count)* where *count*
         is an on-device scalar (fetch to host with `count.get()`) indicating
         how many elements satisfied the predicate.
+
+    .. versionadded:: 2012.2
     """
     if len(ary) > np.iinfo(np.uint32).max:
         scan_dtype = np.uint64
@@ -188,6 +194,8 @@ def unique(ary, is_equal_expr="a == b", extra_args=[], queue=None, preamble=""):
     :returns: a tuple *(out, count)* where *out* is the output array and *count*
         is an on-device scalar (fetch to host with `count.get()`) indicating
         how many elements satisfied the predicate.
+
+    .. versionadded:: 2012.2
     """
 
     if len(ary) > np.iinfo(np.uint32).max:
@@ -337,6 +345,8 @@ RADIX_SORT_OUTPUT_STMT_TPL = Template(r"""//CL//
 class RadixSort(object):
     """Provides a general `radix sort <https://en.wikipedia.org/wiki/Radix_sort>`_
     on the compute device.
+
+    .. versionadded:: 2012.2
     """
     def __init__(self, context, arguments, key_expr, sort_arg_names,
             bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32,
@@ -356,8 +366,8 @@ class RadixSort(object):
 
         # {{{ arg processing
 
-        from pyopencl.scan import _parse_args
-        self.arguments = _parse_args(arguments)
+        from pyopencl.tools import parse_arg_list
+        self.arguments = parse_arg_list(arguments)
         del arguments
 
         self.sort_arg_names = sort_arg_names
@@ -485,4 +495,450 @@ class RadixSort(object):
 
 # }}}
 
+# {{{ generic parallel list builder
+
+# {{{ kernel template
+
+_LIST_BUILDER_TEMPLATE = Template("""//CL//
+${preamble}
+
+// {{{ declare helper macros for user interface
+
+typedef ${index_type} index_type;
+
+%if is_count_stage:
+    %for name, dtype in list_names_and_dtypes:
+        %if name in count_sharing:
+            #define APPEND_${name}(value) /* nothing */
+        %else:
+            #define APPEND_${name}(value) ++(*plb_loc_${name}_count);
+        %endif
+    %endfor
+%else:
+    %for name, dtype in list_names_and_dtypes:
+        %if name in count_sharing:
+            #define APPEND_${name}(value) \
+                plb_${name}_list[(*plb_${count_sharing[name]}_index) - 1] = value;
+        %else:
+            #define APPEND_${name}(value) \
+                plb_${name}_list[(*plb_${name}_index)++] = value;
+        %endif
+    %endfor
+%endif
+
+#define LIST_ARG_DECL ${user_list_arg_decl}
+#define LIST_ARGS ${user_list_args}
+#define USER_ARG_DECL ${user_arg_decl}
+#define USER_ARGS ${user_args}
+
+// }}}
+
+${generate_template}
+
+// {{{ kernel entry point
+
+__kernel void ${kernel_name}(${kernel_list_arg_decl} USER_ARG_DECL index_type n)
+{
+    int lid = get_local_id(0);
+    index_type gsize = get_global_size(0);
+    index_type work_group_start = get_local_size(0)*get_group_id(0);
+    for (index_type i = work_group_start + lid; i < n; i += gsize)
+    {
+        %if is_count_stage:
+            %for name, dtype in list_names_and_dtypes:
+                %if name not in count_sharing:
+                    index_type plb_loc_${name}_count = 0;
+                %endif
+            %endfor
+        %else:
+            %for name, dtype in list_names_and_dtypes:
+                %if name not in count_sharing:
+                    index_type plb_${name}_index =
+                        plb_${name}_start_index[i];
+                %endif
+            %endfor
+        %endif
+
+        generate(${kernel_list_arg_values} USER_ARGS i);
+
+        %if is_count_stage:
+            %for name, dtype in list_names_and_dtypes:
+                %if name not in count_sharing:
+                    plb_${name}_count[i] = plb_loc_${name}_count;
+                %endif
+            %endfor
+        %endif
+    }
+}
+
+// }}}
+
+""", strict_undefined=True)
+
+# }}}
+
+def _get_arg_decl(arg_list):
+    result = ""
+    for arg in arg_list:
+        result += arg.declarator() + ", "
+
+    return result
+
+def _get_arg_list(arg_list, prefix=""):
+    result = ""
+    for arg in arg_list:
+        result += prefix + arg.name + ", "
+
+    return result
+
+
+
+class BuiltList(Record):
+    pass
+
+
+
+class ListOfListsBuilder:
+    """Generates and executes code to produce a large number of variable-size
+    lists, simply.
+
+    .. note:: This functionality is provided as a preview. Its interface
+        is subject to change until this notice is removed.
+
+    .. versionadded:: 2012.2
+
+    Here's a usage example::
+
+        from pyopencl.algorithm import ListOfListsBuilder
+        builder = ListOfListsBuilder(context, [("mylist", np.int32)], \"\"\"
+                void generate(USER_ARG_DECL LIST_ARG_DECL index_type i)
+                {
+                    int count = i % 4;
+                    for (int j = 0; j < count; ++j)
+                    {
+                        APPEND_mylist(count);
+                    }
+                }
+                \"\"\", arg_decls=[])
+
+        result = builder(queue, 2000)
+
+        inf = result["mylist"]
+        assert inf.count == 3000
+        assert (inf.list.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
+
+    The function `generate` above is called once for each "input object".
+    Each input object can then generate zero or more list entries.
+    The number of these input objects is given to :meth:`__call__` as *n_objects*.
+    List entries are generated by calls to `APPEND_<list name>(value)`.
+    Multiple lists may be generated at once.
+
+    """
+    def __init__(self, context, list_names_and_dtypes, generate_template,
+            arg_decls, count_sharing=None, devices=None, name_prefix="plb_build_list",
+            options=[], preamble=""):
+        """
+        :arg context: A :class:`pyopencl.Context`.
+        :arg list_names_and_dtypes: a list of `(name, dtype)` tuples
+            indicating the lists to be built.
+        :arg generate_template: a snippet of C as described below
+        :arg arg_decls: A string of comma-separated C argument declarations.
+        :arg count_sharing: A mapping consisting of `(child, mother)`
+            indicating that `mother` and `child` will always have the
+            same number of indices, and the `APPEND` to `mother`
+            will always happen *before* the `APPEND` to the child.
+        :arg name_prefix: the name prefix to use for the compiled kernels
+        :arg options: OpenCL compilation options for kernels using
+            *generate_template*.
+
+        *generate_template* may use the following C macros/identifiers:
+
+        * `index_type`: expands to C identifier for the index type used
+          for the calculation
+        * `USER_ARG_DECL`: expands to the C declarator for `arg_decls`
+        * `USER_ARGS`: a list of C argument values corresponding to
+          `user_arg_decl`
+        * `LIST_ARG_DECL`: expands to a C argument list representing the
+          data for the output lists. These are escaped prefixed with
+          `"plg_"` so as to not interfere with user-provided names.
+        * `LIST_ARGS`: a list of C argument values corresponding to
+          `LIST_ARG_DECL`
+        * `APPEND_name(entry)`: inserts `entry` into the list `name`.
+          Both arguments are Python strings, the latter representing
+          a valid C expression of the correct dtype.
+
+        All argument-list related macros have a trailing comma included
+        if they are non-empty.
+
+        *generate_template* must supply a function:
+
+        .. code-block:: c
+
+            void generate(USER_ARG_DECL LIST_ARG_DECL index_type i)
+            {
+                APPEND(mylist, 5);
+            }
+
+        Internally, the `kernel_template` is expanded (at least) twice. Once,
+        for a 'counting' stage where the size of all the lists is determined,
+        and a second time, for a 'generation' stage where the lists are
+        actually filled. A `generate` function that has side effects beyond
+        calling `append` is therefore ill-formed.
+        """
+
+        if devices is None:
+            devices = context.devices
+
+        if count_sharing is None:
+            count_sharing = {}
+
+        self.context = context
+        self.devices = devices
+
+        self.list_names_and_dtypes = list_names_and_dtypes
+        self.generate_template = generate_template
+
+        from pyopencl.tools import parse_arg_list
+        self.arg_decls = parse_arg_list(arg_decls)
+
+        self.count_sharing = count_sharing
+
+        self.name_prefix = name_prefix
+        self.preamble = preamble
+        self.options = options
+
+    # {{{ kernel generators
+
+    @memoize_method
+    def get_scan_kernel(self, index_dtype):
+        from pyopencl.scan import GenericScanKernel
+        return GenericScanKernel(
+                self.context, index_dtype,
+                arguments="__global %s *ary" % dtype_to_ctype(index_dtype),
+                input_expr="ary[i]",
+                scan_expr="a+b", neutral="0",
+                output_statement="ary[i+1] = item;",
+                devices=self.devices)
+
+    @memoize_method
+    def get_count_kernel(self, index_dtype):
+        index_ctype = dtype_to_ctype(index_dtype)
+        from pyopencl.tools import VectorArg, OtherArg
+        kernel_list_args = [
+                VectorArg(index_dtype, "plb_%s_count" % name)
+                    for name, dtype in self.list_names_and_dtypes
+                    if name not in self.count_sharing]
+
+        user_list_args = []
+        for name, dtype in self.list_names_and_dtypes:
+            if name in self.count_sharing:
+                continue
+
+            name = "plb_loc_%s_count" % name
+            user_list_args.append(OtherArg("%s *%s" % (
+                index_ctype, name), name))
+
+        kernel_name = self.name_prefix+"_count"
+        src = _LIST_BUILDER_TEMPLATE.render(
+                is_count_stage=True,
+                kernel_name=kernel_name,
+
+                kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
+                kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"),
+                user_list_arg_decl=_get_arg_decl(user_list_args),
+                user_list_args=_get_arg_list(user_list_args),
+                user_arg_decl=_get_arg_decl(self.arg_decls),
+                user_args=_get_arg_list(self.arg_decls),
+
+                list_names_and_dtypes=self.list_names_and_dtypes,
+                count_sharing=self.count_sharing,
+                name_prefix=self.name_prefix,
+                generate_template=self.generate_template,
+                preamble=self.preamble,
+
+                index_type=index_ctype,
+                )
+
+        src = str(src)
+
+        prg = cl.Program(self.context, src).build(self.options)
+        knl = getattr(prg, kernel_name)
+
+        from pyopencl.tools import get_arg_list_scalar_arg_dtypes
+        knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(
+            kernel_list_args+self.arg_decls) + [index_dtype])
+
+        return knl
+
+    @memoize_method
+    def get_write_kernel(self, index_dtype):
+        index_ctype = dtype_to_ctype(index_dtype)
+        from pyopencl.tools import VectorArg, OtherArg
+        kernel_list_args = []
+        kernel_list_arg_values = ""
+        user_list_args = []
+
+        for name, dtype in self.list_names_and_dtypes:
+            list_name = "plb_%s_list" % name
+            list_arg =  VectorArg(dtype, list_name)
+
+            kernel_list_args.append(list_arg)
+
+            if name in self.count_sharing:
+                continue
+
+            kernel_list_args.append(
+                    VectorArg(index_dtype, "plb_%s_start_index" % name))
+
+            user_list_args.append(list_arg)
+            index_name = "plb_%s_index" % name
+            user_list_args.append(OtherArg("%s *%s" % (
+                index_ctype, index_name), index_name))
+
+            kernel_list_arg_values += "%s, &%s, " % (list_name, index_name)
+
+        kernel_name = self.name_prefix+"_write"
+        src = _LIST_BUILDER_TEMPLATE.render(
+                is_count_stage=False,
+                kernel_name=kernel_name,
+
+                kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
+                kernel_list_arg_values=kernel_list_arg_values,
+                user_list_arg_decl=_get_arg_decl(user_list_args),
+                user_list_args=_get_arg_list(user_list_args),
+                user_arg_decl=_get_arg_decl(self.arg_decls),
+                user_args=_get_arg_list(self.arg_decls),
+
+                list_names_and_dtypes=self.list_names_and_dtypes,
+                count_sharing=self.count_sharing,
+                name_prefix=self.name_prefix,
+                generate_template=self.generate_template,
+                preamble=self.preamble,
+
+                index_type=index_ctype,
+                )
+
+        src = str(src)
+
+        prg = cl.Program(self.context, src).build(self.options)
+        knl = getattr(prg, kernel_name)
+
+        from pyopencl.tools import get_arg_list_scalar_arg_dtypes
+        knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(
+            kernel_list_args+self.arg_decls) + [index_dtype])
+
+        return knl
+
+    # }}}
+
+    # {{{ driver
+
+    def __call__(self, queue, n_objects, *args, **kwargs):
+        """
+        :arg args: arguments corresponding to arg_decls in the constructor.
+            :class:`pyopencl.array.Array` are not allowed directly and should
+            be passed as their :attr:`pyopencl.array.Array.data` attribute instead.
+        :arg allocator: optionally, the allocator to use to allocate new
+            arrays.
+        :returns: a mapping from names to objects which have attributes
+
+            * `lists` for the array containing all lists
+            * `starts` for the array of starting indices in `lists`.
+              `starts` is built so that it has n+1 entries, so that
+              the *i*'th entry is the start of the *i*'th list, and the
+              *i*'th entry is the index one past the *i*'th list's end,
+              even for the last list.
+        """
+        if n_objects >= int(np.iinfo(np.int32).max):
+            index_dtype = np.int64
+        else:
+            index_dtype = np.int32
+        index_dtype = np.dtype(index_dtype)
+
+        allocator = kwargs.pop("allocator", None)
+        if kwargs:
+            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))
+
+        result = {}
+        count_list_args = []
+
+        count_kernel = self.get_count_kernel(index_dtype)
+        write_kernel = self.get_write_kernel(index_dtype)
+        scan_kernel = self.get_scan_kernel(index_dtype)
+
+        # {{{ allocate memory for counts
+
+        for name, dtype in self.list_names_and_dtypes:
+            if name in self.count_sharing:
+                continue
+
+            counts = cl.array.empty(queue,
+                    (n_objects + 1), index_dtype, allocator=allocator)
+
+            # The scan will turn the "counts" array into the "starts" array
+            # in-place.
+            result[name] = BuiltList(starts=counts)
+            count_list_args.append(counts.data)
+
+        # }}}
+
+        count_kernel(queue, (n_objects,), None,
+                *(tuple(count_list_args) + args + (n_objects,)))
+
+        # {{{ run scans
+
+        for name, dtype in self.list_names_and_dtypes:
+            if name in self.count_sharing:
+                continue
+
+            info_record = result[name]
+            starts_ary = info_record.starts
+            scan_kernel(starts_ary)
+
+            # set first entry to zero
+            cl.enqueue_copy(queue, starts_ary.data, index_dtype.type(0))
+
+            # retrieve count
+            count = np.array(1, index_dtype)
+            cl.enqueue_copy(queue, count, starts_ary.data,
+                    device_offset=index_dtype.itemsize*n_objects)
+
+            info_record.count = int(count)
+
+        # }}}
+
+        # {{{ deal with count-sharing lists, allocate memory for lists
+
+        write_list_args = []
+        for name, dtype in self.list_names_and_dtypes:
+            if name in self.count_sharing:
+                sharing_from = self.count_sharing[name]
+
+                info_record = result[name] = BuiltList(
+                        count=result[sharing_from].count,
+                        starts=result[sharing_from].starts,
+                        )
+
+            else:
+                info_record = result[name]
+
+            info_record.list = cl.array.empty(queue,
+                    info_record.count, dtype, allocator=allocator)
+            write_list_args.append(info_record.list.data)
+
+            if name not in self.count_sharing:
+                write_list_args.append(info_record.starts.data)
+
+        # }}}
+
+
+        write_kernel(queue, (n_objects,), None,
+                *(tuple(write_list_args) + args + (n_objects,)))
+
+        return result
+
+    # }}}
+
+# }}}
+
 # vim: filetype=pyopencl:fdm=marker
diff --git a/pyopencl/compyte b/pyopencl/compyte
index 331ce0d1771183e4d9f2cc40e23e61659daf3ab3..66591495cf0abd4c670300c0f9f72bef99b48ec8 160000
--- a/pyopencl/compyte
+++ b/pyopencl/compyte
@@ -1 +1 @@
-Subproject commit 331ce0d1771183e4d9f2cc40e23e61659daf3ab3
+Subproject commit 66591495cf0abd4c670300c0f9f72bef99b48ec8
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 75baa22eb7763a3c76fabdb3a59c0ff828f31dee..9f83609577fd129aa8c68f19363ab405185a275d 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -47,7 +47,7 @@ def get_elwise_program(context, arguments, operation,
         body = r"""//CL//
           if (step < 0)
           {
-            for (i = start + (work_item_start + lid)*step;
+            for (i = start + (work_group_start + lid)*step;
               i > stop; i += gsize*step)
             {
               %(operation)s;
@@ -55,7 +55,7 @@ def get_elwise_program(context, arguments, operation,
           }
           else
           {
-            for (i = start + (work_item_start + lid)*step;
+            for (i = start + (work_group_start + lid)*step;
               i < stop; i += gsize*step)
             {
               %(operation)s;
@@ -64,7 +64,7 @@ def get_elwise_program(context, arguments, operation,
           """
     else:
         body = """//CL//
-          for (i = work_item_start + lid; i < n; i += gsize)
+          for (i = work_group_start + lid; i < n; i += gsize)
           {
             %(operation)s;
           }
@@ -77,7 +77,7 @@ def get_elwise_program(context, arguments, operation,
         {
           int lid = get_local_id(0);
           int gsize = get_global_size(0);
-          int work_item_start = get_local_size(0)*get_group_id(0);
+          int work_group_start = get_local_size(0)*get_group_id(0);
           long i;
 
           %(loop_prep)s;
@@ -100,11 +100,9 @@ def get_elwise_program(context, arguments, operation,
 def get_elwise_kernel_and_types(context, arguments, operation,
         name="elwise_kernel", options=[], preamble="", use_range=False,
         **kwargs):
-    if isinstance(arguments, str):
-        from pyopencl.tools import parse_c_arg
-        parsed_args = [parse_c_arg(arg) for arg in arguments.split(",")]
-    else:
-        parsed_args = arguments
+
+    from pyopencl.tools import parse_arg_list
+    parsed_args = parse_arg_list(arguments)
 
     auto_preamble = kwargs.pop("auto_preamble", True)
 
@@ -143,15 +141,10 @@ def get_elwise_kernel_and_types(context, arguments, operation,
         name=name, options=options, preamble=preamble,
         use_range=use_range, **kwargs)
 
-    scalar_arg_dtypes = []
-    for arg in parsed_args:
-        if isinstance(arg, ScalarArg):
-            scalar_arg_dtypes.append(arg.dtype)
-        else:
-            scalar_arg_dtypes.append(None)
+    from pyopencl.tools import get_arg_list_scalar_arg_dtypes
 
     kernel = getattr(prg, name)
-    kernel.set_scalar_arg_dtypes(scalar_arg_dtypes)
+    kernel.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes(parsed_args))
 
     return kernel, parsed_args
 
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index 68562b1127838db6fcadf7a5f233b4c5d94543e5..c0f6b6d3ae7cf1e1daaa1a9bba6ebf778135c028 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -140,9 +140,9 @@ KERNEL = """//CL//
 
 
 
-def  get_reduction_source(
+def  _get_reduction_source(
          ctx, out_type, out_type_size,
-         neutral, reduce_expr, map_expr, arguments,
+         neutral, reduce_expr, map_expr, parsed_args,
          name="reduce_kernel", preamble="",
          device=None, max_group_size=None):
 
@@ -198,7 +198,7 @@ def  get_reduction_source(
     from pyopencl.characterize import has_double_support, has_amd_double_support
     src = str(Template(KERNEL).render(
         out_type=out_type,
-        arguments=arguments,
+        arguments=", ".join(arg.declarator() for arg in parsed_args),
         group_size=group_size,
         no_sync_size=no_sync_size,
         neutral=neutral,
@@ -236,33 +236,30 @@ def get_reduction_kernel(stage,
             map_expr = "in[i]"
 
     if stage == 2:
-        in_arg = "__global const %s *pyopencl_reduction_inp" % out_type
+        in_arg = "const %s *pyopencl_reduction_inp" % out_type
         if arguments:
             arguments = in_arg + ", " + arguments
         else:
             arguments = in_arg
 
-    inf = get_reduction_source(
+    from pyopencl.tools import parse_arg_list, get_arg_list_scalar_arg_dtypes
+    parsed_args = parse_arg_list(arguments)
+
+    inf = _get_reduction_source(
             ctx, out_type, out_type_size,
-            neutral, reduce_expr, map_expr, arguments,
+            neutral, reduce_expr, map_expr, parsed_args,
             name, preamble, device, max_group_size)
 
     inf.program = cl.Program(ctx, inf.source)
     inf.program.build(options)
     inf.kernel = getattr(inf.program, name)
 
-    from pyopencl.tools import parse_c_arg, ScalarArg
-
-    inf.arg_types = [parse_c_arg(arg) for arg in arguments.split(",")]
-    scalar_arg_dtypes = [None]
-    for arg_type in inf.arg_types:
-        if isinstance(arg_type, ScalarArg):
-            scalar_arg_dtypes.append(arg_type.dtype)
-        else:
-            scalar_arg_dtypes.append(None)
-    scalar_arg_dtypes.extend([np.uint32]*2)
+    inf.arg_types = parsed_args
 
-    inf.kernel.set_scalar_arg_dtypes(scalar_arg_dtypes)
+    inf.kernel.set_scalar_arg_dtypes(
+            [None]
+            + get_arg_list_scalar_arg_dtypes(inf.arg_types)
+            + [np.uint32]*2)
 
     return inf
 
@@ -390,7 +387,7 @@ def get_sum_kernel(ctx, dtype_out, dtype_in):
         dtype_out = dtype_in
 
     return ReductionKernel(ctx, dtype_out, "0", "a+b",
-            arguments="__global const %(tp)s *in"
+            arguments="const %(tp)s *in"
             % {"tp": dtype_to_ctype(dtype_in)})
 
 
@@ -450,8 +447,8 @@ def get_dot_kernel(ctx, dtype_out, dtype_a=None, dtype_b=None):
     return ReductionKernel(ctx, dtype_out, neutral="0",
             reduce_expr="a+b", map_expr=map_expr,
             arguments=
-            "__global const %(tp_a)s *a, "
-            "__global const %(tp_b)s *b" % {
+            "const %(tp_a)s *a, "
+            "const %(tp_b)s *b" % {
                 "tp_a": dtype_to_ctype(dtype_a),
                 "tp_b": dtype_to_ctype(dtype_b),
                 })
@@ -477,9 +474,9 @@ def get_subset_dot_kernel(ctx, dtype_out, dtype_subset, dtype_a=None, dtype_b=No
     return ReductionKernel(ctx, dtype_out, neutral="0",
             reduce_expr="a+b", map_expr="a[lookup_tbl[i]]*b[lookup_tbl[i]]",
             arguments=
-            "__global const %(tp_lut)s *lookup_tbl, "
-            "__global const %(tp_a)s *a, "
-            "__global const %(tp_b)s *b" % {
+            "const %(tp_lut)s *lookup_tbl, "
+            "const %(tp_a)s *a, "
+            "const %(tp_b)s *b" % {
             "tp_lut": dtype_to_ctype(dtype_subset),
             "tp_a": dtype_to_ctype(dtype_a),
             "tp_b": dtype_to_ctype(dtype_b),
@@ -520,7 +517,7 @@ def get_minmax_kernel(ctx, what, dtype):
     return ReductionKernel(ctx, dtype,
             neutral=get_minmax_neutral(what, dtype),
             reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
-            arguments="__global const %(tp)s *in" % {
+            arguments="const %(tp)s *in" % {
                 "tp": dtype_to_ctype(dtype),
                 }, preamble="#define MY_INFINITY (1./0)")
 
@@ -541,8 +538,8 @@ def get_subset_minmax_kernel(ctx, what, dtype, dtype_subset):
             reduce_expr="%(reduce_expr)s" % {"reduce_expr": reduce_expr},
             map_expr="in[lookup_tbl[i]]",
             arguments=
-            "__global const %(tp_lut)s *lookup_tbl, "
-            "__global const %(tp)s *in"  % {
+            "const %(tp_lut)s *lookup_tbl, "
+            "const %(tp)s *in"  % {
             "tp": dtype_to_ctype(dtype),
             "tp_lut": dtype_to_ctype(dtype_subset),
             }, preamble="#define MY_INFINITY (1./0)")
diff --git a/pyopencl/scan.py b/pyopencl/scan.py
index 6d22f0c78591fa8f0b3c9a348c045178b1d64e0a..17d2de3f213edfb74f20fd001dc34496490fa05e 100644
--- a/pyopencl/scan.py
+++ b/pyopencl/scan.py
@@ -34,7 +34,8 @@ import numpy as np
 import pyopencl as cl
 import pyopencl.array
 from pyopencl.tools import (dtype_to_ctype, bitlog2,
-        KernelTemplateBase, _process_code_for_macro)
+        KernelTemplateBase, _process_code_for_macro,
+        get_arg_list_scalar_arg_dtypes)
 import pyopencl._mymako as mako
 from pyopencl._cluda import CLUDA_PREAMBLE
 
@@ -144,7 +145,7 @@ void ${name_prefix}_scan_intervals(
     )
 {
     // index K in first dimension used for carry storage
-    %if scan_dtype.itemsize > 4 and scan_dtype.itemsize % 8 == 0:
+    %if scan_dtype.itemsize > 4 and scan_dtype.itemsize % 8 == 0 and is_gpu:
         // Avoid bank conflicts by adding a single 32-bit value to the size of
         // the scan type.
         struct __attribute__ ((__packed__)) wrapped_scan_type
@@ -500,8 +501,9 @@ void ${name_prefix}_scan_intervals(
 
             // {{{ write data
 
-            // work hard with index math to achieve contiguous 32-bit stores
+            %if is_gpu:
             {
+                // work hard with index math to achieve contiguous 32-bit stores
                 __global int *dest = (__global int *) (partial_scan_buffer + unit_base);
 
                 <%
@@ -539,6 +541,21 @@ void ${name_prefix}_scan_intervals(
                     }
                 %endfor
             }
+            %else:
+            for (index_type k = 0; k < K; k++)
+            {
+                const index_type offset = k*WG_SIZE + LID_0;
+
+                %if is_tail:
+                if (unit_base + offset < interval_end)
+                %endif
+                {
+                    pycl_printf(("write: %d\n", unit_base + offset));
+                    partial_scan_buffer[unit_base + offset] =
+                        ldata[offset % K][offset / K].value;
+                }
+            }
+            %endif
 
             pycl_printf(("after write\n"));
 
@@ -714,31 +731,6 @@ def _round_down_to_power_of_2(val):
     assert result <= val
     return result
 
-def _parse_args(arguments):
-    if isinstance(arguments, str):
-        arguments = arguments.split(",")
-
-    def parse_single_arg(obj):
-        if isinstance(obj, str):
-            from pyopencl.tools import parse_c_arg
-            return parse_c_arg(obj)
-        else:
-            return obj
-
-    return [parse_single_arg(arg) for arg in arguments]
-
-def _get_scalar_arg_dtypes(arg_types):
-    result = []
-
-    from pyopencl.tools import ScalarArg
-    for arg_type in arg_types:
-        if isinstance(arg_type, ScalarArg):
-            result.append(arg_type.dtype)
-        else:
-            result.append(None)
-
-    return result
-
 _PREFIX_WORDS = set("""
         ldata partial_scan_buffer global scan_offset
         segment_start_in_k_group carry
@@ -756,7 +748,8 @@ _PREFIX_WORDS = set("""
         first_seg_start_in_interval g_segment_start_flags
         group_base seg_end my_val DEBUG ARGS
         ints_to_store ints_per_wg scan_types_per_int linear_index
-        linear_scan_data_idx dest src store_base
+        linear_scan_data_idx dest src store_base wrapped_scan_type
+        dummy
 
         LID_2 LID_1 LID_0
         LDIM_0 LDIM_1 LDIM_2
@@ -765,11 +758,11 @@ _PREFIX_WORDS = set("""
         """.split())
 
 _IGNORED_WORDS = set("""
-        4 32
+        4 8 32
 
         typedef for endfor if void while endwhile endfor endif else const printf
         None return bool n char true false ifdef pycl_printf str xrange assert
-        np iinfo max itemsize
+        np iinfo max itemsize __packed__ struct
 
         set iteritems len setdefault
 
@@ -802,7 +795,7 @@ _IGNORED_WORDS = set("""
         branch workgroup complicated granularity phase remainder than simpler
         We smaller look ifs lots self behind allow barriers whole loop
         after already Observe achieve contiguous stores hard go with by math
-        size won t way divisible bit so
+        size won t way divisible bit so Avoid declare adding single type
 
         is_tail is_first_level input_expr argument_signature preamble
         double_support neutral output_statement
@@ -813,6 +806,7 @@ _IGNORED_WORDS = set("""
         update_loop_lookbehind update_loop_plain update_loop
         use_lookbehind_update store_segment_start_flags
         update_loop first_seg scan_dtype dtype_to_ctype
+        is_gpu
 
         a b prev_item i last_item prev_value
         N NO_SEG_BOUNDARY across_seg_boundary
@@ -949,7 +943,7 @@ class _GenericScanKernelBase(object):
             from warnings import warn
             warn("not specifying 'neutral' is deprecated and will lead to "
                     "wrong results if your scan is not in-place or your "
-                    "'output_statement' otherwise does something non-trivial",
+                    "'output_statement' does something otherwise non-trivial",
                     stacklevel=2)
 
         if dtype.itemsize % 4 != 0:
@@ -964,7 +958,8 @@ class _GenericScanKernelBase(object):
         self.devices = devices
         self.options = options
 
-        self.parsed_args = _parse_args(arguments)
+        from pyopencl.tools import parse_arg_list
+        self.parsed_args = parse_arg_list(arguments)
         from pyopencl.tools import VectorArg
         self.first_array_idx = [
                 i for i, arg in enumerate(self.parsed_args)
@@ -1010,6 +1005,7 @@ class _GenericScanKernelBase(object):
             arg_ctypes=arg_ctypes,
             scan_expr=_process_code_for_macro(scan_expr),
             neutral=_process_code_for_macro(neutral),
+            is_gpu=self.devices[0].type == cl.device_type.GPU,
             double_support=all(
                 has_double_support(dev) for dev in devices),
             )
@@ -1033,30 +1029,30 @@ class GenericScanKernel(_GenericScanKernelBase):
 
         trip_count = 0
 
+        avail_local_mem = min(
+                dev.local_mem_size
+                for dev in self.devices)
+
         if self.devices[0].type == cl.device_type.CPU:
             # (about the widest vector a CPU can support, also taking
             # into account that CPUs don't hide latency by large work groups
             max_scan_wg_size = 16
-            wg_size_multiples = 16
+            wg_size_multiples = 4
         else:
             max_scan_wg_size = min(dev.max_work_group_size for dev in self.devices)
             wg_size_multiples = 64
 
-        avail_local_mem = min(
-                dev.local_mem_size
-                for dev in self.devices)
-
         # k_group_size should be a power of two because of in-kernel
         # division by that number.
 
         solutions = []
-        for k_exp in range(0, 7):
+        for k_exp in range(0, 9):
             for wg_size in range(wg_size_multiples, max_scan_wg_size+1,
                     wg_size_multiples):
 
                 k_group_size = 2**k_exp
-                if (self.get_local_mem_use(
-                    wg_size, k_group_size) + 256  <= avail_local_mem):
+                lmem_use = self.get_local_mem_use(wg_size, k_group_size)
+                if lmem_use + 256  <= avail_local_mem:
                     solutions.append((wg_size*k_group_size, k_group_size, wg_size))
 
         if self.devices[0].type == cl.device_type.GPU:
@@ -1164,7 +1160,7 @@ class GenericScanKernel(_GenericScanKernelBase):
                 final_update_prg,
                 self.name_prefix+"_final_update")
         update_scalar_arg_dtypes = (
-                _get_scalar_arg_dtypes(self.parsed_args)
+                get_arg_list_scalar_arg_dtypes(self.parsed_args)
                 + [self.index_dtype, self.index_dtype, None, None])
         if self.is_segmented:
             update_scalar_arg_dtypes.append(None) # g_first_segment_start_in_interval
@@ -1204,7 +1200,7 @@ class GenericScanKernel(_GenericScanKernelBase):
     def build_scan_kernel(self, max_wg_size, arguments, input_expr,
             is_segment_start_expr, input_fetch_exprs, is_first_level,
             store_segment_start_flags, k_group_size):
-        scalar_arg_dtypes = _get_scalar_arg_dtypes(arguments)
+        scalar_arg_dtypes = get_arg_list_scalar_arg_dtypes(arguments)
 
         # Empirically found on Nv hardware: no need to be bigger than this size
         wg_size = _round_down_to_power_of_2(
@@ -1418,7 +1414,7 @@ class GenericDebugScanKernel(_GenericScanKernelBase):
         self.kernel = getattr(
                 scan_prg, self.name_prefix+"_debug_scan")
         scalar_arg_dtypes = (
-                _get_scalar_arg_dtypes(self.parsed_args)
+                get_arg_list_scalar_arg_dtypes(self.parsed_args)
                 + [self.index_dtype])
         self.kernel.set_scalar_arg_dtypes(scalar_arg_dtypes)
 
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 22b782e61ef8b48ba989957f172b3cedc08bd12e..6a8bf4e78be3308afca5029d5f832289cece1f4e 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -236,7 +236,10 @@ def pytest_generate_tests_for_pyopencl(metafunc):
 
 # {{{ C argument lists
 
-class Argument:
+class Argument(object):
+    pass
+
+class DtypedArgument(Argument):
     def __init__(self, dtype, name):
         self.dtype = np.dtype(dtype)
         self.name = name
@@ -247,27 +250,67 @@ class Argument:
                 self.name,
                 self.dtype)
 
-class VectorArg(Argument):
+class VectorArg(DtypedArgument):
     def declarator(self):
         return "__global %s *%s" % (dtype_to_ctype(self.dtype), self.name)
 
-class ScalarArg(Argument):
+class ScalarArg(DtypedArgument):
     def declarator(self):
         return "%s %s" % (dtype_to_ctype(self.dtype), self.name)
 
+class OtherArg(Argument):
+    def __init__(self, declarator, name):
+        self.decl = declarator
+        self.name = name
+
+    def declarator(self):
+        return self.decl
+
 
 
 
 
 def parse_c_arg(c_arg):
-    c_arg = (c_arg
-            .replace("__global", "")
-            .replace("__local", "")
-            .replace("__constant", ""))
+    for aspace in ["__local", "__constant"]:
+        if aspace in c_arg:
+            raise RuntimeError("cannot deal with local or constant "
+                    "OpenCL address spaces in C argument lists ")
+
+    c_arg = c_arg.replace("__global", "")
 
     from pyopencl.compyte.dtypes import parse_c_arg_backend
     return parse_c_arg_backend(c_arg, ScalarArg, VectorArg)
 
+def parse_arg_list(arguments):
+    """Parse a list of kernel arguments. *arguments* may be a comma-separate list
+    of C declarators in a string, a list of strings representing C declarators,
+    or :class:`Argument` objects.
+    """
+
+    if isinstance(arguments, str):
+        arguments = arguments.split(",")
+
+    def parse_single_arg(obj):
+        if isinstance(obj, str):
+            from pyopencl.tools import parse_c_arg
+            return parse_c_arg(obj)
+        else:
+            return obj
+
+    return [parse_single_arg(arg) for arg in arguments]
+
+def get_arg_list_scalar_arg_dtypes(arg_types):
+    result = []
+
+    from pyopencl.tools import ScalarArg
+    for arg_type in arg_types:
+        if isinstance(arg_type, ScalarArg):
+            result.append(arg_type.dtype)
+        else:
+            result.append(None)
+
+    return result
+
 # }}}
 
 
diff --git a/src/wrapper/wrap_cl_part_2.cpp b/src/wrapper/wrap_cl_part_2.cpp
index cc7942c8b6af64cdb5aaf9fe44419c46f3185ae1..c48ee7aef39c50bff349caa918fab64d90617d1c 100644
--- a/src/wrapper/wrap_cl_part_2.cpp
+++ b/src/wrapper/wrap_cl_part_2.cpp
@@ -151,13 +151,15 @@ void pyopencl_expose_part_2()
   py::def("enqueue_map_buffer", enqueue_map_buffer,
       (py::args("queue", "buf", "flags",
                 "offset",
-                "shape", "dtype", "order"),
+                "shape", "dtype"),
+       py::arg("order")="C",
        py::arg("wait_for")=py::object(),
        py::arg("is_blocking")=true));
   py::def("enqueue_map_image", enqueue_map_image,
       (py::args("queue", "img", "flags",
                 "origin", "region",
-                "shape", "dtype", "order"),
+                "shape", "dtype"),
+       py::arg("order")="C",
        py::arg("wait_for")=py::object(),
        py::arg("is_blocking")=true));
 
diff --git a/test/test_array.py b/test/test_array.py
index 57ded681ce86f16554a7ca826b4d68b26b51290a..fbc11cf95791b84c90989fe6b80314fd335f69a3 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -1108,6 +1108,28 @@ def test_sort(ctx_factory):
                 1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed))
         assert (a_dev_sorted.get() == a_sorted).all()
 
+@pytools.test.mark_test.opencl
+def test_list_builder(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.algorithm import ListOfListsBuilder
+    builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
+            void generate(USER_ARG_DECL LIST_ARG_DECL index_type i)
+            {
+                int count = i % 4;
+                for (int j = 0; j < count; ++j)
+                {
+                    APPEND_mylist(count);
+                }
+            }
+            """, arg_decls=[])
+
+    result = builder(queue, 2000)
+
+    inf = result["mylist"]
+    assert inf.count == 3000
+    assert (inf.list.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
 
 # }}}
 
@@ -1171,7 +1193,7 @@ def test_nan_arithmetic(ctx_factory):
 def test_mem_pool_with_arrays(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
-    mem_pool = cl_tools.MemoryPool(cl_tools.CLAllocator(context))
+    mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))
 
     a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool)
     b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000