diff --git a/doc/source/array.rst b/doc/source/array.rst
index c6586eb0cc2c8f4e007df83381591fdeb7ef4242..f9b2cedbc370e6fffe2ebedad6cf4fea0abf8a95 100644
--- a/doc/source/array.rst
+++ b/doc/source/array.rst
@@ -421,7 +421,7 @@ Custom Reductions
 
 .. module:: pyopencl.reduction
 
-.. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options="", preamble="")
+.. class:: ReductionKernel(ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble="")
 
     Generate a kernel that takes a number of scalar or vector *arguments*
     (at least one vector argument), performs the *map_expr* on each entry of
@@ -437,15 +437,15 @@ Custom Reductions
     assumed.
 
     *dtype_out* specifies the :class:`numpy.dtype` in which the reduction is
-    performed and in which the result is returned. *neutral* is
-    specified as float or integer formatted as string. *reduce_expr* and
-    *map_expr* are specified as string formatted operations and *arguments*
-    is specified as a string formatted as a C argument list. *name* specifies
-    the name as which the kernel is compiled. *options* are passed
-    unmodified to :meth:`pyopencl.Program.build`. *preamble* is specified
-    as a string of code.
+    performed and in which the result is returned. *neutral* is specified as
+    float or integer formatted as string. *reduce_expr* and *map_expr* are
+    specified as string formatted operations and *arguments* is specified as a
+    string formatted as a C argument list. *name* specifies the name as which
+    the kernel is compiled. *options* are passed unmodified to
+    :meth:`pyopencl.Program.build`. *preamble* specifies a string of code that
+    is inserted before the actual kernels.
 
-    .. method __call__(*args, queue=None)
+    .. method:: __call__(*args, queue=None)
 
     .. versionadded: 2011.1
 
@@ -460,6 +460,46 @@ Here's a usage example::
 
     my_dot_prod = krnl(a, b).get()
 
+Parallel Scan / Prefix Sum
+--------------------------
+
+.. module:: pyopencl.scan
+
+.. class:: ExclusiveScanKernel(ctx, dtype, scan_expr, neutral, name_prefix="scan", options=[], preamble="", devices=None)
+
+    Generates a kernel that can compute a `prefix sum <https://secure.wikimedia.org/wikipedia/en/wiki/Prefix_sum>`_
+    using any associative operation given as *scan_expr*.
+    *scan_expr* uses the formal values "a" and "b" to indicate two operands of
+    an associative binary operation. *neutral* is the neutral element
+    of *scan_expr*, obeying *scan_expr(a, neutral) == a*.
+
+    *dtype* specifies the type of the arrays being operated on. 
+    *name_prefix* is used for kernel names to ensure recognizability
+    in profiles and logs. *options* is a list of compiler options to use
+    when building. *preamble* specifies a string of code that is
+    inserted before the actual kernels. *devices* may be used to restrict
+    the set of devices on which the kernel is meant to run. (defaults
+    to all devices in the context *ctx*.
+
+    .. method:: __call__(self, input_ary, output_ary=None, allocator=None, queue=None)
+
+.. class:: InclusiveScanKernel(dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None)
+
+    Works like :class:`ExclusiveScanKernel`. Unlike the exclusive case,
+    *neutral* is not required.
+
+Here's a usage example::
+
+    knl = InclusiveScanKernel(context, np.int32, "a+b")
+
+    n = 2**20-2**18+5
+    host_data = np.random.randint(0, 10, n).astype(np.int32)
+    dev_data = cl_array.to_device(queue, host_data)
+
+    knl(dev_data)
+    assert (dev_data.get() == np.cumsum(host_data, axis=0)).all()
+
+
 Fast Fourier Transforms
 -----------------------
 
diff --git a/doc/source/misc.rst b/doc/source/misc.rst
index e18d293e9860ddbc19e61f33a834807d7428b256..4cf441e74e2d748e55bd63c59fb700e82e3f8af3 100644
--- a/doc/source/misc.rst
+++ b/doc/source/misc.rst
@@ -86,6 +86,7 @@ Version 2011.1
   :func:`pyopencl.enqueue_map_image`.
 * Add :mod:`pyopencl.reduction`.
 * Add :ref:`reductions`.
+* Add :mod:`pyopencl.scan`.
 * Add :meth:`pyopencl.MemoryObject.get_host_array`.
 * Deprecate context arguments of 
   :func:`pyopencl.array.to_device`,
@@ -104,7 +105,7 @@ Version 0.92
   extension, leading to working GL interoperability.
 * Add :meth:`pyopencl.Kernel.set_args`.
 * The call signature of :meth:`pyopencl.Kernel.__call__` changed to
-  emphasize the importance of *loccal_size*.
+  emphasize the importance of *local_size*.
 * Add :meth:`pyopencl.Kernel.set_scalar_arg_dtypes`.
 * Add support for the
   `cl_nv_device_attribute_query <http://www.khronos.org/registry/cl/extensions/khr/cl_nv_device_attribute_query.txt>`_
@@ -221,7 +222,7 @@ Licensing
 
 PyOpenCL is licensed to you under the MIT/X Consortium license:
 
-Copyright (c) 2009 Andreas Klöckner and Contributors.
+Copyright (c) 2009-11 Andreas Klöckner and Contributors.
 
 Permission is hereby granted, free of charge, to any person
 obtaining a copy of this software and associated documentation
@@ -244,6 +245,30 @@ WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 
+PyOpenCL includes derivatives of parts of the `Thrust
+<https://code.google.com/p/thrust/>`_ computing package (in particular the scan
+implementation). These parts are licensed as follows:
+
+    Copyright 2008-2011 NVIDIA Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        <http://www.apache.org/licenses/LICENSE-2.0>
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+
+.. note::
+
+    If you use Apache-licensed parts, be aware that these may be incompatible
+    with software licensed exclusively under GPL2.  (Most software is licensed
+    as GPL2 or later, in which case this is not an issue.)
+
 Frequently Asked Questions
 ==========================
 
diff --git a/doc/source/runtime.rst b/doc/source/runtime.rst
index 357740222ebcb1563d2265eecf1b0613dbfc7220..aab26ef2108c2cb6ce2060143a67a4a91acae1de 100644
--- a/doc/source/runtime.rst
+++ b/doc/source/runtime.rst
@@ -571,11 +571,15 @@ Programs and Kernels
 
         See :class:`program_build_info` for values of *param*.
 
-    .. method:: build(options="", devices=None)
+    .. method:: build(options=[], devices=None)
 
         *options* is a string of compiler flags.
         Returns *self*.
 
+        .. versionchanged:: 2011.1
+
+            *options* may now also be a :class:`list` of :class:`str`.
+
     .. attribute:: kernel_name
 
         :class:`Kernel` objects can be produced from a built
diff --git a/pyopencl/__init__.py b/pyopencl/__init__.py
index 83f9d3a79fcdd25d3ac6d901ae5ab9b5729c3a4e..feab73477196958a5e7a9f63d3c19590a693f4a0 100644
--- a/pyopencl/__init__.py
+++ b/pyopencl/__init__.py
@@ -136,7 +136,10 @@ def _add_functionality():
         else:
             return self.get_info(pi_attr)
 
-    def program_build(self, options="", devices=None):
+    def program_build(self, options=[], devices=None):
+        if isinstance(options, list):
+            options = " ".join(options)
+
         try:
             self._build(options=options, devices=devices)
         except Exception, e:
diff --git a/pyopencl/_cluda.py b/pyopencl/_cluda.py
new file mode 100644
index 0000000000000000000000000000000000000000..957a8348bbf5b1e27f0e7ebd859a00c06e054955
--- /dev/null
+++ b/pyopencl/_cluda.py
@@ -0,0 +1,26 @@
+CLUDA_PREAMBLE = """
+#define local_barrier() barrier(CLK_LOCAL_MEM_FENCE);
+
+#define WITHIN_KERNEL /* empty */
+#define KERNEL __kernel
+#define GLOBAL_MEM __global
+#define LOCAL_MEM __local
+#define LOCAL_MEM_ARG __local
+#define REQD_WG_SIZE(X,Y,Z) __attribute__((reqd_work_group_size(X, Y, Z)))
+
+#define LID_0 get_local_id(0)
+#define LID_1 get_local_id(1)
+#define LID_2 get_local_id(2)
+
+#define GID_0 get_group_id(0)
+#define GID_1 get_group_id(1)
+#define GID_2 get_group_id(2)
+
+% if double_support:
+    #pragma OPENCL EXTENSION cl_khr_fp64: enable
+% endif
+"""
+
+
+
+
diff --git a/pyopencl/_mymako.py b/pyopencl/_mymako.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4e4b46505c013d7d28058afeace5834e56f53d5
--- /dev/null
+++ b/pyopencl/_mymako.py
@@ -0,0 +1,14 @@
+try:
+    import mako.template
+except ImportError:
+    raise ImportError(
+            "Some of PyOpenCL's facilities require the Mako templating engine.\n"
+            "You or a piece of software you have used has tried to call such a\n"
+            "part of PyOpenCL, but there was a problem importing Mako.\n\n"
+            "You may install mako now by typing one of:\n"
+            "- easy_install Mako\n"
+            "- pip install Mako\n"
+            "- aptitude install python-mako\n"
+            "\nor whatever else is appropriate for your system.")
+
+from mako import *
diff --git a/pyopencl/characterize.py b/pyopencl/characterize.py
new file mode 100644
index 0000000000000000000000000000000000000000..b323047132b5754a718bd783d127bca8d0b41192
--- /dev/null
+++ b/pyopencl/characterize.py
@@ -0,0 +1,5 @@
+def has_double_support(dev):
+    for ext in dev.extensions.split(" "):
+        if ext == "cl_khr_fp64":
+            return True
+    return False
diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index f3994ed621d2fe282269a1ce16c026943f110097..6bcdc004e8ad7d4e3368d02d58228599d94b1ced 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -70,7 +70,7 @@ def get_elwise_program(context, arguments, operation,
             "after_loop": after_loop,
             })
 
-    return Program(context, source).build(options=" ".join(options))
+    return Program(context, source).build(options)
 
 
 
diff --git a/pyopencl/reduction.py b/pyopencl/reduction.py
index 9348253d7fefe1b657bf69c8ef9872ce075c555d..ebde742dcc9bd1f72376a0709aaabf30e6910e62 100644
--- a/pyopencl/reduction.py
+++ b/pyopencl/reduction.py
@@ -39,20 +39,7 @@ from pyopencl.tools import (
         dtype_to_ctype)
 from pytools import memoize_method
 import numpy as np
-
-try:
-    import mako
-except ImportError:
-    raise ImportError(
-            "PyOpenCL's reduction facility requires the Mako templating engine.\n"
-            "You or a piece of software you have used has tried to call PyOpenCL's\n"
-            "reduction code, but there was a problem importing Mako.\n\n"
-            "You may install mako now by typing one of:\n"
-            "- easy_install Mako\n"
-            "- pip install Mako\n"
-            "- aptitude install python-mako\n"
-            "\nor whatever else is appropriate for your system.")
-
+import pyopencl._mymako as mako
 
 
 
@@ -193,7 +180,7 @@ def  get_reduction_source(
 
     from mako.template import Template
     from pytools import all
-    from pyopencl.tools import has_double_support
+    from pyopencl.characterize import has_double_support
     src = str(Template(KERNEL).render(
         out_type=out_type,
         arguments=arguments,
@@ -224,7 +211,7 @@ def get_reduction_kernel(
          ctx, out_type, out_type_size,
          neutral, reduce_expr, map_expr=None, arguments=None,
          name="reduce_kernel", preamble="",
-         device=None, options="", max_group_size=None):
+         device=None, options=[], max_group_size=None):
     if map_expr is None:
         map_expr = "in[i]"
 
@@ -237,7 +224,7 @@ def get_reduction_kernel(
             name, preamble, device, max_group_size)
 
     inf.program = cl.Program(ctx, inf.source)
-    inf.program.build()
+    inf.program.build(options)
     inf.kernel = getattr(inf.program, name)
 
     from pyopencl.tools import parse_c_arg, ScalarArg
@@ -261,7 +248,7 @@ def get_reduction_kernel(
 class ReductionKernel:
     def __init__(self, ctx, dtype_out,
             neutral, reduce_expr, map_expr=None, arguments=None,
-            name="reduce_kernel", options="", preamble=""):
+            name="reduce_kernel", options=[], preamble=""):
 
         dtype_out = self.dtype_out = np.dtype(dtype_out)
 
diff --git a/pyopencl/scan.py b/pyopencl/scan.py
new file mode 100644
index 0000000000000000000000000000000000000000..02679a3122f9662b67175a48db948b23329aa2a1
--- /dev/null
+++ b/pyopencl/scan.py
@@ -0,0 +1,616 @@
+# WARNING!
+# If you update this file, make sure to also update the sister copy in
+# PyCUDA or PyOpenCL--both files should always be exactly identical.
+
+"""Scan primitive."""
+
+from __future__ import division
+
+__copyright__ = """
+Copyright 2011 Andreas Kloeckner
+Copyright 2008-2011 NVIDIA Corporation
+"""
+
+__license__ = """
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+Derived from thrust/detail/backend/cuda/detail/fast_scan.h
+within the Thrust project, https://code.google.com/p/thrust/
+"""
+
+
+
+
+import numpy as np
+
+_CL_MODE = "pyopencl" in __name__
+
+if _CL_MODE:
+    import pyopencl as cl
+    import pyopencl.array as cl_array
+    from pyopencl.tools import dtype_to_ctype
+    import pyopencl._mymako as mako
+    from pyopencl._cluda import CLUDA_PREAMBLE
+else:
+    import pycuda.driver as driver
+    import pycuda.gpuarray as gpuarray
+    from pycuda.compiler import SourceModule
+    from pycuda.tools import dtype_to_ctype
+    import pycuda._mymako as mako
+    from pycuda._cluda import CLUDA_PREAMBLE
+
+
+
+
+SHARED_PREAMBLE = CLUDA_PREAMBLE + """
+#define WG_SIZE ${wg_size}
+#define SCAN_EXPR(a, b) ${scan_expr}
+
+${preamble}
+
+typedef ${scan_type} scan_type;
+"""
+
+
+
+
+SCAN_INTERVALS_SOURCE = mako.template.Template(SHARED_PREAMBLE + """
+#define K ${wg_seq_batches}
+
+<%def name="make_group_scan(name, with_bounds_check)">
+    WITHIN_KERNEL
+    void ${name}(LOCAL_MEM_ARG scan_type *array
+    % if with_bounds_check:
+      , const unsigned n
+    % endif
+    )
+    {
+        scan_type val = array[LID_0];
+
+        <% offset = 1 %>
+
+        % while offset <= wg_size:
+            if (LID_0 >= ${offset}
+            % if with_bounds_check:
+              && LID_0 < n
+            % endif
+            )
+            {
+                scan_type tmp = array[LID_0 - ${offset}];
+                val = SCAN_EXPR(tmp, val);
+            }
+
+            local_barrier();
+            array[LID_0] = val;
+            local_barrier();
+
+            <% offset *= 2 %>
+        % endwhile
+    }
+</%def>
+
+${make_group_scan("scan_group", False)}
+${make_group_scan("scan_group_n", True)}
+
+KERNEL
+REQD_WG_SIZE(WG_SIZE, 1, 1)
+void ${name_prefix}_scan_intervals(
+    GLOBAL_MEM scan_type *input,
+    const unsigned int N,
+    const unsigned int interval_size,
+    GLOBAL_MEM scan_type *output,
+    GLOBAL_MEM scan_type *group_results)
+{
+    LOCAL_MEM scan_type sdata[K + 1][WG_SIZE + 1];  // padded to avoid bank conflicts
+
+    local_barrier(); // TODO figure out why this seems necessary now
+
+    const unsigned int interval_begin = interval_size * GID_0;
+    const unsigned int interval_end   = min(interval_begin + interval_size, N);
+
+    const unsigned int unit_size  = K * WG_SIZE;
+
+    unsigned int base = interval_begin;
+
+    // process full units
+    for(; base + unit_size <= interval_end; base += unit_size)
+    {
+        // read data
+        for(unsigned int k = 0; k < K; k++)
+        {
+            const unsigned int offset = k*WG_SIZE + LID_0;
+
+            GLOBAL_MEM scan_type *temp = input + (base + offset);
+            sdata[offset % K][offset / K] = *temp;
+        }
+
+        // carry in
+        if (LID_0 == 0 && base != interval_begin)
+            sdata[0][0] = SCAN_EXPR(sdata[K][WG_SIZE - 1], sdata[0][0]);
+
+        local_barrier();
+
+        // scan local values
+        scan_type sum = sdata[0][LID_0];
+
+        for(unsigned int k = 1; k < K; k++)
+        {
+            scan_type tmp = sdata[k][LID_0];
+            sum = SCAN_EXPR(sum, tmp);
+            sdata[k][LID_0] = sum;
+        }
+
+        // second level scan
+        sdata[K][LID_0] = sum;
+        local_barrier();
+        scan_group(&sdata[K][0]);
+
+        // update local values
+        if (LID_0 > 0)
+        {
+            sum = sdata[K][LID_0 - 1];
+
+            for(unsigned int k = 0; k < K; k++)
+            {
+                scan_type tmp = sdata[k][LID_0];
+                sdata[k][LID_0] = SCAN_EXPR(sum, tmp);
+            }
+        }
+
+        local_barrier();
+
+        // write data
+        for(unsigned int k = 0; k < K; k++)
+        {
+            const unsigned int offset = k*WG_SIZE + LID_0;
+
+            GLOBAL_MEM scan_type *temp = output + (base + offset);
+            *temp = sdata[offset % K][offset / K];
+        }
+
+        local_barrier();
+    }
+
+    // process partially full unit at end of input (if necessary)
+    if (base < interval_end)
+    {
+        // read data
+        for(unsigned int k = 0; k < K; k++)
+        {
+            const unsigned int offset = k*WG_SIZE + LID_0;
+
+            if (base + offset < interval_end)
+            {
+                GLOBAL_MEM scan_type *temp = input + (base + offset);
+                sdata[offset % K][offset / K] = *temp;
+            }
+        }
+
+        // carry in
+        if (LID_0 == 0 && base != interval_begin)
+            sdata[0][0] = SCAN_EXPR(sdata[K][WG_SIZE - 1], sdata[0][0]);
+
+        local_barrier();
+
+        // scan local values
+        scan_type sum = sdata[0][LID_0];
+
+        const unsigned int offset_end = interval_end - base;
+
+        for(unsigned int k = 1; k < K; k++)
+        {
+            if (K * LID_0 + k < offset_end)
+            {
+                scan_type tmp = sdata[k][LID_0];
+                sum = SCAN_EXPR(sum, tmp);
+                sdata[k][LID_0] = sum;
+            }
+        }
+
+        // second level scan
+        sdata[K][LID_0] = sum;
+        local_barrier();
+        scan_group_n(&sdata[K][0], offset_end / K);
+
+        // update local values
+        if (LID_0 > 0)
+        {
+            sum = sdata[K][LID_0 - 1];
+
+            for(unsigned int k = 0; k < K; k++)
+            {
+                if (K * LID_0 + k < offset_end)
+                {
+                    scan_type tmp = sdata[k][LID_0];
+                    sdata[k][LID_0] = SCAN_EXPR(sum, tmp);
+                }
+            }
+        }
+
+        local_barrier();
+
+        // write data
+        for(unsigned int k = 0; k < K; k++)
+        {
+            const unsigned int offset = k*WG_SIZE + LID_0;
+
+            if (base + offset < interval_end)
+            {
+                GLOBAL_MEM scan_type *temp = output + (base + offset);
+                *temp = sdata[offset % K][offset / K];
+            }
+        }
+
+    }
+
+    local_barrier();
+
+    // write interval sum
+    if (LID_0 == 0)
+    {
+        GLOBAL_MEM scan_type *temp = output + (interval_end - 1);
+        group_results[GID_0] = *temp;
+    }
+}
+""")
+
+
+
+
+INCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """
+KERNEL
+REQD_WG_SIZE(WG_SIZE, 1, 1)
+void ${name_prefix}_final_update(
+    GLOBAL_MEM scan_type *output,
+    const unsigned int N,
+    const unsigned int interval_size,
+    GLOBAL_MEM scan_type *group_results)
+{
+    const unsigned int interval_begin = interval_size * GID_0;
+    const unsigned int interval_end   = min(interval_begin + interval_size, N);
+
+    if (GID_0 == 0)
+        return;
+
+    // value to add to this segment
+    scan_type sum = group_results[GID_0 - 1];
+
+    // advance result pointer
+    output += interval_begin + LID_0;
+
+    for(unsigned int base = interval_begin; base < interval_end; base += WG_SIZE, output += WG_SIZE)
+    {
+        const unsigned int i = base + LID_0;
+
+        if(i < interval_end)
+        {
+            scan_type tmp = *output;
+            *output = SCAN_EXPR(sum, tmp);
+        }
+
+        local_barrier();
+    }
+}
+""")
+
+
+
+
+EXCLUSIVE_UPDATE_SOURCE = mako.template.Template(SHARED_PREAMBLE + """
+KERNEL
+REQD_WG_SIZE(WG_SIZE, 1, 1)
+void ${name_prefix}_final_update(
+    GLOBAL_MEM scan_type *output,
+    const unsigned int N,
+    const unsigned int interval_size,
+    GLOBAL_MEM scan_type *group_results)
+{
+    LOCAL_MEM scan_type sdata[WG_SIZE];
+
+    local_barrier(); // TODO figure out why this seems necessary now
+
+    const unsigned int interval_begin = interval_size * GID_0;
+    const unsigned int interval_end   = min(interval_begin + interval_size, N);
+
+    // value to add to this segment
+    scan_type carry = ${neutral};
+    if(GID_0 != 0)
+    {
+        scan_type tmp = group_results[GID_0 - 1];
+        carry = SCAN_EXPR(carry, tmp);
+    }
+
+    scan_type val   = carry;
+
+    // advance result pointer
+    output += interval_begin + LID_0;
+
+    for(unsigned int base = interval_begin; base < interval_end; base += WG_SIZE, output += WG_SIZE)
+    {
+        const unsigned int i = base + LID_0;
+
+        if(i < interval_end)
+        {
+            scan_type tmp = *output;
+            sdata[LID_0] = SCAN_EXPR(carry, tmp);
+        }
+
+        local_barrier();
+
+        if (LID_0 != 0)
+            val = sdata[LID_0 - 1];
+
+        if (i < interval_end)
+            *output = val;
+
+        if(LID_0 == 0)
+            val = sdata[WG_SIZE - 1];
+
+        local_barrier();
+    }
+}
+""")
+
+
+
+
+def _div_ceil(nr, dr):
+    return (nr + dr -1) // dr
+
+
+def _uniform_interval_splitting(n, granularity, max_intervals):
+    grains  = _div_ceil(n, granularity)
+
+    # one grain per interval
+    if grains <= max_intervals:
+        return granularity, grains
+
+    # ensures that:
+    #     num_intervals * interval_size is >= n
+    #   and
+    #     (num_intervals - 1) * interval_size is < n
+
+    grains_per_interval = _div_ceil(grains, max_intervals)
+    interval_size = grains_per_interval * granularity
+    num_intervals = _div_ceil(n, interval_size)
+
+    return interval_size, num_intervals
+
+
+
+
+if _CL_MODE:
+    class _ScanKernelBase(object):
+        def __init__(self, ctx, dtype,
+                scan_expr, neutral=None,
+                name_prefix="scan", options=[], preamble="", devices=None):
+
+            if isinstance(self, ExclusiveScanKernel) and neutral is None:
+                raise ValueError("neutral element is required for exclusive scan")
+
+            self.context = ctx
+            dtype = self.dtype = np.dtype(dtype)
+            self.neutral = neutral
+
+            if devices is None:
+                devices = ctx.devices
+            self.devices = devices
+
+            max_wg_size = min(dev.max_work_group_size for dev in self.devices)
+
+            # Thrust says these are good for GT200
+            self.scan_wg_size = min(max_wg_size, 128)
+            self.update_wg_size = min(max_wg_size, 256)
+
+            if self.scan_wg_size < 16:
+                # Hello, Apple CPU. Nice to see you.
+                self.scan_wg_seq_batches = 128 # FIXME: guesswork
+            else:
+                self.scan_wg_seq_batches = 6
+
+            from pytools import all
+            from pyopencl.characterize import has_double_support
+
+            kw_values = dict(
+                preamble=preamble,
+                name_prefix=name_prefix,
+                scan_type=dtype_to_ctype(dtype),
+                scan_expr=scan_expr,
+                neutral=neutral,
+                double_support=all(
+                    has_double_support(dev) for dev in devices)
+                )
+
+            scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render(
+                wg_size=self.scan_wg_size,
+                wg_seq_batches=self.scan_wg_seq_batches,
+                **kw_values))
+            scan_intervals_prg = cl.Program(ctx, scan_intervals_src).build(options)
+            self.scan_intervals_knl = getattr(
+                    scan_intervals_prg,
+                    name_prefix+"_scan_intervals")
+            self.scan_intervals_knl.set_scalar_arg_dtypes(
+                    (None, np.uint32, np.uint32, None, None))
+
+            final_update_src = str(self.final_update_tp.render(
+                wg_size=self.update_wg_size,
+                **kw_values))
+
+            final_update_prg = cl.Program(self.context, final_update_src).build(options)
+            self.final_update_knl = getattr(
+                    final_update_prg,
+                    name_prefix+"_final_update")
+            self.final_update_knl.set_scalar_arg_dtypes(
+                    (None, np.uint32, np.uint32, None))
+
+        def __call__(self, input_ary, output_ary=None, allocator=None,
+                queue=None):
+            allocator = allocator or input_ary.allocator
+            queue = queue or input_ary.queue or output_ary.queue
+
+            if output_ary is None:
+                output_ary = input_ary
+
+            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
+                output_ary = cl_array.empty_like(input_ary, allocator=allocator)
+
+            if input_ary.shape != output_ary.shape:
+                raise ValueError("input and output must have the same shape")
+
+            n, = input_ary.shape
+
+            if not n:
+                return output_ary
+
+            unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
+            max_groups = 3*max(dev.max_compute_units for dev in self.devices)
+
+            interval_size, num_groups = _uniform_interval_splitting(
+                    n, unit_size, max_groups);
+
+            block_results = allocator(self.dtype.itemsize*num_groups)
+            dummy_results = allocator(self.dtype.itemsize)
+
+            # first level scan of interval (one interval per block)
+            self.scan_intervals_knl(
+                    queue, (num_groups*self.scan_wg_size,), (self.scan_wg_size,),
+                    input_ary.data,
+                    n, interval_size,
+                    output_ary.data,
+                    block_results)
+
+            # second level inclusive scan of per-block results
+            self.scan_intervals_knl(
+                    queue, (self.scan_wg_size,), (self.scan_wg_size,),
+                    block_results,
+                    num_groups, interval_size,
+                    block_results,
+                    dummy_results)
+
+            # update intervals with result of second level scan
+            self.final_update_knl(
+                    queue, (num_groups*self.update_wg_size,), (self.update_wg_size,),
+                    output_ary.data,
+                    n, interval_size,
+                    block_results)
+
+            return output_ary
+
+
+
+
+else:
+    class _ScanKernelBase(object):
+        def __init__(self, dtype,
+                scan_expr, neutral=None,
+                name_prefix="scan", options=[], preamble="", devices=None):
+
+            if isinstance(self, ExclusiveScanKernel) and neutral is None:
+                raise ValueError("neutral element is required for exclusive scan")
+
+            dtype = self.dtype = np.dtype(dtype)
+            self.neutral = neutral
+
+            # Thrust says these are good for GT200
+            self.scan_wg_size = 128
+            self.update_wg_size = 256
+            self.scan_wg_seq_batches = 6
+
+            kw_values = dict(
+                preamble=preamble,
+                name_prefix=name_prefix,
+                scan_type=dtype_to_ctype(dtype),
+                scan_expr=scan_expr,
+                neutral=neutral)
+
+            scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render(
+                wg_size=self.scan_wg_size,
+                wg_seq_batches=self.scan_wg_seq_batches,
+                **kw_values))
+            scan_intervals_prg = SourceModule(
+                    scan_intervals_src, options=options, no_extern_c=True)
+            self.scan_intervals_knl = scan_intervals_prg.get_function(
+                    name_prefix+"_scan_intervals")
+            self.scan_intervals_knl.prepare("PIIPP", (self.scan_wg_size, 1, 1))
+
+            final_update_src = str(self.final_update_tp.render(
+                wg_size=self.update_wg_size,
+                **kw_values))
+
+            final_update_prg = SourceModule(
+                    final_update_src, options=options, no_extern_c=True)
+            self.final_update_knl = final_update_prg.get_function(
+                    name_prefix+"_final_update")
+            self.final_update_knl.prepare("PIIP", (self.update_wg_size, 1, 1))
+
+        def __call__(self, input_ary, output_ary=None, allocator=None,
+                stream=None):
+            allocator = allocator or input_ary.allocator
+
+            if output_ary is None:
+                output_ary = input_ary
+
+            if isinstance(output_ary, (str, unicode)) and output_ary == "new":
+                output_ary = cl_array.empty_like(input_ary, allocator=allocator)
+
+            if input_ary.shape != output_ary.shape:
+                raise ValueError("input and output must have the same shape")
+
+            n, = input_ary.shape
+
+            if not n:
+                return output_ary
+
+            unit_size  = self.scan_wg_size * self.scan_wg_seq_batches
+            dev = driver.Context.get_device()
+            max_groups = 3*dev.get_attribute(
+                    driver.device_attribute.MULTIPROCESSOR_COUNT)
+
+            interval_size, num_groups = _uniform_interval_splitting(
+                    n, unit_size, max_groups);
+
+            block_results = allocator(self.dtype.itemsize*num_groups)
+            dummy_results = allocator(self.dtype.itemsize)
+
+            # first level scan of interval (one interval per block)
+            self.scan_intervals_knl.prepared_async_call(
+                    (num_groups, 1), stream,
+                    input_ary.gpudata,
+                    n, interval_size,
+                    output_ary.gpudata,
+                    block_results)
+
+            # second level inclusive scan of per-block results
+            self.scan_intervals_knl.prepared_async_call(
+                    (1, 1), stream,
+                    block_results,
+                    num_groups, interval_size,
+                    block_results,
+                    dummy_results)
+
+            # update intervals with result of second level scan
+            self.final_update_knl.prepared_async_call(
+                    (num_groups, 1,), stream,
+                    output_ary.gpudata,
+                    n, interval_size,
+                    block_results)
+
+            return output_ary
+
+
+
+class InclusiveScanKernel(_ScanKernelBase):
+    final_update_tp = INCLUSIVE_UPDATE_SOURCE
+
+class ExclusiveScanKernel(_ScanKernelBase):
+    final_update_tp = EXCLUSIVE_UPDATE_SOURCE
diff --git a/pyopencl/tools.py b/pyopencl/tools.py
index 41b355e950cf5a22804a75650db0ae0368c89b4e..7b59d4fcd7fb3468bd2b6f96e9212530a520f274 100644
--- a/pyopencl/tools.py
+++ b/pyopencl/tools.py
@@ -249,15 +249,6 @@ def parse_c_arg(c_arg):
 
 
 
-def has_double_support(dev):
-    for ext in dev.extensions.split(" "):
-        if ext == "cl_khr_fp64":
-            return True
-    return False
-
-
-
-
 def get_gl_sharing_context_properties():
     ctx_props = cl.context_properties
 
diff --git a/test/test_array.py b/test/test_array.py
index adabdcfc8780e655b83faafa14b233b409e673c4..636a53b97bf7ce55259bad7bdaf7f250085ded1d 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -19,7 +19,7 @@ if have_cl():
     import pyopencl as cl
     from pyopencl.tools import pytest_generate_tests_for_pyopencl \
             as pytest_generate_tests
-    from pyopencl.tools import has_double_support
+    from pyopencl.characterize import has_double_support
 
 
 
@@ -556,6 +556,40 @@ def test_astype(ctx_getter):
 
 
 
+@pytools.test.mark_test.opencl
+def test_scan(ctx_getter):
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
+
+    dtype = np.int32
+    for cls in [InclusiveScanKernel, ExclusiveScanKernel]:
+        knl = cls(context, dtype, "a+b", "0")
+
+        for n in [
+                10, 2**10-5, 2**10, 
+                2**20-2**18, 
+                2**20-2**18+5, 
+                2**10+5,
+                2**20+5,
+                2**20, 2**24
+                ]:
+            host_data = np.random.randint(0, 10, n).astype(dtype)
+            dev_data = cl_array.to_device(queue, host_data)
+
+            knl(dev_data)
+
+            desired_result = np.cumsum(host_data, axis=0)
+            if cls is ExclusiveScanKernel:
+                desired_result -= host_data
+
+            assert (dev_data.get() == desired_result).all()
+
+
+
+
+
 if __name__ == "__main__":
     # make sure that import failures get reported, instead of skipping the tests.
     import pyopencl as cl
diff --git a/test/test_clmath.py b/test/test_clmath.py
index b8f648de08e83a5711ed372f7ba6250a92176ec9..8e4a60f5b33a83e7156e9ecc7fb5f2e53edfcd82 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -16,6 +16,7 @@ if have_cl():
     import pyopencl.clmath as clmath
     from pyopencl.tools import pytest_generate_tests_for_pyopencl \
             as pytest_generate_tests
+    from pyopencl.characterize import has_double_support
 
 
 
@@ -26,15 +27,6 @@ sizes = [10, 128, 1<<10, 1<<11, 1<<13]
 
 
 
-def has_double_support(dev):
-    for ext in dev.extensions.split(" "):
-        if ext == "cl_khr_fp64":
-            return True
-    return False
-
-
-
-
 numpy_func_names = {
         "asin": "arcsin",
         "acos": "arccos",