diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index e3813b4d169903fe5d02d7443ec7d85d36036634..a78d7dfeaa8d9393d2000d57ce8a78973d177916 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -541,12 +541,25 @@ ${generate_template}
 
 // {{{ kernel entry point
 
-__kernel void ${kernel_name}(${kernel_list_arg_decl} USER_ARG_DECL index_type n)
+__kernel
+%if do_not_vectorize:
+__attribute__((reqd_work_group_size(1, 1, 1)))
+%endif
+void ${kernel_name}(${kernel_list_arg_decl} USER_ARG_DECL index_type n)
+
 {
-    int lid = get_local_id(0);
-    index_type gsize = get_global_size(0);
-    index_type work_group_start = get_local_size(0)*get_group_id(0);
-    for (index_type i = work_group_start + lid; i < n; i += gsize)
+    %if not do_not_vectorize:
+        int lid = get_local_id(0);
+        index_type gsize = get_global_size(0);
+        index_type work_group_start = get_local_size(0)*get_group_id(0);
+        for (index_type i = work_group_start + lid; i < n; i += gsize)
+    %else:
+        const int chunk_size = 128;
+        index_type chunk_base = get_global_id(0)*chunk_size;
+        index_type gsize = get_global_size(0);
+        for (; chunk_base < n; chunk_base += gsize*chunk_size)
+        for (index_type i = chunk_base; i < min(n, chunk_base+chunk_size); ++i)
+    %endif
     {
         %if is_count_stage:
             %for name, dtype in list_names_and_dtypes:
@@ -640,7 +653,7 @@ class ListOfListsBuilder:
     """
     def __init__(self, context, list_names_and_dtypes, generate_template,
             arg_decls, count_sharing=None, devices=None, name_prefix="plb_build_list",
-            options=[], preamble=""):
+            options=[], preamble="", debug=False, complex_kernel=False):
         """
         :arg context: A :class:`pyopencl.Context`.
         :arg list_names_and_dtypes: a list of `(name, dtype)` tuples
@@ -654,6 +667,7 @@ class ListOfListsBuilder:
         :arg name_prefix: the name prefix to use for the compiled kernels
         :arg options: OpenCL compilation options for kernels using
             *generate_template*.
+        :arg complex_kernel: If `True`, prevents vectorization on CPUs.
 
         *generate_template* may use the following C macros/identifiers:
 
@@ -711,6 +725,10 @@ class ListOfListsBuilder:
         self.preamble = preamble
         self.options = options
 
+        self.debug = debug
+
+        self.complex_kernel = complex_kernel
+
     # {{{ kernel generators
 
     @memoize_method
@@ -724,6 +742,12 @@ class ListOfListsBuilder:
                 output_statement="ary[i+1] = item;",
                 devices=self.devices)
 
+    def do_not_vectorize(self):
+        from pytools import any
+        return (self.complex_kernel
+                and any(dev.type == cl.device_type.CPU
+                    for dev in self.context.devices))
+
     @memoize_method
     def get_count_kernel(self, index_dtype):
         index_ctype = dtype_to_ctype(index_dtype)
@@ -743,9 +767,15 @@ class ListOfListsBuilder:
                 index_ctype, name), name))
 
         kernel_name = self.name_prefix+"_count"
+
+        from pyopencl.characterize import has_double_support
         src = _LIST_BUILDER_TEMPLATE.render(
                 is_count_stage=True,
                 kernel_name=kernel_name,
+                double_support=all(has_double_support(dev) for dev in
+                    self.context.devices),
+                debug=self.debug,
+                do_not_vectorize=self.do_not_vectorize(),
 
                 kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
                 kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"),
@@ -802,9 +832,15 @@ class ListOfListsBuilder:
             kernel_list_arg_values += "%s, &%s, " % (list_name, index_name)
 
         kernel_name = self.name_prefix+"_write"
+
+        from pyopencl.characterize import has_double_support
         src = _LIST_BUILDER_TEMPLATE.render(
                 is_count_stage=False,
                 kernel_name=kernel_name,
+                double_support=all(has_double_support(dev) for dev in
+                    self.context.devices),
+                debug=self.debug,
+                do_not_vectorize=self.do_not_vectorize(),
 
                 kernel_list_arg_decl=_get_arg_decl(kernel_list_args),
                 kernel_list_arg_values=kernel_list_arg_values,
@@ -846,12 +882,15 @@ class ListOfListsBuilder:
             arrays.
         :returns: a mapping from names to objects which have attributes
 
-            * `lists` for the array containing all lists
+            * `count` for the total number of entries in all lists combined
+            * `lists` for the array containing all lists.
             * `starts` for the array of starting indices in `lists`.
               `starts` is built so that it has n+1 entries, so that
               the *i*'th entry is the start of the *i*'th list, and the
               *i*'th entry is the index one past the *i*'th list's end,
               even for the last list.
+
+              This implies that all lists are contiguous.
         """
         if n_objects >= int(np.iinfo(np.int32).max):
             index_dtype = np.int64
@@ -886,7 +925,17 @@ class ListOfListsBuilder:
 
         # }}}
 
-        count_kernel(queue, (n_objects,), None,
+        if self.debug:
+            gsize = (1,)
+            lsize = (1,)
+        elif self.complex_kernel and queue.device.type == cl.device_type.CPU:
+            gsize = (4*queue.device.max_compute_units,)
+            lsize = (1,)
+        else:
+            from pyopencl.array import splay
+            gsize, lsize = splay(queue, n_objects)
+
+        count_kernel(queue, gsize, lsize,
                 *(tuple(count_list_args) + args + (n_objects,)))
 
         # {{{ run scans
@@ -935,8 +984,7 @@ class ListOfListsBuilder:
 
         # }}}
 
-
-        write_kernel(queue, (n_objects,), None,
+        write_kernel(queue, gsize, lsize,
                 *(tuple(write_list_args) + args + (n_objects,)))
 
         return result