diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index 3dfdab556ca13fc9cc13a26f441617a34f39edc3..01a02b14993584a017dc8bfdb2e21098cefbf31f 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -79,7 +79,9 @@ def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=Non
     :returns: a tuple *(out, count, event)* where *out* is the output array, *count*
         is an on-device scalar (fetch to host with `count.get()`) indicating
         how many elements satisfied *predicate*, and *event* is a
-        :class:`pyopencl.Event` for dependency management.
+        :class:`pyopencl.Event` for dependency management. *out* is allocated
+        to the same length as *ary*, but only the first *count* entries carry
+        meaning.
 
     .. versionadded:: 2013.1
     """
@@ -808,7 +810,7 @@ class ListOfListsBuilder:
     def do_not_vectorize(self):
         from pytools import any
         return (self.complex_kernel
-                and any(dev.type == cl.device_type.CPU
+                and any(dev.type & cl.device_type.CPU
                     for dev in self.context.devices))
 
     @memoize_method
@@ -975,6 +977,9 @@ class ListOfListsBuilder:
         result = {}
         count_list_args = []
 
+        if wait_for is None:
+            wait_for = []
+
         count_kernel = self.get_count_kernel(index_dtype)
         write_kernel = self.get_write_kernel(index_dtype)
         scan_kernel = self.get_scan_kernel(index_dtype)
@@ -987,6 +992,8 @@ class ListOfListsBuilder:
 
             counts = cl.array.empty(queue,
                     (n_objects + 1), index_dtype, allocator=allocator)
+            counts[-1] = 0
+            wait_for = wait_for + counts.events
 
             # The scan will turn the "counts" array into the "starts" array
             # in-place.
@@ -1027,11 +1034,7 @@ class ListOfListsBuilder:
             scan_events.append(evt)
 
             # retrieve count
-            count = np.array(1, index_dtype)
-            cl.enqueue_copy(queue, count, starts_ary.data,
-                    device_offset=index_dtype.itemsize*n_objects)
-
-            info_record.count = int(count)
+            info_record.count = int(starts_ary[-1].get())
 
         # }}}