From 98654721ed47e67d31b85ac7f07b0cfdb5572de7 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 6 Jan 2013 02:27:52 +0100
Subject: [PATCH] Split test_algorithm and test_array.

---
 test/test_algorithm.py | 779 +++++++++++++++++++++++++++++++++++++++++
 test/test_array.py     | 712 -------------------------------------
 2 files changed, 779 insertions(+), 712 deletions(-)
 create mode 100644 test/test_algorithm.py

diff --git a/test/test_algorithm.py b/test/test_algorithm.py
new file mode 100644
index 00000000..0294eddd
--- /dev/null
+++ b/test/test_algorithm.py
@@ -0,0 +1,779 @@
+#! /usr/bin/env python
+
+__copyright__ = "Copyright (C) 2013 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import numpy.linalg as la
+import sys
+import pytools.test
+from pytools import memoize
+from test_array import general_clrand
+
+
+def have_cl():
+    try:
+        import pyopencl
+        return True
+    except:
+        return False
+
+if have_cl():
+    import pyopencl as cl
+    import pyopencl.array as cl_array
+    import pyopencl.tools as cl_tools
+    from pyopencl.tools import pytest_generate_tests_for_pyopencl \
+            as pytest_generate_tests
+    from pyopencl.characterize import has_double_support
+
+
+
+
+
+# {{{ elementwise
+
+@pytools.test.mark_test.opencl
+def test_elwise_kernel(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    a_gpu = clrand(queue, (50,), np.float32)
+    b_gpu = clrand(queue, (50,), np.float32)
+
+    from pyopencl.elementwise import ElementwiseKernel
+    lin_comb = ElementwiseKernel(context,
+            "float a, float *x, float b, float *y, float *z",
+            "z[i] = a*x[i] + b*y[i]",
+            "linear_combination")
+
+    c_gpu = cl_array.empty_like(a_gpu)
+    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
+
+    assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
+
+
+@pytools.test.mark_test.opencl
+def test_elwise_kernel_with_options(ctx_factory):
+    from pyopencl.clrandom import rand as clrand
+    from pyopencl.elementwise import ElementwiseKernel
+
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    in_gpu = clrand(queue, (50,), np.float32)
+
+    options = ['-D', 'ADD_ONE']
+    add_one = ElementwiseKernel(
+        context,
+        "float* out, const float *in",
+        """
+        out[i] = in[i]
+        #ifdef ADD_ONE
+            +1
+        #endif
+        ;
+        """,
+        options=options,
+        )
+
+    out_gpu = cl_array.empty_like(in_gpu)
+    add_one(out_gpu, in_gpu)
+
+    gt = in_gpu.get() + 1
+    gv = out_gpu.get()
+    assert la.norm(gv - gt) < 1e-5
+
+
+@pytools.test.mark_test.opencl
+def test_ranged_elwise_kernel(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.elementwise import ElementwiseKernel
+    set_to_seven = ElementwiseKernel(context,
+            "float *z", "z[i] = 7", "set_to_seven")
+
+    for i, slc in enumerate([
+            slice(5, 20000),
+            slice(5, 20000, 17),
+            slice(3000, 5, -1),
+            slice(1000, -1),
+            ]):
+
+        a_gpu = cl_array.zeros(queue, (50000,), dtype=np.float32)
+        a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)
+
+        a_cpu[slc] = 7
+        set_to_seven(a_gpu, slice=slc)
+
+        assert (a_cpu == a_gpu.get()).all()
+
+@pytools.test.mark_test.opencl
+def test_take(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32)
+    a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32)
+    result = cl_array.take(a, idx)
+    assert ((3 * idx).get() == result.get()).all()
+
+
+@pytools.test.mark_test.opencl
+def test_arange(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    n = 5000
+    a = cl_array.arange(queue, n, dtype=np.float32)
+    assert (np.arange(n, dtype=np.float32) == a.get()).all()
+
+
+@pytools.test.mark_test.opencl
+def test_reverse(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    n = 5000
+    a = np.arange(n).astype(np.float32)
+    a_gpu = cl_array.to_device(queue, a)
+
+    a_gpu = a_gpu.reverse()
+
+    assert (a[::-1] == a_gpu.get()).all()
+
+@pytools.test.mark_test.opencl
+def test_if_positive(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    l = 20000
+    a_gpu = clrand(queue, (l,), np.float32)
+    b_gpu = clrand(queue, (l,), np.float32)
+    a = a_gpu.get()
+    b = b_gpu.get()
+
+    max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu)
+    min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu)
+
+    print(max_a_b_gpu)
+    print(np.maximum(a, b))
+
+    assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
+    assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
+
+
+@pytools.test.mark_test.opencl
+def test_take_put(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    for n in [5, 17, 333]:
+        one_field_size = 8
+        buf_gpu = cl_array.zeros(queue,
+                n * one_field_size, dtype=np.float32)
+        dest_indices = cl_array.to_device(queue,
+                np.array([0, 1, 2,  3, 32, 33, 34, 35], dtype=np.uint32))
+        read_map = cl_array.to_device(queue,
+                np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
+
+        cl_array.multi_take_put(
+                arrays=[buf_gpu for i in range(n)],
+                dest_indices=dest_indices,
+                src_indices=read_map,
+                src_offsets=[i * one_field_size for i in range(n)],
+                dest_shape=(96,))
+
+
+@pytools.test.mark_test.opencl
+def test_astype(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    if not has_double_support(context.devices[0]):
+        from py.test import skip
+        skip("double precision not supported on %s" % context.devices[0])
+
+    a_gpu = clrand(queue, (2000,), dtype=np.float32)
+
+    a = a_gpu.get().astype(np.float64)
+    a2 = a_gpu.astype(np.float64).get()
+
+    assert a2.dtype == np.float64
+    assert la.norm(a - a2) == 0, (a, a2)
+
+    a_gpu = clrand(queue, (2000,), dtype=np.float64)
+
+    a = a_gpu.get().astype(np.float32)
+    a2 = a_gpu.astype(np.float32).get()
+
+    assert a2.dtype == np.float32
+    assert la.norm(a - a2) / la.norm(a) < 1e-7
+
+# }}}
+
+# {{{ reduction
+
+@pytools.test.mark_test.opencl
+def test_sum(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    n = 200000
+    for dtype in [np.float32, np.complex64]:
+        a_gpu = general_clrand(queue, (n,), dtype)
+
+        a = a_gpu.get()
+
+        sum_a = np.sum(a)
+        sum_a_gpu = cl_array.sum(a_gpu).get()
+
+        assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
+
+
+@pytools.test.mark_test.opencl
+def test_minmax(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    if has_double_support(context.devices[0]):
+        dtypes = [np.float64, np.float32, np.int32]
+    else:
+        dtypes = [np.float32, np.int32]
+
+    for what in ["min", "max"]:
+        for dtype in dtypes:
+            a_gpu = clrand(queue, (200000,), dtype)
+            a = a_gpu.get()
+
+            op_a = getattr(np, what)(a)
+            op_a_gpu = getattr(cl_array, what)(a_gpu).get()
+
+            assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
+
+
+@pytools.test.mark_test.opencl
+def test_subset_minmax(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+
+    l_a = 200000
+    gran = 5
+    l_m = l_a - l_a // gran + 1
+
+    if has_double_support(context.devices[0]):
+        dtypes = [np.float64, np.float32, np.int32]
+    else:
+        dtypes = [np.float32, np.int32]
+
+    for dtype in dtypes:
+        a_gpu = clrand(queue, (l_a,), dtype)
+        a = a_gpu.get()
+
+        meaningful_indices_gpu = cl_array.zeros(
+                queue, l_m, dtype=np.int32)
+        meaningful_indices = meaningful_indices_gpu.get()
+        j = 0
+        for i in range(len(meaningful_indices)):
+            meaningful_indices[i] = j
+            j = j + 1
+            if j % gran == 0:
+                j = j + 1
+
+        meaningful_indices_gpu = cl_array.to_device(
+                queue, meaningful_indices)
+        b = a[meaningful_indices]
+
+        min_a = np.min(b)
+        min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()
+
+        assert min_a_gpu == min_a
+
+
+@pytools.test.mark_test.opencl
+def test_dot(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    dtypes = [np.float32, np.complex64]
+    if has_double_support(context.devices[0]):
+        dtypes.extend([np.float64, np.complex128])
+
+    for a_dtype in dtypes:
+        for b_dtype in dtypes:
+            print(a_dtype, b_dtype)
+            a_gpu = general_clrand(queue, (200000,), a_dtype)
+            a = a_gpu.get()
+            b_gpu = general_clrand(queue, (200000,), b_dtype)
+            b = b_gpu.get()
+
+            dot_ab = np.dot(a, b)
+
+            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
+
+            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
+
+@memoize
+def make_mmc_dtype(device):
+    dtype = np.dtype([
+        ("cur_min", np.int32),
+        ("cur_max", np.int32),
+        ("pad", np.int32),
+        ])
+
+    name = "minmax_collector"
+    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
+
+    dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
+    dtype = get_or_register_dtype(name, dtype)
+
+    return dtype, c_decl
+
+@pytools.test.mark_test.opencl
+def test_struct_reduce(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0])
+
+    preamble = mmc_c_decl + r"""//CL//
+
+    minmax_collector mmc_neutral()
+    {
+        // FIXME: needs infinity literal in real use, ok here
+        minmax_collector result;
+        result.cur_min = 1<<30;
+        result.cur_max = -(1<<30);
+        return result;
+    }
+
+    minmax_collector mmc_from_scalar(float x)
+    {
+        minmax_collector result;
+        result.cur_min = x;
+        result.cur_max = x;
+        return result;
+    }
+
+    minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
+    {
+        minmax_collector result = a;
+        if (b.cur_min < result.cur_min)
+            result.cur_min = b.cur_min;
+        if (b.cur_max > result.cur_max)
+            result.cur_max = b.cur_max;
+        return result;
+    }
+
+    """
+
+    from pyopencl.clrandom import rand as clrand
+    a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6)
+    a = a_gpu.get()
+
+    from pyopencl.reduction import ReductionKernel
+    red = ReductionKernel(context, mmc_dtype,
+            neutral="mmc_neutral()",
+            reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])",
+            arguments="__global int *x", preamble=preamble)
+
+    minmax = red(a_gpu).get()
+    #print minmax["cur_min"], minmax["cur_max"]
+    #print np.min(a), np.max(a)
+
+    assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
+    assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
+
+# }}}
+
+# {{{ scan-related
+
+def summarize_error(obtained, desired, orig, thresh=1e-5):
+    err = obtained - desired
+    ok_count = 0
+    bad_count = 0
+
+    bad_limit = 200
+
+    def summarize_counts():
+        if ok_count:
+            entries.append("<%d ok>" % ok_count)
+        if bad_count >= bad_limit:
+            entries.append("<%d more bad>" % (bad_count-bad_limit))
+
+    entries = []
+    for i, val in enumerate(err):
+        if abs(val) > thresh:
+            if ok_count:
+                summarize_counts()
+                ok_count = 0
+
+            bad_count += 1
+
+            if bad_count < bad_limit:
+                entries.append("%r (want: %r, got: %r, orig: %r)" % (obtained[i], desired[i],
+                    obtained[i], orig[i]))
+        else:
+            if bad_count:
+                summarize_counts()
+                bad_count = 0
+
+            ok_count += 1
+
+
+    summarize_counts()
+
+    return " ".join(entries)
+
+scan_test_counts = [
+    10,
+    2 ** 8 - 1,
+    2 ** 8,
+    2 ** 8 + 1,
+    2 ** 10 - 5,
+    2 ** 10,
+    2 ** 10 + 5,
+    2 ** 12 - 5,
+    2 ** 12,
+    2 ** 12 + 5,
+    2 ** 20 - 2 ** 18,
+    2 ** 20 - 2 ** 18 + 5,
+    2 ** 20 + 1,
+    2 ** 20,
+    2 ** 23 + 3,
+    2 ** 24 + 5
+    ]
+
+@pytools.test.mark_test.opencl
+def test_scan(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
+
+    dtype = np.int32
+    for cls in [
+            InclusiveScanKernel,
+            ExclusiveScanKernel
+            ]:
+        knl = cls(context, dtype, "a+b", "0")
+
+        for n in scan_test_counts:
+            host_data = np.random.randint(0, 10, n).astype(dtype)
+            dev_data = cl_array.to_device(queue, host_data)
+
+            assert (host_data == dev_data.get()).all() # /!\ fails on Nv GT2?? for some drivers
+
+            knl(dev_data)
+
+            desired_result = np.cumsum(host_data, axis=0)
+            if cls is ExclusiveScanKernel:
+                desired_result -= host_data
+
+            is_ok = (dev_data.get() == desired_result).all()
+            if 1 and not is_ok:
+                print("something went wrong, summarizing error...")
+                print(summarize_error(dev_data.get(), desired_result, host_data))
+
+            print("n:%d %s worked:%s" % (n, cls, is_ok))
+            assert is_ok
+            from gc import collect
+            collect()
+
+@pytools.test.mark_test.opencl
+def test_copy_if(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+    for n in scan_test_counts:
+        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
+        a = a_dev.get()
+
+        from pyopencl.algorithm import copy_if
+
+        crit = a_dev.dtype.type(300)
+        selected = a[a>crit]
+        selected_dev, count_dev = copy_if(a_dev, "ary[i] > myval", [("myval", crit)])
+
+        assert (selected_dev.get()[:count_dev.get()] == selected).all()
+        from gc import collect
+        collect()
+
+@pytools.test.mark_test.opencl
+def test_partition(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+    for n in scan_test_counts:
+        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
+        a = a_dev.get()
+
+        crit = a_dev.dtype.type(300)
+        true_host = a[a>crit]
+        false_host = a[a<=crit]
+
+        from pyopencl.algorithm import partition
+        true_dev, false_dev, count_true_dev = partition(a_dev, "ary[i] > myval", [("myval", crit)])
+
+        count_true_dev = count_true_dev.get()
+
+        assert (true_dev.get()[:count_true_dev] == true_host).all()
+        assert (false_dev.get()[:n-count_true_dev] == false_host).all()
+
+@pytools.test.mark_test.opencl
+def test_unique(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.clrandom import rand as clrand
+    for n in scan_test_counts:
+        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
+        a = a_dev.get()
+        a = np.sort(a)
+        a_dev = cl_array.to_device(queue, a)
+
+        a_unique_host = np.unique(a)
+
+        from pyopencl.algorithm import unique
+        a_unique_dev, count_unique_dev = unique(a_dev)
+
+        count_unique_dev = count_unique_dev.get()
+
+        assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all()
+        from gc import collect
+        collect()
+
+@pytools.test.mark_test.opencl
+def test_index_preservation(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel
+    classes = [GenericScanKernel]
+
+    dev = context.devices[0]
+    if dev.type == cl.device_type.CPU:
+        classes.append(GenericDebugScanKernel)
+
+    for cls in classes:
+        for n in scan_test_counts:
+            knl = cls(
+                    context, np.int32,
+                    arguments="__global int *out",
+                    input_expr="i",
+                    scan_expr="b", neutral="0",
+                    output_statement="""
+                        out[i] = item;
+                        """)
+
+            out = cl_array.empty(queue, n, dtype=np.int32)
+            knl(out)
+
+            assert (out.get() == np.arange(n)).all()
+            from gc import collect
+            collect()
+
+@pytools.test.mark_test.opencl
+def test_segmented_scan(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.tools import dtype_to_ctype
+    dtype = np.int32
+    ctype = dtype_to_ctype(dtype)
+
+    #for is_exclusive in [False, True]:
+    for is_exclusive in [True, False]:
+        if is_exclusive:
+            output_statement = "out[i] = prev_item"
+        else:
+            output_statement = "out[i] = item"
+
+        from pyopencl.scan import GenericScanKernel
+        knl = GenericScanKernel(context, dtype,
+                arguments="__global %s *ary, __global char *segflags, __global %s *out"
+                    % (ctype, ctype),
+                input_expr="ary[i]",
+                scan_expr="across_seg_boundary ? b : (a+b)", neutral="0",
+                is_segment_start_expr="segflags[i]",
+                output_statement=output_statement,
+                options=[])
+
+        np.set_printoptions(threshold=2000)
+        from random import randrange
+        from pyopencl.clrandom import rand as clrand
+        for n in scan_test_counts:
+            a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10)
+            a = a_dev.get()
+
+            if 10 <= n < 20:
+                seg_boundaries_values = [
+                        [0, 9],
+                        [0, 3],
+                        [4, 6],
+                        ]
+            else:
+                seg_boundaries_values = []
+                for i in range(10):
+                    seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n))))
+                    seg_boundaries = [randrange(n) for i in range(seg_boundary_count)]
+                    if n >= 1029:
+                        seg_boundaries.insert(0, 1028)
+                    seg_boundaries.sort()
+                    seg_boundaries_values.append(seg_boundaries)
+
+            for seg_boundaries in seg_boundaries_values:
+                #print "BOUNDARIES", seg_boundaries
+                #print a
+
+                seg_boundary_flags = np.zeros(n, dtype=np.uint8)
+                seg_boundary_flags[seg_boundaries] = 1
+                seg_boundary_flags_dev = cl_array.to_device(queue, seg_boundary_flags)
+
+                seg_boundaries.insert(0, 0)
+
+                result_host = a.copy()
+                for i, seg_start in enumerate(seg_boundaries):
+                    if i+1 < len(seg_boundaries):
+                        seg_end = seg_boundaries[i+1]
+                    else:
+                        seg_end = None
+
+                    if is_exclusive:
+                        result_host[seg_start+1:seg_end] = np.cumsum(
+                                a[seg_start:seg_end][:-1])
+                        result_host[seg_start] = 0
+                    else:
+                        result_host[seg_start:seg_end] = np.cumsum(
+                                a[seg_start:seg_end])
+
+                #print "REF", result_host
+
+                result_dev = cl_array.empty_like(a_dev)
+                knl(a_dev, seg_boundary_flags_dev, result_dev)
+
+                #print "RES", result_dev
+                is_correct = (result_dev.get() == result_host).all()
+                if not is_correct:
+                    diff = result_dev.get() - result_host
+                    print("RES-REF", diff)
+                    print("ERRWHERE", np.where(diff))
+                    print(n, list(seg_boundaries))
+
+                assert is_correct
+                from gc import collect
+                collect()
+
+            print("%d excl:%s done" % (n, is_exclusive))
+
+
+@pytools.test.mark_test.opencl
+def test_sort(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    dtype = np.int32
+
+    from pyopencl.algorithm import RadixSort
+    sort = RadixSort(context, "int *ary", key_expr="ary[i]",
+            sort_arg_names=["ary"])
+
+    from pyopencl.clrandom import RanluxGenerator
+    rng = RanluxGenerator(queue, seed=15)
+
+    from time import time
+
+    for n in scan_test_counts:
+        print(n)
+
+        print("  rng")
+        a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16)
+        a = a_dev.get()
+
+        dev_start = time()
+        print("  device")
+        a_dev_sorted, = sort(a_dev, key_bits=16)
+        queue.finish()
+        dev_end = time()
+        print("  numpy")
+        a_sorted = np.sort(a)
+        numpy_end = time()
+
+        numpy_elapsed = numpy_end-dev_end
+        dev_elapsed = dev_end-dev_start
+        print ("  dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (
+                1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed))
+        assert (a_dev_sorted.get() == a_sorted).all()
+
+@pytools.test.mark_test.opencl
+def test_list_builder(ctx_factory):
+    context = ctx_factory()
+    queue = cl.CommandQueue(context)
+
+    from pyopencl.algorithm import ListOfListsBuilder
+    builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
+            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
+            {
+                int count = i % 4;
+                for (int j = 0; j < count; ++j)
+                {
+                    APPEND_mylist(count);
+                }
+            }
+            """, arg_decls=[])
+
+    result = builder(queue, 2000)
+
+    inf = result["mylist"]
+    assert inf.count == 3000
+    assert (inf.lists.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
+
+# }}}
+
+
+
+
+if __name__ == "__main__":
+    # make sure that import failures get reported, instead of skipping the
+    # tests.
+    import pyopencl as cl
+
+    import sys
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from py.test.cmdline import main
+        main([__file__])
+
+# vim: filetype=pyopencl:fdm=marker
diff --git a/test/test_array.py b/test/test_array.py
index 14768af8..8e88f73d 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -443,718 +443,6 @@ def test_random(ctx_factory):
 
 # }}}
 
-# {{{ elementwise
-
-@pytools.test.mark_test.opencl
-def test_elwise_kernel(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.clrandom import rand as clrand
-
-    a_gpu = clrand(queue, (50,), np.float32)
-    b_gpu = clrand(queue, (50,), np.float32)
-
-    from pyopencl.elementwise import ElementwiseKernel
-    lin_comb = ElementwiseKernel(context,
-            "float a, float *x, float b, float *y, float *z",
-            "z[i] = a*x[i] + b*y[i]",
-            "linear_combination")
-
-    c_gpu = cl_array.empty_like(a_gpu)
-    lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
-
-    assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
-
-
-@pytools.test.mark_test.opencl
-def test_elwise_kernel_with_options(ctx_factory):
-    from pyopencl.clrandom import rand as clrand
-    from pyopencl.elementwise import ElementwiseKernel
-
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    in_gpu = clrand(queue, (50,), np.float32)
-
-    options = ['-D', 'ADD_ONE']
-    add_one = ElementwiseKernel(
-        context,
-        "float* out, const float *in",
-        """
-        out[i] = in[i]
-        #ifdef ADD_ONE
-            +1
-        #endif
-        ;
-        """,
-        options=options,
-        )
-
-    out_gpu = cl_array.empty_like(in_gpu)
-    add_one(out_gpu, in_gpu)
-
-    gt = in_gpu.get() + 1
-    gv = out_gpu.get()
-    assert la.norm(gv - gt) < 1e-5
-
-
-@pytools.test.mark_test.opencl
-def test_ranged_elwise_kernel(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.elementwise import ElementwiseKernel
-    set_to_seven = ElementwiseKernel(context,
-            "float *z", "z[i] = 7", "set_to_seven")
-
-    for i, slc in enumerate([
-            slice(5, 20000),
-            slice(5, 20000, 17),
-            slice(3000, 5, -1),
-            slice(1000, -1),
-            ]):
-
-        a_gpu = cl_array.zeros(queue, (50000,), dtype=np.float32)
-        a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)
-
-        a_cpu[slc] = 7
-        set_to_seven(a_gpu, slice=slc)
-
-        assert (a_cpu == a_gpu.get()).all()
-
-@pytools.test.mark_test.opencl
-def test_take(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32)
-    a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32)
-    result = cl_array.take(a, idx)
-    assert ((3 * idx).get() == result.get()).all()
-
-
-@pytools.test.mark_test.opencl
-def test_arange(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    n = 5000
-    a = cl_array.arange(queue, n, dtype=np.float32)
-    assert (np.arange(n, dtype=np.float32) == a.get()).all()
-
-
-@pytools.test.mark_test.opencl
-def test_reverse(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    n = 5000
-    a = np.arange(n).astype(np.float32)
-    a_gpu = cl_array.to_device(queue, a)
-
-    a_gpu = a_gpu.reverse()
-
-    assert (a[::-1] == a_gpu.get()).all()
-
-@pytools.test.mark_test.opencl
-def test_if_positive(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.clrandom import rand as clrand
-
-    l = 20000
-    a_gpu = clrand(queue, (l,), np.float32)
-    b_gpu = clrand(queue, (l,), np.float32)
-    a = a_gpu.get()
-    b = b_gpu.get()
-
-    max_a_b_gpu = cl_array.maximum(a_gpu, b_gpu)
-    min_a_b_gpu = cl_array.minimum(a_gpu, b_gpu)
-
-    print(max_a_b_gpu)
-    print(np.maximum(a, b))
-
-    assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
-    assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
-
-
-@pytools.test.mark_test.opencl
-def test_take_put(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    for n in [5, 17, 333]:
-        one_field_size = 8
-        buf_gpu = cl_array.zeros(queue,
-                n * one_field_size, dtype=np.float32)
-        dest_indices = cl_array.to_device(queue,
-                np.array([0, 1, 2,  3, 32, 33, 34, 35], dtype=np.uint32))
-        read_map = cl_array.to_device(queue,
-                np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
-
-        cl_array.multi_take_put(
-                arrays=[buf_gpu for i in range(n)],
-                dest_indices=dest_indices,
-                src_indices=read_map,
-                src_offsets=[i * one_field_size for i in range(n)],
-                dest_shape=(96,))
-
-
-@pytools.test.mark_test.opencl
-def test_astype(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.clrandom import rand as clrand
-
-    if not has_double_support(context.devices[0]):
-        from py.test import skip
-        skip("double precision not supported on %s" % context.devices[0])
-
-    a_gpu = clrand(queue, (2000,), dtype=np.float32)
-
-    a = a_gpu.get().astype(np.float64)
-    a2 = a_gpu.astype(np.float64).get()
-
-    assert a2.dtype == np.float64
-    assert la.norm(a - a2) == 0, (a, a2)
-
-    a_gpu = clrand(queue, (2000,), dtype=np.float64)
-
-    a = a_gpu.get().astype(np.float32)
-    a2 = a_gpu.astype(np.float32).get()
-
-    assert a2.dtype == np.float32
-    assert la.norm(a - a2) / la.norm(a) < 1e-7
-
-# }}}
-
-# {{{ reduction
-
-@pytools.test.mark_test.opencl
-def test_sum(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    n = 200000
-    for dtype in [np.float32, np.complex64]:
-        a_gpu = general_clrand(queue, (n,), dtype)
-
-        a = a_gpu.get()
-
-        sum_a = np.sum(a)
-        sum_a_gpu = cl_array.sum(a_gpu).get()
-
-        assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
-
-
-@pytools.test.mark_test.opencl
-def test_minmax(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.clrandom import rand as clrand
-
-    if has_double_support(context.devices[0]):
-        dtypes = [np.float64, np.float32, np.int32]
-    else:
-        dtypes = [np.float32, np.int32]
-
-    for what in ["min", "max"]:
-        for dtype in dtypes:
-            a_gpu = clrand(queue, (200000,), dtype)
-            a = a_gpu.get()
-
-            op_a = getattr(np, what)(a)
-            op_a_gpu = getattr(cl_array, what)(a_gpu).get()
-
-            assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
-
-
-@pytools.test.mark_test.opencl
-def test_subset_minmax(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.clrandom import rand as clrand
-
-    l_a = 200000
-    gran = 5
-    l_m = l_a - l_a // gran + 1
-
-    if has_double_support(context.devices[0]):
-        dtypes = [np.float64, np.float32, np.int32]
-    else:
-        dtypes = [np.float32, np.int32]
-
-    for dtype in dtypes:
-        a_gpu = clrand(queue, (l_a,), dtype)
-        a = a_gpu.get()
-
-        meaningful_indices_gpu = cl_array.zeros(
-                queue, l_m, dtype=np.int32)
-        meaningful_indices = meaningful_indices_gpu.get()
-        j = 0
-        for i in range(len(meaningful_indices)):
-            meaningful_indices[i] = j
-            j = j + 1
-            if j % gran == 0:
-                j = j + 1
-
-        meaningful_indices_gpu = cl_array.to_device(
-                queue, meaningful_indices)
-        b = a[meaningful_indices]
-
-        min_a = np.min(b)
-        min_a_gpu = cl_array.subset_min(meaningful_indices_gpu, a_gpu).get()
-
-        assert min_a_gpu == min_a
-
-
-@pytools.test.mark_test.opencl
-def test_dot(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    dtypes = [np.float32, np.complex64]
-    if has_double_support(context.devices[0]):
-        dtypes.extend([np.float64, np.complex128])
-
-    for a_dtype in dtypes:
-        for b_dtype in dtypes:
-            print(a_dtype, b_dtype)
-            a_gpu = general_clrand(queue, (200000,), a_dtype)
-            a = a_gpu.get()
-            b_gpu = general_clrand(queue, (200000,), b_dtype)
-            b = b_gpu.get()
-
-            dot_ab = np.dot(a, b)
-
-            dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
-
-            assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
-
-@memoize
-def make_mmc_dtype(device):
-    dtype = np.dtype([
-        ("cur_min", np.int32),
-        ("cur_max", np.int32),
-        ("pad", np.int32),
-        ])
-
-    name = "minmax_collector"
-    from pyopencl.tools import get_or_register_dtype, match_dtype_to_c_struct
-
-    dtype, c_decl = match_dtype_to_c_struct(device, name, dtype)
-    dtype = get_or_register_dtype(name, dtype)
-
-    return dtype, c_decl
-
-@pytools.test.mark_test.opencl
-def test_struct_reduce(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    mmc_dtype, mmc_c_decl = make_mmc_dtype(context.devices[0])
-
-    preamble = mmc_c_decl + r"""//CL//
-
-    minmax_collector mmc_neutral()
-    {
-        // FIXME: needs infinity literal in real use, ok here
-        minmax_collector result;
-        result.cur_min = 1<<30;
-        result.cur_max = -(1<<30);
-        return result;
-    }
-
-    minmax_collector mmc_from_scalar(float x)
-    {
-        minmax_collector result;
-        result.cur_min = x;
-        result.cur_max = x;
-        return result;
-    }
-
-    minmax_collector agg_mmc(minmax_collector a, minmax_collector b)
-    {
-        minmax_collector result = a;
-        if (b.cur_min < result.cur_min)
-            result.cur_min = b.cur_min;
-        if (b.cur_max > result.cur_max)
-            result.cur_max = b.cur_max;
-        return result;
-    }
-
-    """
-
-    from pyopencl.clrandom import rand as clrand
-    a_gpu = clrand(queue, (20000,), dtype=np.int32, a=0, b=10**6)
-    a = a_gpu.get()
-
-    from pyopencl.reduction import ReductionKernel
-    red = ReductionKernel(context, mmc_dtype,
-            neutral="mmc_neutral()",
-            reduce_expr="agg_mmc(a, b)", map_expr="mmc_from_scalar(x[i])",
-            arguments="__global int *x", preamble=preamble)
-
-    minmax = red(a_gpu).get()
-    #print minmax["cur_min"], minmax["cur_max"]
-    #print np.min(a), np.max(a)
-
-    assert abs(minmax["cur_min"] - np.min(a)) < 1e-5
-    assert abs(minmax["cur_max"] - np.max(a)) < 1e-5
-
-# }}}
-
-# {{{ scan-related
-
-def summarize_error(obtained, desired, orig, thresh=1e-5):
-    err = obtained - desired
-    ok_count = 0
-    bad_count = 0
-
-    bad_limit = 200
-
-    def summarize_counts():
-        if ok_count:
-            entries.append("<%d ok>" % ok_count)
-        if bad_count >= bad_limit:
-            entries.append("<%d more bad>" % (bad_count-bad_limit))
-
-    entries = []
-    for i, val in enumerate(err):
-        if abs(val) > thresh:
-            if ok_count:
-                summarize_counts()
-                ok_count = 0
-
-            bad_count += 1
-
-            if bad_count < bad_limit:
-                entries.append("%r (want: %r, got: %r, orig: %r)" % (obtained[i], desired[i],
-                    obtained[i], orig[i]))
-        else:
-            if bad_count:
-                summarize_counts()
-                bad_count = 0
-
-            ok_count += 1
-
-
-    summarize_counts()
-
-    return " ".join(entries)
-
-scan_test_counts = [
-    10,
-    2 ** 8 - 1,
-    2 ** 8,
-    2 ** 8 + 1,
-    2 ** 10 - 5,
-    2 ** 10,
-    2 ** 10 + 5,
-    2 ** 12 - 5,
-    2 ** 12,
-    2 ** 12 + 5,
-    2 ** 20 - 2 ** 18,
-    2 ** 20 - 2 ** 18 + 5,
-    2 ** 20 + 1,
-    2 ** 20,
-    2 ** 23 + 3,
-    2 ** 24 + 5
-    ]
-
-@pytools.test.mark_test.opencl
-def test_scan(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
-
-    dtype = np.int32
-    for cls in [
-            InclusiveScanKernel,
-            ExclusiveScanKernel
-            ]:
-        knl = cls(context, dtype, "a+b", "0")
-
-        for n in scan_test_counts:
-            host_data = np.random.randint(0, 10, n).astype(dtype)
-            dev_data = cl_array.to_device(queue, host_data)
-
-            assert (host_data == dev_data.get()).all() # /!\ fails on Nv GT2?? for some drivers
-
-            knl(dev_data)
-
-            desired_result = np.cumsum(host_data, axis=0)
-            if cls is ExclusiveScanKernel:
-                desired_result -= host_data
-
-            is_ok = (dev_data.get() == desired_result).all()
-            if 1 and not is_ok:
-                print("something went wrong, summarizing error...")
-                print(summarize_error(dev_data.get(), desired_result, host_data))
-
-            print("n:%d %s worked:%s" % (n, cls, is_ok))
-            assert is_ok
-            from gc import collect
-            collect()
-
-@pytools.test.mark_test.opencl
-def test_copy_if(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.clrandom import rand as clrand
-    for n in scan_test_counts:
-        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
-        a = a_dev.get()
-
-        from pyopencl.algorithm import copy_if
-
-        crit = a_dev.dtype.type(300)
-        selected = a[a>crit]
-        selected_dev, count_dev = copy_if(a_dev, "ary[i] > myval", [("myval", crit)])
-
-        assert (selected_dev.get()[:count_dev.get()] == selected).all()
-        from gc import collect
-        collect()
-
-@pytools.test.mark_test.opencl
-def test_partition(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.clrandom import rand as clrand
-    for n in scan_test_counts:
-        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
-        a = a_dev.get()
-
-        crit = a_dev.dtype.type(300)
-        true_host = a[a>crit]
-        false_host = a[a<=crit]
-
-        from pyopencl.algorithm import partition
-        true_dev, false_dev, count_true_dev = partition(a_dev, "ary[i] > myval", [("myval", crit)])
-
-        count_true_dev = count_true_dev.get()
-
-        assert (true_dev.get()[:count_true_dev] == true_host).all()
-        assert (false_dev.get()[:n-count_true_dev] == false_host).all()
-
-@pytools.test.mark_test.opencl
-def test_unique(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.clrandom import rand as clrand
-    for n in scan_test_counts:
-        a_dev = clrand(queue, (n,), dtype=np.int32, a=0, b=1000)
-        a = a_dev.get()
-        a = np.sort(a)
-        a_dev = cl_array.to_device(queue, a)
-
-        a_unique_host = np.unique(a)
-
-        from pyopencl.algorithm import unique
-        a_unique_dev, count_unique_dev = unique(a_dev)
-
-        count_unique_dev = count_unique_dev.get()
-
-        assert (a_unique_dev.get()[:count_unique_dev] == a_unique_host).all()
-        from gc import collect
-        collect()
-
-@pytools.test.mark_test.opencl
-def test_index_preservation(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.scan import GenericScanKernel, GenericDebugScanKernel
-    classes = [GenericScanKernel]
-
-    dev = context.devices[0]
-    if dev.type == cl.device_type.CPU:
-        classes.append(GenericDebugScanKernel)
-
-    for cls in classes:
-        for n in scan_test_counts:
-            knl = cls(
-                    context, np.int32,
-                    arguments="__global int *out",
-                    input_expr="i",
-                    scan_expr="b", neutral="0",
-                    output_statement="""
-                        out[i] = item;
-                        """)
-
-            out = cl_array.empty(queue, n, dtype=np.int32)
-            knl(out)
-
-            assert (out.get() == np.arange(n)).all()
-            from gc import collect
-            collect()
-
-@pytools.test.mark_test.opencl
-def test_segmented_scan(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.tools import dtype_to_ctype
-    dtype = np.int32
-    ctype = dtype_to_ctype(dtype)
-
-    #for is_exclusive in [False, True]:
-    for is_exclusive in [True, False]:
-        if is_exclusive:
-            output_statement = "out[i] = prev_item"
-        else:
-            output_statement = "out[i] = item"
-
-        from pyopencl.scan import GenericScanKernel
-        knl = GenericScanKernel(context, dtype,
-                arguments="__global %s *ary, __global char *segflags, __global %s *out"
-                    % (ctype, ctype),
-                input_expr="ary[i]",
-                scan_expr="across_seg_boundary ? b : (a+b)", neutral="0",
-                is_segment_start_expr="segflags[i]",
-                output_statement=output_statement,
-                options=[])
-
-        np.set_printoptions(threshold=2000)
-        from random import randrange
-        from pyopencl.clrandom import rand as clrand
-        for n in scan_test_counts:
-            a_dev = clrand(queue, (n,), dtype=dtype, a=0, b=10)
-            a = a_dev.get()
-
-            if 10 <= n < 20:
-                seg_boundaries_values = [
-                        [0, 9],
-                        [0, 3],
-                        [4, 6],
-                        ]
-            else:
-                seg_boundaries_values = []
-                for i in range(10):
-                    seg_boundary_count = max(2, min(100, randrange(0, int(0.4*n))))
-                    seg_boundaries = [randrange(n) for i in range(seg_boundary_count)]
-                    if n >= 1029:
-                        seg_boundaries.insert(0, 1028)
-                    seg_boundaries.sort()
-                    seg_boundaries_values.append(seg_boundaries)
-
-            for seg_boundaries in seg_boundaries_values:
-                #print "BOUNDARIES", seg_boundaries
-                #print a
-
-                seg_boundary_flags = np.zeros(n, dtype=np.uint8)
-                seg_boundary_flags[seg_boundaries] = 1
-                seg_boundary_flags_dev = cl_array.to_device(queue, seg_boundary_flags)
-
-                seg_boundaries.insert(0, 0)
-
-                result_host = a.copy()
-                for i, seg_start in enumerate(seg_boundaries):
-                    if i+1 < len(seg_boundaries):
-                        seg_end = seg_boundaries[i+1]
-                    else:
-                        seg_end = None
-
-                    if is_exclusive:
-                        result_host[seg_start+1:seg_end] = np.cumsum(
-                                a[seg_start:seg_end][:-1])
-                        result_host[seg_start] = 0
-                    else:
-                        result_host[seg_start:seg_end] = np.cumsum(
-                                a[seg_start:seg_end])
-
-                #print "REF", result_host
-
-                result_dev = cl_array.empty_like(a_dev)
-                knl(a_dev, seg_boundary_flags_dev, result_dev)
-
-                #print "RES", result_dev
-                is_correct = (result_dev.get() == result_host).all()
-                if not is_correct:
-                    diff = result_dev.get() - result_host
-                    print("RES-REF", diff)
-                    print("ERRWHERE", np.where(diff))
-                    print(n, list(seg_boundaries))
-
-                assert is_correct
-                from gc import collect
-                collect()
-
-            print("%d excl:%s done" % (n, is_exclusive))
-
-
-@pytools.test.mark_test.opencl
-def test_sort(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    dtype = np.int32
-
-    from pyopencl.algorithm import RadixSort
-    sort = RadixSort(context, "int *ary", key_expr="ary[i]",
-            sort_arg_names=["ary"])
-
-    from pyopencl.clrandom import RanluxGenerator
-    rng = RanluxGenerator(queue, seed=15)
-
-    from time import time
-
-    for n in scan_test_counts:
-        print(n)
-
-        print("  rng")
-        a_dev = rng.uniform(queue, (n,), dtype=dtype, a=0, b=2**16)
-        a = a_dev.get()
-
-        dev_start = time()
-        print("  device")
-        a_dev_sorted, = sort(a_dev, key_bits=16)
-        queue.finish()
-        dev_end = time()
-        print("  numpy")
-        a_sorted = np.sort(a)
-        numpy_end = time()
-
-        numpy_elapsed = numpy_end-dev_end
-        dev_elapsed = dev_end-dev_start
-        print ("  dev: %.2f MKeys/s numpy: %.2f MKeys/s ratio: %.2fx" % (
-                1e-6*n/dev_elapsed, 1e-6*n/numpy_elapsed, numpy_elapsed/dev_elapsed))
-        assert (a_dev_sorted.get() == a_sorted).all()
-
-@pytools.test.mark_test.opencl
-def test_list_builder(ctx_factory):
-    context = ctx_factory()
-    queue = cl.CommandQueue(context)
-
-    from pyopencl.algorithm import ListOfListsBuilder
-    builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL//
-            void generate(LIST_ARG_DECL USER_ARG_DECL index_type i)
-            {
-                int count = i % 4;
-                for (int j = 0; j < count; ++j)
-                {
-                    APPEND_mylist(count);
-                }
-            }
-            """, arg_decls=[])
-
-    result = builder(queue, 2000)
-
-    inf = result["mylist"]
-    assert inf.count == 3000
-    assert (inf.lists.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
-
-# }}}
-
 # {{{ misc
 
 @pytools.test.mark_test.opencl
-- 
GitLab