diff --git a/pyopencl/scan.py b/pyopencl/scan.py
index ae380ff29a8567558a9220519412ca393b25caad..41dd7912e3912a467274d6d0213331d899ffc083 100644
--- a/pyopencl/scan.py
+++ b/pyopencl/scan.py
@@ -143,7 +143,7 @@ void ${name_prefix}_scan_intervals(
     )
 {
     // index K in first dimension used for carry storage
-    %if scan_dtype.itemsize > 4 and scan_dtype.itemsize % 8 == 0 and is_gpu:
+    %if use_bank_conflict_avoidance:
         // Avoid bank conflicts by adding a single 32-bit value to the size of
         // the scan type.
         struct __attribute__ ((__packed__)) wrapped_scan_type
@@ -1064,7 +1064,10 @@ class GenericScanKernel(_GenericScanKernelBase):
                 dev.local_mem_size
                 for dev in self.devices)
 
-        if self.devices[0].type == cl.device_type.CPU:
+        is_cpu = self.devices[0].type & cl.device_type.CPU
+        is_gpu = self.devices[0].type & cl.device_type.GPU
+
+        if is_cpu:
             # (about the widest vector a CPU can support, also taking
             # into account that CPUs don't hide latency by large work groups
             max_scan_wg_size = 16
@@ -1073,6 +1076,9 @@ class GenericScanKernel(_GenericScanKernelBase):
             max_scan_wg_size = min(dev.max_work_group_size for dev in self.devices)
             wg_size_multiples = 64
 
+        use_bank_conflict_avoidance = (
+                self.dtype.itemsize > 4 and self.dtype.itemsize % 8 == 0 and is_gpu)
+
         # k_group_size should be a power of two because of in-kernel
         # division by that number.
 
@@ -1082,11 +1088,12 @@ class GenericScanKernel(_GenericScanKernelBase):
                     wg_size_multiples):
 
                 k_group_size = 2**k_exp
-                lmem_use = self.get_local_mem_use(wg_size, k_group_size)
+                lmem_use = self.get_local_mem_use(wg_size, k_group_size,
+                        use_bank_conflict_avoidance)
                 if lmem_use + 256 <= avail_local_mem:
                     solutions.append((wg_size*k_group_size, k_group_size, wg_size))
 
-        if self.devices[0].type & cl.device_type.GPU:
+        if is_gpu:
             from pytools import any
             for wg_size_floor in [256, 192, 128]:
                 have_sol_above_floor = any(wg_size >= wg_size_floor
@@ -1109,7 +1116,8 @@ class GenericScanKernel(_GenericScanKernelBase):
                     input_fetch_exprs=self.input_fetch_exprs,
                     is_first_level=True,
                     store_segment_start_flags=self.store_segment_start_flags,
-                    k_group_size=k_group_size)
+                    k_group_size=k_group_size,
+                    use_bank_conflict_avoidance=use_bank_conflict_avoidance)
 
             # Will this device actually let us execute this kernel
             # at the desired work group size? Building it is the
@@ -1164,6 +1172,7 @@ class GenericScanKernel(_GenericScanKernelBase):
                 is_first_level=False,
                 store_segment_start_flags=False,
                 k_group_size=k_group_size,
+                use_bank_conflict_avoidance=use_bank_conflict_avoidance,
                 **second_level_build_kwargs)
 
         # }}}
@@ -1202,7 +1211,7 @@ class GenericScanKernel(_GenericScanKernelBase):
 
     # {{{ scan kernel build/properties
 
-    def get_local_mem_use(self, k_group_size, wg_size):
+    def get_local_mem_use(self, k_group_size, wg_size, use_bank_conflict_avoidance):
         arg_dtypes = {}
         for arg in self.parsed_args:
             arg_dtypes[arg.name] = arg.dtype
@@ -1211,9 +1220,13 @@ class GenericScanKernel(_GenericScanKernelBase):
         for name, arg_name, ife_offset in self.input_fetch_exprs:
             fetch_expr_offsets.setdefault(arg_name, set()).add(ife_offset)
 
+        itemsize = self.dtype.itemsize
+        if use_bank_conflict_avoidance:
+            itemsize += 4
+
         return (
                 # ldata
-                self.dtype.itemsize*(k_group_size+1)*(wg_size+1)
+                itemsize*(k_group_size+1)*(wg_size+1)
 
                 # l_segment_start_flags
                 + k_group_size*wg_size
@@ -1228,7 +1241,8 @@ class GenericScanKernel(_GenericScanKernelBase):
 
     def build_scan_kernel(self, max_wg_size, arguments, input_expr,
             is_segment_start_expr, input_fetch_exprs, is_first_level,
-            store_segment_start_flags, k_group_size):
+            store_segment_start_flags, k_group_size,
+            use_bank_conflict_avoidance):
         scalar_arg_dtypes = get_arg_list_scalar_arg_dtypes(arguments)
 
         # Empirically found on Nv hardware: no need to be bigger than this size
@@ -1245,6 +1259,7 @@ class GenericScanKernel(_GenericScanKernelBase):
             input_fetch_exprs=input_fetch_exprs,
             is_first_level=is_first_level,
             store_segment_start_flags=store_segment_start_flags,
+            use_bank_conflict_avoidance=use_bank_conflict_avoidance,
             **self.code_variables))
 
         prg = cl.Program(self.context, scan_src).build(self.options)
diff --git a/test/test_algorithm.py b/test/test_algorithm.py
index 59fa60f03adb6068247a4fe95678164b81fa88df..7f0f9f4c71b8ab4b73f79d4fd7afbe4537677201 100644
--- a/test/test_algorithm.py
+++ b/test/test_algorithm.py
@@ -25,7 +25,6 @@ THE SOFTWARE.
 import numpy as np
 import numpy.linalg as la
 import sys
-import pytools.test
 from pytools import memoize
 from test_array import general_clrand
 
@@ -36,11 +35,11 @@ import pyopencl.array as cl_array  # noqa
 from pyopencl.tools import (  # noqa
         pytest_generate_tests_for_pyopencl as pytest_generate_tests)
 from pyopencl.characterize import has_double_support
+from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
 
 
 # {{{ elementwise
 
-@pytools.test.mark_test.opencl
 def test_elwise_kernel(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -62,7 +61,6 @@ def test_elwise_kernel(ctx_factory):
     assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
 
 
-@pytools.test.mark_test.opencl
 def test_elwise_kernel_with_options(ctx_factory):
     from pyopencl.clrandom import rand as clrand
     from pyopencl.elementwise import ElementwiseKernel
@@ -94,7 +92,6 @@ def test_elwise_kernel_with_options(ctx_factory):
     assert la.norm(gv - gt) < 1e-5
 
 
-@pytools.test.mark_test.opencl
 def test_ranged_elwise_kernel(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -119,7 +116,6 @@ def test_ranged_elwise_kernel(ctx_factory):
         assert (a_cpu == a_gpu.get()).all()
 
 
-@pytools.test.mark_test.opencl
 def test_take(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -130,7 +126,6 @@ def test_take(ctx_factory):
     assert ((3 * idx).get() == result.get()).all()
 
 
-@pytools.test.mark_test.opencl
 def test_arange(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -140,7 +135,6 @@ def test_arange(ctx_factory):
     assert (np.arange(n, dtype=np.float32) == a.get()).all()
 
 
-@pytools.test.mark_test.opencl
 def test_reverse(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -154,7 +148,6 @@ def test_reverse(ctx_factory):
     assert (a[::-1] == a_gpu.get()).all()
 
 
-@pytools.test.mark_test.opencl
 def test_if_positive(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -177,7 +170,6 @@ def test_if_positive(ctx_factory):
     assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
 
 
-@pytools.test.mark_test.opencl
 def test_take_put(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -199,7 +191,6 @@ def test_take_put(ctx_factory):
                 dest_shape=(96,))
 
 
-@pytools.test.mark_test.opencl
 def test_astype(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -231,7 +222,6 @@ def test_astype(ctx_factory):
 
 # {{{ reduction
 
-@pytools.test.mark_test.opencl
 def test_sum(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -257,7 +247,6 @@ def test_sum(ctx_factory):
             assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
 
 
-@pytools.test.mark_test.opencl
 def test_minmax(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -283,7 +272,6 @@ def test_minmax(ctx_factory):
             assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
 
 
-@pytools.test.mark_test.opencl
 def test_subset_minmax(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -326,7 +314,6 @@ def test_subset_minmax(ctx_factory):
         assert min_a_gpu == min_a
 
 
-@pytools.test.mark_test.opencl
 def test_dot(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -374,7 +361,6 @@ def make_mmc_dtype(device):
     return dtype, c_decl
 
 
-@pytools.test.mark_test.opencl
 def test_struct_reduce(ctx_factory):
     pytest.importorskip("mako")
 
@@ -500,48 +486,41 @@ scan_test_counts = [
     ]
 
 
-@pytools.test.mark_test.opencl
-def test_scan(ctx_factory):
+@pytest.mark.parametrize("dtype", [np.int32, np.int64])
+@pytest.mark.parametrize("scan_cls", [InclusiveScanKernel, ExclusiveScanKernel])
+def test_scan(ctx_factory, dtype, scan_cls):
     from pytest import importorskip
     importorskip("mako")
 
     context = ctx_factory()
     queue = cl.CommandQueue(context)
 
-    from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
+    knl = scan_cls(context, dtype, "a+b", "0")
 
-    dtype = np.int32
-    for cls in [
-            InclusiveScanKernel,
-            ExclusiveScanKernel
-            ]:
-        knl = cls(context, dtype, "a+b", "0")
-
-        for n in scan_test_counts:
-            host_data = np.random.randint(0, 10, n).astype(dtype)
-            dev_data = cl_array.to_device(queue, host_data)
+    for n in scan_test_counts:
+        host_data = np.random.randint(0, 10, n).astype(dtype)
+        dev_data = cl_array.to_device(queue, host_data)
 
-            # /!\ fails on Nv GT2?? for some drivers
-            assert (host_data == dev_data.get()).all()
+        # /!\ fails on Nv GT2?? for some drivers
+        assert (host_data == dev_data.get()).all()
 
-            knl(dev_data)
+        knl(dev_data)
 
-            desired_result = np.cumsum(host_data, axis=0)
-            if cls is ExclusiveScanKernel:
-                desired_result -= host_data
+        desired_result = np.cumsum(host_data, axis=0)
+        if scan_cls is ExclusiveScanKernel:
+            desired_result -= host_data
 
-            is_ok = (dev_data.get() == desired_result).all()
-            if 1 and not is_ok:
-                print("something went wrong, summarizing error...")
-                print(summarize_error(dev_data.get(), desired_result, host_data))
+        is_ok = (dev_data.get() == desired_result).all()
+        if 1 and not is_ok:
+            print("something went wrong, summarizing error...")
+            print(summarize_error(dev_data.get(), desired_result, host_data))
 
-            print("n:%d %s worked:%s" % (n, cls, is_ok))
-            assert is_ok
-            from gc import collect
-            collect()
+        print("dtype:%s n:%d %s worked:%s" % (dtype, n, scan_cls, is_ok))
+        assert is_ok
+        from gc import collect
+        collect()
 
 
-@pytools.test.mark_test.opencl
 def test_copy_if(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -566,7 +545,6 @@ def test_copy_if(ctx_factory):
         collect()
 
 
-@pytools.test.mark_test.opencl
 def test_partition(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -595,7 +573,6 @@ def test_partition(ctx_factory):
         assert (false_dev.get()[:n-count_true_dev] == false_host).all()
 
 
-@pytools.test.mark_test.opencl
 def test_unique(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -622,7 +599,6 @@ def test_unique(ctx_factory):
         collect()
 
 
-@pytools.test.mark_test.opencl
 def test_index_preservation(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -656,7 +632,6 @@ def test_index_preservation(ctx_factory):
             collect()
 
 
-@pytools.test.mark_test.opencl
 def test_segmented_scan(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -755,7 +730,6 @@ def test_segmented_scan(ctx_factory):
             print("%d excl:%s done" % (n, is_exclusive))
 
 
-@pytools.test.mark_test.opencl
 def test_sort(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -798,7 +772,6 @@ def test_sort(ctx_factory):
         assert (a_dev_sorted.get() == a_sorted).all()
 
 
-@pytools.test.mark_test.opencl
 def test_list_builder(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
@@ -825,7 +798,6 @@ def test_list_builder(ctx_factory):
     assert (inf.lists.get()[-6:] == [1, 2, 2, 3, 3, 3]).all()
 
 
-@pytools.test.mark_test.opencl
 def test_key_value_sorter(ctx_factory):
     from pytest import importorskip
     importorskip("mako")
diff --git a/test/test_array.py b/test/test_array.py
index c147dfaba7333d37c58dd1577e1139721f55f49f..e3258fb9ae01751f93d438b52d039885013b7ecd 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -25,7 +25,6 @@ THE SOFTWARE.
 import numpy as np
 import numpy.linalg as la
 import sys
-import pytools.test
 
 import pyopencl as cl
 import pyopencl.array as cl_array
@@ -71,7 +70,6 @@ def make_random_array(queue, dtype, size):
 
 # {{{ dtype-related
 
-@pytools.test.mark_test.opencl
 def test_basic_complex(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -88,7 +86,6 @@ def test_basic_complex(ctx_factory):
     assert la.norm((ary*c).get() - c*host_ary) < 1e-5 * la.norm(host_ary)
 
 
-@pytools.test.mark_test.opencl
 def test_mix_complex(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -159,7 +156,6 @@ def test_mix_complex(ctx_factory):
                     assert correct
 
 
-@pytools.test.mark_test.opencl
 def test_pow_neg1_vs_inv(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -179,7 +175,6 @@ def test_pow_neg1_vs_inv(ctx_factory):
     assert la.norm(res2-ref, np.inf) / la.norm(ref) < 1e-13
 
 
-@pytools.test.mark_test.opencl
 def test_vector_fill(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -192,7 +187,6 @@ def test_vector_fill(ctx_factory):
     a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4)
 
 
-@pytools.test.mark_test.opencl
 def test_absrealimag(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -227,7 +221,6 @@ def test_absrealimag(ctx_factory):
 
 # {{{ operators
 
-@pytools.test.mark_test.opencl
 def test_rmul_yields_right_type(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -242,7 +235,6 @@ def test_rmul_yields_right_type(ctx_factory):
     assert isinstance(two_a, cl_array.Array)
 
 
-@pytools.test.mark_test.opencl
 def test_pow_array(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -257,7 +249,6 @@ def test_pow_array(ctx_factory):
     assert (np.abs(pow(a, a) - result) < 1e-3).all()
 
 
-@pytools.test.mark_test.opencl
 def test_pow_number(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -269,7 +260,6 @@ def test_pow_number(ctx_factory):
     assert (np.abs(a ** 2 - result) < 1e-3).all()
 
 
-@pytools.test.mark_test.opencl
 def test_multiply(ctx_factory):
     """Test the muliplication of an array with a scalar. """
 
@@ -289,7 +279,6 @@ def test_multiply(ctx_factory):
                 assert (a * scalar == a_mult).all()
 
 
-@pytools.test.mark_test.opencl
 def test_multiply_array(ctx_factory):
     """Test the multiplication of two arrays."""
 
@@ -306,7 +295,6 @@ def test_multiply_array(ctx_factory):
     assert (a * a == a_squared).all()
 
 
-@pytools.test.mark_test.opencl
 def test_addition_array(ctx_factory):
     """Test the addition of two arrays."""
 
@@ -320,7 +308,6 @@ def test_addition_array(ctx_factory):
     assert (a + a == a_added).all()
 
 
-@pytools.test.mark_test.opencl
 def test_addition_scalar(ctx_factory):
     """Test the addition of an array and a scalar."""
 
@@ -334,7 +321,6 @@ def test_addition_scalar(ctx_factory):
     assert (7 + a == a_added).all()
 
 
-@pytools.test.mark_test.opencl
 def test_substract_array(ctx_factory):
     """Test the substraction of two arrays."""
     #test data
@@ -355,7 +341,6 @@ def test_substract_array(ctx_factory):
     assert (b - a == result).all()
 
 
-@pytools.test.mark_test.opencl
 def test_substract_scalar(ctx_factory):
     """Test the substraction of an array and a scalar."""
 
@@ -375,7 +360,6 @@ def test_substract_scalar(ctx_factory):
     assert (7 - a == result).all()
 
 
-@pytools.test.mark_test.opencl
 def test_divide_scalar(ctx_factory):
     """Test the division of an array and a scalar."""
 
@@ -392,7 +376,6 @@ def test_divide_scalar(ctx_factory):
     assert (np.abs(2 / a - result) < 1e-5).all()
 
 
-@pytools.test.mark_test.opencl
 def test_divide_array(ctx_factory):
     """Test the division of an array and a scalar. """
 
@@ -417,7 +400,6 @@ def test_divide_array(ctx_factory):
 
 # {{{ RNG
 
-@pytools.test.mark_test.opencl
 def test_random(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -461,7 +443,6 @@ def test_random(ctx_factory):
 
 # {{{ misc
 
-@pytools.test.mark_test.opencl
 def test_numpy_integer_shape(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -470,7 +451,6 @@ def test_numpy_integer_shape(ctx_factory):
     cl_array.empty(queue, (np.int32(17), np.int32(17)), np.float32)
 
 
-@pytools.test.mark_test.opencl
 def test_len(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -480,7 +460,6 @@ def test_len(ctx_factory):
     assert len(a_cpu) == 10
 
 
-@pytools.test.mark_test.opencl
 def test_stride_preservation(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -493,7 +472,6 @@ def test_stride_preservation(ctx_factory):
     assert np.allclose(AT_GPU.get(), AT)
 
 
-@pytools.test.mark_test.opencl
 def test_nan_arithmetic(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -519,7 +497,6 @@ def test_nan_arithmetic(ctx_factory):
     assert (np.isnan(ab) == np.isnan(ab_gpu)).all()
 
 
-@pytools.test.mark_test.opencl
 def test_mem_pool_with_arrays(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -532,7 +509,6 @@ def test_mem_pool_with_arrays(ctx_factory):
     assert b_dev.allocator is mem_pool
 
 
-@pytools.test.mark_test.opencl
 def test_view(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -557,7 +533,6 @@ def test_view(ctx_factory):
 
 # {{{ slices, concatenation
 
-@pytools.test.mark_test.opencl
 def test_slice(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -599,7 +574,6 @@ def test_slice(ctx_factory):
         assert la.norm(a_gpu.get() - a) == 0
 
 
-@pytools.test.mark_test.opencl
 def test_concatenate(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -623,7 +597,6 @@ def test_concatenate(ctx_factory):
 
 # {{{ conditionals, any, all
 
-@pytools.test.mark_test.opencl
 def test_comparisons(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -655,7 +628,6 @@ def test_comparisons(ctx_factory):
         assert (res_dev.get() == res).all()
 
 
-@pytools.test.mark_test.opencl
 def test_any_all(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -679,7 +651,6 @@ def test_any_all(ctx_factory):
 # }}}
 
 
-@pytools.test.mark_test.opencl
 def test_map_to_host(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 1425cc46184ecd243e2305c967740058163c5e61..190dd6c43acc0184d955536c15ce5318ac105346 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -23,7 +23,6 @@ THE SOFTWARE.
 """
 import math
 import numpy as np
-import pytools.test
 
 def have_cl():
     try:
@@ -41,14 +40,9 @@ if have_cl():
     from pyopencl.characterize import has_double_support
 
 
-
-
-
 sizes = [10, 128, 1<<10, 1<<11, 1<<13]
 
 
-
-
 numpy_func_names = {
         "asin": "arcsin",
         "acos": "arccos",
@@ -56,8 +50,6 @@ numpy_func_names = {
         }
 
 
-
-
 def make_unary_function_test(name, limits=(0, 1), threshold=0, use_complex=False):
     (a, b) = limits
     a = float(a)
@@ -100,7 +92,7 @@ def make_unary_function_test(name, limits=(0, 1), threshold=0, use_complex=False
                 assert (max_err <= my_threshold).all(), \
                         (max_err, name, dtype)
 
-    return pytools.test.mark_test.opencl(test)
+    return test
 
 
 
@@ -129,7 +121,6 @@ if have_cl():
 
 
 
-@pytools.test.mark_test.opencl
 def test_fmod(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -146,7 +137,6 @@ def test_fmod(ctx_factory):
         for i in range(s):
             assert math.fmod(a[i], a2[i]) == b[i]
 
-@pytools.test.mark_test.opencl
 def test_ldexp(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -163,7 +153,6 @@ def test_ldexp(ctx_factory):
         for i in range(s):
             assert math.ldexp(a[i], int(a2[i])) == b[i]
 
-@pytools.test.mark_test.opencl
 def test_modf(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -182,7 +171,6 @@ def test_modf(ctx_factory):
             assert intpart_true == intpart[i]
             assert abs(fracpart_true - fracpart[i]) < 1e-4
 
-@pytools.test.mark_test.opencl
 def test_frexp(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -201,7 +189,6 @@ def test_frexp(ctx_factory):
             assert sig_true == significands[i]
             assert ex_true == exponents[i]
 
-@pytools.test.mark_test.opencl
 def test_bessel(ctx_factory):
     try:
         import scipy.special as spec
diff --git a/test/test_wrapper.py b/test/test_wrapper.py
index 7b5a65e23685c1d21ed5c716490b3d8812086caa..3a7a52d9be6edf848e16fa4affa7b767bbd0e8d5 100644
--- a/test/test_wrapper.py
+++ b/test/test_wrapper.py
@@ -24,8 +24,6 @@ THE SOFTWARE.
 
 import numpy as np
 import numpy.linalg as la
-import pytools.test
-
 
 import pyopencl as cl
 import pyopencl.array as cl_array
@@ -41,7 +39,6 @@ else:
     faulthandler.enable()
 
 
-@pytools.test.mark_test.opencl
 def test_get_info(ctx_factory):
     ctx = ctx_factory()
     device, = ctx.devices
@@ -189,7 +186,6 @@ def test_get_info(ctx_factory):
                 lambda info: img.get_image_info(info))
 
 
-@pytools.test.mark_test.opencl
 def test_int_ptr(ctx_factory):
     def do_test(obj):
         new_obj = type(obj).from_int_ptr(obj.int_ptr)
@@ -238,7 +234,6 @@ def test_int_ptr(ctx_factory):
         do_test(img)
 
 
-@pytools.test.mark_test.opencl
 def test_invalid_kernel_names_cause_failures(ctx_factory):
     ctx = ctx_factory()
     device = ctx.devices[0]
@@ -268,7 +263,6 @@ def test_invalid_kernel_names_cause_failures(ctx_factory):
             raise
 
 
-@pytools.test.mark_test.opencl
 def test_image_format_constructor():
     # doesn't need image support to succeed
     iform = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.FLOAT)
@@ -278,7 +272,6 @@ def test_image_format_constructor():
     assert not iform.__dict__
 
 
-@pytools.test.mark_test.opencl
 def test_nonempty_supported_image_formats(ctx_factory):
     context = ctx_factory()
 
@@ -292,7 +285,6 @@ def test_nonempty_supported_image_formats(ctx_factory):
         skip("images not supported on %s" % device.name)
 
 
-@pytools.test.mark_test.opencl
 def test_that_python_args_fail(ctx_factory):
     context = ctx_factory()
 
@@ -325,7 +317,6 @@ def test_that_python_args_fail(ctx_factory):
     cl.enqueue_read_buffer(queue, a_buf, a_result).wait()
 
 
-@pytools.test.mark_test.opencl
 def test_image_2d(ctx_factory):
     context = ctx_factory()
 
@@ -402,7 +393,6 @@ def test_image_2d(ctx_factory):
             assert good
 
 
-@pytools.test.mark_test.opencl
 def test_image_3d(ctx_factory):
     #test for image_from_array for 3d image of float2
     context = ctx_factory()
@@ -478,7 +468,6 @@ def test_image_3d(ctx_factory):
             assert good
 
 
-@pytools.test.mark_test.opencl
 def test_copy_buffer(ctx_factory):
     context = ctx_factory()
 
@@ -497,7 +486,6 @@ def test_copy_buffer(ctx_factory):
     assert la.norm(a - b) == 0
 
 
-@pytools.test.mark_test.opencl
 def test_mempool(ctx_factory):
     from pyopencl.tools import MemoryPool, CLAllocator
 
@@ -517,7 +505,6 @@ def test_mempool(ctx_factory):
     pool.stop_holding()
 
 
-@pytools.test.mark_test.opencl
 def test_mempool_2():
     from pyopencl.tools import MemoryPool
     from random import randrange
@@ -532,7 +519,6 @@ def test_mempool_2():
         assert asize < asize*(1+1/8)
 
 
-@pytools.test.mark_test.opencl
 def test_vector_args(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -554,7 +540,6 @@ def test_vector_args(ctx_factory):
     assert (dest == x).all()
 
 
-@pytools.test.mark_test.opencl
 def test_header_dep_handling(ctx_factory):
     context = ctx_factory()
 
@@ -575,7 +560,6 @@ def test_header_dep_handling(ctx_factory):
     cl.Program(context, kernel_src).build(["-I", os.getcwd()])
 
 
-@pytools.test.mark_test.opencl
 def test_context_dep_memoize(ctx_factory):
     context = ctx_factory()
 
@@ -593,7 +577,6 @@ def test_context_dep_memoize(ctx_factory):
     assert counter[0] == 1
 
 
-@pytools.test.mark_test.opencl
 def test_can_build_binary(ctx_factory):
     ctx = ctx_factory()
     device, = ctx.devices