diff --git a/pyopencl/elementwise.py b/pyopencl/elementwise.py
index 8e77fb80023bd0c4d913674eae610d29358a6e29..eeb6dde0e8283b0304884ff48d97b78b65f1e6a5 100644
--- a/pyopencl/elementwise.py
+++ b/pyopencl/elementwise.py
@@ -28,18 +28,14 @@ OTHER DEALINGS IN THE SOFTWARE.
 """
 
 
-
-
 from pyopencl.tools import context_dependent_memoize
 import numpy as np
 import pyopencl as cl
 from pyopencl.tools import dtype_to_ctype, VectorArg, ScalarArg
 
 
-
-
 def get_elwise_program(context, arguments, operation,
-        name="elwise_kernel", keep=False, options=[],
+        name="elwise_kernel", options=[],
         preamble="", loop_prep="", after_loop=""):
     from pyopencl import Program
     source = ("""
@@ -73,10 +69,8 @@ def get_elwise_program(context, arguments, operation,
     return Program(context, source).build(options)
 
 
-
-
 def get_elwise_kernel_and_types(context, arguments, operation,
-        name="elwise_kernel", keep=False, options=[], preamble="", **kwargs):
+        name="elwise_kernel", options=[], preamble="", **kwargs):
     if isinstance(arguments, str):
         from pyopencl.tools import parse_c_arg
         parsed_args = [parse_c_arg(arg) for arg in arguments.split(",")]
@@ -84,7 +78,7 @@ def get_elwise_kernel_and_types(context, arguments, operation,
         parsed_args = arguments
 
     for arg in parsed_args:
-        if np.float64  == arg.dtype:
+        if np.float64 == arg.dtype:
             preamble = (
                     "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n\n\n"
                     + preamble)
@@ -92,8 +86,9 @@ def get_elwise_kernel_and_types(context, arguments, operation,
 
     parsed_args.append(ScalarArg(np.uintp, "n"))
 
-    prg = get_elwise_program(context, parsed_args, operation, name,
-            keep, options, preamble, **kwargs)
+    prg = get_elwise_program(
+        context, parsed_args, operation,
+        name=name, options=options, preamble=preamble, **kwargs)
 
     scalar_arg_dtypes = []
     for arg in parsed_args:
@@ -108,26 +103,27 @@ def get_elwise_kernel_and_types(context, arguments, operation,
     return kernel, parsed_args
 
 
-
-
 def get_elwise_kernel(context, arguments, operation,
         name="elwise_kernel", options=[], **kwargs):
     """Return a L{pyopencl.Kernel} that performs the same scalar operation
     on one or several vectors.
     """
-    func, arguments = get_elwise_kernel_and_types(context,
-            arguments, operation, name, options, **kwargs)
+    func, arguments = get_elwise_kernel_and_types(
+        context, arguments, operation,
+        name=name, options=options, **kwargs)
 
     return func
 
 
-
-
 class ElementwiseKernel:
+
     def __init__(self, context, arguments, operation,
             name="elwise_kernel", options=[], **kwargs):
-        self.kernel, self.arguments = get_elwise_kernel_and_types(context,
-            arguments, operation, name, options, **kwargs)
+
+        self.kernel, self.arguments = get_elwise_kernel_and_types(
+            context, arguments, operation,
+            name=name, options=options,
+            **kwargs)
 
         if not [i for i, arg in enumerate(self.arguments)
                 if isinstance(arg, VectorArg)]:
@@ -165,8 +161,6 @@ class ElementwiseKernel:
         return cl.enqueue_nd_range_kernel(queue, self.kernel, gs, ls)
 
 
-
-
 @context_dependent_memoize
 def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
     ctx = {
@@ -174,8 +168,10 @@ def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
             "tp": dtype_to_ctype(dtype),
             }
 
-    args = ([VectorArg(dtype, "dest"+str(i))for i in range(vec_count)]
-            + [VectorArg(dtype, "src"+str(i))for i in range(vec_count)] 
+    args = ([VectorArg(dtype, "dest" + str(i))
+             for i in range(vec_count)]
+            + [VectorArg(dtype, "src" + str(i))
+               for i in range(vec_count)]
             + [VectorArg(idx_dtype, "idx")])
     body = (
             ("%(idx_tp)s src_idx = idx[i];\n" % ctx)
@@ -183,9 +179,7 @@ def get_take_kernel(context, dtype, idx_dtype, vec_count=1):
             "dest%d[i] = src%d[src_idx];" % (i, i)
             for i in range(vec_count)))
 
-    return get_elwise_kernel(context, args, body, "take")
-
-
+    return get_elwise_kernel(context, args, body, name="take")
 
 
 @context_dependent_memoize
@@ -223,9 +217,7 @@ def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1):
                 "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx)
             + "\n".join(get_copy_insn(i) for i in range(vec_count)))
 
-    return get_elwise_kernel(context, args, body, "take_put")
-
-
+    return get_elwise_kernel(context, args, body, name="take_put")
 
 
 @context_dependent_memoize
@@ -240,7 +232,7 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
                 for i in range(vec_count)
             ] + [
             VectorArg(idx_dtype, "gmem_dest_idx"),
-            ]  + [
+            ] + [
             VectorArg(dtype, "src%d" % i)
                 for i in range(vec_count)
             ]
@@ -250,9 +242,7 @@ def get_put_kernel(context, dtype, idx_dtype, vec_count=1):
             + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i)
                 for i in range(vec_count)))
 
-    return get_elwise_kernel(args, body, "put")
-
-
+    return get_elwise_kernel(args, body, name="put")
 
 
 @context_dependent_memoize
@@ -263,8 +253,7 @@ def get_copy_kernel(context, dtype_dest, dtype_src):
                 "tp_src": dtype_to_ctype(dtype_src),
                 },
             "dest[i] = src[i]",
-            "copy")
-
+            name="copy")
 
 
 @context_dependent_memoize
@@ -273,12 +262,12 @@ def get_linear_combination_kernel(summand_descriptors,
     # TODO: Port this!
     raise NotImplementedError
 
-    from pycuda.tools import dtype_to_ctype
-    from pycuda.elementwise import \
+    from pyopencl.tools import dtype_to_ctype
+    from pyopencl.elementwise import \
             VectorArg, ScalarArg, get_elwise_module
 
     args = []
-    preamble = [ "#include <pycuda-helpers.hpp>\n\n" ]
+    preamble = []
     loop_prep = []
     summands = []
     tex_names = []
@@ -312,13 +301,11 @@ def get_linear_combination_kernel(summand_descriptors,
     func = mod.get_function("linear_combination")
     tex_src = [mod.get_texref(tn) for tn in tex_names]
     func.prepare("".join(arg.struct_char for arg in args),
-            (1,1,1), texrefs=tex_src)
+            (1, 1, 1), texrefs=tex_src)
 
     return func, tex_src
 
 
-
-
 @context_dependent_memoize
 def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
     return get_elwise_kernel(context,
@@ -328,7 +315,8 @@ def get_axpbyz_kernel(context, dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = a*x[i] + b*y[i]",
-            "axpbyz")
+            name="axpbyz")
+
 
 @context_dependent_memoize
 def get_axpbz_kernel(context, dtype):
@@ -336,7 +324,8 @@ def get_axpbz_kernel(context, dtype):
             "%(tp)s *z, %(tp)s a, %(tp)s *x,%(tp)s b" % {
                 "tp": dtype_to_ctype(dtype)},
             "z[i] = a * x[i] + b",
-            "axpb")
+            name="axpb")
+
 
 @context_dependent_memoize
 def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
@@ -347,7 +336,8 @@ def get_multiply_kernel(context, dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = x[i] * y[i]",
-            "multiply")
+            name="multiply")
+
 
 @context_dependent_memoize
 def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
@@ -358,7 +348,8 @@ def get_divide_kernel(context, dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = x[i] / y[i]",
-            "divide")
+            name="divide")
+
 
 @context_dependent_memoize
 def get_rdivide_elwise_kernel(context, dtype):
@@ -367,7 +358,8 @@ def get_rdivide_elwise_kernel(context, dtype):
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = y / x[i]",
-            "divide_r")
+            name="divide_r")
+
 
 @context_dependent_memoize
 def get_fill_kernel(context, dtype):
@@ -376,7 +368,8 @@ def get_fill_kernel(context, dtype):
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = a",
-            "fill")
+            name="fill")
+
 
 @context_dependent_memoize
 def get_reverse_kernel(context, dtype):
@@ -385,7 +378,8 @@ def get_reverse_kernel(context, dtype):
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = y[n-1-i]",
-            "reverse")
+            name="reverse")
+
 
 @context_dependent_memoize
 def get_arange_kernel(context, dtype):
@@ -394,7 +388,7 @@ def get_arange_kernel(context, dtype):
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = start + i*step",
-            "arange")
+            name="arange")
 
 
 @context_dependent_memoize
@@ -404,7 +398,8 @@ def get_pow_kernel(context, dtype):
                 "tp": dtype_to_ctype(dtype),
                 },
             "z[i] = pow(y[i], value)",
-            "pow_method")
+            name="pow_method")
+
 
 @context_dependent_memoize
 def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z):
@@ -415,21 +410,24 @@ def get_pow_array_kernel(context, dtype_x, dtype_y, dtype_z):
                 "tp_z": dtype_to_ctype(dtype_z),
                 },
             "z[i] = pow(x[i], y[i])",
-            "pow_method")
+            name="pow_method")
+
 
 @context_dependent_memoize
 def get_fmod_kernel(context):
     return get_elwise_kernel(context,
             "float *z, float *arg, float *mod",
             "z[i] = fmod(arg[i], mod[i])",
-            "fmod_kernel")
+            name="fmod_kernel")
+
 
 @context_dependent_memoize
 def get_modf_kernel(context):
     return get_elwise_kernel(context,
             "float *intpart ,float *fracpart, float *x",
             "fracpart[i] = modf(x[i], &intpart[i])",
-            "modf_kernel")
+            name="modf_kernel")
+
 
 @context_dependent_memoize
 def get_frexp_kernel(context):
@@ -440,14 +438,16 @@ def get_frexp_kernel(context):
                 significand[i] = frexp(x[i], &expt);
                 exponent[i] = expt;
             """,
-            "frexp_kernel")
+            name="frexp_kernel")
+
 
 @context_dependent_memoize
 def get_ldexp_kernel(context):
     return get_elwise_kernel(context,
             "float *z, float *sig, float *expt",
             "z[i] = ldexp(sig[i], (int) expt[i])",
-            "ldexp_kernel")
+            name="ldexp_kernel")
+
 
 @context_dependent_memoize
 def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None):
@@ -460,9 +460,7 @@ def get_unary_func_kernel(context, func_name, in_dtype, out_dtype=None):
                 "tp_out": dtype_to_ctype(out_dtype),
                 },
             "z[i] = %s(y[i])" % func_name,
-            "%s_kernel" % func_name)
-
-
+            name="%s_kernel" % func_name)
 
 
 @context_dependent_memoize
@@ -474,4 +472,4 @@ def get_if_positive_kernel(context, crit_dtype, dtype):
             VectorArg(dtype, "else_"),
             ],
             "result[i] = crit[i] > 0 ? then_[i] : else_[i]",
-            "if_positive")
+            name="if_positive")
diff --git a/test/test_array.py b/test/test_array.py
index 4269eb855e38d92ca86fc6c37d8ab11be904e591..591b7e7ea7c0be785d3a631fc337716235c74950 100644
--- a/test/test_array.py
+++ b/test/test_array.py
@@ -5,8 +5,6 @@ import sys
 import pytools.test
 
 
-
-
 def have_cl():
     try:
         import pyopencl
@@ -22,36 +20,31 @@ if have_cl():
     from pyopencl.characterize import has_double_support
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_pow_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1,2,3,4,5]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
 
-    result = pow(a_gpu,a_gpu).get()
-    assert (np.abs(a**a - result) < 1e-3).all()
+    result = pow(a_gpu, a_gpu).get()
+    assert (np.abs(a ** a - result) < 1e-3).all()
 
-    result = (a_gpu**a_gpu).get()
+    result = (a_gpu ** a_gpu).get()
     assert (np.abs(pow(a, a) - result) < 1e-3).all()
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_pow_number(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
 
     result = pow(a_gpu, 2).get()
-    assert (np.abs(a**2 - result) < 1e-3).all()
-
+    assert (np.abs(a ** 2 - result) < 1e-3).all()
 
 
 @pytools.test.mark_test.opencl
@@ -69,7 +62,7 @@ def test_abs(ctx_getter):
 
     res = a.get()
 
-    for i in range (111):
+    for i in range(111):
         assert abs(res[i]) >= 0
         assert res[i] == i
 
@@ -79,13 +72,11 @@ def test_len(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
     a_cpu = cl_array.to_device(queue, a)
     assert len(a_cpu) == 10
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_multiply(ctx_getter):
     """Test the muliplication of an array with a scalar. """
@@ -93,11 +84,9 @@ def test_multiply(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-
     for sz in [10, 50000]:
         for dtype, scalars in [
             (np.float32, [2]),
-            #(np.complex64, [2, 2j])
             ]:
             for scalar in scalars:
                 a = np.arange(sz).astype(dtype)
@@ -106,6 +95,7 @@ def test_multiply(ctx_getter):
 
                 assert (a * scalar == a_doubled).all()
 
+
 @pytools.test.mark_test.opencl
 def test_multiply_array(ctx_getter):
     """Test the multiplication of two arrays."""
@@ -113,16 +103,14 @@ def test_multiply_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
 
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = cl_array.to_device(queue, a)
 
-    a_squared = (b_gpu*a_gpu).get()
-
-    assert (a*a == a_squared).all()
-
+    a_squared = (b_gpu * a_gpu).get()
 
+    assert (a * a == a_squared).all()
 
 
 @pytools.test.mark_test.opencl
@@ -132,13 +120,11 @@ def test_addition_array(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
-    a_added = (a_gpu+a_gpu).get()
-
-    assert (a+a == a_added).all()
-
+    a_added = (a_gpu + a_gpu).get()
 
+    assert (a + a == a_added).all()
 
 
 @pytools.test.mark_test.opencl
@@ -148,21 +134,20 @@ def test_addition_scalar(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
-    a_added = (7+a_gpu).get()
-
-    assert (7+a == a_added).all()
-
+    a_added = (7 + a_gpu).get()
 
+    assert (7 + a == a_added).all()
 
 
 @pytools.test.mark_test.opencl
 def test_substract_array(ctx_getter):
     """Test the substraction of two arrays."""
     #test data
-    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
-    b = np.array([10,20,30,40,50,60,70,80,90,100]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
+    b = np.array([10, 20, 30, 40, 50,
+                  60, 70, 80, 90, 100]).astype(np.float32)
 
     context = ctx_getter()
     queue = cl.CommandQueue(context)
@@ -170,13 +155,11 @@ def test_substract_array(ctx_getter):
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = cl_array.to_device(queue, b)
 
-    result = (a_gpu-b_gpu).get()
-    assert (a-b == result).all()
-
-    result = (b_gpu-a_gpu).get()
-    assert (b-a == result).all()
-
+    result = (a_gpu - b_gpu).get()
+    assert (a - b == result).all()
 
+    result = (b_gpu - a_gpu).get()
+    assert (b - a == result).all()
 
 
 @pytools.test.mark_test.opencl
@@ -187,18 +170,16 @@ def test_substract_scalar(ctx_getter):
     queue = cl.CommandQueue(context)
 
     #test data
-    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
 
     #convert a to a gpu object
     a_gpu = cl_array.to_device(queue, a)
 
-    result = (a_gpu-7).get()
-    assert (a-7 == result).all()
-
-    result = (7-a_gpu).get()
-    assert (7-a == result).all()
-
+    result = (a_gpu - 7).get()
+    assert (a - 7 == result).all()
 
+    result = (7 - a_gpu).get()
+    assert (7 - a == result).all()
 
 
 @pytools.test.mark_test.opencl
@@ -208,16 +189,14 @@ def test_divide_scalar(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    a = np.array([1,2,3,4,5,6,7,8,9,10]).astype(np.float32)
+    a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
     a_gpu = cl_array.to_device(queue, a)
 
-    result = (a_gpu/2).get()
-    assert (a/2 == result).all()
-
-    result = (2/a_gpu).get()
-    assert (np.abs(2/a - result) < 1e-5).all()
-
+    result = (a_gpu / 2).get()
+    assert (a / 2 == result).all()
 
+    result = (2 / a_gpu).get()
+    assert (np.abs(2 / a - result) < 1e-5).all()
 
 
 @pytools.test.mark_test.opencl
@@ -228,19 +207,17 @@ def test_divide_array(ctx_getter):
     queue = cl.CommandQueue(context)
 
     #test data
-    a = np.array([10,20,30,40,50,60,70,80,90,100]).astype(np.float32)
-    b = np.array([10,10,10,10,10,10,10,10,10,10]).astype(np.float32)
+    a = np.array([10, 20, 30, 40, 50, 60, 70, 80, 90, 100]).astype(np.float32)
+    b = np.array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10]).astype(np.float32)
 
     a_gpu = cl_array.to_device(queue, a)
     b_gpu = cl_array.to_device(queue, b)
 
-    a_divide = (a_gpu/b_gpu).get()
-    assert (np.abs(a/b - a_divide) < 1e-3).all()
-
-    a_divide = (b_gpu/a_gpu).get()
-    assert (np.abs(b/a - a_divide) < 1e-3).all()
-
+    a_divide = (a_gpu / b_gpu).get()
+    assert (np.abs(a / b - a_divide) < 1e-3).all()
 
+    a_divide = (b_gpu / a_gpu).get()
+    assert (np.abs(b / a - a_divide) < 1e-3).all()
 
 
 @pytools.test.mark_test.opencl
@@ -262,8 +239,6 @@ def test_random(ctx_getter):
         assert (a < 1).all()
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_nan_arithmetic(ctx_getter):
     context = ctx_getter()
@@ -272,10 +247,8 @@ def test_nan_arithmetic(ctx_getter):
     def make_nan_contaminated_vector(size):
         shape = (size,)
         a = np.random.randn(*shape).astype(np.float32)
-        #for i in range(0, shape[0], 3):
-            #a[i] = float('nan')
         from random import randrange
-        for i in range(size//10):
+        for i in range(size // 10):
             a[randrange(0, size)] = float('nan')
         return a
 
@@ -286,14 +259,12 @@ def test_nan_arithmetic(ctx_getter):
     b = make_nan_contaminated_vector(size)
     b_gpu = cl_array.to_device(queue, b)
 
-    ab = a*b
-    ab_gpu = (a_gpu*b_gpu).get()
+    ab = a * b
+    ab_gpu = (a_gpu * b_gpu).get()
 
     assert (np.isnan(ab) == np.isnan(ab_gpu)).all()
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_elwise_kernel(ctx_getter):
     context = ctx_getter()
@@ -313,9 +284,38 @@ def test_elwise_kernel(ctx_getter):
     c_gpu = cl_array.empty_like(a_gpu)
     lin_comb(5, a_gpu, 6, b_gpu, c_gpu)
 
-    assert la.norm((c_gpu - (5*a_gpu+6*b_gpu)).get()) < 1e-5
+    assert la.norm((c_gpu - (5 * a_gpu + 6 * b_gpu)).get()) < 1e-5
 
 
+@pytools.test.mark_test.opencl
+def test_elwise_kernel_with_options(ctx_getter):
+    from pyopencl.clrandom import rand as clrand
+    from pyopencl.elementwise import ElementwiseKernel
+
+    context = ctx_getter()
+    queue = cl.CommandQueue(context)
+
+    in_gpu = clrand(context, queue, (50,), np.float32)
+
+    options = ['-DADD_ONE']
+    add_one = ElementwiseKernel(
+        context,
+        "float* out, const float *in",
+        """
+        out[i] = in[i];
+        #ifdef ADD_ONE
+        out[i]++;
+        #endif
+        """,
+        options=options,
+        )
+
+    out_gpu = cl_array.empty_like(in_gpu)
+    add_one(out_gpu, in_gpu)
+
+    gt = in_gpu.get() + 1
+    gv = out_gpu.get()
+    assert la.norm(gv - gt) < 1e-5
 
 
 @pytools.test.mark_test.opencl
@@ -326,9 +326,7 @@ def test_take(ctx_getter):
     idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32)
     a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32)
     result = cl_array.take(a, idx)
-    assert ((3*idx).get() == result.get()).all()
-
-
+    assert ((3 * idx).get() == result.get()).all()
 
 
 @pytools.test.mark_test.opencl
@@ -341,8 +339,6 @@ def test_arange(ctx_getter):
     assert (np.arange(n, dtype=np.float32) == a.get()).all()
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_reverse(ctx_getter):
     context = ctx_getter()
@@ -357,8 +353,6 @@ def test_reverse(ctx_getter):
     assert (a[::-1] == a_gpu.get()).all()
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_sum(ctx_getter):
     context = ctx_getter()
@@ -372,9 +366,7 @@ def test_sum(ctx_getter):
     sum_a = np.sum(a)
     sum_a_gpu = cl_array.sum(a_gpu).get()
 
-    assert abs(sum_a_gpu-sum_a)/abs(sum_a) < 1e-4
-
-
+    assert abs(sum_a_gpu - sum_a) / abs(sum_a) < 1e-4
 
 
 @pytools.test.mark_test.opencl
@@ -400,8 +392,6 @@ def test_minmax(ctx_getter):
             assert op_a_gpu == op_a, (op_a_gpu, op_a, dtype, what)
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_subset_minmax(ctx_getter):
     context = ctx_getter()
@@ -442,8 +432,6 @@ def test_subset_minmax(ctx_getter):
         assert min_a_gpu == min_a
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_dot(ctx_getter):
     context = ctx_getter()
@@ -459,9 +447,7 @@ def test_dot(ctx_getter):
 
     dot_ab_gpu = cl_array.dot(a_gpu, b_gpu).get()
 
-    assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
-
-
+    assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
 
 
 if False:
@@ -481,7 +467,8 @@ if False:
             a_gpu_slice = a_gpu[start:end]
             a_slice = a[start:end]
 
-            assert la.norm(a_gpu_slice.get()-a_slice) == 0
+            assert la.norm(a_gpu_slice.get() - a_slice) == 0
+
 
 @pytools.test.mark_test.opencl
 def test_if_positive(ctx_getter):
@@ -502,8 +489,9 @@ def test_if_positive(ctx_getter):
     print(max_a_b_gpu)
     print(np.maximum(a, b))
 
-    assert la.norm(max_a_b_gpu.get()- np.maximum(a, b)) == 0
-    assert la.norm(min_a_b_gpu.get()- np.minimum(a, b)) == 0
+    assert la.norm(max_a_b_gpu.get() - np.maximum(a, b)) == 0
+    assert la.norm(min_a_b_gpu.get() - np.minimum(a, b)) == 0
+
 
 @pytools.test.mark_test.opencl
 def test_take_put(ctx_getter):
@@ -513,9 +501,9 @@ def test_take_put(ctx_getter):
     for n in [5, 17, 333]:
         one_field_size = 8
         buf_gpu = cl_array.zeros(queue,
-                n*one_field_size, dtype=np.float32)
+                n * one_field_size, dtype=np.float32)
         dest_indices = cl_array.to_device(queue,
-                np.array([ 0,  1,  2,  3, 32, 33, 34, 35], dtype=np.uint32))
+                np.array([0, 1, 2,  3, 32, 33, 34, 35], dtype=np.uint32))
         read_map = cl_array.to_device(queue,
                 np.array([7, 6, 5, 4, 3, 2, 1, 0], dtype=np.uint32))
 
@@ -523,9 +511,10 @@ def test_take_put(ctx_getter):
                 arrays=[buf_gpu for i in range(n)],
                 dest_indices=dest_indices,
                 src_indices=read_map,
-                src_offsets=[i*one_field_size for i in range(n)],
+                src_offsets=[i * one_field_size for i in range(n)],
                 dest_shape=(96,))
 
+
 @pytools.test.mark_test.opencl
 def test_astype(ctx_getter):
     context = ctx_getter()
@@ -550,9 +539,7 @@ def test_astype(ctx_getter):
     a2 = a_gpu.astype(np.float32).get()
 
     assert a2.dtype == np.float32
-    assert la.norm(a - a2)/la.norm(a) < 1e-7
-
-
+    assert la.norm(a - a2) / la.norm(a) < 1e-7
 
 
 @pytools.test.mark_test.opencl
@@ -567,13 +554,12 @@ def test_scan(ctx_getter):
         knl = cls(context, dtype, "a+b", "0")
 
         for n in [
-                10, 2**10-5, 2**10, 
-                2**20-2**18, 
-                2**20-2**18+5, 
-                2**10+5,
-                2**20+5,
-                2**20, 2**24
-                ]:
+            10, 2 ** 10 - 5, 2 ** 10,
+            2 ** 20 - 2 ** 18,
+            2 ** 20 - 2 ** 18 + 5,
+            2 ** 10 + 5,
+            2 ** 20 + 5,
+            2 ** 20, 2 ** 24]:
             host_data = np.random.randint(0, 10, n).astype(dtype)
             dev_data = cl_array.to_device(queue, host_data)
 
@@ -588,21 +574,17 @@ def test_scan(ctx_getter):
             collect()
 
 
-
-
 @pytools.test.mark_test.opencl
 def test_stride_preservation(ctx_getter):
     context = ctx_getter()
     queue = cl.CommandQueue(context)
 
-    A = np.random.rand(3,3)
+    A = np.random.rand(3, 3)
     AT = A.T
     print(AT.flags.f_contiguous, AT.flags.c_contiguous)
     AT_GPU = cl_array.to_device(queue, AT)
     print(AT_GPU.flags.f_contiguous, AT_GPU.flags.c_contiguous)
-    assert np.allclose(AT_GPU.get(),AT)
-
-
+    assert np.allclose(AT_GPU.get(), AT)
 
 
 @pytools.test.mark_test.opencl
@@ -618,10 +600,9 @@ def test_vector_fill(ctx_getter):
     a_gpu = cl_array.zeros(queue, 100, dtype=cl_array.vec.float4)
 
 
-
-
 if __name__ == "__main__":
-    # make sure that import failures get reported, instead of skipping the tests.
+    # make sure that import failures get reported, instead of skipping the
+    # tests.
     import pyopencl as cl
 
     import sys