diff --git a/.gitmodules b/.gitmodules
index 7014cb83ae9044e411f9249054c406ed86175a2d..2489c1c03910d3c9a60fff87c97799e850c08cd0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,6 @@
 [submodule "bpl-subset"]
 	path = bpl-subset
-	url = git://github.com/inducer/bpl-subset
+	url = https://github.com/inducer/bpl-subset
 [submodule "pyopencl/compyte"]
 	path = pyopencl/compyte
-	url = git://github.com/inducer/compyte
+	url = https://github.com/inducer/compyte
diff --git a/README.rst b/README.rst
index 0b1b9528206cf64089f390b6ea12ba57b886060a..33e52774ebf6512f7d179bc3475857addb5e93aa 100644
--- a/README.rst
+++ b/README.rst
@@ -32,13 +32,13 @@ To use PyOpenCL, you just need `numpy <http://numpy.org>`_ and an OpenCL
 implementation.
 (See this `howto <http://wiki.tiker.net/OpenCLHowTo>`_ for how to get one.)
 
-Web resources for PyOpenCL:
+Places on the web related to PyOpenCL:
 
-* Python package index (download releases)
+* `Python package index <http://pypi.python.org/pypi/pyopencl>`_ (download releases)
 
   .. image:: https://badge.fury.io/py/pyopencl.png
       :target: http://pypi.python.org/pypi/pyopencl
 * `C. Gohlke's Windows binaries <http://www.lfd.uci.edu/~gohlke/pythonlibs/#pyopencl>`_ (download Windows binaries)
 * `Github <http://github.com/pyopencl/pyopencl>`_ (get latest source code, file bugs)
-* `Documetnation <http://documen.tician.de>`_ (get latest source code, file bugs)
-* `Wiki <http://wiki.tiker.net/PyOpenCL>`_ (get latest source code, file bugs)
+* `Documentation <http://documen.tician.de>`_ (read how things work)
+* `Wiki <http://wiki.tiker.net/PyOpenCL>`_ (read installation tips, get examples, read FAQ)
diff --git a/contrib/fortran-to-opencl/translate.py b/contrib/fortran-to-opencl/translate.py
index 24a08dcac9be5b7eab2d5e57319cf7f7701b478f..d373888a36bec0e27a7a87f745efaaffe43864ac 100644
--- a/contrib/fortran-to-opencl/translate.py
+++ b/contrib/fortran-to-opencl/translate.py
@@ -33,14 +33,12 @@ from pymbolic.mapper.c_code import CCodeMapper as CCodeMapperBase
 from warnings import warn
 
 import pytools.lex
-import re
-
-
 
 
 class TranslatorWarning(UserWarning):
     pass
 
+
 class TranslationError(RuntimeError):
     pass
 
@@ -77,9 +75,7 @@ def dtype_to_ctype(dtype):
     elif dtype == np.complex128:
         return "cdouble_t"
     else:
-        raise ValueError, "unable to map dtype '%s'" % dtype
-
-
+        raise ValueError("unable to map dtype '%s'" % dtype)
 
 
 class POD(cgen.POD):
@@ -89,8 +85,6 @@ class POD(cgen.POD):
 # }}}
 
 
-
-
 # {{{ expression parser
 
 _less_than = intern("less_than")
@@ -104,6 +98,7 @@ _not = intern("not")
 _and = intern("and")
 _or = intern("or")
 
+
 class TypedLiteral(pymbolic.primitives.Leaf):
     def __init__(self, value, dtype):
         self.value = value
@@ -114,21 +109,22 @@ class TypedLiteral(pymbolic.primitives.Leaf):
 
     mapper_method = intern("map_literal")
 
+
 class FortranExpressionParser(ExpressionParserBase):
     # FIXME double/single prec literals
 
     lex_table = [
-            (_less_than, pytools.lex.RE(r"\.lt\.", re.I)),
-            (_greater_than, pytools.lex.RE(r"\.gt\.", re.I)),
-            (_less_equal, pytools.lex.RE(r"\.le\.", re.I)),
-            (_greater_equal, pytools.lex.RE(r"\.ge\.", re.I)),
-            (_equal, pytools.lex.RE(r"\.eq\.", re.I)),
-            (_not_equal, pytools.lex.RE(r"\.ne\.", re.I)),
-
-            (_not, pytools.lex.RE(r"\.not\.", re.I)),
-            (_and, pytools.lex.RE(r"\.and\.", re.I)),
-            (_or, pytools.lex.RE(r"\.or\.", re.I)),
-            ] + ExpressionParserBase.lex_table
+        (_less_than, pytools.lex.RE(r"\.lt\.", re.I)),
+        (_greater_than, pytools.lex.RE(r"\.gt\.", re.I)),
+        (_less_equal, pytools.lex.RE(r"\.le\.", re.I)),
+        (_greater_equal, pytools.lex.RE(r"\.ge\.", re.I)),
+        (_equal, pytools.lex.RE(r"\.eq\.", re.I)),
+        (_not_equal, pytools.lex.RE(r"\.ne\.", re.I)),
+
+        (_not, pytools.lex.RE(r"\.not\.", re.I)),
+        (_and, pytools.lex.RE(r"\.and\.", re.I)),
+        (_or, pytools.lex.RE(r"\.or\.", re.I)),
+        ] + ExpressionParserBase.lex_table
 
     def __init__(self, tree_walker):
         self.tree_walker = tree_walker
@@ -140,7 +136,7 @@ class FortranExpressionParser(ExpressionParserBase):
 
         from pymbolic.primitives import Subscript, Call, Variable
         from pymbolic.parser import (
-                _identifier, _openpar, _closepar, _float)
+            _identifier, _openpar, _closepar, _float)
 
         next_tag = pstate.next_tag()
         if next_tag is _float:
@@ -266,6 +262,7 @@ class FortranExpressionParser(ExpressionParserBase):
 
 # }}}
 
+
 # {{{ expression generator
 
 class TypeInferenceMapper(CombineMapper):
@@ -302,8 +299,6 @@ class TypeInferenceMapper(CombineMapper):
             return CombineMapper.map_call(self, expr)
 
 
-
-
 class ComplexCCodeMapper(CCodeMapperBase):
     def __init__(self, infer_type):
         CCodeMapperBase.__init__(self)
@@ -436,8 +431,6 @@ class ComplexCCodeMapper(CCodeMapperBase):
         return CCodeMapperBase.map_power(self, expr, enclosing_prec)
 
 
-
-
 class CCodeMapper(ComplexCCodeMapper):
     # Whatever is needed to mop up after Fortran goes here.
     # Stuff that deals with generating real-valued code
@@ -596,8 +589,6 @@ class Scope(object):
             return name
 
 
-
-
 class FTreeWalkerBase(object):
     def __init__(self):
         self.scope_stack = []
@@ -633,7 +624,7 @@ class FTreeWalkerBase(object):
 
             assert 1 <= len(start_end) <= 2
 
-            return (self.parse_expr(s) for s in start_end)
+            return tuple(self.parse_expr(s) for s in start_end)
 
         for decl in dim_decls:
             entity_match = self.ENTITY_RE.match(decl)
@@ -661,7 +652,6 @@ class FTreeWalkerBase(object):
     # }}}
 
 
-
 class ArgumentAnalayzer(FTreeWalkerBase):
     def __init__(self):
         FTreeWalkerBase.__init__(self)
@@ -768,7 +758,6 @@ class ArgumentAnalayzer(FTreeWalkerBase):
 
         lhs = self.parse_expr(node.variable)
 
-
         from pymbolic.primitives import Subscript, Call
         if isinstance(lhs, Subscript):
             lhs_name = lhs.aggregate.name
@@ -869,9 +858,6 @@ class ArgumentAnalayzer(FTreeWalkerBase):
     # }}}
 
 
-
-
-
 # {{{ translator
 
 class F2CLTranslator(FTreeWalkerBase):
@@ -892,7 +878,6 @@ class F2CLTranslator(FTreeWalkerBase):
         scope = self.scope_stack[-1]
         return POD(scope.get_type(name), name)
 
-
     def get_declarations(self):
         scope = self.scope_stack[-1]
 
@@ -1003,8 +988,7 @@ class F2CLTranslator(FTreeWalkerBase):
 
             return decl
 
-
-        result =  cgen.FunctionBody(
+        result = cgen.FunctionBody(
                 cgen.FunctionDeclaration(
                     cgen.Value("void", node.name),
                     [get_arg_decl(i, arg) for i, arg in enumerate(node.args)]
@@ -1196,7 +1180,7 @@ class F2CLTranslator(FTreeWalkerBase):
 
         return cgen.Statement("%s(%s)" % (
             node.designator,
-            ", ".join(transform_arg(i, arg_str) 
+            ", ".join(transform_arg(i, arg_str)
                 for i, arg_str in enumerate(node.items))))
 
     def map_Return(self, node):
@@ -1278,7 +1262,8 @@ class F2CLTranslator(FTreeWalkerBase):
 
             if not isinstance(step, int):
                 print type(step)
-                raise TranslationError("non-constant steps not yet supported: %s" % step)
+                raise TranslationError(
+                        "non-constant steps not yet supported: %s" % step)
 
             if step < 0:
                 comp_op = ">="
@@ -1327,10 +1312,6 @@ class F2CLTranslator(FTreeWalkerBase):
 # }}}
 
 
-
-
-
-
 def f2cl(source, free_form=False, strict=True,
         addr_space_hints={}, force_casts={},
         do_arg_analysis=True,
@@ -1369,15 +1350,11 @@ def f2cl(source, free_form=False, strict=True,
     return str_mod
 
 
-
-
 def f2cl_files(source_file, target_file, **kwargs):
     mod = f2cl(open(source_file).read(), **kwargs)
     open(target_file, "w").write(mod)
 
 
-
-
 if __name__ == "__main__":
     from cgen.opencl import CLConstant
 
diff --git a/doc/_static/akdoc.css b/doc/_static/akdoc.css
index 1a1ef7a64fe74e3c1eda3d976c5acd38f2a978e2..d8b61e3ff7a358e5d5c0f132b5040ec7c3f43e42 100644
--- a/doc/_static/akdoc.css
+++ b/doc/_static/akdoc.css
@@ -23,17 +23,17 @@ code {
 }
 
 h1 {
-  padding-bottom:5px;
+  padding-bottom:7px;
   border-bottom: 1px solid #ccc;
 }
 
 h2 {
-  padding-bottom:1px;
+  padding-bottom:5px;
   border-bottom: 1px solid #ccc;
 }
 
 h3 {
-  padding-bottom:1px;
+  padding-bottom:5px;
   border-bottom: 1px solid #ccc;
 }
 
diff --git a/doc/algorithm.rst b/doc/algorithm.rst
index 25396c4c55163c8c85eabe3d9de2fbf540fcdd33..6eccc2f86683d39bd0bf719b414e3befdef987da 100644
--- a/doc/algorithm.rst
+++ b/doc/algorithm.rst
@@ -119,8 +119,9 @@ Prefix Sums ("scan")
 .. module:: pyopencl.scan
 
 .. |scan_extra_args| replace:: a list of tuples *(name, value)* specifying
-    extra arguments to pass to the scan procedure. *value* must be :mod:`numpy`
-    sized type.
+    extra arguments to pass to the scan procedure. For version 2013.1,
+    *value* must be a of a :mod:`numpy` sized scalar type. As of version 2013.2,
+    *value* may also be a :class:`pyopencl.array.Array`.
 .. |preamble| replace:: A snippet of C that is inserted into the compiled kernel
     before the actual kernel function. May be used for, e.g. type definitions
     or include statements.
@@ -231,7 +232,7 @@ Simple / Legacy Interface
     an associative binary operation. *neutral* is the neutral element
     of *scan_expr*, obeying *scan_expr(a, neutral) == a*.
 
-    *dtype* specifies the type of the arrays being operated on. 
+    *dtype* specifies the type of the arrays being operated on.
     *name_prefix* is used for kernel names to ensure recognizability
     in profiles and logs. *options* is a list of compiler options to use
     when building. *preamble* specifies a string of code that is
diff --git a/doc/array.rst b/doc/array.rst
index 0f3252f64326806ae3716e53e8f0b77cf52f85ea..25d1a421d5f0f1f2f58298de5d81a66d35242bdf 100644
--- a/doc/array.rst
+++ b/doc/array.rst
@@ -99,9 +99,11 @@ autodetected.
 Under the hood, the complex types are simply `float2` and `double2`.
 
 .. warning::
-    Note that addition (real + complex) and multiplication (complex*complex)
-    are defined for e.g. `float2`, but yield wrong results, so that you need to
-    use the corresponding functions.
+    Note that, at the OpenCL source code level, addition (real + complex) and
+    multiplication (complex*complex) are defined for e.g. `float2`, but yield
+    wrong results, so that you need to use the corresponding functions.
+    (The :mod:`Array` type implements complex arithmetic as you remember it,
+    without any idiotic quirks like this.)
 
 .. versionadded:: 2012.1
 
diff --git a/doc/conf.py b/doc/conf.py
index 093eeaddfa9505e42ab0e219157e2c8160ee87b8..eb9ba09b61a209216fa9c6d3c09c083199fc9937 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -101,7 +101,6 @@ else:
     # documentation.
     html_theme_options = {
             "navbar_fixed_top": "true",
-            "navbar_class": "navbar navbar-inverse",
             "navbar_site_name": "Contents",
             'bootstrap_version': '3',
             'source_link_position': 'footer',
@@ -155,7 +154,7 @@ html_copy_source = False
 #html_file_suffix = ''
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'PyCudadoc'
+htmlhelp_basename = 'PyOpenClDoc'
 
 
 # Options for LaTeX output
diff --git a/pyopencl/algorithm.py b/pyopencl/algorithm.py
index e34bb018029e6f75b81e38af916a159ac1e553ea..153c341f504c83e36327d05a0986484fc31199b2 100644
--- a/pyopencl/algorithm.py
+++ b/pyopencl/algorithm.py
@@ -49,6 +49,24 @@ _copy_if_template = ScanTemplate(
         template_processor="printf")
 
 
+def extract_extra_args_types_values(extra_args):
+    from pyopencl.tools import VectorArg, ScalarArg
+
+    extra_args_types = []
+    extra_args_values = []
+    for name, val in extra_args:
+        if isinstance(val, cl.array.Array):
+            extra_args_types.append(VectorArg(val.dtype, name, with_offset=False))
+            extra_args_values.append(val)
+        elif isinstance(val, np.generic):
+            extra_args_types.append(ScalarArg(val.dtype, name))
+            extra_args_values.append(val)
+        else:
+            raise RuntimeError("argument '%d' not understood" % name)
+
+    return tuple(extra_args_types), extra_args_values
+
+
 def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=None):
     """Copy the elements of *ary* satisfying *predicate* to an output array.
 
@@ -61,7 +79,9 @@ def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=Non
     :returns: a tuple *(out, count, event)* where *out* is the output array, *count*
         is an on-device scalar (fetch to host with `count.get()`) indicating
         how many elements satisfied *predicate*, and *event* is a
-        :class:`pyopencl.Event` for dependency management.
+        :class:`pyopencl.Event` for dependency management. *out* is allocated
+        to the same length as *ary*, but only the first *count* entries carry
+        meaning.
 
     .. versionadded:: 2013.1
     """
@@ -70,8 +90,7 @@ def copy_if(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=Non
     else:
         scan_dtype = np.int32
 
-    extra_args_types = tuple((val.dtype, name) for name, val in extra_args)
-    extra_args_values = tuple(val for name, val in extra_args)
+    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
 
     knl = _copy_if_template.build(ary.context,
             type_aliases=(("scan_t", scan_dtype), ("item_t", ary.dtype)),
@@ -153,8 +172,7 @@ def partition(ary, predicate, extra_args=[], preamble="", queue=None, wait_for=N
     else:
         scan_dtype = np.uint32
 
-    extra_args_types = tuple((val.dtype, name) for name, val in extra_args)
-    extra_args_values = tuple(val for name, val in extra_args)
+    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
 
     knl = _partition_template.build(
             ary.context,
@@ -222,8 +240,7 @@ def unique(ary, is_equal_expr="a == b", extra_args=[], preamble="",
     else:
         scan_dtype = np.uint32
 
-    extra_args_types = tuple((val.dtype, name) for name, val in extra_args)
-    extra_args_values = tuple(val for name, val in extra_args)
+    extra_args_types, extra_args_values = extract_extra_args_types_values(extra_args)
 
     knl = _unique_template.build(
             ary.context,
@@ -793,7 +810,7 @@ class ListOfListsBuilder:
     def do_not_vectorize(self):
         from pytools import any
         return (self.complex_kernel
-                and any(dev.type == cl.device_type.CPU
+                and any(dev.type & cl.device_type.CPU
                     for dev in self.context.devices))
 
     @memoize_method
@@ -960,6 +977,9 @@ class ListOfListsBuilder:
         result = {}
         count_list_args = []
 
+        if wait_for is None:
+            wait_for = []
+
         count_kernel = self.get_count_kernel(index_dtype)
         write_kernel = self.get_write_kernel(index_dtype)
         scan_kernel = self.get_scan_kernel(index_dtype)
@@ -972,6 +992,8 @@ class ListOfListsBuilder:
 
             counts = cl.array.empty(queue,
                     (n_objects + 1), index_dtype, allocator=allocator)
+            counts[-1] = 0
+            wait_for = wait_for + counts.events
 
             # The scan will turn the "counts" array into the "starts" array
             # in-place.
@@ -1004,19 +1026,14 @@ class ListOfListsBuilder:
 
             info_record = result[name]
             starts_ary = info_record.starts
-            evt = scan_kernel(starts_ary, wait_for=[count_event])
+            evt = scan_kernel(starts_ary, wait_for=[count_event],
+                    size=n_objects)
 
-            # set first entry to zero
-            evt = cl.enqueue_copy(queue, starts_ary.data, index_dtype.type(0),
-                    wait_for=[evt])
-            scan_events.append(evt)
+            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
+            scan_events.extend(starts_ary.events)
 
             # retrieve count
-            count = np.array(1, index_dtype)
-            cl.enqueue_copy(queue, count, starts_ary.data,
-                    device_offset=index_dtype.itemsize*n_objects)
-
-            info_record.count = int(count)
+            info_record.count = int(starts_ary[-1].get())
 
         # }}}
 
diff --git a/pyopencl/array.py b/pyopencl/array.py
index 61ad4f83ace8a4fdf8f0546451fac3fa622ed2f6..2139c8be38cf44139bed800c47558125d5f504e4 100644
--- a/pyopencl/array.py
+++ b/pyopencl/array.py
@@ -1154,6 +1154,12 @@ class Array(object):
 
     # }}}
 
+    def finish(self):
+        # undoc
+        if self.events:
+            cl.wait_for_events(self.events)
+            del self.events[:]
+
     # {{{ views
 
     def reshape(self, *shape, **kwargs):
@@ -1214,12 +1220,6 @@ class Array(object):
 
     # }}}
 
-    def finish(self):
-        # undoc
-        if self.events:
-            cl.wait_for_events(self.events)
-            del self.events[:]
-
     def map_to_host(self, queue=None, flags=None, is_blocking=True, wait_for=None):
         """If *is_blocking*, return a :class:`numpy.ndarray` corresponding to the
         same memory as *self*.
@@ -1344,11 +1344,15 @@ class Array(object):
                 shape=tuple(new_shape),
                 strides=tuple(new_strides))
 
-    def setitem(self, subscript, value, queue=None):
+    def setitem(self, subscript, value, queue=None, wait_for=None):
         """Like :meth:`__setitem__`, but with the ability to specify
-        a *queue* for execution.
+        a *queue* and *wait_for*.
 
         .. versionadded:: 2013.1
+
+        .. versionchanged:: 2013.2
+
+            Added *wait_for*.
         """
 
         if isinstance(subscript, Array):
@@ -1362,7 +1366,8 @@ class Array(object):
                 raise NotImplementedError(
                         "fancy indexing into a multi-d array is supported")
 
-            multi_put([value], subscript, out=[self], queue=self.queue)
+            multi_put([value], subscript, out=[self], queue=self.queue,
+                    wait_for=wait_for)
             return
 
         queue = queue or self.queue or value.queue
@@ -1373,7 +1378,7 @@ class Array(object):
             if subarray.shape == value.shape and subarray.strides == value.strides:
                 self.events.append(
                         cl.enqueue_copy(queue, subarray.base_data,
-                            value, device_offset=subarray.offset))
+                            value, device_offset=subarray.offset, wait_for=wait_for))
                 return
             else:
                 value = to_device(queue, value, self.allocator)
@@ -1389,11 +1394,11 @@ class Array(object):
                 raise ValueError("cannot assign between arrays of "
                         "differing strides")
 
-            self._copy(subarray, value, queue=queue)
+            self._copy(subarray, value, queue=queue, wait_for=wait_for)
 
         else:
             # Let's assume it's a scalar
-            subarray.fill(value, queue=queue)
+            subarray.fill(value, queue=queue, wait_for=wait_for)
 
     def __setitem__(self, subscript, value):
         """Set the slice of *self* identified *subscript* to *value*.
@@ -1750,7 +1755,8 @@ def multi_take_put(arrays, dest_indices, src_indices, dest_shape=None,
     return out
 
 
-def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None):
+def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None,
+        wait_for=None):
     if not len(arrays):
         return []
 
@@ -1797,7 +1803,7 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None):
                     queue.device))
 
         from pytools import flatten
-        knl(queue, gs, ls,
+        evt = knl(queue, gs, ls,
                 *(
                     list(flatten(
                         (o.base_data, o.offset)
@@ -1806,7 +1812,13 @@ def multi_put(arrays, dest_indices, dest_shape=None, out=None, queue=None):
                     + list(flatten(
                         (i.base_data, i.offset)
                         for i in arrays[chunk_slice]))
-                    + [dest_indices.size]))
+                    + [dest_indices.size]),
+                **dict(wait_for=wait_for))
+
+        # FIXME should wait on incoming events
+
+        for o in out[chunk_slice]:
+            o.events.append(evt)
 
     return out
 
diff --git a/pyopencl/characterize/__init__.py b/pyopencl/characterize/__init__.py
index 8fd33a86942d7991cae0951156c46c11809a986e..0a6aed652b9b3a67613e7be349d422f9fb6faa01 100644
--- a/pyopencl/characterize/__init__.py
+++ b/pyopencl/characterize/__init__.py
@@ -261,8 +261,11 @@ def get_fast_inaccurate_build_options(dev):
     """Return a list of flags valid on device *dev* that enable fast, but
     potentially inaccurate floating point math.
     """
-    return ["-cl-mad-enable", "-cl-fast-relaxed-math",
-        "-cl-no-signed-zeros", "-cl-strict-aliasing"]
+    result = ["-cl-mad-enable", "-cl-fast-relaxed-math",
+        "-cl-no-signed-zeros", ]
+    if dev.vendor.startswith("Advanced Micro") or dev.vendor.startswith("NVIDIA"):
+        result.append("-cl-strict-aliasing")
+    return result
 
 
 def get_simd_group_size(dev, type_size):
diff --git a/src/wrapper/wrap_cl.hpp b/src/wrapper/wrap_cl.hpp
index ea1cea822cf3efaf2ff8ed73f5febb8db3fed7a0..38667949518d76d3a1ffb2d2a258234ea8a92d47 100644
--- a/src/wrapper/wrap_cl.hpp
+++ b/src/wrapper/wrap_cl.hpp
@@ -1725,7 +1725,13 @@ namespace pyopencl
         PYOPENCL_CALL_GUARDED(clGetMemObjectInfo,
             (data(), CL_MEM_FLAGS, sizeof(my_flags), &my_flags, 0));
 
-        return get_sub_region(start, end, my_flags);
+        my_flags &= ~CL_MEM_COPY_HOST_PTR;
+
+        if (end <= start)
+          throw pyopencl::error("Buffer.__getitem__", CL_INVALID_VALUE,
+              "Buffer slice have end > start");
+
+        return get_sub_region(start, end-start, my_flags);
       }
 #endif
   };
diff --git a/test/test_clmath.py b/test/test_clmath.py
index 190dd6c43acc0184d955536c15ce5318ac105346..5778860b260eb25f488d00507fd11adf9bbc14e0 100644
--- a/test/test_clmath.py
+++ b/test/test_clmath.py
@@ -24,9 +24,10 @@ THE SOFTWARE.
 import math
 import numpy as np
 
+
 def have_cl():
     try:
-        import pyopencl
+        import pyopencl  # noqa
         return True
     except:
         return False
@@ -35,12 +36,20 @@ if have_cl():
     import pyopencl.array as cl_array
     import pyopencl as cl
     import pyopencl.clmath as clmath
-    from pyopencl.tools import pytest_generate_tests_for_pyopencl \
-            as pytest_generate_tests
+    from pyopencl.tools import (  # noqa
+            pytest_generate_tests_for_pyopencl
+            as pytest_generate_tests)
     from pyopencl.characterize import has_double_support
 
+try:
+    import faulthandler
+except ImportError:
+    pass
+else:
+    faulthandler.enable()
+
 
-sizes = [10, 128, 1<<10, 1<<11, 1<<13]
+sizes = [10, 128, 1 << 10, 1 << 11, 1 << 13]
 
 
 numpy_func_names = {
@@ -95,8 +104,6 @@ def make_unary_function_test(name, limits=(0, 1), threshold=0, use_complex=False
     return test
 
 
-
-
 if have_cl():
     test_ceil = make_unary_function_test("ceil", (-10, 10))
     test_floor = make_unary_function_test("ceil", (-10, 10))
@@ -119,8 +126,6 @@ if have_cl():
     test_tanh = make_unary_function_test("tanh", (-3, 3), 2e-6, use_complex=True)
 
 
-
-
 def test_fmod(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -137,6 +142,7 @@ def test_fmod(ctx_factory):
         for i in range(s):
             assert math.fmod(a[i], a2[i]) == b[i]
 
+
 def test_ldexp(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -144,7 +150,7 @@ def test_ldexp(ctx_factory):
     for s in sizes:
         a = cl_array.arange(queue, s, dtype=np.float32)
         a2 = cl_array.arange(queue, s, dtype=np.float32)*1e-3
-        b = clmath.ldexp(a,a2)
+        b = clmath.ldexp(a, a2)
 
         a = a.get()
         a2 = a2.get()
@@ -153,6 +159,7 @@ def test_ldexp(ctx_factory):
         for i in range(s):
             assert math.ldexp(a[i], int(a2[i])) == b[i]
 
+
 def test_modf(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -171,6 +178,7 @@ def test_modf(ctx_factory):
             assert intpart_true == intpart[i]
             assert abs(fracpart_true - fracpart[i]) < 1e-4
 
+
 def test_frexp(ctx_factory):
     context = ctx_factory()
     queue = cl.CommandQueue(context)
@@ -189,6 +197,7 @@ def test_frexp(ctx_factory):
             assert sig_true == significands[i]
             assert ex_true == exponents[i]
 
+
 def test_bessel(ctx_factory):
     try:
         import scipy.special as spec
@@ -196,7 +205,6 @@ def test_bessel(ctx_factory):
         from pytest import skip
         skip("scipy not present--cannot test Bessel function")
 
-
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
@@ -275,13 +283,11 @@ def test_bessel(ctx_factory):
 
                 pt.loglog(a, np.abs(cl_bessel-scipy_bessel), label="vs scipy")
                 if use_pyfmmlib:
-                    pt.loglog(a, np.abs(cl_bessel-hk_bessel), label="vs pyfmmlib")
+                    pt.loglog(a, np.abs(cl_bessel-pyfmm_bessel), label="vs pyfmmlib")
                 pt.legend()
                 pt.show()
 
 
-
-
 if __name__ == "__main__":
     import sys
     if len(sys.argv) > 1: