diff --git a/examples/dump-performance.py b/examples/dump-performance.py
index 42946a791d7ef876885e7efcabb830b1d906e567..d8b00142b52afc49635f5f52f061701274abea4d 100644
--- a/examples/dump-performance.py
+++ b/examples/dump-performance.py
@@ -1,13 +1,9 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
+from __future__ import division, absolute_import, print_function
 import pyopencl as cl
 import pyopencl.characterize.performance as perf
 from six.moves import range
 
 
-
-
 def main():
     ctx = cl.create_some_context()
 
@@ -15,7 +11,8 @@ def main():
     print("command latency: %g s" % latency)
     print("profiling overhead: %g s -> %.1f %%" % (
             prof_overhead, 100*prof_overhead/latency))
-    queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
+    queue = cl.CommandQueue(
+            ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
 
     print("empty kernel: %g s" % perf.get_empty_kernel_time(queue))
     print("float32 add: %g GOps/s" % (perf.get_add_rate(queue)/1e9))
@@ -29,13 +26,11 @@ def main():
         print("----------------------------------------")
 
         print("latency: %g s" % perf.transfer_latency(queue, tx_type))
-        for i in range(6, 28, 2):
-            bs = 1<<i
+        for i in range(6, 31, 2):
+            bs = 1 << i
             print("bandwidth @ %d bytes: %g GB/s" % (
                     bs, perf.transfer_bandwidth(queue, tx_type, bs)/1e9))
 
 
-
-
 if __name__ == "__main__":
     main()
diff --git a/pyopencl/characterize/performance.py b/pyopencl/characterize/performance.py
index d2cbf791ecde610c5a5e6fc69bad7d0dff37d213..f3f2d8947c6ada90356e228b23961a9fb369c375 100644
--- a/pyopencl/characterize/performance.py
+++ b/pyopencl/characterize/performance.py
@@ -1,7 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-from __future__ import print_function
-from six.moves import range
+from __future__ import division, absolute_import, print_function
 
 __copyright__ = "Copyright (C) 2009 Andreas Kloeckner"
 
@@ -25,12 +22,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from six.moves import range
 import pyopencl as cl
 import numpy as np
 
 
-
-
 # {{{ timing helpers
 
 class Timer:
@@ -50,8 +46,6 @@ class Timer:
         pass
 
 
-
-
 class WallTimer(Timer):
     def start(self):
         from time import time
@@ -67,8 +61,6 @@ class WallTimer(Timer):
         return self.end-self.start
 
 
-
-
 def _get_time(queue, f, timer_factory=None, desired_duration=0.1,
         warmup_rounds=3):
 
@@ -106,8 +98,6 @@ def _get_time(queue, f, timer_factory=None, desired_duration=0.1,
 # }}}
 
 
-
-
 # {{{ transfer measurements
 
 class HostDeviceTransferBase(object):
@@ -116,32 +106,33 @@ class HostDeviceTransferBase(object):
         self.host_buf = np.empty(block_size, dtype=np.uint8)
         self.dev_buf = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
 
+
 class HostToDeviceTransfer(HostDeviceTransferBase):
     def do(self):
         return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf)
 
+
 class DeviceToHostTransfer(HostDeviceTransferBase):
     def do(self):
         return cl.enqueue_copy(self. queue, self.host_buf, self.dev_buf)
 
+
 class DeviceToDeviceTransfer(object):
     def __init__(self, queue, block_size):
         self.queue = queue
-        self.dev_buf_1 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
-        self.dev_buf_2 = cl.Buffer(queue.context, cl.mem_flags.READ_WRITE, block_size)
+        mf = cl.mem_flags
+        self.dev_buf_1 = cl.Buffer(queue.context, mf.READ_WRITE, block_size)
+        self.dev_buf_2 = cl.Buffer(queue.context, mf.READ_WRITE, block_size)
 
     def do(self):
         return cl.enqueue_copy(self. queue, self.dev_buf_2, self.dev_buf_1)
 
-class HostToDeviceTransfer(HostDeviceTransferBase):
-    def do(self):
-        return cl.enqueue_copy(self. queue, self.dev_buf, self.host_buf)
-
 
 def transfer_latency(queue, transfer_type, timer_factory=None):
     transfer = transfer_type(queue, 1)
     return _get_time(queue, transfer.do, timer_factory=timer_factory)
 
+
 def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None):
     """Measures one-sided bandwidth."""
 
@@ -151,8 +142,6 @@ def transfer_bandwidth(queue, transfer_type, block_size, timer_factory=None):
 # }}}
 
 
-
-
 def get_profiling_overhead(ctx, timer_factory=None):
     no_prof_queue = cl.CommandQueue(ctx)
     transfer = DeviceToDeviceTransfer(no_prof_queue, 1)
@@ -165,6 +154,7 @@ def get_profiling_overhead(ctx, timer_factory=None):
 
     return prof_time - no_prof_time, prof_time
 
+
 def get_empty_kernel_time(queue, timer_factory=None):
     prg = cl.Program(queue.context, """
         __kernel void empty()
@@ -178,13 +168,16 @@ def get_empty_kernel_time(queue, timer_factory=None):
 
     return _get_time(queue, f, timer_factory=timer_factory)
 
-def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", timer_factory=None):
+
+def _get_full_machine_kernel_rate(queue, src, args, name="benchmark",
+        timer_factory=None):
     prg = cl.Program(queue.context, src).build()
 
     knl = getattr(prg, name)
 
     dev = queue.device
     global_size = 4 * dev.max_compute_units
+
     def f():
         knl(queue, (global_size,), None, *args)
 
@@ -198,22 +191,22 @@ def _get_full_machine_kernel_rate(queue, src, args, name="benchmark", timer_fact
 
         keep_trying = not rates
 
-        if rates and rate > 1.05*max(rates): # big improvement
+        if rates and rate > 1.05*max(rates):  # big improvement
             keep_trying = True
             num_dips = 0
 
-        if rates and rate < 0.9*max(rates) and num_dips < 3: # big dip
+        if rates and rate < 0.9*max(rates) and num_dips < 3:  # big dip
             keep_trying = True
             num_dips += 1
 
         if keep_trying:
             global_size *= 2
-            last_rate = rate
             rates.append(rate)
         else:
             rates.append(rate)
             return max(rates)
 
+
 def get_add_rate(queue, type="float", timer_factory=None):
     return 50*10*_get_full_machine_kernel_rate(queue, """
         typedef %(op_t)s op_t;
@@ -244,6 +237,4 @@ def get_add_rate(queue, type="float", timer_factory=None):
         """ % dict(op_t=type), ())
 
 
-
-
 # vim: foldmethod=marker:filetype=pyopencl